<h1>Segmenting and Clustering Neighborhoods in Toronto</h1>

In [1]:
# importing important libraries

#Data handling imports
import numpy as np
import pandas as pd

#these libraries are used for webPage scraping purpose
import urllib.request
from bs4 import BeautifulSoup
import requests

# tranform JSON file into a pandas dataframe
from pandas.io.json import json_normalize 

<h3>1. Scraping the data from Wikipedia and Exploring the dataset</h3>

In [2]:
# storing the URL in a variable which is to be scraped
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# opening the URL using urllib.request.uropen() method into the page variable
page = urllib.request.urlopen(url)

# parsing the HTML from our URL into the BeautifulSoup parse tree format
soup = BeautifulSoup(page,'lxml')
soup.prettify()

# using 'find_all' function, we can bring back all instances of the 'table' tag in the HTML and store it in a variable
pc_table = soup.find('table',class_ = 'wikitable sortable')

# list that will store all the scraped values from the table 
pc_list=[[],[],[]]

# stripping all the ''\n' or newline character and appending the values into the list 
for row in pc_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        pc_list[0].append(cells[0].find(text=True).strip())
        pc_list[1].append(cells[1].find(text=True).strip())
        pc_list[2].append(cells[2].find(text=True).strip())

In [3]:
# creating a DataFrame from the scraped table from Wikipedia
columnNames=['PostalCode','Borough','Neighborhood']
df = pd.DataFrame(columns=columnNames)
df['PostalCode'] = pc_list[0]
df['Borough'] = pc_list[1]
df['Neighborhood'] = pc_list[2]

df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
...,...,...,...
175,M5Z,Not assigned,
176,M6Z,Not assigned,
177,M7Z,Not assigned,
178,M8Z,Etobicoke,Mimico NW / The Queensway West / South of Bloo...


The DataFrame has a lot of null and not assigned values in Borough column.

In [4]:
# removing all the rows which have Not assigned values in them.
df = df[df['Borough']!='Not assigned']
df

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
...,...,...,...
160,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,Business reply mail Processing CentrE
169,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...


In [5]:
#To check whether there are any more than one neighborhoosd that exist in same postal code area.
df.duplicated(subset='PostalCode').value_counts()

False    103
dtype: int64

This implies that there are no duplicate rows that have same postal code values

In [6]:
# Checking whether there are any Boroughs with un assigned neighborhood
df[df['Neighborhood']=='']

Unnamed: 0,PostalCode,Borough,Neighborhood


This shows there are no null values in the Neighborhood column, which means there are no Boroughs with unassigned Neighborhood

In [7]:
#getting the geospatial data from csv file
geospatial_df = pd.read_csv('Geospatial_Coordinates.csv')
geospatial_df

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [8]:
# sorting the value of both the dataframes
df = df.sort_values(['PostalCode'],ascending=True)
geospatial_df = geospatial_df.sort_values(['Postal Code'],ascending=True)

In [9]:
#adding the longitude and latitude value to the df
df['Latitude']=np.nan
df['Longitude']=np.nan
for i in range(len(df)):
    if df.iloc[i, 0]==geospatial_df.iloc[i, 0]:
        df.iloc[i, 3]=geospatial_df.iloc[i, 1]
        df.iloc[i, 4]=geospatial_df.iloc[i, 2]
df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
9,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
18,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
27,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
36,M1G,Scarborough,Woburn,43.770992,-79.216917
45,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
107,M9P,Etobicoke,Westmount,43.696319,-79.532242
116,M9R,Etobicoke,Kingsview Village / St. Phillips / Martin Grov...,43.688905,-79.554724
143,M9V,Etobicoke,South Steeles / Silverstone / Humbergate / Jam...,43.739416,-79.588437


In [10]:
#replacing (/) with (,) in dataframe
for i in range(len(df)):
    df.iloc[i, 2] = df.iloc[i, 2].replace(' / ',', ')
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
9,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
18,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
27,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
36,M1G,Scarborough,Woburn,43.770992,-79.216917
45,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
54,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
63,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
72,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
81,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
90,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


#### Creating a map of Toronto with neighborhoods superimposed on top.

In [11]:
#importing necessary libraries
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans#for clustering
import folium # map rendering library
from geopy.geocoders import Nominatim#to convert an address to longitude and latitude

In [12]:
# getting the lognitude and latitude of toronto
address = 'Toronto, Ontario, CA'

geolocator = Nominatim(user_agent="toronto_data")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

#creating a map of toronto
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

#### Define Foursquare Credentials and Version

In [None]:
# @hidden_cell
CLIENT_ID = 'EQHEGBJWRIMUCW52IZNFCH1WDPNGXERU21S4A5D0NSH03UF4' # your Foursquare ID
CLIENT_SECRET = 'MGXMV2UPEM4VA40DLPHTSH0A2X5RR5UYN5RSSFIBR42TXJP5' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

For clustering and getting all the venues, we'll look for the places in Scarborough

In [14]:
scarborough_df = df[df['Borough']=='Scarborough']

scarborough_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
9,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
18,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
27,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
36,M1G,Scarborough,Woburn,43.770992,-79.216917
45,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
54,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
63,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
72,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
81,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
90,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


Let's get the coordinates for Downtown Toronto

In [15]:
# storing the address of Downtown Toronto in a variable
addresss = 'Scarborough, Toronto, ON, Canada'

geolocator = Nominatim(user_agent="Scarborough_area")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

#creating a map of Downtown Toronto
scarborough_map = folium.Map(location=[latitude, longitude], zoom_start=10)
for lat, lng, borough, neighborhood in zip(scarborough_df['Latitude'], scarborough_df['Longitude'], 
                                           scarborough_df['Borough'], scarborough_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(scarborough_map)  
    
scarborough_map

Picking one of the neighborhoods in Scarborough

In [16]:
scarborough_df.loc[36,'Neighborhood']

'Woburn'

Getting the Longitude and Latitude of Woburn

In [17]:
neighborhood_longitude = scarborough_df.loc[36,'Longitude']
neighborhood_latitude = scarborough_df.loc[36,'Latitude']
print('The Longitude and Latitude values of Woburn are: {}, {}'.format(neighborhood_longitude,neighborhood_latitude))

The Longitude and Latitude values of Woburn are: -79.21691740000001, 43.7709921


#### Now getting the top 100 venues in Woburn

In [18]:
# creating the URL for the request first
limit = 100 # limit for the number of venues returned by Foursquare API
radius=500 # define radius

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    limit)

'https://api.foursquare.com/v2/venues/explore?&client_id=EQHEGBJWRIMUCW52IZNFCH1WDPNGXERU21S4A5D0NSH03UF4&client_secret=MGXMV2UPEM4VA40DLPHTSH0A2X5RR5UYN5RSSFIBR42TXJP5&v=20180605&ll=43.7709921,-79.21691740000001&radius=500&limit=100'

In [19]:
#getting the results from the API
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5eae8601b9a389001e27f516'},
 'response': {'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.7754921045, 'lng': -79.21069729639068},
   'sw': {'lat': 43.7664920955, 'lng': -79.22313750360935}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4cc1d28c06c254815ac18547',
       'name': 'Starbucks',
       'location': {'address': '300 Borough Dr',
        'crossStreet': 'Scarborough Town Centre',
        'lat': 43.770037201625215,
        'lng': -79.22115586641958,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.770037201625215,
          'lng': -79.22115586641958}],
        'distance': 356,
        'cc': 'CA

In [20]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [21]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  nearby_venues = json_normalize(venues) # flatten JSON


Unnamed: 0,name,categories,lat,lng
0,Starbucks,Coffee Shop,43.770037,-79.221156
1,Tim Hortons,Coffee Shop,43.770827,-79.223078
2,Korean Grill House,Korean Restaurant,43.770812,-79.214502
3,Jessies Variety Store,Convenience Store,43.772778,-79.2225


<h3>2. Exploring the neighborhoods in Scarborough</h3>

In [22]:
#creating a function that gets a nearby venues in the neighborhoods of Scarborough
def getVenues(name,latitude,longitude,radius=500):
    venues_list=[]
    for name,lat,lng in zip(name,latitude,longitude):
        #creating API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
        
        #creating a GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        #appending the relevant values in venue list
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']

    return(nearby_venues)

#### Getting the venues in Scarborough using the function defined above

In [23]:
scarborough_venues = getVenues(name=scarborough_df['Neighborhood'],
                                   latitude=scarborough_df['Latitude'],
                                   longitude=scarborough_df['Longitude']
                                  )
scarborough_venues

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Malvern, Rouge",43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
1,"Malvern, Rouge",43.806686,-79.194353,Interprovincial Group,43.805630,-79.200378,Print Shop
2,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
3,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,SEBS Engineering Inc. (Sustainable Energy and ...,43.782371,-79.156820,Construction & Landscaping
4,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,Scarborough Historical Society,43.788755,-79.162438,History Museum
...,...,...,...,...,...,...,...
97,"Steeles West, L'Amoreaux West",43.799525,-79.318389,Nails for You,43.798750,-79.318768,Cosmetics Shop
98,"Steeles West, L'Amoreaux West",43.799525,-79.318389,Rogers Plus,43.798911,-79.318277,Electronics Store
99,"Steeles West, L'Amoreaux West",43.799525,-79.318389,A Buck or Two,43.798286,-79.318485,Thrift / Vintage Store
100,"Steeles West, L'Amoreaux West",43.799525,-79.318389,Presotea,43.799397,-79.319014,Bubble Tea Shop


In [24]:
#getting the shape of the DataFrame obtained
scarborough_venues.shape

(102, 7)

In [25]:
#checking the number of venues returned per neighborhood
scarborough_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,5,5,5,5,5,5
"Birch Cliff, Cliffside West",4,4,4,4,4,4
Cedarbrae,8,8,8,8,8,8
"Clarks Corners, Tam O'Shanter, Sullivan",15,15,15,15,15,15
"Cliffside, Cliffcrest, Scarborough Village West",2,2,2,2,2,2
"Dorset Park, Wexford Heights, Scarborough Town Centre",7,7,7,7,7,7
"Golden Mile, Clairlea, Oakridge",10,10,10,10,10,10
"Guildwood, Morningside, West Hill",7,7,7,7,7,7
"Kennedy Park, Ionview, East Birchmount Park",6,6,6,6,6,6
"Malvern, Rouge",2,2,2,2,2,2


In [26]:
#getting a count for the number of unique categories 
print('There are {} uniques categories.'.format(len(scarborough_venues['Venue Category'].unique())))

There are 62 uniques categories.


<h3>3. Analysing each neighborhood</h3>

In [27]:
# one hot encoding
scarborough_onehot = pd.get_dummies(scarborough_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
scarborough_onehot['Neighborhood'] = scarborough_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [scarborough_onehot.columns[-1]] + list(scarborough_onehot.columns[:-1])
scarborough_onehot = scarborough_onehot[fixed_columns]

scarborough_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,American Restaurant,Athletics & Sports,Auto Garage,Bakery,Bank,Bar,Breakfast Spot,Brewery,...,Sandwich Place,Shopping Mall,Skating Rink,Smoke Shop,Soccer Field,Supermarket,Thai Restaurant,Thrift / Vintage Store,Train Station,Vietnamese Restaurant
0,"Malvern, Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Malvern, Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Rouge Hill, Port Union, Highland Creek",0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Rouge Hill, Port Union, Highland Creek",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Rouge Hill, Port Union, Highland Creek",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
scarborough_onehot.shape

(102, 63)

#### Grouping rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [29]:
scarborough_grouped = scarborough_onehot.groupby('Neighborhood').mean().reset_index()
scarborough_grouped

Unnamed: 0,Neighborhood,Accessories Store,American Restaurant,Athletics & Sports,Auto Garage,Bakery,Bank,Bar,Breakfast Spot,Brewery,...,Sandwich Place,Shopping Mall,Skating Rink,Smoke Shop,Soccer Field,Supermarket,Thai Restaurant,Thrift / Vintage Store,Train Station,Vietnamese Restaurant
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,...,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Birch Cliff, Cliffside West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Cedarbrae,0.0,0.0,0.125,0.0,0.125,0.125,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0
3,"Clarks Corners, Tam O'Shanter, Sullivan",0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,...,0.0,0.066667,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,0.0
4,"Cliffside, Cliffcrest, Scarborough Village West",0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Dorset Park, Wexford Heights, Scarborough Town...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.142857
6,"Golden Mile, Clairlea, Oakridge",0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0
7,"Guildwood, Morningside, West Hill",0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.142857,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,"Kennedy Park, Ionview, East Birchmount Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0
9,"Malvern, Rouge",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
#getting the shape of new dataframe
scarborough_grouped.shape

(16, 63)

#### Printing each neighborhood along with the top 5 most common venues

In [31]:
num_top_venues = 5

for hood in scarborough_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = scarborough_grouped[scarborough_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
                       venue  freq
0             Clothing Store   0.2
1  Latin American Restaurant   0.2
2                     Lounge   0.2
3             Breakfast Spot   0.2
4               Skating Rink   0.2


----Birch Cliff, Cliffside West----
                   venue  freq
0           Skating Rink  0.25
1  General Entertainment  0.25
2                   Café  0.25
3        College Stadium  0.25
4     Mexican Restaurant  0.00


----Cedarbrae----
                venue  freq
0    Hakka Restaurant  0.12
1         Gas Station  0.12
2  Athletics & Sports  0.12
3              Bakery  0.12
4                Bank  0.12


----Clarks Corners, Tam O'Shanter, Sullivan----
                 venue  freq
0          Pizza Place  0.13
1             Pharmacy  0.13
2        Shopping Mall  0.07
3          Gas Station  0.07
4  Fried Chicken Joint  0.07


----Cliffside, Cliffcrest, Scarborough Village West----
                 venue  freq
0                Motel   0.5
1  American Restaura

#### Converting above result into a DataFrame

In [32]:
#function to sort the venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [33]:
#creating a new dataframe and displaying the top 10 venues for each neighborhood
num_top_venues = 10

#indicators list for 1st, 2nd and 3rd
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = scarborough_grouped['Neighborhood']

for ind in np.arange(scarborough_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(scarborough_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Latin American Restaurant,Lounge,Skating Rink,Clothing Store,Breakfast Spot,Vietnamese Restaurant,College Stadium,Construction & Landscaping,Convenience Store,Cosmetics Shop
1,"Birch Cliff, Cliffside West",General Entertainment,Skating Rink,College Stadium,Café,Vietnamese Restaurant,Clothing Store,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store
2,Cedarbrae,Caribbean Restaurant,Thai Restaurant,Athletics & Sports,Gas Station,Bakery,Bank,Fried Chicken Joint,Hakka Restaurant,Department Store,Construction & Landscaping
3,"Clarks Corners, Tam O'Shanter, Sullivan",Pizza Place,Pharmacy,Noodle House,Fast Food Restaurant,Intersection,Italian Restaurant,Coffee Shop,Chinese Restaurant,Gas Station,Fried Chicken Joint
4,"Cliffside, Cliffcrest, Scarborough Village West",American Restaurant,Motel,Vietnamese Restaurant,Clothing Store,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store


<h3>4. Cluster neighborhoods</h3

Here we're using k-means to cluster the neighborhoods into 5 clusters

In [34]:
# set number of clusters
kclusters = 5

scarborough_grouped_clustering = scarborough_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(scarborough_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 1, 0, 1, 1, 1, 1, 4])

Creating a new dataframe that includes the cluster along with top 10 venues for each neighborhood.

In [35]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_.astype(int))

scarborough_merged = scarborough_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
scarborough_merged = scarborough_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

scarborough_merged.head() 

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
9,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,4.0,Fast Food Restaurant,Print Shop,Vietnamese Restaurant,Chinese Restaurant,General Entertainment,Gas Station,Fried Chicken Joint,Electronics Store,Discount Store,Department Store
18,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,1.0,History Museum,Bar,Construction & Landscaping,Clothing Store,Grocery Store,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store
27,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,1.0,Mexican Restaurant,Medical Center,Electronics Store,Bank,Intersection,Breakfast Spot,Rental Car Location,Vietnamese Restaurant,Cosmetics Shop,Construction & Landscaping
36,M1G,Scarborough,Woburn,43.770992,-79.216917,3.0,Coffee Shop,Korean Restaurant,Convenience Store,Vietnamese Restaurant,Clothing Store,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store
45,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,1.0,Caribbean Restaurant,Thai Restaurant,Athletics & Sports,Gas Station,Bakery,Bank,Fried Chicken Joint,Hakka Restaurant,Department Store,Construction & Landscaping


In [36]:
#checking the last few values of the DataFrame
scarborough_merged.tail()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
117,M1S,Scarborough,Agincourt,43.7942,-79.262029,1.0,Latin American Restaurant,Lounge,Skating Rink,Clothing Store,Breakfast Spot,Vietnamese Restaurant,College Stadium,Construction & Landscaping,Convenience Store,Cosmetics Shop
126,M1T,Scarborough,"Clarks Corners, Tam O'Shanter, Sullivan",43.781638,-79.304302,1.0,Pizza Place,Pharmacy,Noodle House,Fast Food Restaurant,Intersection,Italian Restaurant,Coffee Shop,Chinese Restaurant,Gas Station,Fried Chicken Joint
135,M1V,Scarborough,"Milliken, Agincourt North, Steeles East, L'Amo...",43.815252,-79.284577,3.0,Park,Playground,Coffee Shop,Chinese Restaurant,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store
144,M1W,Scarborough,"Steeles West, L'Amoreaux West",43.799525,-79.318389,1.0,Chinese Restaurant,Fast Food Restaurant,Sandwich Place,Cosmetics Shop,Bubble Tea Shop,Pharmacy,Pizza Place,Breakfast Spot,Coffee Shop,Grocery Store
153,M1X,Scarborough,Upper Rouge,43.836125,-79.205636,,,,,,,,,,,


The row 153 has a lot of null values as shown above

In [37]:
#dropping the row 153 
scarborough_merged.drop(153,inplace=True)

#### Creating a map to visualize the clusters

In [38]:
cluster_values=[]
for value in scarborough_merged['Cluster Labels']:
    cluster_values.append(int(value))
scarborough_merged['Cluster Labels'] = cluster_values

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(scarborough_merged['Latitude'], scarborough_merged['Longitude'], scarborough_merged['Neighborhood'], scarborough_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

<h3>5. Examining each clusters</h3>

Segregating each Neighborhoods based on the cluster formed

#### Cluster 1

In [39]:
cluster1 = scarborough_merged[scarborough_merged['Cluster Labels']==0]
cluster1

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
81,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476,0,American Restaurant,Motel,Vietnamese Restaurant,Clothing Store,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store


#### Cluster 2

In [40]:
cluster2 = scarborough_merged[scarborough_merged['Cluster Labels']==1]
cluster2

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
18,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,1,History Museum,Bar,Construction & Landscaping,Clothing Store,Grocery Store,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store
27,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,1,Mexican Restaurant,Medical Center,Electronics Store,Bank,Intersection,Breakfast Spot,Rental Car Location,Vietnamese Restaurant,Cosmetics Shop,Construction & Landscaping
45,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,1,Caribbean Restaurant,Thai Restaurant,Athletics & Sports,Gas Station,Bakery,Bank,Fried Chicken Joint,Hakka Restaurant,Department Store,Construction & Landscaping
63,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029,1,Discount Store,Train Station,Hobby Shop,Department Store,Coffee Shop,Hakka Restaurant,Grocery Store,General Entertainment,Gas Station,Fried Chicken Joint
72,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577,1,Bakery,Bus Line,Park,Metro Station,Ice Cream Shop,Soccer Field,Intersection,Bus Station,Cosmetics Shop,Construction & Landscaping
90,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848,1,General Entertainment,Skating Rink,College Stadium,Café,Vietnamese Restaurant,Clothing Store,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store
99,M1P,Scarborough,"Dorset Park, Wexford Heights, Scarborough Town...",43.75741,-79.273304,1,Indian Restaurant,Vietnamese Restaurant,Pet Store,Brewery,Chinese Restaurant,Thrift / Vintage Store,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store
108,M1R,Scarborough,"Wexford, Maryvale",43.750072,-79.295849,1,Accessories Store,Smoke Shop,Breakfast Spot,Sandwich Place,Shopping Mall,Middle Eastern Restaurant,Bakery,Auto Garage,Fast Food Restaurant,Fried Chicken Joint
117,M1S,Scarborough,Agincourt,43.7942,-79.262029,1,Latin American Restaurant,Lounge,Skating Rink,Clothing Store,Breakfast Spot,Vietnamese Restaurant,College Stadium,Construction & Landscaping,Convenience Store,Cosmetics Shop
126,M1T,Scarborough,"Clarks Corners, Tam O'Shanter, Sullivan",43.781638,-79.304302,1,Pizza Place,Pharmacy,Noodle House,Fast Food Restaurant,Intersection,Italian Restaurant,Coffee Shop,Chinese Restaurant,Gas Station,Fried Chicken Joint


#### Cluster 3

In [41]:
cluster3 = scarborough_merged[scarborough_merged['Cluster Labels']==2]
cluster3

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
54,M1J,Scarborough,Scarborough Village,43.744734,-79.239476,2,Playground,Vietnamese Restaurant,Chinese Restaurant,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store,Department Store


#### Cluster 4

In [42]:
cluster4 = scarborough_merged[scarborough_merged['Cluster Labels']==3]
cluster4

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
36,M1G,Scarborough,Woburn,43.770992,-79.216917,3,Coffee Shop,Korean Restaurant,Convenience Store,Vietnamese Restaurant,Clothing Store,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store
135,M1V,Scarborough,"Milliken, Agincourt North, Steeles East, L'Amo...",43.815252,-79.284577,3,Park,Playground,Coffee Shop,Chinese Restaurant,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store


#### Cluster 5

In [43]:
cluster5 = scarborough_merged[scarborough_merged['Cluster Labels']==4]
cluster5

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
9,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,4,Fast Food Restaurant,Print Shop,Vietnamese Restaurant,Chinese Restaurant,General Entertainment,Gas Station,Fried Chicken Joint,Electronics Store,Discount Store,Department Store


#### Displaying the shape of the Toronto DataFrame

In [44]:
df.shape

(103, 5)