# Segmenting and Clustering Neighborhoods in Toronto - Part 3

Part 1:
Build the code to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe. Clean the data.

Part 2:
Collect latitude and longitude for every postal code in the dataframe, and append to the dataframe

Part 3:
Perform k-means clustering analysis

Perform imports

In [65]:
import numpy as np
import pandas as pd
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

import geocoder

import os 

import matplotlib.cm as cm
import matplotlib.colors as colors


print('Libraries imported.')

Libraries imported.


Read in wikipedia data into pandas dataframe. This can be done with read_html.

In [37]:
table = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
table = table[0]
table.dropna(axis=0, subset=['Neighborhood'], inplace=True)

mysep='\n---------------------------------------------------------------------------------------------------\n'

print(table)
print(mysep)
print(table[table['Borough']=='Not assigned'])

    Postal Code           Borough  \
2           M3A        North York   
3           M4A        North York   
4           M5A  Downtown Toronto   
5           M6A        North York   
6           M7A  Downtown Toronto   
..          ...               ...   
160         M8X         Etobicoke   
165         M4Y  Downtown Toronto   
168         M7Y      East Toronto   
169         M8Y         Etobicoke   
178         M8Z         Etobicoke   

                                          Neighborhood  
2                                            Parkwoods  
3                                     Victoria Village  
4                            Regent Park, Harbourfront  
5                     Lawrence Manor, Lawrence Heights  
6          Queen's Park, Ontario Provincial Government  
..                                                 ...  
160      The Kingsway, Montgomery Road, Old Mill North  
165                               Church and Wellesley  
168              Business reply mail Proce

In [38]:
table = table.reset_index(drop=True)
table

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing Centre
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


A snapshot of dataframe statistics

In [39]:
table.describe()

Unnamed: 0,Postal Code,Borough,Neighborhood
count,103,103,103
unique,103,10,98
top,M9N,North York,Downsview
freq,1,24,4


In [40]:
table[table['Postal Code']=='M5A']

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [41]:
table.shape

(103, 3)

## Use persistent requests to obtain geocoordinates for Postal Codes, then append to dataframe.

*Geocoder didn't work*

In [31]:
# initialize your variable to None
lat_lng_coords = None
postal_code='M5G'

# # loop until you get the coordinates
# i=0
# while(lat_lng_coords is None):
#     g = geocoder.google(f'{postal_code}, Toronto, Ontario')
#     print(i)
#     lat_lng_coords = g.latlng
#     i+=1

g = geocoder.google('M5G, Toronto, Ontario')
lat_lng_coords = g.latlng
    
print(g)

# latitude = lat_lng_coords[0]
# longitude = lat_lng_coords[1]

<[REQUEST_DENIED] Google - Geocode [empty]>


*Try Nominatim*

In [42]:
from geopy.geocoders import Nominatim
geolocator = Nominatim('ca',user_agent="myapp1")
postal_code='M5A'
location = geolocator.geocode({"postalcode": postal_code,'countryRegion': 'CA'})
print(location)
# print((location.latitude, location.longitude))

None


*Try pgeocode*

In [42]:
import pgeocode
nomi = pgeocode.Nominatim('ca')
postal_code='M5A'
location = nomi.query_postal_code(postal_code)
print(location.latitude, location.longitude)

43.6555 -79.3626


Loop through and add lat and long to dataframe

In [43]:
for index, row in table.iterrows():
#     print(table.at[index,'Postal Code'])
    location = nomi.query_postal_code(table.at[index,'Postal Code'])
#     print(location.latitude, location.longitude)
    table.at[index,'latitude'] = location.latitude
    table.at[index,'longitude'] = location.longitude

In [44]:
table

Unnamed: 0,Postal Code,Borough,Neighborhood,latitude,longitude
0,M3A,North York,Parkwoods,43.7545,-79.3300
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.6518,-79.5076
99,M4Y,Downtown Toronto,Church and Wellesley,43.6656,-79.3830
100,M7Y,East Toronto,Business reply mail Processing Centre,43.7804,-79.2505
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.6325,-79.4939


In [45]:
print(f'The dataframe has {len(table["Borough"].unique())} boroughs and {table.shape[0]} neighborhoods.')

The dataframe has 10 boroughs and 103 neighborhoods.


# Cluster Analysis

## FourSquare API Data

In [46]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ca_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(f'The geograpical coordinates of Toronto, Ontario are {latitude}, {longitude}.')

The geograpical coordinates of Toronto, Ontario are 43.6534817, -79.3839347.


In [47]:
#drop lat/long nan values
table.dropna(axis=0, subset=['latitude'], inplace=True)

In [48]:
print(table)

    Postal Code           Borough  \
0           M3A        North York   
1           M4A        North York   
2           M5A  Downtown Toronto   
3           M6A        North York   
4           M7A  Downtown Toronto   
..          ...               ...   
98          M8X         Etobicoke   
99          M4Y  Downtown Toronto   
100         M7Y      East Toronto   
101         M8Y         Etobicoke   
102         M8Z         Etobicoke   

                                          Neighborhood  latitude  longitude  
0                                            Parkwoods   43.7545   -79.3300  
1                                     Victoria Village   43.7276   -79.3148  
2                            Regent Park, Harbourfront   43.6555   -79.3626  
3                     Lawrence Manor, Lawrence Heights   43.7223   -79.4504  
4          Queen's Park, Ontario Provincial Government   43.6641   -79.3889  
..                                                 ...       ...        ...  
98       

In [49]:
# create map of Canada using latitude and longitude values
map_canada = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(table['latitude'], table['longitude'], table['Borough'], table['Neighborhood']):
    label = f'{neighborhood}, {borough}'
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_canada)  
    
map_canada

#### Define Foursquare Credentials and Version

In [50]:
CLIENT_ID = os.environ.get('foursq_client_id') # your Foursquare ID
CLIENT_SECRET = os.environ.get('foursq_client_secret') # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

# print('Your credentails:')
# print('CLIENT_ID: ' + CLIENT_ID)
# print('CLIENT_SECRET:' + CLIENT_SECRET)

#### Now, let's get the top 100 venues that are around the center coordinate of Toronto within a radius of 500 meters.

In [70]:
LIMIT = 400 # limit of number of venues returned by Foursquare API

radius = 800 # define radius

url = f'https://api.foursquare.com/v2/venues/explore?&client_id={CLIENT_ID}&client_secret={CLIENT_SECRET}&v={VERSION}&ll={latitude},{longitude}&radius={radius}&limit={LIMIT}'

results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5ebebe0c47b43d0023a277b8'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Bay Street Corridor',
  'headerFullLocation': 'Bay Street Corridor, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 185,
  'suggestedBounds': {'ne': {'lat': 43.660681707200006,
    'lng': -79.37400202895091},
   'sw': {'lat': 43.646281692799995, 'lng': -79.39386737104908}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '5227bb01498e17bf485e6202',
       'name': 'Downtown Toronto',
       'location': {'lat': 43.65323167517444,
        'lng': -79.38529600606677,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.65323167517444,


In [71]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [72]:
venues = results['response']['groups'][0]['items']

nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# # filter the category for each row, using the pre-defined function above
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# # clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,categories,lat,lng
0,Downtown Toronto,Neighborhood,43.653232,-79.385296
1,Nathan Phillips Square,Plaza,43.652270,-79.383516
2,Indigo,Bookstore,43.653515,-79.380696
3,Chatime 日出茶太,Bubble Tea Shop,43.655542,-79.384684
4,Textile Museum of Canada,Art Museum,43.654396,-79.386500
...,...,...,...,...
95,Versus Coffee,Coffee Shop,43.651213,-79.375236
96,Kupfert & Kim (First Canadian Place),Gluten-free Restaurant,43.648547,-79.381624
97,Page One Cafe,Café,43.657772,-79.376073
98,Kojin,Colombian Restaurant,43.649398,-79.386091


#### Write a function to repeat the same process to all the neighborhoods in Manhattan

In [95]:
def getNearbyVenues(names, latitudes, longitudes, radius=800, LIMIT=400):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        print(lat, lng)
            
        # create the API request URL
        url = f'https://api.foursquare.com/v2/venues/explore?&client_id={CLIENT_ID}&client_secret={CLIENT_SECRET}&v={VERSION}&ll={lat},{lng}&radius={radius}&limit={LIMIT}'
            
        # make the GET request
        results = requests.get(url).json()
        results_venues = results['response']['groups'][0]['items']
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results_venues])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [96]:
toronto_venues = getNearbyVenues(names=table['Neighborhood'],
                                   latitudes=table['latitude'],
                                   longitudes=table['longitude']
                                  )

Parkwoods
43.7545 -79.33
Victoria Village
43.7276 -79.3148
Regent Park, Harbourfront
43.6555 -79.3626
Lawrence Manor, Lawrence Heights
43.7223 -79.4504
Queen's Park, Ontario Provincial Government
43.6641 -79.3889
Islington Avenue
43.6662 -79.5282
Malvern, Rouge
43.8113 -79.193
Don Mills
43.745 -79.359
Parkview Hill, Woodbine Gardens
43.7063 -79.3094
Garden District, Ryerson
43.6572 -79.3783
Glencairn
43.7081 -79.4479
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
43.6505 -79.5517
Rouge Hill, Port Union, Highland Creek
43.7878 -79.1564
Don Mills
43.7334 -79.3329
Woodbine Heights
43.6913 -79.3116
St. James Town
43.6513 -79.3756
Humewood-Cedarvale
43.6915 -79.4307
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
43.6437 -79.5767
Guildwood, Morningside, West Hill
43.7678 -79.1866
The Beaches
43.6784 -79.2941
Berczy Park
43.6456 -79.3754
Caledonia-Fairbanks
43.6889 -79.4507
Woburn
43.7712 -79.2144
Leaside
43.7124 -79.3644
Central Bay Street
43.6564 -79.3

In [97]:
print(toronto_venues.shape)
toronto_venues.groupby('Neighborhood').count()

(3931, 7)


Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,30,30,30,30,30,30
"Alderwood, Long Branch",11,11,11,11,11,11
"Bathurst Manor, Wilson Heights, Downsview North",23,23,23,23,23,23
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",36,36,36,36,36,36
...,...,...,...,...,...,...
"Willowdale, Newtonbrook",26,26,26,26,26,26
Woburn,4,4,4,4,4,4
Woodbine Heights,44,44,44,44,44,44
York Mills West,6,6,6,6,6,6


In [98]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Zoo Exhibit,Accessories Store,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Animal Shelter,Antique Shop,Aquarium,Arcade,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [99]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Zoo Exhibit,Accessories Store,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Animal Shelter,Antique Shop,Aquarium,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,...,0.0,0.0,0.043478,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.027778,0.0,0.0,0.0,...,0.0,0.0,0.027778,0.0,0.0,0.0,0.027778,0.027778,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,"Willowdale, Newtonbrook",0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
92,Woburn,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
93,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
94,York Mills West,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0


Print neighborhood with 5 most common venues

In [100]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
                venue  freq
0       Shopping Mall  0.10
1      Sandwich Place  0.07
2  Chinese Restaurant  0.07
3         Pizza Place  0.07
4  Seafood Restaurant  0.03


----Alderwood, Long Branch----
         venue  freq
0  Pizza Place  0.18
1          Pub  0.09
2          Gym  0.09
3         Pool  0.09
4     Pharmacy  0.09


----Bathurst Manor, Wilson Heights, Downsview North----
           venue  freq
0    Coffee Shop  0.09
1    Pizza Place  0.09
2           Bank  0.09
3  Deli / Bodega  0.04
4       Pharmacy  0.04


----Bayview Village----
                venue  freq
0  Golf Driving Range  0.25
1               Trail  0.25
2                Park  0.25
3    Asian Restaurant  0.25
4         Zoo Exhibit  0.00


----Bedford Park, Lawrence Manor East----
                venue  freq
0         Coffee Shop  0.08
1  Italian Restaurant  0.08
2      Sandwich Place  0.06
3        Intersection  0.03
4                Bank  0.03


----Berczy Park----
                 venue  freq
0 

4                 Park  0.03


----Studio District----
                 venue  freq
0                 Café  0.09
1          Coffee Shop  0.06
2       Sandwich Place  0.04
3  American Restaurant  0.04
4                  Bar  0.04


----Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park----
                venue  freq
0         Coffee Shop  0.10
1  Italian Restaurant  0.07
2    Sushi Restaurant  0.05
3          Restaurant  0.05
4                Park  0.03


----The Annex, North Midtown, Yorkville----
                           venue  freq
0                    Coffee Shop  0.10
1                           Café  0.06
2             Italian Restaurant  0.05
3                            Pub  0.05
4  Vegetarian / Vegan Restaurant  0.04


----The Beaches----
                  venue  freq
0                   Pub  0.11
1           Coffee Shop  0.11
2  Caribbean Restaurant  0.07
3             Gastropub  0.04
4                 Trail  0.04


----The Danforth West, Riverdale----
      

In [101]:
#function to sort venues
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [121]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Shopping Mall,Sandwich Place,Pizza Place,Chinese Restaurant,Malay Restaurant,Sushi Restaurant,Supermarket,Seafood Restaurant,Bank,Bakery
1,"Alderwood, Long Branch",Pizza Place,Pool,Skating Rink,Gas Station,Gym,Coffee Shop,Pharmacy,Sandwich Place,Pub,Print Shop
2,"Bathurst Manor, Wilson Heights, Downsview North",Pizza Place,Bank,Coffee Shop,Ice Cream Shop,Sandwich Place,Sushi Restaurant,Supermarket,Frozen Yogurt Shop,Fried Chicken Joint,Diner
3,Bayview Village,Golf Driving Range,Trail,Park,Asian Restaurant,Yoga Studio,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Elementary School,Ethiopian Restaurant
4,"Bedford Park, Lawrence Manor East",Coffee Shop,Italian Restaurant,Sandwich Place,Café,Bank,Thai Restaurant,Intersection,Baby Store,Bagel Shop,Bakery


In [122]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0,
       0, 0, 0, 0, 0, 4, 4, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4,
       3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0,
       0, 0, 0, 0, 1, 0, 4, 2], dtype=int32)

Create a new dataframe that includes the cluster as well as the top 50 venues for each neighborhood.

In [123]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = table

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.7545,-79.33,0.0,Park,Bus Stop,Coffee Shop,Caribbean Restaurant,Chinese Restaurant,Café,Discount Store,Train Station,Road,Supermarket
1,M4A,North York,Victoria Village,43.7276,-79.3148,0.0,Portuguese Restaurant,Coffee Shop,Hockey Arena,Playground,Pizza Place,Park,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Elementary School
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626,0.0,Coffee Shop,Restaurant,Park,Café,Theater,Bakery,Italian Restaurant,Breakfast Spot,Diner,Pub
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504,0.0,Clothing Store,Restaurant,Fast Food Restaurant,Furniture / Home Store,Cosmetics Shop,Coffee Shop,Dessert Shop,Bookstore,Toy / Game Store,Food Court
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889,0.0,Coffee Shop,Park,Sushi Restaurant,Japanese Restaurant,Restaurant,Café,Clothing Store,Women's Store,Italian Restaurant,Gastropub


In [124]:
toronto_merged['Cluster Labels'].describe()

count    101.000000
mean       0.554455
std        1.307480
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        4.000000
Name: Cluster Labels, dtype: float64

In [125]:
toronto_merged.shape

(102, 16)

In [126]:
for i in toronto_merged['Cluster Labels']:
    print(i)

0.0
0.0
0.0
0.0
0.0
3.0
4.0
4.0
0.0
0.0
0.0
4.0
4.0
4.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
3.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
4.0
0.0
0.0
0.0
0.0
0.0
2.0
0.0
0.0
0.0
3.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
4.0
0.0
0.0
0.0
3.0
0.0
0.0
0.0
4.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
4.0
0.0
0.0
0.0
0.0
0.0
4.0
0.0
0.0
0.0
nan
0.0
0.0
0.0
0.0
0.0
0.0
0.0


In [127]:
toronto_merged.dropna(axis=0, subset=['Cluster Labels'], inplace=True)

In [128]:
for i in toronto_merged['Cluster Labels']:
    print(i)

0.0
0.0
0.0
0.0
0.0
3.0
4.0
4.0
0.0
0.0
0.0
4.0
4.0
4.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
3.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
4.0
0.0
0.0
0.0
0.0
0.0
2.0
0.0
0.0
0.0
3.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
4.0
0.0
0.0
0.0
3.0
0.0
0.0
0.0
4.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
4.0
0.0
0.0
0.0
0.0
0.0
4.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


In [129]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['latitude'], toronto_merged['longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters