# Segmenting and Clustering Neighborhoods in Toronto

Hello and welcome to this example. In this notebook I'll learn the process of scraping Web-sources and building real-data model. It will be well commented and must be a very useful for further researches as a template.

Let's have a look at the object of the research as an inspiration.

<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/5/5d/Toronto_skyline_16.jpg/1100px-Toronto_skyline_16.jpg" />

# Here is a territory of city from above

<img src="https://upload.wikimedia.org/wikipedia/commons/9/96/Toronto_by_Sentinel-2.jpg" />

# Import section

In [39]:
# All required libraries for this notebook
import pandas as pd

#!conda install -c conda-forge geopy --yes # uncomment this line if library isn't installed
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if library isn't installed
import folium # map rendering library

import json # library to handle JSON files

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

import numpy as np

# import k-means from clustering stage
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

### Credentials for Foursquare

In [40]:
CLIENT_ID = 'G1U1XU3C5MMJWI1Q23TGMKQ4WVRJS5NV5CR10K2CK5R31A0I' # Foursquare ID
CLIENT_SECRET = 'M25Y55LQSBPQTVM5RYYBGNH1R4OES0ALTRBTHM3PSQL10PE3' # Foursquare Secret
VERSION = '20200501' # Foursquare API version
LIMIT = 100 # Limits for queries

### Upload data from previous lab as dataframe

In [41]:
with open('geo_canada.csv','r') as file:
    canada_df = pd.DataFrame(pd.read_csv(file, index_col=0))

canada_df

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,Business reply mail Processing Centre,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


### Get coordinates of Toronto, Canada via converter

In [42]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="coursera_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(f'The geograpical coordinate of Toronto are {latitude}, {longitude}.')

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


### Creating a map of Toronto with markers on it

In [43]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(canada_df['Latitude'], canada_df['Longitude'], canada_df['Borough'], canada_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Function for getting nearby venues with geo coordinates

In [83]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [84]:
''''''
# Uncomment this to download data with Foursquare API
toronto_venues = getNearbyVenues(names=canada_df['Neighborhood'],
                                   latitudes=canada_df['Latitude'],
                                   longitudes=canada_df['Longitude']
                                  )


Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Queen's Park, Ontario Provincial Government
Islington Avenue
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmount Park
Bayview Village
Downsview
The Danforth West, Ri

In [37]:
#with open('toronto_venues.csv', 'r') as file:
#    toronto_venues = pd.DataFrame(pd.read_csv(file, index_col=0))

In [85]:
toronto_venues

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Allwyn's Bakery,43.759840,-79.324719,Caribbean Restaurant
1,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.332140,Park
2,Parkwoods,43.753259,-79.329656,Tim Hortons,43.760668,-79.326368,Café
3,Parkwoods,43.753259,-79.329656,A&W,43.760643,-79.326865,Fast Food Restaurant
4,Parkwoods,43.753259,-79.329656,Bruno's valu-mart,43.746143,-79.324630,Grocery Store
...,...,...,...,...,...,...,...
4874,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Rainbow Convenience,43.635901,-79.520464,Convenience Store
4875,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Fresh & Tasty,43.635899,-79.520534,Deli / Bodega
4876,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,High Seas Restaurant,43.636058,-79.520502,Mediterranean Restaurant
4877,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Mr.Sub,43.636174,-79.520655,Restaurant


#### Display venues shape

In [86]:
toronto_venues.shape

(4879, 7)

In [87]:
# Tech cell for saving df
#toronto_venues.to_csv('toronto_venues_1km_radius.csv')

In [88]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,46,46,46,46,46,46
"Alderwood, Long Branch",28,28,28,28,28,28
"Bathurst Manor, Wilson Heights, Downsview North",27,27,27,27,27,27
Bayview Village,12,12,12,12,12,12
"Bedford Park, Lawrence Manor East",42,42,42,42,42,42
...,...,...,...,...,...,...
"Willowdale, Newtonbrook",31,31,31,31,31,31
Woburn,8,8,8,8,8,8
Woodbine Heights,29,29,29,29,29,29
York Mills West,21,21,21,21,21,21


In [89]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Zoo,ATM,Accessories Store,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [90]:
toronto_onehot.shape

(4879, 325)

In [91]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Zoo,ATM,Accessories Store,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.021739,0.0,0.0,...,0.0,0.000000,0.021739,0.0,0.0,0.0,0.0,0.00000,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.037037,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.023810,0.0,0.0,...,0.0,0.023810,0.000000,0.0,0.0,0.0,0.0,0.02381,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,"Willowdale, Newtonbrook",0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.0
93,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.0
94,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.034483,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.0
95,York Mills West,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.0,0.0


In [92]:
toronto_grouped.shape

(97, 325)

#### Print each neighborhood along with the top 5 most common venues

In [93]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
                  venue  freq
0    Chinese Restaurant  0.13
1         Shopping Mall  0.07
2                Bakery  0.04
3  Caribbean Restaurant  0.04
4           Pizza Place  0.04


----Alderwood, Long Branch----
               venue  freq
0     Discount Store  0.11
1           Pharmacy  0.11
2  Convenience Store  0.07
3        Pizza Place  0.07
4               Park  0.07


----Bathurst Manor, Wilson Heights, Downsview North----
              venue  freq
0       Coffee Shop  0.07
1              Bank  0.07
2    Ice Cream Shop  0.04
3     Shopping Mall  0.04
4  Sushi Restaurant  0.04


----Bayview Village----
                 venue  freq
0                 Bank  0.17
1  Japanese Restaurant  0.17
2          Gas Station  0.17
3                 Park  0.08
4   Chinese Restaurant  0.08


----Bedford Park, Lawrence Manor East----
                venue  freq
0  Italian Restaurant  0.07
1         Coffee Shop  0.07
2                Park  0.05
3          Restaurant  0.05
4      Co

4           Gastropub  0.04


----St. James Town, Cabbagetown----
                 venue  freq
0                 Park  0.09
1                 Café  0.06
2           Restaurant  0.06
3            Gastropub  0.06
4  Japanese Restaurant  0.06


----Steeles West, L'Amoreaux West----
                  venue  freq
0    Chinese Restaurant  0.17
1  Fast Food Restaurant  0.07
2           Coffee Shop  0.07
3                  Bank  0.07
4                Bakery  0.07


----Stn A PO Boxes----
                 venue  freq
0          Coffee Shop  0.12
1                 Café  0.07
2                Hotel  0.05
3           Restaurant  0.04
4  Japanese Restaurant  0.04


----Studio District----
                   venue  freq
0            Coffee Shop  0.07
1                    Bar  0.06
2                Brewery  0.05
3                   Café  0.05
4  Vietnamese Restaurant  0.04


----Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park----
                venue  freq
0         Coffee Shop  0.

First, let's write a function to sort the venues in descending order.

In [94]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [149]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Chinese Restaurant,Shopping Mall,Caribbean Restaurant,Pizza Place,Bakery,Cantonese Restaurant,Sri Lankan Restaurant,Latin American Restaurant,Breakfast Spot,Bubble Tea Shop
1,"Alderwood, Long Branch",Discount Store,Pharmacy,Convenience Store,Pizza Place,Park,Trail,Sandwich Place,Garden Center,Gas Station,Liquor Store
2,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Pizza Place,Sandwich Place,Gas Station,Middle Eastern Restaurant,Dog Run,Fried Chicken Joint,Sushi Restaurant,Deli / Bodega
3,Bayview Village,Gas Station,Bank,Japanese Restaurant,Chinese Restaurant,Park,Grocery Store,Trail,Restaurant,Café,Yoga Studio
4,"Bedford Park, Lawrence Manor East",Italian Restaurant,Coffee Shop,Sushi Restaurant,Cosmetics Shop,Sandwich Place,Park,Bank,Restaurant,Pub,Bagel Shop
...,...,...,...,...,...,...,...,...,...,...,...
92,"Willowdale, Newtonbrook",Korean Restaurant,Café,Pizza Place,Middle Eastern Restaurant,Diner,Coffee Shop,Trail,Shopping Mall,Fried Chicken Joint,Supermarket
93,Woburn,Park,Coffee Shop,Mobile Phone Shop,Indian Restaurant,Fast Food Restaurant,Chinese Restaurant,Curling Ice,Farm,Eastern European Restaurant,Electronics Store
94,Woodbine Heights,Coffee Shop,Pizza Place,Sandwich Place,Athletics & Sports,Park,Convenience Store,Pastry Shop,Farmers Market,Skating Rink,Café
95,York Mills West,Coffee Shop,Park,Restaurant,Tennis Court,Grocery Store,Gym,Gas Station,Intersection,French Restaurant,Dentist's Office


## Cluster Neighborhoods

In [150]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([4, 4, 4, 0, 0, 0, 1, 0, 1, 0, 1, 0, 4, 0, 0, 0, 4, 1, 0, 0, 0, 4,
       0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 0, 0, 4, 1, 4, 4, 0, 4, 4,
       0, 4, 0, 0, 0, 0, 4, 4, 0, 0, 1, 4, 0, 3, 4, 2, 0, 4, 4, 0, 0, 0,
       4, 0, 2, 0, 4, 4, 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 4, 1,
       4, 4, 4, 0, 0, 4, 4, 4, 2])

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [151]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = canada_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head(10) # check the last columns!

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,4.0,Park,Convenience Store,Pharmacy,Bus Stop,Shopping Mall,Fish & Chips Shop,Pizza Place,Skating Rink,Food & Drink Shop,Road
1,M4A,North York,Victoria Village,43.725882,-79.315572,4.0,Coffee Shop,Golf Course,French Restaurant,Intersection,Gym / Fitness Center,Men's Store,Park,Grocery Store,Pizza Place,Portuguese Restaurant
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0.0,Coffee Shop,Theater,Café,Pub,Park,Diner,Breakfast Spot,Restaurant,Italian Restaurant,Bakery
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,0.0,Furniture / Home Store,Clothing Store,Restaurant,Coffee Shop,Fast Food Restaurant,Dessert Shop,Fried Chicken Joint,Sushi Restaurant,Vietnamese Restaurant,Women's Store
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0.0,Coffee Shop,Sushi Restaurant,Park,Ramen Restaurant,Gastropub,Café,Clothing Store,Italian Restaurant,Japanese Restaurant,Thai Restaurant
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242,4.0,Pharmacy,Café,Bakery,Shopping Mall,Golf Course,Park,Grocery Store,Skating Rink,Bank,Playground
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,4.0,Fast Food Restaurant,Trail,Restaurant,Supermarket,Chinese Restaurant,Bank,Bakery,Caribbean Restaurant,Paper / Office Supplies Store,Coffee Shop
7,M3B,North York,Don Mills,43.745906,-79.352188,0.0,Coffee Shop,Restaurant,Japanese Restaurant,Asian Restaurant,Burger Joint,Gym,Supermarket,Bank,Beer Store,Office
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,4.0,Fast Food Restaurant,Construction & Landscaping,Pizza Place,Brewery,Bakery,Intersection,Breakfast Spot,Gastropub,Bank,Rock Climbing Spot
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0.0,Coffee Shop,Gastropub,Japanese Restaurant,Café,Hotel,Restaurant,Diner,Theater,Ramen Restaurant,Plaza


In [152]:
# Check dataframe for NaN values
toronto_merged.groupby(toronto_merged['Cluster Labels'].isnull()).count()

Unnamed: 0_level_0,Postal code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
False,102,102,102,102,102,102,102,102,102,102,102,102,102,102,102,102
True,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0


In [153]:
# Drop NaN rows as non-informative
toronto_merged.dropna(subset=['Cluster Labels'], inplace = True)
toronto_merged.groupby(toronto_merged['Cluster Labels'].isnull()).count()

Unnamed: 0_level_0,Postal code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
False,102,102,102,102,102,102,102,102,102,102,102,102,102,102,102,102


In [154]:
# Count cluster units
toronto_merged.groupby('Cluster Labels').count()

Unnamed: 0_level_0,Postal code,Borough,Neighborhood,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0.0,52,52,52,52,52,52,52,52,52,52,52,52,52,52,52
1.0,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
2.0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
3.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4.0,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39


Finally, let's visualize the resulting clusters

In [155]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], 
                                  toronto_merged['Longitude'], 
                                  toronto_merged['Neighborhood'], 
                                  toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters