## Segmenting and Clustering Neighborhoods in Toronto

- import required library

In [1]:
import numpy as np
import pandas as pd
import json                               # library to handle JSON files
from geopy.geocoders import Nominatim     # convert an address into latitude and longitude values
import requests                           # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans        # import k-means from clustering stage
import folium                             # map rendering library

- Scrapping data from given webpage and display it.

In [2]:
data = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
data

[    Postcode           Borough          Neighbourhood
 0        M1A      Not assigned           Not assigned
 1        M2A      Not assigned           Not assigned
 2        M3A        North York              Parkwoods
 3        M4A        North York       Victoria Village
 4        M5A  Downtown Toronto           Harbourfront
 ..       ...               ...                    ...
 282      M8Z         Etobicoke              Mimico NW
 283      M8Z         Etobicoke     The Queensway West
 284      M8Z         Etobicoke  Royal York South West
 285      M8Z         Etobicoke         South of Bloor
 286      M9Z      Not assigned           Not assigned
 
 [287 rows x 3 columns],
                                                   0   \
 0                                                NaN   
 1  NL NS PE NB QC ON MB SK AB BC NU/NT YT A B C E...   
 2                                                 NL   
 3                                                  A   
 
                          

- Selecting required data and display it.

In [3]:
df = data[0]
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West
285,M8Z,Etobicoke,South of Bloor


- Renaming the column name from 'Postcode' to 'Postalcode'

In [4]:
df.rename(columns={'Postcode':'PostalCode'},inplace=True)
df

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West
285,M8Z,Etobicoke,South of Bloor


- Dropping cells with a borough that is 'Not assigned'.

In [5]:
df = df[df['Borough'] != 'Not assigned']
df

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
...,...,...,...
281,M8Z,Etobicoke,Kingsway Park South West
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West


- Reset index

In [6]:
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
...,...,...,...
205,M8Z,Etobicoke,Kingsway Park South West
206,M8Z,Etobicoke,Mimico NW
207,M8Z,Etobicoke,The Queensway West
208,M8Z,Etobicoke,Royal York South West


- Reassigning Neighbourhood as Borough if Neighbourhood is 'Not assigned'

In [7]:
for i in df.index:
    if df.iloc[i, 2] == 'Not assigned':
        df.iloc[i, 2] = df.iloc[i,1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[i, 2] = df.iloc[i,1]


- Shape of the processed data

In [8]:
df.shape

(210, 3)

In [9]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


- filtering data with Borough contains Toronto

In [10]:
toronto = pd.DataFrame(columns=['PostalCode', 'Borough', 'Neighborhood']) 

In [11]:
for i in df.index:
    if str(df.iloc[i, 1]).find('Toronto') != -1:
        
        toronto = toronto.append({
            'PostalCode' : df.iloc[i, 0],
            'Borough' : df.iloc[i,1],
            'Neighborhood' : df.iloc[i,2]
        }, ignore_index=True)

toronto.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5A,Downtown Toronto,Harbourfront
1,M7A,Downtown Toronto,Queen's Park
2,M5B,Downtown Toronto,Ryerson
3,M5B,Downtown Toronto,Garden District
4,M5C,Downtown Toronto,St. James Town
5,M4E,East Toronto,The Beaches
6,M5E,Downtown Toronto,Berczy Park
7,M5G,Downtown Toronto,Central Bay Street
8,M6G,Downtown Toronto,Christie
9,M5H,Downtown Toronto,Adelaide


These below mentioned Neighborhood need to be renamed as Latitude and longitude is unavailable for the given name.

In [12]:
toronto.loc[67, 'Neighborhood'] = 'The Esplanade'
toronto.loc[73,'Neighborhood'] = 'Centre 969 Eastern'

In [13]:
toronto.shape

(74, 3)

Create new DataFrame to store latitude and longitude for each neighborhoods.

In [29]:
lat_lng = pd.DataFrame(columns=['Latitude', 'Longitude']) 

Finding latitude and longitude for neighborhoods.

In [30]:
geolocator = Nominatim(user_agent="ny_explorer")
j = 1

for i in toronto.index:
    
    location = geolocator.geocode('{}, Toronto'.format(str(toronto.iloc[i,2])))
    latitude = location.latitude
    longitude = location.longitude
    print('{}) ----- {} -----'.format(j, toronto.iloc[i, 2]))
    j+=1
    lat_lng = lat_lng.append({
        'Latitude' : latitude,
        'Longitude' : longitude
        }, ignore_index=True)

1) ----- Harbourfront -----
2) ----- Queen's Park -----
3) ----- Ryerson -----
4) ----- Garden District -----
5) ----- St. James Town -----
6) ----- The Beaches -----
7) ----- Berczy Park -----
8) ----- Central Bay Street -----
9) ----- Christie -----
10) ----- Adelaide -----
11) ----- King -----
12) ----- Richmond -----
13) ----- Dovercourt Village -----
14) ----- Dufferin -----
15) ----- Harbourfront East -----
16) ----- Toronto Islands -----
17) ----- Union Station -----
18) ----- Little Portugal -----
19) ----- Trinity -----
20) ----- The Danforth West -----
21) ----- Riverdale -----
22) ----- Design Exchange -----
23) ----- Toronto Dominion Centre -----
24) ----- Brockton -----
25) ----- Exhibition Place -----
26) ----- Parkdale Village -----
27) ----- The Beaches West -----
28) ----- India Bazaar -----
29) ----- Commerce Court -----
30) ----- Victoria Hotel -----
31) ----- Studio District -----
32) ----- Lawrence Park -----
33) ----- Roselawn -----
34) ----- Davisville North ----

In [31]:
lat_lng.shape

(74, 2)

Combining neighborhood data with their corresponding latitude and longitude.

In [32]:
toronto = pd.concat([toronto, lat_lng], axis=1)
toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,Harbourfront,43.64008,-79.38015
1,M7A,Downtown Toronto,Queen's Park,43.659659,-79.39034
2,M5B,Downtown Toronto,Ryerson,43.658469,-79.378993
3,M5B,Downtown Toronto,Garden District,43.6565,-79.377114
4,M5C,Downtown Toronto,St. James Town,43.669403,-79.372704


Get the geolocation of Toronto

In [33]:
address = 'Toronto, Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


Create map of Toronto with neighborhood using latitude and longitude values

In [34]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto['Latitude'], toronto['Longitude'], toronto['Borough'], toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Defining and mentaining Foursquare credentials

In [35]:
CLIENT_ID = 'YQ2PMOH5JBAXNLXZXID4U2N5WXMXXPFWAKSR231LXML2G0EI' # your Foursquare ID
CLIENT_SECRET = 'NBLCRJTGZHAJOAAETKETGWFHZEMIP5AJRRIQR1GCALMLFW5U' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: YQ2PMOH5JBAXNLXZXID4U2N5WXMXXPFWAKSR231LXML2G0EI
CLIENT_SECRET:NBLCRJTGZHAJOAAETKETGWFHZEMIP5AJRRIQR1GCALMLFW5U


Creating GET request URL

In [36]:
radius = 500
LIMIT = 100
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)

Send the GET request and store result

In [37]:
results = requests.get(url).json()

In [38]:
#results

Definig the function to explore the category of venues from Foursquare lab.

In [39]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Exploring the venue and its category and storing the result in a new dataframe.

In [40]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Downtown Toronto,Neighborhood,43.653232,-79.385296
1,Japango,Sushi Restaurant,43.655268,-79.385165
2,Rolltation,Japanese Restaurant,43.654918,-79.387424
3,Sansotei Ramen 三草亭,Ramen Restaurant,43.655157,-79.386501
4,Poke Guys,Poke Place,43.654895,-79.385052


Defining an ietrative function to explore the neighborhoods in Toronto.

In [41]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [42]:
toronto_venues = getNearbyVenues(names=toronto['Neighborhood'],
                                   latitudes=toronto['Latitude'],
                                   longitudes=toronto['Longitude']
                                  )

Harbourfront
Queen's Park
Ryerson
Garden District
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Adelaide
King
Richmond
Dovercourt Village
Dufferin
Harbourfront East
Toronto Islands
Union Station
Little Portugal
Trinity
The Danforth West
Riverdale
Design Exchange
Toronto Dominion Centre
Brockton
Exhibition Place
Parkdale Village
The Beaches West
India Bazaar
Commerce Court
Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North
Forest Hill West
High Park
The Junction South
North Toronto West
The Annex
North Midtown
Yorkville
Parkdale
Roncesvalles
Davisville
Harbord
University of Toronto
Runnymede
Swansea
Moore Park
Summerhill East
Chinatown
Grange Park
Kensington Market
Deer Park
Forest Hill SE
Rathnelly
South Hill
Summerhill West
CN Tower
Bathurst Quay
Island airport
Harbourfront West
King and Spadina
Railway Lands
South Niagara
Rosedale
The Esplanade
Cabbagetown
St. James Town
First Canadian Place
Underground city
Church and Welles

In [43]:
print(toronto_venues.shape)
toronto_venues.head()

(3903, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Harbourfront,43.64008,-79.38015,Harbour Square Park,43.639253,-79.378395,Park
1,Harbourfront,43.64008,-79.38015,Lake Ontario,43.638945,-79.379665,Lake
2,Harbourfront,43.64008,-79.38015,Harbourfront,43.639526,-79.380688,Neighborhood
3,Harbourfront,43.64008,-79.38015,Miku,43.641374,-79.377531,Japanese Restaurant
4,Harbourfront,43.64008,-79.38015,Natrel Pond/Rink,43.638431,-79.382528,Skating Rink


Aggregating neighborhoods with their corresponding number of venues.

In [44]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Adelaide,100,100,100,100,100,100
Bathurst Quay,23,23,23,23,23,23
Berczy Park,100,100,100,100,100,100
Brockton,18,18,18,18,18,18
CN Tower,87,87,87,87,87,87
...,...,...,...,...,...,...
Underground city,12,12,12,12,12,12
Union Station,78,78,78,78,78,78
University of Toronto,32,32,32,32,32,32
Victoria Hotel,38,38,38,38,38,38


Analysing each neighborhood and transforming venue category by onehot encoding.

In [45]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Udon Restaurant,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Getting dimension of new data.

In [46]:
toronto_onehot.shape

(3903, 300)

Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category.

In [47]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Udon Restaurant,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint,Women's Store
0,Adelaide,0.000000,0.0,0.00,0.0,0.0,0.000000,0.0,0.030000,0.00,...,0.0,0.0,0.010000,0.00000,0.00,0.000000,0.0,0.010000,0.000000,0.00
1,Bathurst Quay,0.000000,0.0,0.00,0.0,0.0,0.043478,0.0,0.000000,0.00,...,0.0,0.0,0.000000,0.00000,0.00,0.000000,0.0,0.000000,0.000000,0.00
2,Berczy Park,0.000000,0.0,0.00,0.0,0.0,0.000000,0.0,0.010000,0.01,...,0.0,0.0,0.010000,0.00000,0.00,0.000000,0.0,0.000000,0.000000,0.00
3,Brockton,0.000000,0.0,0.00,0.0,0.0,0.000000,0.0,0.000000,0.00,...,0.0,0.0,0.000000,0.00000,0.00,0.111111,0.0,0.000000,0.000000,0.00
4,CN Tower,0.011494,0.0,0.00,0.0,0.0,0.000000,0.0,0.022989,0.00,...,0.0,0.0,0.000000,0.00000,0.00,0.000000,0.0,0.011494,0.011494,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,Underground city,0.000000,0.0,0.00,0.0,0.0,0.000000,0.0,0.000000,0.00,...,0.0,0.0,0.000000,0.00000,0.00,0.000000,0.0,0.000000,0.000000,0.00
68,Union Station,0.000000,0.0,0.00,0.0,0.0,0.000000,0.0,0.025641,0.00,...,0.0,0.0,0.012821,0.00000,0.00,0.000000,0.0,0.000000,0.000000,0.00
69,University of Toronto,0.031250,0.0,0.00,0.0,0.0,0.000000,0.0,0.000000,0.00,...,0.0,0.0,0.000000,0.03125,0.00,0.000000,0.0,0.000000,0.000000,0.00
70,Victoria Hotel,0.000000,0.0,0.00,0.0,0.0,0.000000,0.0,0.000000,0.00,...,0.0,0.0,0.000000,0.00000,0.00,0.000000,0.0,0.000000,0.000000,0.00


In [48]:
toronto_grouped.shape

(72, 300)

Print each neighborhood along with the top 5 most common venues.

In [50]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['Venue','Frequency']
    temp = temp.iloc[1:]
    temp['Frequency'] = temp['Frequency'].astype(float)
    temp = temp.round({'Frequency': 2})
    print(temp.sort_values('Frequency', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide----
                 Venue  Frequency
0          Coffee Shop       0.09
1           Restaurant       0.05
2                 Café       0.05
3            Gastropub       0.04
4  American Restaurant       0.03


----Bathurst Quay----
                 Venue  Frequency
0          Coffee Shop       0.17
1                 Café       0.13
2                 Park       0.09
3        Grocery Store       0.04
4  Japanese Restaurant       0.04


----Berczy Park----
                 Venue  Frequency
0          Coffee Shop       0.09
1                 Café       0.06
2           Restaurant       0.04
3  Japanese Restaurant       0.04
4   Italian Restaurant       0.04


----Brockton----
                   Venue  Frequency
0                    Bar       0.17
1  Vietnamese Restaurant       0.11
2                   Park       0.11
3      French Restaurant       0.06
4                   Café       0.06


----CN Tower----
                Venue  Frequency
0               Hotel       0.08
1    

Sort the venues in descending order

In [51]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Create the new dataframe and display the top 10 venues for each neighborhood.

In [52]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Café,Restaurant,Gastropub,Breakfast Spot,American Restaurant,Hotel,Cosmetics Shop,Seafood Restaurant,Gym
1,Bathurst Quay,Coffee Shop,Café,Park,Japanese Restaurant,Grocery Store,Caribbean Restaurant,Diner,Bank,Ramen Restaurant,Pub
2,Berczy Park,Coffee Shop,Café,Japanese Restaurant,Restaurant,Hotel,Italian Restaurant,Gym,Beer Bar,Seafood Restaurant,Bakery
3,Brockton,Bar,Vietnamese Restaurant,Park,Jazz Club,Portuguese Restaurant,Gastropub,Bakery,Korean Restaurant,Salon / Barbershop,Café
4,CN Tower,Hotel,Coffee Shop,Pizza Place,Italian Restaurant,Bar,Aquarium,Scenic Lookout,Baseball Stadium,Restaurant,Bistro


Running *k*-means to cluster the neighborhood into 4 clusters.

In [53]:
# number of clusters
kclusters = 4

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 2, 0, 0, 0, 2, 2, 0])

Create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [54]:
# adding clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,Harbourfront,43.64008,-79.38015,0.0,Coffee Shop,Café,Hotel,Pizza Place,Restaurant,Italian Restaurant,Park,Sushi Restaurant,Steakhouse,Sports Bar
1,M7A,Downtown Toronto,Queen's Park,43.659659,-79.39034,0.0,Coffee Shop,Café,Sandwich Place,Italian Restaurant,Chinese Restaurant,Indian Restaurant,Japanese Restaurant,Thai Restaurant,Bubble Tea Shop,Ice Cream Shop
2,M5B,Downtown Toronto,Ryerson,43.658469,-79.378993,0.0,Coffee Shop,Japanese Restaurant,Clothing Store,Burger Joint,Café,Diner,Ramen Restaurant,Spa,Tea Room,Middle Eastern Restaurant
3,M5B,Downtown Toronto,Garden District,43.6565,-79.377114,0.0,Coffee Shop,Clothing Store,Fast Food Restaurant,Restaurant,Cosmetics Shop,Hotel,Theater,Movie Theater,Sandwich Place,Café
4,M5C,Downtown Toronto,St. James Town,43.669403,-79.372704,0.0,Coffee Shop,Pizza Place,Café,Grocery Store,Filipino Restaurant,Beer Store,Bike Rental / Bike Share,Restaurant,Bistro,Caribbean Restaurant


print the cluster labels

In [55]:
toronto_merged['Cluster Labels']

0     0.0
1     0.0
2     0.0
3     0.0
4     0.0
     ... 
69    0.0
70    0.0
71    2.0
72    0.0
73    NaN
Name: Cluster Labels, Length: 74, dtype: float64

There is NaN lable so checking unique cluster lables

In [56]:
toronto_merged['Cluster Labels'].unique()

array([ 0.,  2.,  1.,  3., nan])

Droping row which has NaN cluster label

In [57]:
toronto_merged.drop(73, axis=0, inplace=True)

In [58]:
toronto_merged['Cluster Labels'].unique()

array([0., 2., 1., 3.])

Changing data types of cluster labels

In [59]:
toronto_merged = toronto_merged.astype(dtype={'Cluster Labels': np.int})

Let's visualize the resulting clusters

In [60]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters