# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import numpy as np, pandas as pd
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

## Part 1: Parsing html data and generating dataframe

In [2]:
d1 = pd.read_html(url)

In [3]:
html_table =  d1[0]
html_table.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Removing areas which do not have Borough assigned to it

In [4]:
html_table = html_table[html_table['Borough']!='Not assigned']
html_table.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Replacing 'Not assigned' Neighborhoods with Borough names

In [5]:
for ind in np.where(html_table['Neighbourhood']=='Not assigned'):
    html_table.iloc[ind,2] = html_table.iloc[ind,1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Combining Neighborhoods by Postcode

In [6]:
res = []
for postcode in list(set(html_table['Postcode'])):
    postcode_ind = np.where(html_table['Postcode']==postcode)[0]
    if(len(postcode_ind)==1):
        res.append(list(html_table.iloc[postcode_ind,].values[0]))
    else:
        res.append(list(html_table.iloc[postcode_ind[0],:2])+[', '.join(html_table.iloc[postcode_ind,2])])

In [7]:
df = pd.DataFrame(res,columns=html_table.columns,dtype='str')
print('Shape of Resulting Postal Code dataframe:',df.shape)
df.head()

Shape of Resulting Postal Code dataframe: (103, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M9W,Etobicoke,Northwest
1,M5S,Downtown Toronto,"Harbord, University of Toronto"
2,M3J,North York,"Northwood Park, York University"
3,M2H,North York,Hillcrest Village
4,M9C,Etobicoke,"Bloordale Gardens, Eringate, Markland Wood, Ol..."


## Part 2: Fetching lat,long values for each Postcode


In [8]:
#!conda install -c conda-forge geopy
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="canada_explorer")

### After exhausting geolocator requests, the lat_lng data is fetched from the csv file and df is populated

In [9]:
d = pd.read_csv('Geospatial_Coordinates.csv')
d.set_index('Postal Code',inplace=True)
d  = d.reindex(df['Postcode'])
df['Latitude'] = d['Latitude'].values
df['Longitude'] = d['Longitude'].values
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M9W,Etobicoke,Northwest,43.706748,-79.594054
1,M5S,Downtown Toronto,"Harbord, University of Toronto",43.662696,-79.400049
2,M3J,North York,"Northwood Park, York University",43.76798,-79.487262
3,M2H,North York,Hillcrest Village,43.803762,-79.363452
4,M9C,Etobicoke,"Bloordale Gardens, Eringate, Markland Wood, Ol...",43.643515,-79.577201


## Part 3 Cluster Analysis

### Generating map of Toronto

In [10]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="canada_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

import folium
map_toronto = folium.Map(location=[latitude,longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['Borough']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
map_toronto

The geograpical coordinate of Toronto are 43.653963, -79.387207.


### Analyzing Downtown Toronto Borough of Toronto

In [11]:
df_dt = df[df['Borough']=='Downtown Toronto']
df_dt.index = range(df_dt.shape[0])

address = 'Downtown Toronto, Toronto'

geolocator = Nominatim(user_agent="canada_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

map_downtown = folium.Map(location=[latitude, longitude], zoom_start=13)

# add markers to map
for lat, lng, label in zip(df_dt['Latitude'], df_dt['Longitude'], df_dt['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtown)  
map_downtown

### Finding top 100 locations near each neighborhood of Downtown Toronto borough

In [12]:
import requests
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius)
            
        # make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [13]:
downtown_venues = getNearbyVenues(names=df_dt['Neighbourhood'],
                                   latitudes=df_dt['Latitude'],
                                   longitudes=df_dt['Longitude'])

Harbord, University of Toronto
Berczy Park
St. James Town
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Church and Wellesley
Rosedale
Harbourfront, Regent Park
Ryerson, Garden District
Stn A PO Boxes 25 The Esplanade
Harbourfront East, Toronto Islands, Union Station
Chinatown, Grange Park, Kensington Market
Cabbagetown, St. James Town
First Canadian Place, Underground city
Christie
Central Bay Street
Adelaide, King, Richmond


In [14]:
print(downtown_venues.shape)
downtown_venues.head()

(486, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Harbord, University of Toronto",43.662696,-79.400049,Yasu,43.662837,-79.403217,Japanese Restaurant
1,"Harbord, University of Toronto",43.662696,-79.400049,Rasa,43.662757,-79.403988,Restaurant
2,"Harbord, University of Toronto",43.662696,-79.400049,Piano Piano,43.662949,-79.402898,Italian Restaurant
3,"Harbord, University of Toronto",43.662696,-79.400049,The Dessert Kitchen,43.662823,-79.402746,Dessert Shop
4,"Harbord, University of Toronto",43.662696,-79.400049,Almond Butterfly,43.662836,-79.403365,Bakery


In [15]:
print('There are {} uniques categories.'.format(len(downtown_venues['Venue Category'].unique())))

There are 143 uniques categories.


### Analyzing each neighborhood

In [16]:
# one hot encoding
downtown_onehot = pd.get_dummies(downtown_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
downtown_onehot['Neighbourhood'] = downtown_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [downtown_onehot.columns[-1]] + list(downtown_onehot.columns[:-1])
downtown_onehot = downtown_onehot[fixed_columns]

print(downtown_onehot.shape)
downtown_onehot.head()

(486, 144)


Unnamed: 0,Neighbourhood,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Aquarium,Art Gallery,...,Thai Restaurant,Theater,Theme Restaurant,Thrift / Vintage Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar
0,"Harbord, University of Toronto",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Harbord, University of Toronto",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Harbord, University of Toronto",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Harbord, University of Toronto",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Harbord, University of Toronto",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Finding 10-most frequently visited venue-categories in a neighborhood

In [17]:
downtown_grouped = downtown_onehot.groupby('Neighbourhood').mean().reset_index()
print(downtown_grouped.shape)

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = downtown_grouped['Neighbourhood']

for ind in np.arange(downtown_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(downtown_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

(18, 144)


Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Steakhouse,Café,Coffee Shop,Pizza Place,Asian Restaurant,Hotel,Seafood Restaurant,Noodle House,Monument / Landmark,Opera House
1,Berczy Park,Cocktail Bar,Coffee Shop,Café,Beer Bar,Seafood Restaurant,Farmers Market,Steakhouse,Museum,Park,Concert Hall
2,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Service,Airport Lounge,Airport Terminal,Harbor / Marina,Sculpture Garden,Airport Food Court,Airport Gate,Bar,Boat or Ferry,Boutique
3,"Cabbagetown, St. James Town",Coffee Shop,Italian Restaurant,Bakery,Café,Restaurant,Indian Restaurant,Japanese Restaurant,Jewelry Store,Diner,Deli / Bodega
4,Central Bay Street,Coffee Shop,Bubble Tea Shop,Spa,Italian Restaurant,Park,Ice Cream Shop,Café,Modern European Restaurant,Miscellaneous Shop,Poke Place


### Cluster Neighborhoods


In [18]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

downtown_grouped_clustering = downtown_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(downtown_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

downtown_merged = df_dt

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
downtown_merged = downtown_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

downtown_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5S,Downtown Toronto,"Harbord, University of Toronto",43.662696,-79.400049,3,Café,Japanese Restaurant,Restaurant,Bookstore,Bar,Bakery,Chinese Restaurant,Coffee Shop,College Arts Building,College Gym
1,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,0,Cocktail Bar,Coffee Shop,Café,Beer Bar,Seafood Restaurant,Farmers Market,Steakhouse,Museum,Park,Concert Hall
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,3,Coffee Shop,Gastropub,Restaurant,Japanese Restaurant,Italian Restaurant,Hotel,Church,Cosmetics Shop,Middle Eastern Restaurant,Diner
3,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442,1,Airport Service,Airport Lounge,Airport Terminal,Harbor / Marina,Sculpture Garden,Airport Food Court,Airport Gate,Bar,Boat or Ferry,Boutique
4,M5K,Downtown Toronto,"Design Exchange, Toronto Dominion Centre",43.647177,-79.381576,3,Café,Coffee Shop,Restaurant,Deli / Bodega,Hotel Bar,Museum,Pizza Place,Concert Hall,Pub,Sandwich Place


### Mapping clusters on map

In [19]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=13)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(downtown_merged['Latitude'], downtown_merged['Longitude'], downtown_merged['Neighbourhood'], downtown_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examining the Clusters

### Cluster 1

In [20]:
downtown_merged.loc[downtown_merged['Cluster Labels'] == 0, downtown_merged.columns[[1,2] + list(range(5, downtown_merged.shape[1]))]]

Unnamed: 0,Borough,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Downtown Toronto,Berczy Park,0,Cocktail Bar,Coffee Shop,Café,Beer Bar,Seafood Restaurant,Farmers Market,Steakhouse,Museum,Park,Concert Hall
6,Downtown Toronto,Church and Wellesley,0,Gay Bar,Italian Restaurant,Juice Bar,Creperie,Salon / Barbershop,Bookstore,Restaurant,Ramen Restaurant,Pub,Breakfast Spot
9,Downtown Toronto,"Ryerson, Garden District",0,Café,Clothing Store,Steakhouse,Plaza,Pizza Place,Ramen Restaurant,Beer Bar,Japanese Restaurant,Sandwich Place,Movie Theater
10,Downtown Toronto,Stn A PO Boxes 25 The Esplanade,0,Café,Beer Bar,Seafood Restaurant,Cocktail Bar,Farmers Market,Art Gallery,Concert Hall,Jazz Club,Museum,Fish Market
11,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",0,Hotel,Park,Café,Japanese Restaurant,Skating Rink,Performing Arts Venue,Ice Cream Shop,Basketball Stadium,Deli / Bodega,Plaza
12,Downtown Toronto,"Chinatown, Grange Park, Kensington Market",0,Café,Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Bakery,Caribbean Restaurant,Mexican Restaurant,Wine Bar,Noodle House,Cheese Shop,Organic Grocery
16,Downtown Toronto,Central Bay Street,0,Coffee Shop,Bubble Tea Shop,Spa,Italian Restaurant,Park,Ice Cream Shop,Café,Modern European Restaurant,Miscellaneous Shop,Poke Place
17,Downtown Toronto,"Adelaide, King, Richmond",0,Steakhouse,Café,Coffee Shop,Pizza Place,Asian Restaurant,Hotel,Seafood Restaurant,Noodle House,Monument / Landmark,Opera House


### CLuster 2

In [21]:
downtown_merged.loc[downtown_merged['Cluster Labels'] == 1, downtown_merged.columns[[1,2] + list(range(5, downtown_merged.shape[1]))]]

Unnamed: 0,Borough,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",1,Airport Service,Airport Lounge,Airport Terminal,Harbor / Marina,Sculpture Garden,Airport Food Court,Airport Gate,Bar,Boat or Ferry,Boutique


### Cluster 3

In [22]:
downtown_merged.loc[downtown_merged['Cluster Labels'] == 2, downtown_merged.columns[[1,2] + list(range(5, downtown_merged.shape[1]))]]

Unnamed: 0,Borough,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
7,Downtown Toronto,Rosedale,2,Park,Playground,Trail,Building,Comfort Food Restaurant,Dance Studio,Creperie,Cosmetics Shop,Convenience Store,Concert Hall


### Cluster 4


In [23]:
downtown_merged.loc[downtown_merged['Cluster Labels'] == 3, downtown_merged.columns[[1,2] + list(range(5, downtown_merged.shape[1]))]]

Unnamed: 0,Borough,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,"Harbord, University of Toronto",3,Café,Japanese Restaurant,Restaurant,Bookstore,Bar,Bakery,Chinese Restaurant,Coffee Shop,College Arts Building,College Gym
2,Downtown Toronto,St. James Town,3,Coffee Shop,Gastropub,Restaurant,Japanese Restaurant,Italian Restaurant,Hotel,Church,Cosmetics Shop,Middle Eastern Restaurant,Diner
4,Downtown Toronto,"Design Exchange, Toronto Dominion Centre",3,Café,Coffee Shop,Restaurant,Deli / Bodega,Hotel Bar,Museum,Pizza Place,Concert Hall,Pub,Sandwich Place
5,Downtown Toronto,"Commerce Court, Victoria Hotel",3,Café,Coffee Shop,Restaurant,Gastropub,Deli / Bodega,Bakery,Pub,American Restaurant,Museum,Art Gallery
8,Downtown Toronto,"Harbourfront, Regent Park",3,Coffee Shop,Park,Bakery,Café,Mexican Restaurant,Gym / Fitness Center,Pub,Breakfast Spot,Dessert Shop,Spa
13,Downtown Toronto,"Cabbagetown, St. James Town",3,Coffee Shop,Italian Restaurant,Bakery,Café,Restaurant,Indian Restaurant,Japanese Restaurant,Jewelry Store,Diner,Deli / Bodega
14,Downtown Toronto,"First Canadian Place, Underground city",3,Café,Coffee Shop,Deli / Bodega,Restaurant,Steakhouse,Gluten-free Restaurant,Food Court,Pizza Place,Pub,Salad Place


### Cluster 5

In [24]:
downtown_merged.loc[downtown_merged['Cluster Labels'] == 4, downtown_merged.columns[[1,2] + list(range(5, downtown_merged.shape[1]))]]

Unnamed: 0,Borough,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
15,Downtown Toronto,Christie,4,Grocery Store,Café,Park,Nightclub,Convenience Store,Baby Store,Italian Restaurant,Diner,Coffee Shop,Restaurant
