## Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import json

import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans

### Data Acquisition and Cleaning

Get the data from the Wikipedia website

In [2]:
page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
page.status_code

200

In [3]:
soup = BeautifulSoup(page.content, 'html.parser')

table = soup.find('table', attrs={'class':'wikitable sortable'})
table_body = table.find('tbody')

data = []
rows = table_body.find_all('tr')
for row in rows:
    cols = row.find_all('td')
    cols = [elem.text.strip() for elem in cols]
    data.append([elem for elem in cols if elem])
    
data[:5]

[[],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village']]

Load the raw data into a pandas dataframe

In [4]:
columns = ['PostalCode', 'Borough', 'Neighborhood']
df_raw = pd.DataFrame(data[1:], columns=columns)
df_raw.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


Clean the raw data and check for 'Not assigned' values

In [5]:
df = df_raw[df_raw['Borough'] != 'Not assigned'].reset_index(drop=True)
(df['Neighborhood'] == 'Not assigned').value_counts()

False    103
Name: Neighborhood, dtype: int64

In [6]:
df.shape

(103, 3)

### Assignment of Latitude and Longitude Coordinates

In [7]:
df_coordinates = pd.read_csv('data/geospatial_coordinates.csv')
df_coordinates.head(5)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [8]:
df_coordinates.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
df_coordinates.head(1)

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353


Join the dataframes

In [9]:
df_combined = pd.merge(df, df_coordinates, on='PostalCode')
df_combined.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [10]:
df_combined.shape

(103, 5)

### Exploration and Clustering of Neighborhoods in Toronto

Create a map of Toronto with neighborhoods superimposed on top

In [11]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(f'The geograpical coordinates of Toronto are {latitude}, {longitude}.')

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


In [12]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# Add markers to map
for lat, lng, borough, neighborhood in zip(df_combined['Latitude'], df_combined['Longitude'],
                                           df_combined['Borough'], df_combined['Neighborhood']):
    label = f'{latitude}, {longitude}'
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Define Foursquare credentials and version

In [1]:
CLIENT_ID = '*****'             # Foursquare ID
CLIENT_SECRET = '*****'         # Foursquare Secret
VERSION = '20180605'            # Foursquare API version
LIMIT = 100

Explore neighborhoods in Toronto

In [14]:
def get_nearby_venues(names, latitudes, longitudes, radius=500):
    
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # Create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # Make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        # Return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name'],
            )
            for v in results
            ])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = [
        'Neighborhood', 
        'Neighborhood Latitude', 
        'Neighborhood Longitude', 
        'Venue', 
        'Venue Latitude', 
        'Venue Longitude', 
        'Venue Category',
        ]
    
    return nearby_venues

In [15]:
toronto_venues = get_nearby_venues(names=df_combined['Neighborhood'],
                                   latitudes=df_combined['Latitude'],
                                   longitudes=df_combined['Longitude'])

In [16]:
toronto_venues.head(10)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Parkwoods,43.753259,-79.329656,Corrosion Service Company Limited,43.752432,-79.334661,Construction & Landscaping
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
5,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant
6,Victoria Village,43.725882,-79.315572,The Frig,43.727051,-79.317418,French Restaurant
7,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
8,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
9,"Regent Park, Harbourfront",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot


Check how many venues were returned for each neighborhood

In [17]:
toronto_venues.groupby('Neighborhood')[['Venue']].count()

Unnamed: 0_level_0,Venue
Neighborhood,Unnamed: 1_level_1
Agincourt,4
"Alderwood, Long Branch",8
"Bathurst Manor, Wilson Heights, Downsview North",22
Bayview Village,4
"Bedford Park, Lawrence Manor East",22
...,...
"Willowdale, Willowdale East",33
"Willowdale, Willowdale West",6
Woburn,4
Woodbine Heights,7


Find out how many unique categories can be curated from all the returned venues

In [18]:
n_venues = len(toronto_venues['Venue Category'].unique())
print(f'There are {n_venues} unique categories.')

There are 270 unique categories.


Prepare the data to analyze each neighborhood

In [48]:
# One hot encoding
toronto_dummies = pd.get_dummies(toronto_venues['Venue Category'], prefix='_', prefix_sep='')

# Add neighborhood column back to dataframe
toronto_onehot = pd.concat([toronto_venues[['Neighborhood']], toronto_dummies], axis=1)

toronto_onehot.head()

Unnamed: 0,Neighborhood,_Accessories Store,_Afghan Restaurant,_Airport,_Airport Food Court,_Airport Gate,_Airport Lounge,_Airport Service,_Airport Terminal,_American Restaurant,...,_Vegetarian / Vegan Restaurant,_Video Game Store,_Video Store,_Vietnamese Restaurant,_Warehouse Store,_Wine Bar,_Wine Shop,_Wings Joint,_Women's Store,_Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

toronto_grouped.shape

(95, 271)

In [50]:
toronto_grouped[toronto_grouped['Neighborhood'] == 'Agincourt'].T.reset_index()

Unnamed: 0,index,0
0,Neighborhood,Agincourt
1,_Accessories Store,0
2,_Afghan Restaurant,0
3,_Airport,0
4,_Airport Food Court,0
...,...,...
266,_Wine Bar,0
267,_Wine Shop,0
268,_Wings Joint,0
269,_Women's Store,0


Print each neighborhood along with the top 5 most common venues

In [51]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print('---' + hood + '----')
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue', 'freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

---Agincourt----
                        venue  freq
0  _Latin American Restaurant  0.25
1                     _Lounge  0.25
2             _Breakfast Spot  0.25
3               _Skating Rink  0.25
4                _Men's Store  0.00


---Alderwood, Long Branch----
             venue  freq
0     _Pizza Place  0.25
1        _Pharmacy  0.12
2     _Coffee Shop  0.12
3  _Sandwich Place  0.12
4             _Gym  0.12


---Bathurst Manor, Wilson Heights, Downsview North----
            venue  freq
0           _Bank  0.09
1    _Coffee Shop  0.09
2    _Pizza Place  0.05
3       _Pharmacy  0.05
4  _Deli / Bodega  0.05


---Bayview Village----
                  venue  freq
0  _Japanese Restaurant  0.25
1   _Chinese Restaurant  0.25
2                 _Bank  0.25
3                 _Café  0.25
4    _Accessories Store  0.00


---Bedford Park, Lawrence Manor East----
                 venue  freq
0         _Coffee Shop  0.09
1  _Italian Restaurant  0.09
2      _Sandwich Place  0.09
3          _Restaura

             venue  freq
0  _Breakfast Spot  0.14
1       _Gift Shop  0.14
2         _Dog Run  0.07
3    _Dessert Shop  0.07
4   _Movie Theater  0.07


---Parkview Hill, Woodbine Gardens----
                   venue  freq
0           _Pizza Place  0.18
1              _Pharmacy  0.09
2          _Intersection  0.09
3  _Fast Food Restaurant  0.09
4                  _Café  0.09


---Parkwoods----
                         venue  freq
0  _Construction & Landscaping  0.33
1                        _Park  0.33
2           _Food & Drink Shop  0.33
3           _Accessories Store  0.00
4               _Metro Station  0.00


---Queen's Park, Ontario Provincial Government----
                 venue  freq
0         _Coffee Shop  0.23
1    _Sushi Restaurant  0.06
2         _Yoga Studio  0.03
3  _Mexican Restaurant  0.03
4            _Beer Bar  0.03


---Regent Park, Harbourfront----
             venue  freq
0     _Coffee Shop  0.17
1            _Park  0.07
2             _Pub  0.07
3          _Bakery  

Put the data into a pandas dataframe

In [56]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Create a new dataframe and display the top 10 venues for each neighborhood

In [57]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# Create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append(f'{ind+1}{indicators[ind]} Most Common Venue')
    except:
        columns.append(f'{ind+1}th Most Common Venue')

# Create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :],
                                                                          num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,_Lounge,_Breakfast Spot,_Latin American Restaurant,_Skating Rink,_Dumpling Restaurant,_Distribution Center,_Dog Run,_Doner Restaurant,_Donut Shop,_Drugstore
1,"Alderwood, Long Branch",_Pizza Place,_Gym,_Skating Rink,_Pharmacy,_Coffee Shop,_Sandwich Place,_Pub,_Dog Run,_Diner,_Discount Store
2,"Bathurst Manor, Wilson Heights, Downsview North",_Coffee Shop,_Bank,_Grocery Store,_Mobile Phone Shop,_Bridal Shop,_Sandwich Place,_Diner,_Restaurant,_Deli / Bodega,_Supermarket
3,Bayview Village,_Café,_Japanese Restaurant,_Chinese Restaurant,_Bank,_Yoga Studio,_Discount Store,_Dog Run,_Doner Restaurant,_Donut Shop,_Drugstore
4,"Bedford Park, Lawrence Manor East",_Restaurant,_Sandwich Place,_Italian Restaurant,_Coffee Shop,_Grocery Store,_Juice Bar,_Indian Restaurant,_Sushi Restaurant,_Pub,_Liquor Store


Run k-means to cluster the neighborhood into 5 clusters

In [61]:
# Set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', axis=1)

# Run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=42)
kmeans.fit(toronto_grouped_clustering)

# Check cluster labels generated for each row in the dataframe
kmeans.labels_

array([3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 1, 3, 3, 3, 2, 3, 1, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3,
       1, 3, 2, 3, 3, 4, 2, 3, 2, 3, 2, 3, 3, 3, 2, 3, 1, 2, 3, 3, 3, 2,
       3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 1,
       3, 3, 3, 1, 3, 3, 2], dtype=int32)

Create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood

In [72]:
# Add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_combined

# Merge toronto_grouped with df_combined to add latitude / longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'),
                                     on='Neighborhood', how='right')

toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,2,_Park,_Construction & Landscaping,_Food & Drink Shop,_Fabric Shop,_Falafel Restaurant,_Event Space,_Ethiopian Restaurant,_Electronics Store,_Diner,_Eastern European Restaurant
1,M4A,North York,Victoria Village,43.725882,-79.315572,3,_Portuguese Restaurant,_French Restaurant,_Hockey Arena,_Coffee Shop,_Yoga Studio,_Discount Store,_Distribution Center,_Dog Run,_Doner Restaurant,_Donut Shop
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,3,_Coffee Shop,_Bakery,_Park,_Pub,_Breakfast Spot,_Café,_Restaurant,_Theater,_French Restaurant,_Event Space
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,3,_Clothing Store,_Furniture / Home Store,_Accessories Store,_Boutique,_Gift Shop,_Event Space,_Miscellaneous Shop,_Coffee Shop,_Women's Store,_Vietnamese Restaurant
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,3,_Coffee Shop,_Sushi Restaurant,_Yoga Studio,_Bar,_Beer Bar,_Smoothie Shop,_Sandwich Place,_Burrito Place,_Café,_College Auditorium


Visualize the resulting clusters

In [73]:
# Create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# Set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'],
                                  toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters