# Segmenting and Clustering Neighborhoods in Toronto

#### Importing the required libraries

In [1]:
import pandas as pd # Library for data structures
import requests # library to handle requests
import numpy as np # Library to handle arrays and vectors

# Libraries for scraping web data
import bs4 as bs
import urllib.request

# import k-means from clustering stage
from sklearn.cluster import KMeans

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you need to install Folium
import folium # map rendering library

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

print ('Libraries imported successfully.')

Libraries imported successfully.


#### Scrape the Wikipedia page and populate the postal codes data in a pandas dataframe

In [2]:
# URL corresponding to the Wikipedia page to be scraped
wiki_page_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# Read data form the Wikipedia page
page = urllib.request.urlopen(wiki_page_url).read()
page_data = bs.BeautifulSoup(page, 'lxml')

# Look for postal codes table and extract the table data
postal_codes_table = page_data.find('table')
table_rows = postal_codes_table.find_all('tr') # reading the rows from the table

# Create a blank pandas dataframe for storing the Postal codes data
df_postal_codes = pd.DataFrame(columns = ['PostalCode', 'Borough', 'Neighbourhood'])

# initiate variables to be used below
i = 0
prev_row = ''

# Loop through the table rows and read the table data
for row in table_rows:
    table_data = row.find_all('td') # reading the table data
    row = [i.text for i in table_data]
    
    if (len(row) > 0): # checking for any empty rows to ignore them
        if row[1] != 'Not assigned': # checking to see if the borough is not assigned to ignore that data row
            
            # Read the postal code, borough and nighbourhood from the data row and store them in local variables
            row_postal_code = row[0] # row[0] corresponds to postal code
            row_borough = row[1] # row[1] corresponds to borough
            row_neighbourhood = row[2] # row[2] corresponds to neighbourhood

            # For a Not assigned neighborhood, the neighborhood will be the same as the borough
            if row[2][0:12] == 'Not assigned':
                row_neighbourhood = row[1]
            
            if prev_row != '': # If the current row is not the first row then the following will be executed
                
                if prev_row[0] == row [0]: # Condition to check if postal code for the current and previous rows is the same
                    
                    # If the postal codes are the same then the neighbourhood values will be concatenated to form a single string
                    # instead of adding a new data row in the pandas dataframe
                    row_neighbourhood = df_postal_codes.xs(i-1)['Neighbourhood'] + ', ' + row_neighbourhood
                    df_postal_codes.xs(i-1)['Neighbourhood'] = row_neighbourhood
               
                else: # if postal code for the current and previous rows is the different
                
                    df_postal_codes = df_postal_codes.append(pd.DataFrame([[row_postal_code, row_borough, row_neighbourhood]], columns = ['PostalCode', 'Borough', 'Neighbourhood']), ignore_index=True)
                    i = i + 1
                    prev_row = row
            
            else: # if the current row is the first row then the data is added to the pandas dataframe
                df_postal_codes = df_postal_codes.append(pd.DataFrame([[row_postal_code, row_borough, row_neighbourhood]], columns = ['PostalCode', 'Borough', 'Neighbourhood']), ignore_index=True)
                i = i + 1
                prev_row = row
        
df_postal_codes = df_postal_codes.replace('\n',' ', regex=True)
df_postal_codes

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront , Regent Park"
3,M6A,North York,"Lawrence Heights , Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge , Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens , Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson , Garden District"


#### Print the number of datarows in the dataframe

In [3]:
df_postal_codes.shape

(103, 3)

#### Load Geopspatial csv file with geographical coordinates for Toronto

In [4]:
geo_data = pd.read_csv('http://cocl.us/Geospatial_data')
geo_data.set_index('Postal Code', inplace=True)
geo_data.head()

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


#### Find and populate the Postal codes table with the Latitude and Longitude for each Postal Code in Toronto

In [5]:
# Loop through the Postal Codes Dataframe
for index, row in df_postal_codes.iterrows():
    # Get the coordinates for each Postal code from the geo_data dataframe and add the Latitude and Longitude 
    # values to the postal codes dataframe
    df_postal_codes.loc[index, 'Latitude'] = geo_data.loc[row[0]].Latitude
    df_postal_codes.loc[index, 'Longitude'] = geo_data.loc[row[0]].Longitude

df_postal_codes

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront , Regent Park",43.654260,-79.360636
3,M6A,North York,"Lawrence Heights , Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge , Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens , Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson , Garden District",43.657162,-79.378937


#### Generate a map of Toronto showing the superimposed neigbourhoods

In [6]:
# Find the latitude and Longitude for Toronto, Ontario
address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent="Ontario_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

# Create a map of Toronto using the latitude and longitude values
map_toronto = folium.Map(location = [latitude, longitude], zoom_start = 10)

# Add neighbourhood markers to the Toronto map
for lat, lang, label in zip (df_postal_codes['Latitude'], df_postal_codes['Longitude'], df_postal_codes['Neighbourhood']):
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker (
        [lat, lang],
        radius = 5,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.7,
        parse_html = False).add_to(map_toronto)

# Display the map of Toronto
map_toronto

The geograpical coordinate of Toronto are 43.653963, -79.387207.


#### Define the Foursquare credentials for exploring the venues in Toronto

In [7]:
# @hidden_cell
CLIENT_ID = '532CJWN2YHXD2TR0JF2J32FFAJ1OOSVBUAJ4G3BUFXV5ZURB' # your Foursquare ID
CLIENT_SECRET = 'AHAELISONAQKGDESXTPGT1O04GRTOZFMJYVY11I4TZIYLSVT' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

#### Define function to explore the Neighbourhoods of Toronto listed in the Postal codes table

In [8]:
# Function to find venues in the neighborhood
def exploreNeighbourhood (names, latitudes, longitudes, radius = 500):
    
    # initiate venues list
    venues_list = []
    LIMIT = 100
    
    # for loop to loop through the neighbourhood list
    for name, lat, lng in zip(names, latitudes, longitudes):
        
        # Create URL for fetching the venues from the Foursquare API
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        # fetch the results for the neighbourhood venues by calling the Foursquare API
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        # extract relevant information for each nearby venue from the results
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])
    # End For Loop

    # Populate the venues and corresponsing details received from the Foursuare API into a pandas dataframe
    df_nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    df_nearby_venues.columns = ['Neighbourhood',
                             'Neighbourhood Latitude',
                             'Neighbourhood Longitude',
                             'Venue',
                             'Venue Latitude',
                             'Venue Longitude',
                             'Venue Category']
    
    return (df_nearby_venues)

#### Invoke the above function for fetching the venues for the Toronto Neighbourhood

In [9]:
# Fetch venues for the Toronto Neighbourhood
df_toronto_venues = exploreNeighbourhood (names = df_postal_codes['Neighbourhood'],
                                       latitudes = df_postal_codes['Latitude'],
                                       longitudes = df_postal_codes['Longitude'])
print (df_toronto_venues.shape)
df_toronto_venues.head()

(2255, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,KFC,43.754387,-79.333021,Fast Food Restaurant
2,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


#### Check number of venues for each neighbourhood

In [10]:
df_toronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide , King , Richmond",100,100,100,100,100,100
Agincourt,3,3,3,3,3,3
"Agincourt North , L'Amoreaux East , Milliken , Steeles East",2,2,2,2,2,2
"Albion Gardens , Beaumond Heights , Humbergate , Jamestown , Mount Olive , Silverstone , South Steeles , Thistletown",12,12,12,12,12,12
"Alderwood , Long Branch",11,11,11,11,11,11
"Bathurst Manor , Downsview North , Wilson Heights",16,16,16,16,16,16
Bayview Village,4,4,4,4,4,4
"Bedford Park , Lawrence Manor East",24,24,24,24,24,24
Berczy Park,56,56,56,56,56,56
"Birch Cliff , Cliffside West",5,5,5,5,5,5


#### Analyze each neighbourhood

In [11]:
# Find number of unique categories
print ('Number of unique categories is ', len(df_toronto_venues['Venue Category'].unique()))

# Normalize the Toronto Venues Dataframe
df_normalized_toronto_venues = pd.get_dummies(df_toronto_venues[['Venue Category']], prefix = '', prefix_sep = '')

# Add nieghbourhood back to the normalized dataframe
df_normalized_toronto_venues['Neighbourhood'] = df_toronto_venues['Neighbourhood']

# Make Neighbourhood the first column in the normalized dataframe
fixed_columns = [df_normalized_toronto_venues.columns[-1]] + list(df_normalized_toronto_venues.columns[:-1])
df_normalized_toronto_venues = df_normalized_toronto_venues[fixed_columns]

print ('Shape of the normalized dataframe is ', df_normalized_toronto_venues.shape)
df_normalized_toronto_venues.head()

Number of unique categories is  282
Shape of the normalized dataframe is  (2255, 283)


Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Let us group the data by neighbourhood and take the mean of the frequency of occurancefor each category

In [12]:
df_toronto_grouped = df_normalized_toronto_venues.groupby('Neighbourhood').mean().reset_index()
df_toronto_grouped

Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide , King , Richmond",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.010000,0.000000,0.000000,0.000000,0.000000,0.010000,0.0,0.000000,0.010000,0.000000
1,Agincourt,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
2,"Agincourt North , L'Amoreaux East , Milliken ,...",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
3,"Albion Gardens , Beaumond Heights , Humbergate...",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.083333,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
4,"Alderwood , Long Branch",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
5,"Bathurst Manor , Downsview North , Wilson Heig...",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.062500,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
6,Bayview Village,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
7,"Bedford Park , Lawrence Manor East",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
8,Berczy Park,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
9,"Birch Cliff , Cliffside West",0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000


Define function to find most common venues

In [13]:
# define function to sort venues in descending order
def most_common_venues (row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending = False)

    return (row_categories_sorted.index.values[0:num_top_venues])

#### Create a dataframe and display the top 10 venues for each nieghbourhood 

In [14]:
# define the number of venues to be displayed
num_top_venues = 10

# Create dataframe with columns for top 10 venues
columns = ['Neighbourhood']
for i in range(0,num_top_venues):
    columns.append('No. {} Most Common Venue'.format(i+1))

df_toronto_venues_sorted = pd.DataFrame(columns = columns)
df_toronto_venues_sorted['Neighbourhood'] = df_toronto_grouped['Neighbourhood']

for i in range(0,df_toronto_grouped.shape[0]):
    df_toronto_venues_sorted.iloc[i, 1:] = most_common_venues (df_toronto_grouped.iloc[1, :], num_top_venues)

print ('Shape: ', df_toronto_venues_sorted.shape)
df_toronto_venues_sorted.head()

Shape:  (100, 11)


Unnamed: 0,Neighbourhood,No. 1 Most Common Venue,No. 2 Most Common Venue,No. 3 Most Common Venue,No. 4 Most Common Venue,No. 5 Most Common Venue,No. 6 Most Common Venue,No. 7 Most Common Venue,No. 8 Most Common Venue,No. 9 Most Common Venue,No. 10 Most Common Venue
0,"Adelaide , King , Richmond",Lounge,Sandwich Place,Breakfast Spot,Yoga Studio,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
1,Agincourt,Lounge,Sandwich Place,Breakfast Spot,Yoga Studio,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
2,"Agincourt North , L'Amoreaux East , Milliken ,...",Lounge,Sandwich Place,Breakfast Spot,Yoga Studio,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
3,"Albion Gardens , Beaumond Heights , Humbergate...",Lounge,Sandwich Place,Breakfast Spot,Yoga Studio,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
4,"Alderwood , Long Branch",Lounge,Sandwich Place,Breakfast Spot,Yoga Studio,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant


#### Cluster the neighberhoods using K-Means algorithm

In [15]:
# set number of clusters
clusters = 5

# Create a new data frame
df_toronto_grouped_clustering = df_toronto_grouped.drop ('Neighbourhood', axis = 1)

# run K-Means clustering algorithm
kmeans = KMeans(n_clusters = clusters, random_state=0).fit(df_toronto_grouped_clustering)

# Display the cluster labels
kmeans.labels_

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 1, 4, 3, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1], dtype=int32)

#### Create a new dataframe that adds the clusters and the top 10 venues to the neighborhoods of toronto

In [16]:
# add cluster labels
df_toronto_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

# Merge the Nighborhood data with the cluster labels and the top 10 venues
df_toronto_merged = df_postal_codes
df_toronto_merged = df_toronto_merged.join(df_toronto_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')
df_toronto_merged['Cluster Labels'].fillna(0, inplace=True)
df_toronto_merged['Cluster Labels'] = df_toronto_merged['Cluster Labels'].astype('int')
df_toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,No. 1 Most Common Venue,No. 2 Most Common Venue,No. 3 Most Common Venue,No. 4 Most Common Venue,No. 5 Most Common Venue,No. 6 Most Common Venue,No. 7 Most Common Venue,No. 8 Most Common Venue,No. 9 Most Common Venue,No. 10 Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,1,Lounge,Sandwich Place,Breakfast Spot,Yoga Studio,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
1,M4A,North York,Victoria Village,43.725882,-79.315572,0,Lounge,Sandwich Place,Breakfast Spot,Yoga Studio,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
2,M5A,Downtown Toronto,"Harbourfront , Regent Park",43.65426,-79.360636,0,Lounge,Sandwich Place,Breakfast Spot,Yoga Studio,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
3,M6A,North York,"Lawrence Heights , Lawrence Manor",43.718518,-79.464763,0,Lounge,Sandwich Place,Breakfast Spot,Yoga Studio,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494,0,Lounge,Sandwich Place,Breakfast Spot,Yoga Studio,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant


#### Generate the map of Toronto to view the clustered nighbourhoods

In [18]:
# create map
map_toronto_clustered = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(clusters)
ys = [i + x + (i*x)**2 for i in range(clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_toronto_merged['Latitude'], df_toronto_merged['Longitude'], df_toronto_merged['Neighbourhood'], df_toronto_merged['Cluster Labels'].astype('int64')):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_toronto_clustered)
       
map_toronto_clustered