# Segmenting and Clustering Neighborhoods in Toronto

## By: Nigel Burrows

# Part 1: Scrape and Clean Wiki Canada Postal data

First we start by importing the libraries we need

In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import json # library to handle JSON files

!pip install geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!pip install folium 
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


First thing we need to do is scrape the data for postal codes in Canada from Wikipedia

In [2]:
CanadaWikiDfList = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
CanadaWikiDfList

[    Postal Code           Borough  \
 0           M1A      Not assigned   
 1           M2A      Not assigned   
 2           M3A        North York   
 3           M4A        North York   
 4           M5A  Downtown Toronto   
 ..          ...               ...   
 175         M5Z      Not assigned   
 176         M6Z      Not assigned   
 177         M7Z      Not assigned   
 178         M8Z         Etobicoke   
 179         M9Z      Not assigned   
 
                                          Neighbourhood  
 0                                         Not assigned  
 1                                         Not assigned  
 2                                            Parkwoods  
 3                                     Victoria Village  
 4                            Regent Park, Harbourfront  
 ..                                                 ...  
 175                                       Not assigned  
 176                                       Not assigned  
 177                

Get the data frame for the correct table and store it as a data frame that we will clean up

In [3]:
can_postal = CanadaWikiDfList[0]
can_postal.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Get the new data frames shape

In [4]:
can_postal.shape

(180, 3)

Remove all records where the Borough is Not Assigned

In [5]:
can_postal = can_postal[~can_postal.Borough.str.contains("Not assigned")]
can_postal.reset_index(inplace=True, drop=True)
can_postal.head(5)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Replace all Neighborhoods that are Not assigned with the name of the Borough

In [11]:
# Print records of neighbourhoods that are Not Assigned before we replace them
print('Neighborhoods that are "Not assigned" before: ', len(can_postal[can_postal.Neighbourhood == 'Not assigned'].index))

# Replace all records of Not Assigned, in Neighborhood with the name of the Borough
can_postal.loc[can_postal['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = can_postal['Borough']

# Print records of neighbourhoods that are Not Assigned after we replace them
print('Neighborhoods that are "Not assigned" after : ', len(can_postal[can_postal.Neighbourhood == 'Not assigned'].index))


Neighborhoods that are "Not assigned" before:  0
Neighborhoods that are "Not assigned" after :  0


Merge neighborhoods that have the same postal code and delimit by a comma

In [7]:
# Check for duplicates

dup_postal_count = 0

for r in can_postal.duplicated(subset=['Postal Code']):
    if r == True:
        dup_postal_count += 1

print("Duplicate Postal Codes: ", dup_postal_count)

# As you can see there are no duplicates so there is no need to merge 

Duplicate Postal Codes:  0


Print the shape of the new clean up dataframe

In [8]:
can_postal.shape

(103, 3)

# Part 2: Get Geo Location Data and Merge to Original Data Frame

Import CSV with geospatial location data

In [9]:
can_postal_latlng = pd.read_csv("http://cocl.us/Geospatial_data")
can_postal_latlng.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
can_postal = can_postal.set_index('Postal Code').join(can_postal_latlng.set_index('Postal Code'))
can_postal.reset_index(inplace=True, drop=False)
can_postal.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [12]:
can_postal.loc[can_postal['Postal Code'] == 'MG5A']

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude


# Part 3: Create Cluster