# Segmenting and Clustering Neighbourhoods in Toronto

First import the required packages

In [3]:
import requests
import pandas as pd

Now we must scrape the list of canadian postal codes from wikipedia and clean up the data to remove missing values and group related neighbourhoods.

In [9]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wikipedia_page = requests.get(wiki_url)

#read HTML table into a list of dataframe objects and access the dataframe.
df_raw = pd.read_html(wikipedia_page.content, header=0)[0]

#remove cells were borough is not assigned.
df_new = df_raw[df_raw.Borough != 'Not assigned']

#group neighbourhoods that exist within the same postal area
df_new.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [10]:
#check if any neighbourhoods have "Not assigned" as their value (there are none present)
df_new.loc[df_new.Neighbourhood == 'Not assigned']

Unnamed: 0,Postal Code,Borough,Neighbourhood


In [11]:
#group neighbourhoods which have the same postal code and Borough.
df_toronto = df_new.groupby(['Postal Code', 'Borough'])['Neighbourhood'].apply(lambda x: ', '.join(x))
df_toronto = df_toronto.reset_index()
df_toronto.rename(columns = {'Postal Code':'PostalCode'}, inplace = True)
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [12]:
df_toronto.shape

(103, 3)

## Acquire latitude and longitude values for each neighborhood
#FYI: after running into some issues with the geocoder API I have manully connected th e lat long values via a csv.

In [13]:
#import coordinates from csv file
latlong = 'http://cocl.us/Geospatial_data'
df_cord = pd.read_csv(latlong)

df_cord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
#evaluate dataframe shape
df_cord.shape

(103, 3)

In [15]:
#join the two columns together via the postcode column

df_toronto = df_toronto.join(df_cord.set_index('Postal Code'), on='PostalCode')
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
