This notebook will explore, segment, and cluster the neighborhoods in the city of Toronto

<h4>Import Library and Parser</h4>

In [1]:
# Installed beautifulsoup4
# Installed lml and html5lib parser and request library
# Note: To simplify, outputs for 'pip install' were cleared out

In [None]:
pip install beautifulsoup4

In [None]:
pip install lxml

In [None]:
pip install html5lib

In [None]:
pip install requests

In [6]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

<h4>Reading the data set from the URL</h4>

In [7]:
# Getting source code from Wikipedia page using 'requests libary' 
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

# Pass source code to Beautifulsoup4
soup = BeautifulSoup(source, 'lxml')

# print(soup.prettify()) - This will format our output
# Scroll through and copy tag '<table class="wikitable sortable">'
# Note: to simplify, the long output was cleared out

In [8]:
# We then parsed out 'table' from tag to create dataframe
table = soup.find_all('table')[0]

In [9]:
# Read table to the dataframe
df = pd.read_html(str(table))[0]

In [10]:
# Now, let's see what our table looks like
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West
285,M8Z,Etobicoke,South of Bloor


<h4>Evaluating and Cleaning Data</h4>

In [11]:
# From our table, we can select rows with 'Not assigned' value
df_Na = df[df.Borough.isin(['Not assigned'])]

In [12]:
# Let's see all rows with "Not assigned' value
df_Na

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
8,M8A,Not assigned,Not assigned
12,M2B,Not assigned,Not assigned
19,M7B,Not assigned,Not assigned
...,...,...,...
277,M4Z,Not assigned,Not assigned
278,M5Z,Not assigned,Not assigned
279,M6Z,Not assigned,Not assigned
280,M7Z,Not assigned,Not assigned


In [13]:
# After finding out rows with 'Not assigned', we can now ignore them
df_Na = df[~df.Borough.isin(['Not assigned'])]

In [14]:
# Here, we have 287 rows less 77 Not assigned Borough values
# Should return 210 rows
df_Na

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
...,...,...,...
281,M8Z,Etobicoke,Kingsway Park South West
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West


In [15]:
# From our generated table, we can group values with same Postcode and Borough
# in the same Neighbourhood row, separated by a comma
df_same_postcode = df_Na.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()
df_same_postcode.tail(18)

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Not assigned
86,M7R,Mississauga,Canada Post Gateway Processing Centre
87,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
88,M8V,Etobicoke,"Humber Bay Shores,Mimico South,New Toronto"
89,M8W,Etobicoke,"Alderwood,Long Branch"
90,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North"
91,M8Y,Etobicoke,"Humber Bay,King's Mill Park,Kingsway Park Sout..."
92,M8Z,Etobicoke,"Kingsway Park South West,Mimico NW,The Queensw..."
93,M9A,Queen's Park,Queen's Park
94,M9B,Etobicoke,"Cloverdale,Islington,Martin Grove,Princess Gar..."


In [16]:
# Notice first row, Postcode M7A, Neighbourhood has 'Not assigned' value? 

<h4>Identify and Handle values</h4>


In [17]:
# For Neighborhood with 'Not assigned' value, will be assign a value, same as their Borough 
df_same_postcode.loc[df_same_postcode['Neighbourhood']
                     =="Not assigned",'Neighbourhood'] = df_same_postcode.loc[df_same_postcode['Neighbourhood']
                                        =="Not assigned",'Borough']

In [18]:
# Here, we can see Borough value is same as Neighbourhood for Postcode M7A
df_same_postcode.tail(18)

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Queen's Park
86,M7R,Mississauga,Canada Post Gateway Processing Centre
87,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
88,M8V,Etobicoke,"Humber Bay Shores,Mimico South,New Toronto"
89,M8W,Etobicoke,"Alderwood,Long Branch"
90,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North"
91,M8Y,Etobicoke,"Humber Bay,King's Mill Park,Kingsway Park Sout..."
92,M8Z,Etobicoke,"Kingsway Park South West,Mimico NW,The Queensw..."
93,M9A,Queen's Park,Queen's Park
94,M9B,Etobicoke,"Cloverdale,Islington,Martin Grove,Princess Gar..."


<h4>Checking the Data</h4>

In [19]:
# Finally, for our dataframe we can use '.shape' method to check number of rows
df_same_postcode.shape

(103, 3)

<h4>Latitude and Longitude coordinates of each neighborhood</h4>

In [20]:
# Retrieving Geospatial_data
# Let's examine the table
df_gdata = pd.read_csv('Geo_data.csv')
df_gdata

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [21]:
# Here,we perfomed join operation by columns using merge function
# Let's add the Geospatial_data to our dataframe
# Then use 'PostalCode' as our header for first column

df_gsd = pd.merge(df_same_postcode,df_gdata,how='left',left_on='Postcode',right_on='Postal Code')
df_gsd.drop('Postal Code',axis=1,inplace=True)
df_gsd.rename(columns={'Postcode':'PostalCode'}, inplace=True)
df_gsd

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",43.739416,-79.588437
