Segmenting and Clustering Neighborhoods in Toronto

Scrape the wiki table

In [3]:
import requests
#access rge url
website_url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

In [4]:
from bs4 import BeautifulSoup
#scrape the web page
soup = BeautifulSoup(website_url,"lxml")

In [5]:
#extract the table
fsa_table = soup.find("table",{"class":"wikitable sortable"})

In [6]:
#the data we're looking for are in <td> tags
links = fsa_table.findAll("td")

In [7]:
elements = []
for link in links:
    #extract the text between <td> tags
    elements.append(link.get_text(strip = True))

In [8]:
postcode = []
borough = []
neighborhood = []

#1st element is postcode
for i in range(0, len(elements), 3):
    postcode.append(elements[i])

#2nd element is borough
for j in range(1, len(elements), 3):
    borough.append(elements[j])

#3rd element is neighborhood
for k in range(2, len(elements), 3):
    neighborhood.append(elements[k])   

#combine the three arrays to create a table
table = {"Postcode":postcode, "Borough":borough, "Neighborhood": neighborhood}

In [9]:
import pandas as pd
#convert the table into a dataframe
df = pd.DataFrame(table)
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [10]:
#drop the rows with Borough = Not assigned
df = df[df.Borough != "Not assigned"]
#reset the index
df.index = range(len(df))
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [11]:
#group by the postcode
df = df.astype(str).groupby("Postcode", sort = False, as_index = False).agg(lambda x: ", ".join(x.unique()))
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned


In [12]:
#replace the Neighborhood = Not assigned with Borough
df.loc[df.Neighborhood == "Not assigned", "Neighborhood"] = df.Borough
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [13]:
df.shape

(103, 3)

Adding latitude and logitude data to each postal code

In [25]:
#using the csv to specify the latitudes and longitudes
file = "Geospatial_Coordinates.csv"
headers = ["Postcode", "Latitude", "Longitude"]
loc_data = pd.read_csv(file, skiprows=1, names = headers)
loc_data.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [29]:
#join the two tables based on "Postcode"
df_with_loc = df.merge(loc_data, on = "Postcode")
df_with_loc.head(12)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
