The libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

Use BeautifulSoup to extract value from the table

In [2]:
page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')       
# Create a BeautifulSoup object
soup = BeautifulSoup(page.text, 'html.parser')
table=soup.find(class_="wikitable sortable")

In [3]:
#Get the table information
data=[]
for row in table.findAll("tr"):
    cells = row.findAll("td")
    if len(cells) == 3:
        Postcode = cells[0].find(text=True)
        borough = cells[1].find(text=True)
        Neighbourhoodx = cells[2].find(text=True)
        Neighbourhood = Neighbourhoodx.strip()    #Remove the '\n'
        data.append((Postcode, borough, Neighbourhood))
data[0:10]

[('M1A', 'Not assigned', 'Not assigned'),
 ('M2A', 'Not assigned', 'Not assigned'),
 ('M3A', 'North York', 'Parkwoods'),
 ('M4A', 'North York', 'Victoria Village'),
 ('M5A', 'Downtown Toronto', 'Harbourfront'),
 ('M5A', 'Downtown Toronto', 'Regent Park'),
 ('M6A', 'North York', 'Lawrence Heights'),
 ('M6A', 'North York', 'Lawrence Manor'),
 ('M7A', "Queen's Park", 'Not assigned'),
 ('M8A', 'Not assigned', 'Not assigned')]

In [4]:
# Transform list into a data frame
df = pd.DataFrame(np.array(data), columns = ("PostalCode","Borough","Neighborhood"))  #289 rows
print(df.head())
print(df.shape)

#Delete rows where borough equal to not assigned
df=df.loc[df['Borough'] != 'Not assigned']
print(df.shape)
# reset index starting from 0
df.reset_index(inplace=True, drop=True)

#Replace 'Not assigned' neighborhood with borough
df['Neighborhood']=np.where(df['Neighborhood'] == 'Not assigned',df['Borough'],df['Neighborhood']) 
df.head()

#y=df[df['PostalCode']=='M7A']
#y

  PostalCode           Borough      Neighborhood
0        M1A      Not assigned      Not assigned
1        M2A      Not assigned      Not assigned
2        M3A        North York         Parkwoods
3        M4A        North York  Victoria Village
4        M5A  Downtown Toronto      Harbourfront
(289, 3)
(212, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [5]:
#Combine neighborhood with same postalcode and borough
df_new = df.groupby(['PostalCode','Borough']).agg(lambda x: tuple(x)).applymap(list).reset_index()
print(df_new.shape)
df_new.head()

(103, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"[Rouge, Malvern]"
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]"
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]"
3,M1G,Scarborough,[Woburn]
4,M1H,Scarborough,[Cedarbrae]


In [6]:
#change list in the data frame to text 
df_new['Neighborhood'] = df_new.Neighborhood.apply(lambda x: ', '.join([str(i) for i in x]))
df_new.head(12)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [7]:
df_new.shape

(103, 3)

In [8]:
coor=pd.read_csv('Geospatial_Coordinates.csv')
coor=coor.rename(columns={"Postal Code":"PostalCode"})
coor.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
df_new=pd.merge(df_new, coor, on='PostalCode')
df_new.shape

(103, 5)

In [11]:
df_new.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848
