In [16]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [17]:
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
table = pd.DataFrame(columns=["postcode","borough","neighborhood"])
res=requests.get(url).text
soup=BeautifulSoup(res,'lxml')
for items in soup.find('table',class_='wikitable').find_all('tr')[1::1]:
    data=items.find_all(['th','td'])
    try:
        postcode = data[0].text.rstrip()
        borough = data[1].text.rstrip()
        neighborhood = data[2].text.rstrip()
    except IndexError:pass
    table = table.append({"postcode":postcode,"borough":borough,"neighborhood":neighborhood},ignore_index=True)


In [18]:
# Drop rows with 'Not assigned' borough
table = table[table.borough!='Not assigned']
table = table.reset_index(drop=True)

In [19]:
table.head()

Unnamed: 0,postcode,borough,neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [20]:
#Check there's no "Not assigned" value in borough
table.borough.value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East York            5
East Toronto         5
York                 5
Mississauga          1
Name: borough, dtype: int64

In [21]:
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
dftemp=table[(table.borough!="Not assigned") & (table.neighborhood=="Not assigned")]
dftemp

Unnamed: 0,postcode,borough,neighborhood


In [22]:
# Replace 'Not assigned' neighborhood with corresponding borough
table.neighborhood = table.borough.where((table.borough!='Not assigned')&(table.neighborhood=='Not assigned'),
                                         table.neighborhood)

In [23]:
#Combine neigborhoods having same postcodes in one row separated by a comma.
f_merge_comma = lambda x: " , ".join(x)
table = table.groupby(['postcode','borough']).agg({'neighborhood':f_merge_comma}).reset_index()

In [24]:
table.head(10)

Unnamed: 0,postcode,borough,neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West
9,M1N,Scarborough,Birch Cliff / Cliffside West


In [25]:
table.shape

(103, 3)

In [26]:
postcodes_df=table.copy()

In [27]:
#https://pypi.org/project/pgeocode/
import pgeocode
nomi = pgeocode.Nominatim('ca')
def get_geocode(post_code):
    loc=nomi.query_postal_code(post_code)
    return loc.latitude, loc.longitude

In [28]:
get_geocode('M5G')

(43.6564, -79.38600000000002)

In [29]:
postcodes_df['latitude'], postcodes_df['longitude'] = zip(*postcodes_df['postcode'].apply(get_geocode))

In [30]:
postcodes_df[postcodes_df.longitude.isnull()]

Unnamed: 0,postcode,borough,neighborhood,latitude,longitude
86,M7R,Mississauga,Canada Post Gateway Processing Centre,,


In [36]:
postcodes_df.head(12)

Unnamed: 0,postcode,borough,neighborhood,latitude,longitude
0,M1B,Scarborough,Malvern / Rouge,43.8113,-79.193
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.7878,-79.1564
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.7678,-79.1866
3,M1G,Scarborough,Woburn,43.7712,-79.2144
4,M1H,Scarborough,Cedarbrae,43.7686,-79.2389
5,M1J,Scarborough,Scarborough Village,43.7464,-79.2323
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park,43.7298,-79.2639
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge,43.7122,-79.2843
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West,43.7247,-79.2312
9,M1N,Scarborough,Birch Cliff / Cliffside West,43.6952,-79.2646
