In [24]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [25]:
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
table = pd.DataFrame(columns=["postcode","borough","neighborhood"])
res=requests.get(url).text
soup=BeautifulSoup(res,'lxml')
for items in soup.find('table',class_='wikitable').find_all('tr')[1::1]:
    data=items.find_all(['th','td'])
    try:
        postcode = data[0].text
        borough = data[1].text
        neighborhood = data[2].text.rstrip()
    except IndexError:pass
    table = table.append({"postcode":postcode,"borough":borough,"neighborhood":neighborhood},ignore_index=True)


In [26]:
# Drop rows with 'Not assigned' borough
table = table[table.borough!='Not assigned']
table = table.reset_index(drop=True)

In [27]:
table.head()

Unnamed: 0,postcode,borough,neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [4]:
#Check there's no "Not assigned" value in borough
table.borough.value_counts()

Etobicoke           44
North York          38
Downtown Toronto    37
Scarborough         37
Central Toronto     17
West Toronto        13
York                 9
East Toronto         7
East York            6
Mississauga          1
Queen's Park         1
Name: borough, dtype: int64

In [5]:
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
dftemp=table[(table.borough!="Not assigned") & (table.neighborhood=="Not assigned")]
dftemp

Unnamed: 0,postcode,borough,neighborhood
5,M7A,Queen's Park,Not assigned


In [6]:
# Replace 'Not assigned' neighborhood with corresponding borough
table.neighborhood = table.borough.where((table.borough!='Not assigned')&(table.neighborhood=='Not assigned'),
                                         table.neighborhood)

In [7]:
#Combine neigborhoods having same postcodes in one row separated by a comma.
f_merge_comma = lambda x: " , ".join(x)
table = table.groupby(['postcode','borough']).agg({'neighborhood':f_merge_comma}).reset_index()

In [8]:
table.head(10)

Unnamed: 0,postcode,borough,neighborhood
0,M1B,Scarborough,"Rouge , Malvern"
1,M1C,Scarborough,"Highland Creek , Rouge Hill , Port Union"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park , Ionview , Kennedy Park"
7,M1L,Scarborough,"Clairlea , Golden Mile , Oakridge"
8,M1M,Scarborough,"Cliffcrest , Cliffside , Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff , Cliffside West"


In [9]:
table.shape

(103, 3)

In [15]:
postcodes_df=table.copy()

In [11]:
#https://pypi.org/project/pgeocode/
import pgeocode
nomi = pgeocode.Nominatim('ca')
def get_geocode(post_code):
    loc=nomi.query_postal_code(post_code)
    return loc.latitude, loc.longitude

In [12]:
get_geocode('M5G')

(43.6564, -79.38600000000002)

In [19]:
postcodes_df['latitude'], postcodes_df['longitude'] = zip(*postcodes_df['postcode'].apply(get_geocode))

In [23]:
postcodes_df[postcodes_df.longitude.isnull()]

Unnamed: 0,postcode,borough,neighborhood,latitude,longitude
86,M7R,Mississauga,Canada Post Gateway Processing Centre,,
