Import Libraries

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

Use BeautifulSoup to parse table

In [2]:
# define source
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

# Use lxml parser
soup = BeautifulSoup(source, 'lxml')

# find table to parse using 'find' method
table = soup.find('table', class_='wikitable')

In [3]:
# parse table
content = table.tbody.find_all('tr')

# use list apprehension to extract rows
content_list = []

for tr in content:
    th = tr.find_all('th')    # extract header
    td = tr.find_all('td')    # extract rows
    row = [a.text for a in th] + [i.text for i in td]
    content_list.append(row)

Input data to dataframe

In [13]:
df = pd.DataFrame(content_list)    # Input data to dataframe
df.rename(columns=df.iloc[0], inplace=True)    # rename column header
df.drop([0], inplace=True)    # drop duplicate first row
df['Neighbourhood\n'] = df['Neighbourhood\n'].apply(lambda x: x.strip('\n'))    # Clean data in neighbourhood columns
df.rename (columns={'Postcode' : 'Postal Code'}, inplace = True)
df.rename(columns={'Neighbourhood\n':'Neighbourhood'}, inplace=True)    # rename neighbourhood column header
df = df[df.Borough != 'Not assigned']    # filter rows with Not assigned value
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor


In [22]:
# Group neighborhoods with same postal code into single row

df2 = df.groupby(['Postal Code', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()    
df2

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [23]:
df3 = pd.read_csv('Geospatial_Coordinates.csv')
df3

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [24]:
df2 = df2.set_index('Postal Code')
df3 = df3.set_index('Postal Code')
df2[['Latitude', 'Longitude']] = df3[['Latitude', 'Longitude']]
df2.reset_index()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437


In [26]:
df2.shape

(103, 4)