In [3]:
import pandas as pd
import numpy as np

In [4]:
# download web page
!wget -q -O 'postal_code_canada.html' https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M
print('Data downloaded!')

Data downloaded!


In [None]:
#install beautiful soup
!pip install beautifulsoup4

In [5]:
from bs4 import BeautifulSoup

# parse web page
with open("postal_code_canada.html") as fp:
    soup = BeautifulSoup(fp)

#print(soup.table.prettify())

borough_map={}
neighborhood_map={}

# build dataframe
for row in soup.table.find_all('tr'):
    cols = []

    index = 0
    for c in row.find_all('td'):
        cols.append(c.string)
        if index > 2:
            break
        else: 
            index += 1

    if len(cols) != 3: 
        continue

    code = cols[0]
    borough= cols[1]   
    neighborhood = cols[2] 

    if borough == "Not assigned":
        continue

    if neighborhood == "Not assigned":
        neighborhood = borough

    if neighborhood == None:
        continue

    borough_map[code] = borough

    if code in neighborhood_map: 
        list = neighborhood_map[code]
        list.append(neighborhood.strip('\n'))
        list.sort()
        neighborhood_map[code] = list
    else:    
        neighborhood_map[code] = [neighborhood.strip('\n')]

    #print(borough_map)    
    #print(neighborhood_map)


In [6]:
# define the dataframe columns
column_names = ["PostalCode", 'Borough', 'Neighborhood'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

#populate dataframe
for key, value in borough_map.items():
    neighborhood = neighborhood_map[key]
    neighborhoods = neighborhoods.append({'PostalCode':key, 'Borough': value, 'Neighborhood': ','.join(neighborhood)}, ignore_index=True)    


neighborhoods


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3B,North York,Don Mills North
1,M5B,Downtown Toronto,"Garden District,Ryerson"
2,M6B,North York,Glencairn
3,M9B,Etobicoke,"Cloverdale,Martin Grove"
4,M3C,North York,Don Mills South
5,M9C,Etobicoke,"Bloordale Gardens,Eringate,Old Burnhamthorpe"
6,M1E,Scarborough,Guildwood
7,M6E,York,Caledonia-Fairbanks
8,M5G,Downtown Toronto,Central Bay Street
9,M6G,Downtown Toronto,Christie


In [7]:
# load geographical coordinates
neighborhoods_geo = pd.read_csv("http://cocl.us/Geospatial_data", sep=",")

# renamed column 'Posta Code' -> 'PostalCode' for the merge
neighborhoods_geo.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
neighborhoods_geo


Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [11]:
# merge the dataframes
neigborhoods = pd.merge(neighborhoods, neighborhoods_geo, how="left", on="PostalCode")
neigborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3B,North York,Don Mills North,43.745906,-79.352188
1,M5B,Downtown Toronto,"Garden District,Ryerson",43.657162,-79.378937
2,M6B,North York,Glencairn,43.709577,-79.445073
3,M9B,Etobicoke,"Cloverdale,Martin Grove",43.650943,-79.554724
4,M3C,North York,Don Mills South,43.7259,-79.340923
5,M9C,Etobicoke,"Bloordale Gardens,Eringate,Old Burnhamthorpe",43.643515,-79.577201
6,M1E,Scarborough,Guildwood,43.763573,-79.188711
7,M6E,York,Caledonia-Fairbanks,43.689026,-79.453512
8,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
9,M6G,Downtown Toronto,Christie,43.669542,-79.422564
