In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json, lxml
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans
# import folium # map rendering library
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

In [2]:
try:
    import folium
except:
    !pip install folium
    import folium

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting folium
  Downloading folium-0.12.1-py2.py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 6.1 MB/s  eta 0:00:01
Collecting branca>=0.3.0
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.12.1


In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url).text
soup = BeautifulSoup(source)

table_data = soup.find('div', class_='mw-parser-output')
table = table_data.table.tbody

columns = ['PostalCode', 'Borough', 'Neighbourhood']
data = dict({key:[]*len(columns) for key in columns})

for row in table.find_all('tr'):
    for i,column in zip(row.find_all('td'),columns):
        i = i.text
        i = i.replace('\n', '')
        data[column].append(i)

df = pd.DataFrame.from_dict(data=data)[columns]
print(df.shape)
df.head()

(20, 3)


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1ANot assigned,M2ANot assigned,M3ANorth York(Parkwoods)
1,M1BScarborough(Malvern / Rouge),M2BNot assigned,M3BNorth York(Don Mills)North
2,M1CScarborough(Rouge Hill / Port Union / Highl...,M2CNot assigned,M3CNorth York(Don Mills)South(Flemingdon Park)
3,M1EScarborough(Guildwood / Morningside / West ...,M2ENot assigned,M3ENot assigned
4,M1GScarborough(Woburn),M2GNot assigned,M3GNot assigned


In [4]:
df = df[df['Borough'] != 'Not assigned'].reset_index(drop = True)
print('After dropping rows where borough is "Not assigned", Shape is: ',df.shape)
print('Number of rows where Neighbourhood is "Not assigned" but borough has value: ', 
      df[df['Neighbourhood'] == 'Not assigned'].shape[0])

After dropping rows where borough is "Not assigned", Shape is:  (20, 3)
Number of rows where Neighbourhood is "Not assigned" but borough has value:  0


In [5]:
p, b, n = [], [], []
for postcode, borough, neigh in zip(df['PostalCode'], df['Borough'], df['Neighbourhood']):
    p.append(postcode)
    b.append(borough)
    if neigh == 'Not assigned':
        n.append(borough)
    else:
        n.append(neigh)

df = pd.DataFrame({'PostalCode': p, 'Borough': b, 'Neighbourhood':n})[columns]
print(df.shape)
df.head()

(20, 3)


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1ANot assigned,M2ANot assigned,M3ANorth York(Parkwoods)
1,M1BScarborough(Malvern / Rouge),M2BNot assigned,M3BNorth York(Don Mills)North
2,M1CScarborough(Rouge Hill / Port Union / Highl...,M2CNot assigned,M3CNorth York(Don Mills)South(Flemingdon Park)
3,M1EScarborough(Guildwood / Morningside / West ...,M2ENot assigned,M3ENot assigned
4,M1GScarborough(Woburn),M2GNot assigned,M3GNot assigned


In [6]:
postcodes = df['PostalCode'].values
boroughs = df['Borough'].values
neighs = df['Neighbourhood'].values

#create a dictionary with keys as Postcode and Borough, keys of dictioaries are unique
dic = dict({(key1,key2): [] for key1, key2 in zip(postcodes, boroughs)})
print('Number of keys in the dictionary are: ', len(dic.keys()))

#filling the values of keys of dictionary
for postcode, borough, neigh in zip(postcodes,boroughs, neighs):
    key = (postcode, borough)
    dic[key].append(neigh)

df = pd.DataFrame(columns = ['Postal Code', 'Borough', 'Neighbourhood'])
for key, value in dic.items():
    postcode, borough, neig = key[0], key[1], value
    neig = ', '.join(neig)
    df = df.append({'Postal Code': postcode,
                     'Borough': borough,
                     'Neighbourhood': neig}, ignore_index = True)
print('Shape of final data is: ', df.shape)
df.head(10)

Number of keys in the dictionary are:  20
Shape of final data is:  (20, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1ANot assigned,M2ANot assigned,M3ANorth York(Parkwoods)
1,M1BScarborough(Malvern / Rouge),M2BNot assigned,M3BNorth York(Don Mills)North
2,M1CScarborough(Rouge Hill / Port Union / Highl...,M2CNot assigned,M3CNorth York(Don Mills)South(Flemingdon Park)
3,M1EScarborough(Guildwood / Morningside / West ...,M2ENot assigned,M3ENot assigned
4,M1GScarborough(Woburn),M2GNot assigned,M3GNot assigned
5,M1HScarborough(Cedarbrae),M2HNorth York(Hillcrest Village),M3HNorth York(Bathurst Manor / Wilson Heights ...
6,M1JScarborough(Scarborough Village),M2JNorth York(Fairview / Henry Farm / Oriole),M3JNorth York(Northwood Park / York University)
7,M1KScarborough(Kennedy Park / Ionview / East B...,M2KNorth York(Bayview Village),M3KNorth York(Downsview)East (CFB Toronto)
8,M1LScarborough(Golden Mile / Clairlea / Oakridge),M2LNorth York(York Mills / Silver Hills),M3LNorth York(Downsview)West
9,M1MScarborough(Cliffside / Cliffcrest / Scarbo...,M2MNorth York(Willowdale / Newtonbrook),M3MNorth York(Downsview)Central
