We import pandas and numpy

In [1]:
import pandas as pd

In [2]:
import numpy as np 

We install lxml

In [3]:
!pip install lxml

Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/79/37/d420b7fdc9a550bd29b8cfeacff3b38502d9600b09d7dfae9a69e623b891/lxml-4.5.2-cp36-cp36m-manylinux1_x86_64.whl (5.5MB)
[K     |████████████████████████████████| 5.5MB 5.3MB/s eta 0:00:01     |████████▎                       | 1.4MB 5.3MB/s eta 0:00:01     |██████████████████████████▎     | 4.5MB 5.3MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.5.2


We read the information from the website. 

In [4]:
site = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
table = pd.read_html(site,header=0)[0]

We clean the data.

In [5]:
table = table.replace(to_replace='Not assigned',value=np.nan)
table = table.loc[table.Borough.notna(),:]
table = table.rename({"Postal Code":"PostalCode"}, axis='columns') 
table = table.apply(lambda x: [x.PostalCode,x.Borough, x.Borough] if pd.isna(x.Neighbourhood)
                       else x, axis=1)
table.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [6]:
counts = table.PostalCode.value_counts()
counts[counts>1]

Series([], Name: PostalCode, dtype: int64)

In [7]:
table = table.groupby('PostalCode').apply(lambda x: x.apply(lambda y:  ', '.join(y) \
                                                   if y.name =='Neighbourhood' \
                                                   else y.tolist()[0]))
table.reset_index(inplace=True, drop=True)
table.loc[table.PostalCode.isin(counts[counts>1].index),:].head()

Unnamed: 0,PostalCode,Borough,Neighbourhood


In [8]:
table.loc[table.PostalCode.isin(counts[counts==1].index),:].head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [9]:
# The shape of the table 
table.shape

(103, 3)

In [10]:
geo_coords = pd.read_csv('Geospatial_Coordinates.csv')
geo_coords.columns = ['PostalCode', 'Latitude', 'Longitude']

In [11]:
print(geo_coords.shape)
geo_coords.head()

(103, 3)


Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
table_n = pd.merge(table,geo_coords,on='PostalCode', how='inner')
table_n.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [15]:
table_n.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [16]:
table_n.drop(columns = 'PostalCode',inplace=True)
print(print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(table_n['Borough'].unique()),
        table_n.shape[0])))
table_n.head()

The dataframe has 10 boroughs and 103 neighborhoods.
None


Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,Scarborough,Woburn,43.770992,-79.216917
4,Scarborough,Cedarbrae,43.773136,-79.239476
