## Import libraries

In [1]:
import numpy as np
import pandas as pd

## Download data from Wiki

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
table = pd.read_html(url)
type(table)

list

## Get the neighbourhood info from downloaded info

In [3]:
neigh_df = table[0].copy()
print(neigh_df.shape)
neigh_df.head()

(288, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Drop rows with a borough that is Not assigned

In [4]:
neigh_df.drop(neigh_df.loc[neigh_df["Borough"] == "Not assigned"].index, axis = 0, inplace = True)
print(neigh_df.shape)
neigh_df.head()

(211, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


## Rename Column name to PostalCode

In [5]:
neigh_df.reset_index(drop = True, inplace = True)
neigh_df.rename(columns = {'Postcode' : 'PostalCode'}, inplace = True)
neigh_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


## Merge Neighbourhood records for those with same PostalCode

In [6]:
# Define function to get a string of Neighbourhoods from the same postal code group, input is groupby object
def getNeighbors(group):
    a = ''
    for data in group['Neighbourhood']:
        if a == '':
            a = data
        else:
            a = a + ', '+ data
    return a

In [7]:
# an empty list to contain new data
neigh_data = []
# group data based on 'PostalCode'
grouping = neigh_df.groupby('PostalCode')
# check each sub group to get data to construct new dataframe
for postcode in neigh_df['PostalCode'].unique():
    sub_group = grouping.get_group(postcode)
    PostalCode = sub_group.PostalCode.values[0]
    Borough = sub_group.Borough.values[0]
    Neighbourhood = getNeighbors(sub_group)
    # append data to the data list
    neigh_data.append([PostalCode,Borough,Neighbourhood])
# create new data frame
neigh_df_new = pd.DataFrame(neigh_data,columns = ['PostalCode','Borough','Neighbourhood'])
neigh_df_new.shape

(103, 3)

In [8]:
neigh_df_new.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned


## Clean Neighbourhood data with 'Not assigned' value

In [9]:
# See the records with 'Not assigned' value as Neighbourhood
neigh_df_new.loc[neigh_df_new['Neighbourhood']=='Not assigned']

Unnamed: 0,PostalCode,Borough,Neighbourhood
4,M7A,Queen's Park,Not assigned


In [10]:
# Update the 'Not assigned' value to be the same as Borough value
for x, y in neigh_df_new.iterrows():
    if neigh_df_new.at[x,'Neighbourhood'] == 'Not assigned':
        neigh_df_new.at[x,'Neighbourhood'] = neigh_df_new.at[x,'Borough']

In [11]:
# Check the records after fix
neigh_df_new.loc[neigh_df_new['Neighbourhood']=='Not assigned']

Unnamed: 0,PostalCode,Borough,Neighbourhood


In [12]:
neigh_df_new.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


## Number of rows of my dataframe

In [15]:
print(neigh_df_new.shape)
print("Number of rows: ",neigh_df_new.shape[0])

(103, 3)
Number of rows:  103
