In [1]:
# import libraries
import pandas as pd
import numpy as np


Get the data from Wikipedia with pandas


In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
list_df = pd.read_html(url)

The created list of Data Frames contain 3 data frames. The first one is the one we need. Let's assign it to a new data frame.



In [3]:
df = list_df[0]
print(df.shape)
# Display the first 10 rows
df.head()

(180, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Let's rename the first column to PostalCode



In [4]:
df.rename(columns={"Postal Code":"PostalCode"},inplace=True)

# Get rid of rows with no Borough
to_drop = np.where(df['Borough']=='Not assigned')[0]
df.drop(to_drop,axis=0,inplace=True)
df.shape

(103, 3)

Let's check if any "Not assigned" values are still in Borough...

In [5]:
(df['Borough']=='Not assigned').sum()

0

Let's reset the index and check how the new DF looks like...

In [6]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Let's check now for "Not assigned" values in Neighbourhood

In [7]:
(df['Neighbourhood']=="Not assigned").sum()

0

Now we combine rows that have the same PostalCode value

In [8]:
df = df.groupby(['PostalCode','Borough'])['Neighbourhood'].apply(','.join).reset_index()
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [9]:
df.shape

(103, 3)

# ==== End of question 1 ====

# ==== Question 2 =====

 The geocoder package didn't work so we will load the csv data directly

In [10]:
coord_data = pd.read_csv('Geospatial_Coordinates.csv')
coord_data.rename(columns={"Postal Code": "PostalCode" },inplace=True)

In [11]:
coord_data.head(10)

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


Now we merge both data frames into a single data frame....

In [12]:
new_df = df.merge(coord_data,on='PostalCode')
new_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


# ==== End of question 2 ====

# ==== Question 3 ====