In [1]:
import pandas as pd
import numpy as np
import requests

### Get the data from Wikipedia link

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
webpage = requests.get(url)
webpage

<Response [200]>

In [3]:
from pprint import pprint

pprint(webpage.text)

('<!DOCTYPE html>\n'
 '<html class="client-nojs" lang="en" dir="ltr">\n'
 '<head>\n'
 '<meta charset="UTF-8"/>\n'
 '<title>List of postal codes of Canada: M - Wikipedia</title>\n'
 '<script>document.documentElement.className = '
 'document.documentElement.className.replace( /(^|\\s)client-nojs(\\s|$)/, '
 '"$1client-js$2" );</script>\n'
 '<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List '
 'of postal codes of Canada: '
 'M","wgCurRevisionId":876823784,"wgRevisionId":876823784,"wgArticleId":539066,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications '
 'in Ontario","Postal codes in Canada","Toronto","Ontario-related '
 'lists"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgD

### Extract table of data from webpage

In [4]:
start = webpage.text.find('<table class="wikitable sortable">')
end = webpage.text.find('</table>')

In [5]:
table_text = webpage.text[start:end] + '\n</table>'

### Tranform HTML Table to Pandas DataFrame

In [6]:
df = pd.read_html(table_text)[0]

In [7]:
df.columns = df.iloc[0].tolist()
df.drop(index=0, axis=0, inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [8]:
df['Borough'].value_counts()

Not assigned        77
Etobicoke           45
North York          38
Scarborough         38
Downtown Toronto    37
Central Toronto     17
West Toronto        13
York                 9
East Toronto         7
East York            6
Mississauga          1
Queen's Park         1
Name: Borough, dtype: int64

In [9]:
df['Neighbourhood'].value_counts()

Not assigned                                         78
Runnymede                                             2
St. James Town                                        2
North Midtown                                         1
Golden Mile                                           1
Wexford Heights                                       1
York University                                       1
Bathurst Quay                                         1
Oriole                                                1
Woodbine Heights                                      1
Central Bay Street                                    1
Bathurst Manor                                        1
King's Mill Park                                      1
Island airport                                        1
Harbord                                               1
Wexford                                               1
The Beaches West                                      1
Deer Park                                       

### Merge Neighbourhoods with the same Postcode and Borough

In [13]:
df = df[df['Borough'] != 'Not assigned'].groupby(['Postcode', 'Borough'], as_index=False, sort=False).agg(', '.join)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned


### Assign "Not assigned" Neighbourhood to its Borough

In [26]:
for i in df.loc[df['Neighbourhood'] == 'Not assigned'].index:
    df['Neighbourhood'][i] = df['Borough'][i]
df.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [27]:
df.shape

(103, 3)