In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
# first step is to get the url
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
link = requests.get(url).text
toronto = BeautifulSoup(link,'lxml')

In [12]:
# extracting the information and putting it in the dataframe

# creating the dataframe
column_name = ['Postalcode', 'Borough', 'Neighborhood']
toronto_df = pd.DataFrame(columns = column_name)

# going through toronto to extract the Postal Code, Borough, and Neighborhood
info = toronto.find('div', class_='mw-parser-output')
table = info.table.tbody
postcode = 0
borough = 0
neighborhood = 0

for tr in table.find_all('tr'):
    i = 0
    for td in tr.find_all('td'):
        if i == 0:
            postcode = td.text
            i = i + 1
        elif i == 1:
            borough = td.text
            i = i + 1
        elif i == 2: 
            neighborhood = td.text.strip('\n').replace(']','')
    toronto_df = toronto_df.append({'Postalcode': postcode,'Borough': borough,'Neighborhood': neighborhood},ignore_index=True)

In [13]:
toronto_df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,0,0,0
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


Getting rid of Boroughs with a value of "Not assigned"

In [14]:
toronto_df = toronto_df[toronto_df.Borough != 'Not assigned']
toronto_df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,0,0,0
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park


Getting rid of the first row (all 0's) and re-naming Postalcode to Postal Code

In [15]:
toronto_df = toronto_df[toronto_df.Borough != 0]
toronto_df.rename(columns = {'Postalcode': 'Postal Code'}, inplace = True)
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


Removing Neighborhoods that have a value of "Not assigned" and consolidating Borough's which have multiple Neighborhoods listed

In [19]:
toronto_df[toronto_df.Neighborhood == "Not assigned"]

Unnamed: 0,Postal Code,Borough,Neighborhood
9,M7A,Queen's Park,Not assigned


In [22]:
toronto_df[toronto_df.Borough == "Queen's Park"]

# Borough was erased in code that was previously run

Unnamed: 0,Postal Code,Borough,Neighborhood


In [23]:
# Consolidating the neighborhoods

toronto_df = toronto_df.groupby(["Postal Code", "Borough"])['Neighborhood'].apply(', '.join).reset_index()

In [24]:
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
