# Import packages

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Get Wikipedia Page Source

In [5]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [6]:
soup = BeautifulSoup(source, 'html.parser')

# Scrape Html To Get Post Code, Borough, and Neighborhood Lists

In [7]:
#print(postalCodeL)
#print(boroughL)
#print(neighborhoodL)
postalCodeL = []
boroughL = []
neighborhoodL = []

for line in soup.find('table').find_all('tr'):
    cells = line.find_all('td')
    if(len(cells) > 0):
        postalCodeL.append(cells[0].text)
        boroughL.append(cells[1].text)
        neighborhoodL.append(cells[2].text.replace("\n",""))

# Add PostCode, Borough, and Neighborhood Lists into Pandas Dataframe & Remove Not Assigned Rows

In [8]:
toronto_neigh = [('PostalCode', postalCodeL),
                      ('Borough', boroughL),
                      ('Neighborhood', neighborhoodL)]
toronto_df = pd.DataFrame.from_dict(dict(toronto_neigh))
toronto_df.head()

toronto_df = toronto_df[["PostalCode","Borough", "Neighborhood"]]


toronto_df = toronto_df[toronto_df.Borough != 'Not assigned'].reset_index(drop=True)
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


# Remove Duplicate PostalCode By Grouping Neighborhoods by PostalCode & Borough

In [9]:
toronto_df = toronto_df.groupby(['PostalCode','Borough'], as_index=False).agg(lambda x: ','.join(x))
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# Fix the Not Assigned Neighborhood

In [10]:
na_neigh_rows = toronto_df.Neighborhood == 'Not assigned'
toronto_df.loc[na_neigh_rows, 'Neighborhood'] = toronto_df.loc[na_neigh_rows, 'Borough']
toronto_df[na_neigh_rows]

Unnamed: 0,PostalCode,Borough,Neighborhood
85,M7A,Queen's Park,Queen's Park


# Clean up and Show Shape of Data Frame

In [11]:
toronto_df_cleaned = toronto_df
toronto_df_cleaned.shape

(103, 3)