# Toronto Neighborhoods
This Notebook is for retrieving postal codes for neighborhoods in toronto

In [1]:
#%pip install bs4      # install bs4 to use beautiful soup for web parsing table
#%pip install lxml    # install lxml for pandas to read html file (when using pandas to extract)
# import required libraries
import pandas as pd
import numpy as np
import urllib.request
from bs4 import BeautifulSoup


In [2]:
# using pandas read_html file to extract table
columns = ['PosalCode, Borough, Neighborhood']
pd.read_html('http://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West
285,M8Z,Etobicoke,South of Bloor


In [3]:
# using beautifulsoup to web parse table
url = urllib.request.urlopen("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
html = url.read()

soup = BeautifulSoup(html, 'html.parser')

table=[]
htmltable = soup.table.find_all('tr')
table.append([th.get_text(strip=True) for th in htmltable[0].find_all('th')])
for line in htmltable[1:]:
    row = [td.get_text(strip=True) for td in line.find_all('td')]
    table.append(row)
df = pd.DataFrame(table[1:], columns=table[0])
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West
285,M8Z,Etobicoke,South of Bloor


In [4]:
# remove 'Borough's that are not assigned and group same postcodes
df = df[df['Borough'] != 'Not assigned']
group = df.groupby(['Postcode']).count()
indices = group.index

In [5]:
#Clean dataframe to merge same post codes into one row
clean_df = pd.DataFrame(columns = df.columns)

for index in indices:
    num_neigh = group.loc[index,'Neighbourhood']
    neighbourhoods = []
    for i in range(0, num_neigh):
        neighbourhoods.append(df[df['Postcode'] == index]['Neighbourhood'].iloc[i])
        neighbourhood = ', '.join(neighbourhoods)
    clean_df = clean_df.append({'Postcode': index, 'Borough': df[df['Postcode'] == index]['Borough'].iloc[0], 'Neighbourhood': neighbourhood},ignore_index=True)

In [6]:
clean_df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [7]:
for i in range(0,clean_df.shape[0]):
    if clean_df['Neighbourhood'].iloc[i] == 'Not assigned':
        clean_df['Neighbourhood'].iloc[i] = clean_df['Borough'].iloc[i]

In [8]:
clean_df.shape

(103, 3)

In [9]:
clean_df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."
