In [34]:
from bs4 import BeautifulSoup #web scraping lib
import requests 
import lxml #HTML parsing lib
import pandas as pd

# Scraping Toronto neighborhood data from Wikipedia.

In [35]:
wiki_file=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text #download Toronto neighborhood data from Wikipedia
soup= BeautifulSoup(wiki_file,'lxml')

In [36]:
table=soup.find('table', class_="wikitable sortable") #find table with data
data=[]
for tr in table.find_all('tr'): #create list of rows data from the table
    row=tr.text.split('\n')
    while("" in row): 
        row.remove("") 
    data.append(row)


In [41]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,0,1,2
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [42]:
#First row becomes as header.
df.columns = df.iloc[0]
df.drop([0], axis=0, inplace=True) #remove first row from dataset
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
10,M8A,Not assigned,Not assigned


In [44]:
print(df.shape)
df.head()

(288, 3)


Unnamed: 0,Postcode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [None]:
df.rename(columns={'Neighbourhood':'Neighborhood'}, inplace=True) #change spelling

In [45]:
#remove entries with not assigned 'Borough'
df=df[df['Borough']!='Not assigned']
df.shape

(211, 3)

In [47]:
# Find rows with Not assigned Neighbourhoods, replace neighbourhood with name of the borough
df.loc[df.Neighborhood =='Not assigned', 'Neighborhood'] = df.Borough

In [48]:
df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Queen's Park
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern


In [49]:
df['Borough'].value_counts()

Etobicoke           45
North York          38
Downtown Toronto    37
Scarborough         37
Central Toronto     17
West Toronto        13
York                 9
East Toronto         7
East York            6
Mississauga          1
Queen's Park         1
Name: Borough, dtype: int64

In [50]:
df.Postcode.nunique()

103

Check how many unique values exist

In [52]:
#create new dataframe by grouping entries with the same postcode.
#Neighbourhood names being joined with ' ,'
df_new=df.groupby(['Postcode', 'Borough' ])['Neighborhood'].apply(', '.join).reset_index() 

In [53]:
df_new.shape

(103, 3)

In [54]:
df_new.to_csv('Toronto_NH.csv') #Export data to be used in other notebooks
df_new.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"
