# This is the notebook for scraping the table from the Wikipedia page.

In [1]:
import pandas as pd # library for data analsysis

## Install beautifulsoup 4 package to scrape the webpage

In [2]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


## Install lxml package for xml parsing with BeautifulSoup

In [3]:
pip install lxml

Note: you may need to restart the kernel to use updated packages.


## Save wikipedia page into page variable and parse it using BeautifulSoup

In [4]:
#import library to query webpage of interest
from urllib.request import urlopen
#specifying page of interest
wiki = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
#save the HTML of the site within the page variable
page = urlopen(wiki)
#import library to parse HTML from page
from bs4 import BeautifulSoup
#parse data from "page" and save to new variable "soup"
soup = BeautifulSoup(page, 'lxml')

## By inspecting the source of the wikipedia page, we know we want to scrape the table with the class "wikitable sortable"

In [5]:
#pinpointing the location of the table and its contents
My_table = soup.find('table',{'class':'wikitable sortable'})

In [6]:
#creating lists for each of the columns I know to be in my table.
A=[]
B=[]
C=[]

## For each row (with html code tr), append the values of each element (html code td) into the columns 

In [7]:
for row in My_table.findAll('tr'):
    cells = row.findAll('td')
    if len(cells)==3: #Only extract table body not heading
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))

## Define the headers of dataframe columns

In [8]:
df=pd.DataFrame(A,columns=['Postcode'])
df['Borough']=B
df['Neighbourhood']=C

## Remove rows which contain "Not assigned" in the column Borough

In [9]:
df=df[~df.Borough.str.contains('Not assigned')]

## When scrapping the table, found that the line break indicators were read as text. therefore, remove "\n" using rstrip

In [10]:
df['Neighbourhood'] = df['Neighbourhood'].map(lambda x: x.rstrip('\n'))

## For each row containing "Not assigned" in the Neighbourhood column, use the value in the Borough column

In [11]:
for i, row in df.iterrows():
    if row['Neighbourhood'] == 'Not assigned':
        row['Neighbourhood'] = row['Borough']

## For each Borough, join the neighbourhoods in the same row and separated by ","

In [12]:
strJoin = lambda x:",".join(x.astype(str))     
new_df = df.groupby(['Postcode','Borough']).agg({"Neighbourhood":strJoin})

In [13]:
new_df.shape

(103, 1)