# Website Scraping
### Use this Notebook to scrape the following Wikipedia page: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [91]:
import requests
import lxml.html as lh
import pandas as pd
import numpy as np

### Read data from wikipedia

In [92]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' #Create a handle, page, to handle the contents of the website
page = requests.get(url) #Store the contents of the website under doc
doc = lh.fromstring(page.content) #Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

In [93]:
#Check the length of the first 12 rows
[len(T) for T in tr_elements[:12]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

#### Get the Columns for the new table

In [94]:
tr_elements = doc.xpath('//tr') #Create empty list
col = []
i = 0 #For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+= 1
    name = t.text_content().rstrip("\n") # ==> USE RSTRIP to REMOVE TRAILING NEW-LINE CHARACTERS
    print("%d: %s" %(i,name))
    col.append((name,[]))

1: Postal Code
2: Borough
3: Neighbourhood


##### Retrieve each row from the web page

In [95]:
#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 3, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i = 0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content().rstrip("\n") # ==> USE RSTRIP to REMOVE TRAILING NEW-LINE CHARACTERS
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

In [96]:
# Ensure we retrieve an equal number of rows
[len(C) for (title,C) in col]

[181, 181, 181]

In [97]:
# Create the dataframe
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

#### Delete rows that don't have an assigned borough. 

In [98]:
df = df[df.Borough != 'Not assigned']

In [99]:
df.shape

(104, 3)

#### If a cell has a borough and there is no assigned neighborhood, then the neighborhood will be the same as the borough

In [104]:
df['Neighbourhood'] = np.where(df['Neighbourhood'] == 'Not assigned', df['Borough'], df['Neighbourhood'])

In [105]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [106]:
df.shape

(104, 3)