**From the inspect element, we see that the data, which we are interested in,  
is stored in the table, wikitable, and we can access it through the table,   
tr, and td tags. Let us begin by importing and installing the necessary libraries.**

In [66]:
# import requests for getting the HTML contents, lxml.html for parsing, and pandas 
!pip install lxml
import requests
import lxml.html as lh
import pandas as pd



**Send an HTTP request to the URL of the webpage we wish to get access to,  
and the server responds by by returning the HTML content of the webpage**

In [67]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

#Create a handle to handle the contents of the website
tor = requests.get(URL) 

#Store the contents of the website under doc
tor_M = lh.fromstring(tor.content)

#Parse the data that is stored in the rows of the table, in tr tag 
tor_M_tr = tor_M.xpath('//tr')

In [68]:
#Check the length of the first 12 rows
[len(T) for T in tor_M_tr[:12]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [70]:
# let’s parse the first row as our header.
#Create empty list
col=[]
i=0
#For each row, store each first element (header) and an empty list
for t in tor_M_tr[0]:
    i+=1
    name=t.text_content()
    name=name.strip() # remove the \n that follows each name
    print ("%d: %s" %(i,name))
    col.append((name,[]))
    
col # look at the tuple called col to see if everything is the way it should be


1: Postal code
2: Borough
3: Neighborhood


[('Postal code', []), ('Borough', []), ('Neighborhood', [])]

**Now we begin to read data from the webpage and store the results row by row.**

In [71]:
#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tor_M_tr)):
    #T is our j'th row
    T=tor_M_tr[j]
    
    #If row is not of size 3, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

**Create the pandas dataframe**

In [73]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)
df = df.replace('\n', '',regex=True) #replace all \n from the entries in the dataframe
df.head() #look at the first five rows


Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [75]:
# Drop the rows that have Not assigned as a Borough entry, and reset the index.
df.drop(df.loc[df['Borough'] == "Not assigned"].index, inplace=True)
df=df.reset_index()
del df['index']

# Look at the last five rows of the table.
df.tail()


Unnamed: 0,Postal code,Borough,Neighborhood
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing CentrE
101,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...
102,M8Z,Etobicoke,Mimico NW / The Queensway West / South of Bloo...
103,,Canadian postal codes,


In [76]:
# The last row of df should be deleted
df_tor=df.loc[:102,:] # form a new dataframe which excludes the last row of df.
df_tor.shape

(103, 3)