In [None]:
!pip install bs4
!pip install lxml

# Imports and functions definitions to get the table as a DataFrame

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
def get_request(url):
    response = requests.get(url)
    return response

def get_empty_table_df():
    columns = ['Postcode', 'Borough', 'Neighbourhood']
    table_df = pd.DataFrame(columns=columns)
    table_df.reset_index()
    
    return table_df

def get_df_line(line):
    td = line.findAll('td')
    
    new_line = [0, 0, 0]
    for i in range(0, 3):
        new_line[i] = td[i].findAll(text=True)[0]
    
    if new_line[2].endswith('\n'):
        new_line[2] = new_line[2][:-1]
        
    return new_line

def treat_not_assigned(df):
    df.replace('Not assigned', np.nan, inplace=True)
    
    for i in range(0, df.shape[0]):
        if pd.isnull(df.loc[i, 'Neighbourhood']) and (not pd.isnull(df.loc[i, 'Borough'])):
            df.loc[i, 'Neighbourhood'] = df.loc[i, 'Borough']
    
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    return df

def group_by_postcode(df):
    df = df.groupby(by='Postcode', observed=True)
    return df
    

def get_table_df(response):
    table_df = get_empty_table_df()
    
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.findAll('table')[0]
    tbody = table.findAll('tbody')[0]
    lines = tbody.findAll('tr')
    
    for i in range(1, len(lines)): #Skip header line
        line = get_df_line(lines[i])
        table_df.loc[i-1] = line
    
    table_df = treat_not_assigned(table_df)
        
    return table_df

# Get the page and clean the data

In [3]:
response = get_request('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M');

In [4]:
table_df = get_table_df(response)

# Functions definitions to group elements in DataFrame

In [5]:
def get_list_of_postcodes(df):
    return df['Postcode'].unique()

def get_empty_final_df_with_postcodes(postcodes):
    columns = ['Borough', 'Neighbourhood']
    df = pd.DataFrame(columns=columns)
    
    for i in range(len(postcodes)):
        df.loc[postcodes[i], :] = [[], []]
    
    df.index.name = 'Postcode'
    
    return df

def fill_final_df(table_df, postcodes, final_df):
    for i in range(len(postcodes)):
        for j in range(len(table_df)):
            if table_df.loc[j, 'Postcode'] == postcodes[i]:
                final_df.loc[postcodes[i], 'Borough'].append(table_df.loc[j, 'Borough'])
                final_df.loc[postcodes[i], 'Neighbourhood'].append(table_df.loc[j, 'Neighbourhood'])
    
    final_df.reset_index(inplace=True)
    return final_df

def format_final_df(final_df):
    for i in range(len(final_df)):
        final_df.loc[i, 'Borough'] = final_df.loc[i, 'Borough'][0]
        final_df.loc[i, 'Neighbourhood'] = ', '.join(final_df.loc[i, 'Neighbourhood'])
    
    return final_df

def get_final_df(table_df):
    postcodes = get_list_of_postcodes(table_df)
    final_df = get_empty_final_df_with_postcodes(postcodes)
    final_df = fill_final_df(table_df, postcodes, final_df)
    final_df = format_final_df(final_df)
    
    return final_df



In [6]:


final_df = get_final_df(table_df)

# Print 12 first elements and the shape of the final DataFrame

In [7]:
final_df.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [9]:
final_df.shape

(103, 3)