# Toronto postal codes scraping from the wikipedia page

In [7]:
#import the libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

### Data acquistion: Scrape Toronto postal codes from the wikipedia page

In [8]:
# get the data 
link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
raw_text= requests.get(link).text

# Parse the raw text using Beautiful Soup package
soup = BeautifulSoup(raw_text)

### Preprocessing : Extract the data from the tags

In [9]:
# extracting the raw table inside that webpage
table = soup.find('table')

Postcode      = []
Borough       = []
Neighbourhood = []

# extracting elements from the <tr> tag and loop td each element inside 
for tr_cell in table.find_all('tr'):
    
    counter = 1
    Postcode_var      = -1
    Borough_var       = -1
    Neighbourhood_var = -1
    
    for td_cell in tr_cell.find_all('td'):
        if counter == 1: 
            Postcode_var = td_cell.text
        if counter == 2: 
            Borough_var = td_cell.text
            tag_a_Borough = td_cell.find('a')
            
        if counter == 3: 
            Neighbourhood_var = str(td_cell.text).strip()
            tag_a_Neighbourhood = td_cell.find('a')
            
        counter +=1
    
    #ignoring the codes which have missing information in any of fields
    
    if (Postcode_var == 'Not assigned' or Borough_var == 'Not assigned' or Neighbourhood_var == 'Not assigned'): 
        continue
    try:
        if ((tag_a_Borough is None) or (tag_a_Neighbourhood is None)):
            continue
    except:
        pass
    if(Postcode_var == -1 or Borough_var == -1 or Neighbourhood_var == -1):
        continue
    
    #append each individual elements
    Postcode.append(Postcode_var)
    Borough.append(Borough_var)
    Neighbourhood.append(Neighbourhood_var)

### Preparation: Appending more than one neighbor into one cell

In [10]:
unique_codes = set(Postcode)

postalcodes      = []
borough       = []
neighbourhood = []

for unique_element in unique_codes:
    code_item = ''
    borough_item = ''
    neighbor_item = ''    
        
    for code_idx, code_element in enumerate(Postcode):
        if code_element == unique_element:
            code_item = code_element
            borough_item = Borough[code_idx]
            
            if neighbor_item == '': 
                neighbor_item = Neighbourhood[code_idx]
            else:
                neighbor_item = neighbor_item + ', ' + Neighbourhood[code_idx]
                
    postalcodes.append(code_item)
    borough.append(borough_item)
    neighbourhood.append(neighbor_item)

### Create the dataframe and push the data

In [11]:
toronto_dict = {'PostalCode':postalcodes, 'Borough':borough, 'Neighbourhood':neighbourhood}
df_toronto = pd.DataFrame.from_dict(toronto_dict)
df_toronto.to_csv('toronto_postal_codes.csv')
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M5E,Downtown Toronto,Berczy Park
1,M4W,Downtown Toronto,Rosedale
2,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor"
3,M9M,North York,"Emery, Humberlea"
4,M1R,Scarborough,"Maryvale, Wexford"


In [12]:
df_toronto.shape

(84, 3)