install library for extract data from web

In [102]:
!pip install beautifulsoup4
!pip install lxml
!pip install html5lib



Import libraries

In [103]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

Extract data from web and convert to list

In [104]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
table = soup.find('table')

data_list = list()

for row in table.find_all('tr'):
    row_list = list()
    for cell in row.find_all('td'):
        text = cell.text
        text = text.rstrip()
        row_list.append(text)
    data_list.append(row_list)

convert lis to dataframe and drop first row which contain NONE

In [105]:
df = pd.DataFrame(data_list, columns = ['Postcode' , 'Borough', 'Neighbourhood'])
df.drop(df.index[0], inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


Drop all rows which column 'Borough' is 'Not assigned'

In [106]:
df.drop(df[df['Borough'] == 'Not assigned'].index, inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor


Grouping rows which have same postcode and rorough

In [121]:
df = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Replace row that Neighbourhood is Not assigned

In [108]:
df[df['Neighbourhood'] == 'Not assigned']
df.iloc[[93],[2]] = "Queen's Park"

In [122]:
df.shape

(103, 3)

Download Geospatial data and set to dataframe

In [123]:
df_geospatial = pd.read_csv('http://cocl.us/Geospatial_data')
df_geospatial.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merge Neighborhood dataframe and Geospatial data

In [124]:
df = df.merge(df_geospatial, left_on='Postcode', right_on='Postal Code')
df.drop(columns=['Postal Code'], axis=0, inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
