In [1]:
# import all necessary libraries including BeautifulSoup and requests for 
# reading in html text from Wikipedia webpage and parsing the text for the 
# table of Toronto area postcodes, boroughs and neighborhoods

#!conda install -c conda-forge beautifulsoup4 --yes
from bs4 import BeautifulSoup

#!conda install -c conda-forge lxml --yes
import lxml

#!conda install -c conda-forge geocoder --yes
import geocoder 

import pandas as pd
import numpy as np

import requests

from sklearn.cluster import KMeans


In [4]:
# get html object via request
wikipage = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

# parse html object using BeautifulSoup and lxml
soup = BeautifulSoup(wikipage,'lxml')

# identify portion of the html text containing the table of Toronto postcodes
table = soup.find('table')

# loop through the list of all table entries (identified via tag 'tr')
# and extract and format each row of the table, making sure to split
# each table row by carriage return '\n' and removing any whitespace
tablebody = []
for xx in table.find_all('tr'):
    tablebody.append(xx.text.split('\n')[1:-1])

# create pandas DataFrame to store table    
df = pd.DataFrame(tablebody[1:],columns=tablebody[0])

# only keep those rows that have a Borough identified by name
df = df.loc[df['Borough'] != 'Not assigned',:]

# replace the neighbourhood name with the borough name for those 
# neighborhoods with unassigned names
df.loc[df['Neighbourhood'] == 'Not assigned','Neighbourhood'] = df.loc[df['Neighbourhood'] == 'Not assigned','Borough']

# define function to concatenate a list of names into a list
def f2(x):
    return(list(x.unique()))

# group Dataframe by borough and apply function to Postcode and Neighborhood columns
df = df.groupby(['Postcode']).agg({'Borough':f2, 'Neighbourhood': f2}).reset_index()

print('Number of rows in dataframe (i.e. number of postcodes) ', df.shape[0])

Number of rows in dataframe (i.e. number of postcodes)  103


In [None]:
# import geocoder
lat_long_coords = None

# set up loop to find lat/long for every postcode in the dataframe
# then, add columns for "Latitude" and "Longitude" to the dataframe
for ind in df.index:
    while( lat_long_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(df.loc[ind,'Postcode']))
        lat_long_coords = g.latlng
    
    df.loc[ind,"Latitude"] = lat_long_coords[0]
    df.loc[ind,"Longitude"] = lat_long_coords[1]
    print(df.loc[ind,"Postcode"] + ' Toronto, Ontario has lat/long ' + str(lat_long_coords))

# NOTE: GETTING THE LAT LONG COORDINATES VIA GEOCODER WAS TAKING TOO LONG
# SO I IMPORTED THE LAT/LONG COORDINATES FOR THE VARIOUS POSTCODES FROM
# THE CSV FILE PROVIDED

In [7]:
# import CSV file of lat/long coordinates into a second dataframe

df_latlong = pd.read_csv('Geospatial_Coordinates.csv')
df_latlong.rename(columns={'Postal Code':'Postcode'},inplace= True)

# merge two dataframes on postcode
df_toronto=df.join(df_latlong.set_index('Postcode'),on='Postcode')