In [10]:
# import all necessary libraries including BeautifulSoup and requests for 
# reading in html text from Wikipedia webpage and parsing the text for the 
# table of Toronto area postcodes, boroughs and neighborhoods

#!conda install -c conda-forge beautifulsoup4 --yes
from bs4 import BeautifulSoup

#!conda install -c conda-forge lxml --yes
import lxml

#!conda install -c conda-forge geocoder --yes
import geocoder 
from geopy.geocoders import Nominatim

import pandas as pd
import numpy as np

import requests

from sklearn.cluster import KMeans

import matplotlib.colors as colors
import matplotlib.cm as cm
import folium

In [11]:
# get html object via request
wikipage = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

# parse html object using BeautifulSoup and lxml
soup = BeautifulSoup(wikipage,'lxml')

# identify portion of the html text containing the table of Toronto postcodes
table = soup.find('table')

# loop through the list of all table entries (identified via tag 'tr')
# and extract and format each row of the table, making sure to split
# each table row by carriage return '\n' and removing any whitespace
tablebody = []
for xx in table.find_all('tr'):
    tablebody.append(xx.text.split('\n')[1:-1])

# create pandas DataFrame to store table    
df = pd.DataFrame(tablebody[1:],columns=tablebody[0])

# only keep those rows that have a Borough identified by name
df = df.loc[df['Borough'] != 'Not assigned',:]

# replace the neighbourhood name with the borough name for those 
# neighborhoods with unassigned names
df.loc[df['Neighbourhood'] == 'Not assigned','Neighbourhood'] = df.loc[df['Neighbourhood'] == 'Not assigned','Borough']

# define function to concatenate a list of names into a list
def f2(x):
    return(list(x.unique()))

# group Dataframe by borough and apply function to Postcode and Neighborhood columns
df = df.groupby(['Postcode']).agg({'Borough':f2, 'Neighbourhood': f2}).reset_index()

print('Number of rows in dataframe (i.e. number of postcodes) ', df.shape[0])

Number of rows in dataframe (i.e. number of postcodes)  103


In [5]:
# import geocoder
lat_long_coords = None

# set up loop to find lat/long for every postcode in the dataframe
# then, add columns for "Latitude" and "Longitude" to the dataframe
for ind in df.index:
    while( lat_long_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(df.loc[ind,'Postcode']))
        lat_long_coords = g.latlng
    
    df.loc[ind,"Latitude"] = lat_long_coords[0]
    df.loc[ind,"Longitude"] = lat_long_coords[1]
    print(df.loc[ind,"Postcode"] + ' Toronto, Ontario has lat/long ' + str(lat_long_coords))

# NOTE: GETTING THE LAT LONG COORDINATES VIA GEOCODER WAS TAKING TOO LONG
# SO I IMPORTED THE LAT/LONG COORDINATES FOR THE VARIOUS POSTCODES FROM
# THE CSV FILE PROVIDED

KeyboardInterrupt: 

In [22]:
# import CSV file of lat/long coordinates into a second dataframe

df_latlong = pd.read_csv('Geospatial_Coordinates.csv')
df_latlong.rename(columns={'Postal Code':'Postcode'},inplace= True)

# merge two dataframes on postcode
df_toronto=df.join(df_latlong.set_index('Postcode'),on='Postcode')

In [23]:
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,[Scarborough],"[Rouge, Malvern]",43.806686,-79.194353
1,M1C,[Scarborough],"[Highland Creek, Rouge Hill, Port Union]",43.784535,-79.160497
2,M1E,[Scarborough],"[Guildwood, Morningside, West Hill]",43.763573,-79.188711
3,M1G,[Scarborough],[Woburn],43.770992,-79.216917
4,M1H,[Scarborough],[Cedarbrae],43.773136,-79.239476


In [24]:
# set up kmeans algorithm with 5 clusters
kclusters = 5

# drop all non-numeric columns from input dataframe to kmeans clustering
df_toronto_clustering = df_toronto.drop(['Postcode','Borough','Neighbourhood'],axis=1)

# run kmeans
k_means = KMeans(n_clusters=kclusters,random_state=1).fit(df_toronto_clustering)

# extract result of kmeans clustering
cluster_labels = k_means.labels_
cluster_centroids = k_means.cluster_centers_

# update toronto data with cluster labels
df_toronto.insert(0,'Cluster labels',cluster_labels)

In [25]:
# get latitude and longitude of Toronto for plotting

address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent='Toronto_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

print('the latitude/longitude of Toronto, Ontario is {}'.format([latitude,longitude]))

the latitude/longitude of Toronto, Ontario is [43.653963, -79.387207]


In [35]:
# create folium map of Toronto
toronto_map = folium.Map(location=[latitude,longitude],zoom_start=10)

#set color scheme for clusters
x = np.arange(kclusters)
colors_array = cm.rainbow(np.linspace(0,1,kclusters))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# plot neighborhoods color coded by clusters
for lat,lng,cluster,neigh,borough,postcode in zip(
    df_toronto['Latitude'],
    df_toronto['Longitude'],
    df_toronto['Cluster labels'],
    df_toronto['Neighbourhood'],
    df_toronto['Borough'],
    df_toronto['Postcode']):
        
        label = folium.Popup('Postcode: ' + str(postcode) + ' contains neighborhoods: ' + str(neigh),parse_html=True)
        folium.CircleMarker(
            [lat,lng],
            radius=5,
            popup=label,
            color=rainbow[cluster-1],
            fill=True,
            fill_color = rainbow[cluster-1],
            fill_opacity=0.7).add_to(toronto_map)

# plot cluster centroids
for lat_lng,cluster in zip(cluster_centroids,x):
    

    folium.CircleMarker(
        lat_lng,
        radius=10,
        popup=str(cluster),
        color='k',
        fill=True,
        fill_color='k',
        fill_opacity=0.7).add_to(toronto_map)

    
toronto_map

In [31]:
cluster_centroids

array([[ 43.66283488, -79.39607721],
       [ 43.72115859, -79.31706874],
       [ 43.74893391, -79.41817098],
       [ 43.68279116, -79.53037109],
       [ 43.77845083, -79.22767767]])