# Segmenting & Clustering Neighborhoods in Toronto: Part 3

### Scrape Wikipedia page

In [1]:
# use requests get text
import requests
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

# use BeautifulSoup to get the table
from bs4 import BeautifulSoup
soup = BeautifulSoup(website_url)
my_table = soup.find('table',{'class':'wikitable sortable'})

# read the table to a DataFrame
from IPython.display import display_html
import pandas as pd
df = pd.read_html(str(my_table))[0]

In [2]:
df = df.loc[df['Borough'] != 'Not assigned']

# if the neighbourhood is "Not assigned", assign the Borough name as the Neighbourhood
df.loc[df['Neighbourhood'] == 'Not assigned','Neighbourhood'] = df[df['Neighbourhood'] == 'Not assigned']['Borough']

df = df.assign(Latitude = "", Longitude = "")

### Geocode to get lattitude

We are using the `Nominatim` function for geocoding

Geocoding is first attempted by "Neighbourhood, Borough" and  if not found, then by "Neighbourhood, Toronto ON"

In [3]:
!conda install -c conda-forge geocoder --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values
from ipywidgets import IntProgress
from IPython.display import display

n_neighborhoods = df.shape[0]

# progress bar
f = IntProgress(min=0, max=n_neighborhoods) # instantiate the bar
display(f) # display the bar

# geocode neighborhoods
geolocator = Nominatim(user_agent="my_agent")
for i in range(n_neighborhoods):
    row = df.iloc[i]

    try: 
        address = (row.Neighbourhood + ", " + row.Borough)# + row.Postcode)
        #    address = address.replace("Downtown","")
        location = geolocator.geocode(address)
        df.loc[df['Neighbourhood']==row.Neighbourhood,'Latitude'] = location.latitude
        df.loc[df['Neighbourhood']==row.Neighbourhood,'Longitude'] = location.longitude
    except:
        try:
            address = (row.Neighbourhood + ", Toronto ON")
            location = geolocator.geocode(address)
            df.loc[df['Neighbourhood']==row.Neighbourhood,'Latitude'] = location.latitude
            df.loc[df['Neighbourhood']==row.Neighbourhood,'Longitude'] = location.longitude
        except:
            print(row.Neighbourhood + ", " + row.Borough + " not found")

    f.value = i #progressbar

IntProgress(value=0, max=211)

Humewood-Cedarvale, York not found
Caledonia-Fairbanks, York not found
CFB Toronto, North York not found
Canada Post Gateway Processing Centre, Mississauga not found
Island airport, Downtown Toronto not found
Railway Lands, Downtown Toronto not found
Humber Bay Shores, Etobicoke not found
Beaumond Heights, Etobicoke not found
Stn A PO Boxes 25 The Esplanade, Downtown Toronto not found
Business Reply Mail Processing Centre 969 Eastern, East Toronto not found


### Clustering and analysis

We are following DP0701EN-3-3-2-Neighbourhoods-New-York-py-v1.0.ipynb

First clustering. In this case, clustering is by location (latitude and longitude) only.

I use 10 clusters, because as we will see, a few of the clusters are populated by just a single outlier - probably because there are errors in the geocoding.

In [14]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 10

# convert string to numeric, and remove non-geocoded records
df = df.assign(Latitude = pd.to_numeric(df['Latitude'], errors='coerce'),
                                              Longitude = pd.to_numeric(df['Longitude'], errors='coerce'))
df = df.dropna(how = 'any')

# run k-means clustering
neighbourhoods_geo = df[['Latitude','Longitude']]
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(neighbourhoods_geo)

In [16]:
# add clustering labels
#df.insert(0, 'Cluster Labels', kmeans.labels_)
df['Cluster Labels'] = kmeans.labels_
df.head()

Unnamed: 0,Cluster Labels,Postcode,Borough,Neighbourhood,Latitude,Longitude
2,7,M3A,North York,Parkwoods,43.7588,-79.320197
3,7,M4A,North York,Victoria Village,43.732658,-79.311189
4,9,M5A,Downtown Toronto,Harbourfront,43.64008,-79.38015
5,9,M5A,Downtown Toronto,Regent Park,43.660706,-79.360457
6,0,M6A,North York,Lawrence Heights,43.722778,-79.450933


In [20]:
print("Number of neighbourhoods in each cluster")

df['Cluster Labels'].value_counts()

Number of neighbourhoods in each cluster


9    60
6    45
7    39
0    27
5    13
4    13
8     1
3     1
2     1
1     1
Name: Cluster Labels, dtype: int64

Now, map the clusters.

I am using the colourmap `tab10` rather than `rainbow` because it shows up better on this folium map.
I suspect `tab10` has is the Tableau 10 colormap, so it is not surpirsing that it works well.

In [19]:
!conda install -c conda-forge folium=0.5.0
import folium # map rendering library
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors


# create map
toronto = geolocator.geocode('Toronto, ON')
map_clusters = folium.Map(location=[toronto.latitude, toronto.longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.tab10(np.linspace(0, 1, len(ys)))
tab10 = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df['Latitude'], df['Longitude'], df['Neighbourhood'], df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=tab10[cluster-1],
        fill=True,
        fill_color=tab10[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Clustering does a good job at seperating the dense downtown from the surrounding boroughs

Some outliers formed their own cluster -- perhaps a density-based cluster method would be better than k-means?