# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
#!pip install folium
import folium 

## Part I: built a dataframe of the postal code of each neighborhood in Toronto

In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
table = pd.read_html(url)[0]
table_contents = []
for row in table.index:
    for column in table.columns:
        cell = {}
        if 'Not assigned' in table.iloc[row,column]:
            pass
        else:
            cell['PostalCode'] = table.iloc[row,column][:3]
            cell['Borough'] = (table.iloc[row,column][3:]).split('(')[0]
            cell['Neighborhood'] = (((((table.iloc[row,column][3:]).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
            table_contents.append(cell)
df = pd.DataFrame(table_contents)
df['Borough'] = df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})
df.shape

(103, 3)

## Part II: add the geographical coordinates of each neighborhood to the dataframe

In [3]:
data = pd.read_csv('Geospatial_Coordinates.csv')
df = df.sort_values(by=['PostalCode']).reset_index(drop=True)
#df['PostalCode'].equals(data['Postal Code'])
df[['Latitude','Longitude']] = data[['Latitude','Longitude']]
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Part III: cluster the neighborhoods in Toronto and visualize the resulting clusters

In [5]:
# set number of clusters and run k-means clustering
nbr_clusters = 5
kmeans = KMeans(n_clusters=nbr_clusters, random_state=0).fit(df[['Latitude','Longitude']])
new_df = df.copy()
new_df['Cluster'] = kmeans.labels_
# set the geographical coordinates of Toronto
latlng = [43.651070,-79.347015] 
# create map
map_clusters = folium.Map(location=latlng, zoom_start=11)
# set color scheme for the clusters
colors_array = cm.rainbow(np.linspace(0, 1, nbr_clusters))
rainbow = [colors.rgb2hex(i) for i in colors_array]
# add markers to the map
markers_colors = []
X = (new_df['Latitude'], new_df['Longitude'], new_df['Neighborhood'], new_df['PostalCode'], new_df['Cluster'])
for lat, lng, neighborhood, code, cluster in zip(X[0],X[1],X[2],X[3],X[4]):
    label = folium.Popup(str(neighborhood) + ' PostalCode:'+ str(code)+' Cluster '+ str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
map_clusters