## This Notebook will be used for the Capstone Project

### Import required packages

In [44]:
import pandas as pd
import numpy as np

#! pip install sklearn
from sklearn.cluster import KMeans

# ! pip install requests geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests
import matplotlib.cm as cm
import matplotlib.colors as colors

#! pip install folium
import folium # map rendering library


### Scraping of table on the webpage

In [2]:
df = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
df = df[0]
df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [3]:
df = df[df["Borough"].str.contains("Not assigned") == False]
df = df.reset_index(drop=True)
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### Dataframe shape

In [4]:
df.shape

(103, 3)

### Adding Coordinates for each  Postal Code

We merge the coordinates according to each Borough's Postal Code

In [6]:
geo = pd.read_csv("Geospatial_Coordinates.csv")
new_df = pd.merge(df, geo, on=['Postal Code'])
new_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


### Preprocessing of Data
We do one-hot enconding on the DataFrame based on Boroughs that contain the word Toronto

In [41]:
# one hot encoding
a = new_df[new_df['Borough'].str.contains("Toronto")]

df_onehot = pd.get_dummies(a[['Borough']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
df_onehot['Neighborhood'] = new_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [df_onehot.columns[-1]] + list(df_onehot.columns[:-1])
df_onehot = df_onehot[fixed_columns]

#Group according to Neighborhood
df_grouped = df_onehot.groupby('Neighborhood').mean().reset_index()
df_grouped

Unnamed: 0,Neighborhood,Central Toronto,Downtown Toronto,East Toronto,West Toronto
0,Berczy Park,0,1,0,0
1,"Brockton, Parkdale Village, Exhibition Place",0,0,0,1
2,"Business reply mail Processing Centre, South C...",0,0,1,0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0,1,0,0
4,Central Bay Street,0,1,0,0
5,Christie,0,1,0,0
6,Church and Wellesley,0,1,0,0
7,"Commerce Court, Victoria Hotel",0,1,0,0
8,Davisville,1,0,0,0
9,Davisville North,1,0,0,0


We also obtain the coordinates for Toronto.

In [11]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(latitude,",",longitude)

43.6534817 , -79.3839347


### Clustering

We run K-means clustering on K=5

In [22]:
# set number of clusters
kclusters = 5

df_grouped_clustering = df_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

  return_n_iter=True)


array([4, 2, 3, 4, 4, 4, 4, 4, 1, 1], dtype=int32)

In [42]:
df_grouped.insert(0, 'Cluster Labels', kmeans.labels_)

df_merged = a.join(df_grouped.set_index('Neighborhood'), on='Neighborhood').reset_index()

df_merged.head()

Unnamed: 0,index,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,Central Toronto,Downtown Toronto,East Toronto,West Toronto
0,2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,4,0,1,0,0
1,4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,4,0,1,0,0
2,9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,4,0,1,0,0
3,15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,4,0,1,0,0
4,19,M4E,East Toronto,The Beaches,43.676357,-79.293031,3,0,0,1,0


We then create the map with clusters.

In [45]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Neighborhood'], df_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters