# Coursera Data Science Capstone Project

Week 3 Assignment - Segmenting and Clustering Neighborhoods in the city of Toronto, Canada

Import all necesary libraries

In [20]:
import pandas as pd
import numpy as np
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

from urllib.request import urlopen as ureq
from bs4 import BeautifulSoup as soup
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim

## Part I
### Web Scraping

Get html page content

In [2]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
u_client = ureq(wiki_url)

In [3]:
page_html = u_client.read()
u_client.close()

Extract data from html page

In [4]:
page_soup = soup(page_html, 'html.parser')

Create an empty Pandas DataFrame to store the data

In [5]:
df_columns = []
for th in page_soup.table.find_all('th'):
    df_columns.append(th.string[:-1])

print(df_columns)

['Postal Code', 'Borough', 'Neighbourhood']


In [6]:
df = pd.DataFrame(columns=df_columns)

for tr in page_soup.table.find_all('tr')[1:]:
    postcode = tr.find_all('td')[0].string[:-1]
    borough = tr.find_all('td')[1].string[:-1]
    neighbourhood = tr.find_all('td')[2].string[:-1]
    
    if borough != 'Not assigned':
        if neighbourhood == 'Not assigned':
            neighbourhood = borough
        
        df = df.append(
            {
                'Postal Code': postcode,
                'Borough': borough,
                'Neighbourhood': neighbourhood
            },
            ignore_index=True
        )

In [7]:
print(df.shape)

(103, 3)


In [8]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## Part II
### Geolocation data

Unfortunately the geocoder library didn't work for me so I'll use the .csv file

In [9]:
df_coords = pd.read_csv('./Geospatial_Coordinates.csv')
df_coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
df = df.merge(df_coords, on='Postal Code')
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## Part III
### Cluster data
I chosse to cluster both by neighbourgs first followed by borough

Group data by **neighbourhood**

In [11]:
grouped_neighbourhood_df = df.groupby('Neighbourhood').mean().reset_index()
grouped_neighbourhood_df.shape

(99, 3)

Select the number of clusters

In [30]:
kclusters = 5

grouped_clustering = grouped_neighbourhood_df.drop('Neighbourhood', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(grouped_clustering)
kmeans.labels_[0:10] 

array([4, 2, 0, 0, 0, 3, 1, 3, 1, 3], dtype=int32)

Create a DataFrame to hold the cluster id and the geo location

In [32]:
grouped_clustering.insert(0, 'Cluster Labels', kmeans.labels_)
grouped_clustering

Unnamed: 0,Cluster Labels,Latitude,Longitude
0,4,43.794200,-79.262029
1,2,43.602414,-79.543484
2,0,43.754328,-79.442259
3,0,43.786947,-79.385975
4,0,43.733283,-79.419750
...,...,...,...
94,0,43.782736,-79.442259
95,4,43.770992,-79.216917
96,1,43.695344,-79.318389
97,0,43.752758,-79.400049


Get the main central geo location 

In [33]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))

The geograpical coordinate of Toronto, Ontario are 43.6534817, -79.3839347.


Create a map and plot the clusters

In [36]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(df['Latitude'], df['Longitude'], df['Neighbourhood'], grouped_clustering['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Now I'll group by **borough**

In [57]:
grouped_borough_df = df.groupby('Borough').mean().reset_index()
grouped_borough_df

Unnamed: 0,Borough,Latitude,Longitude
0,Central Toronto,43.70198,-79.398954
1,Downtown Toronto,43.654597,-79.383972
2,East Toronto,43.669436,-79.324654
3,East York,43.700303,-79.335851
4,Etobicoke,43.660043,-79.542074
5,Mississauga,43.636966,-79.615819
6,North York,43.750727,-79.429338
7,Scarborough,43.766229,-79.249085
8,West Toronto,43.652653,-79.44929
9,York,43.690797,-79.472633


In [58]:
kclusters = 10

grouped_clustering = grouped_borough_df.drop('Borough', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(grouped_clustering)
kmeans.labels_[0:10] 

array([4, 7, 9, 0, 1, 5, 6, 3, 8, 2], dtype=int32)

In [59]:
grouped_clustering.insert(0, 'Cluster Labels', kmeans.labels_)
grouped_clustering

Unnamed: 0,Cluster Labels,Latitude,Longitude
0,4,43.70198,-79.398954
1,7,43.654597,-79.383972
2,9,43.669436,-79.324654
3,0,43.700303,-79.335851
4,1,43.660043,-79.542074
5,5,43.636966,-79.615819
6,6,43.750727,-79.429338
7,3,43.766229,-79.249085
8,8,43.652653,-79.44929
9,2,43.690797,-79.472633


In [74]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 0.8, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

for lat, lon, poi, cluster in zip(df['Latitude'], df['Longitude'], df['Borough'], grouped_clustering['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters