# Segmenting and Clustering Neighborhoods in Toronto

## Applied Data Science Capstone - Week 3

### 1. Data Pre-Processing

In [1]:
# Importing some libraries
import numpy as np
import pandas as pd
import requests

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [2]:
# Wikipedia page's url
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
# Getting the first table from Wikipedia page
df = pd.read_html(url)[0]

In [4]:
# Checking if df is correct
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Postal Code    180 non-null    object
 1   Borough        180 non-null    object
 2   Neighbourhood  180 non-null    object
dtypes: object(3)
memory usage: 4.3+ KB


In [6]:
# Dropping those rows whose Borough is "Not assigned"
df = df[df['Borough'] != 'Not assigned']

In [7]:
# Checking df once more
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 2 to 178
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Postal Code    103 non-null    object
 1   Borough        103 non-null    object
 2   Neighbourhood  103 non-null    object
dtypes: object(3)
memory usage: 3.2+ KB


In [9]:
df['Postal Code'].unique().size

103

There are 103 entries, and also 103 unique Postal Code. Therefore, every postal code indicates its neighborhoods in one row. There is no need of further pre-processing.

In [10]:
# Ckecking for 'Not assigned' Neighborhoods
df[df['Neighbourhood'] =='Not assigned']

Unnamed: 0,Postal Code,Borough,Neighbourhood


There is no 'Not assigned' Neighborhood in df.

In [11]:
df.shape

(103, 3)

### 2. Getting Neighborhoods Coordinates

In [12]:
# Using CSV file provided by Coursera
coord = pd.read_csv('Geospatial_Coordinates.csv')

In [13]:
coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
# Merging df and coord dataframes on 'Postal Code' label
result = pd.merge(df,coord,on='Postal Code')

In [15]:
result

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [28]:
result_toronto = result[result['Borough'].str.contains('Toronto')]

In [32]:
result_toronto

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


### 3. Clustering Toronto Neighborhoods

#### 3.1 Showing Postal Code on Toronto's map

In [33]:
from geopy.geocoders import Nominatim

from sklearn.cluster import KMeans

import folium

In [34]:
address = 'Toronto, On'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.6534817, -79.3839347.


In [37]:
# create map of Toronto using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add Postal Code markers to map
for pc, lat, lng, borough, neighborhood in zip(result['Postal Code'], 
                                               result['Latitude'], result['Longitude'], result['Borough'], result['Neighbourhood']):
    label = f'{pc}, {neighborhood}, {borough}'
    label = folium.Popup(label, parse_html=True)
    folium.vector_layers.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
#map_newyork

#### 3.2 Organizing for Clustering

Foursquare Credentials and Version

In [38]:
CLIENT_ID = 'UVQYTO1U3J0FFQAHZIVP2ZXP5M3EYSCVPOMOEXYQ0DL1XDRK' # your Foursquare ID
CLIENT_SECRET = 'AQL0YLQ3WQAHQPIDNWXPC1EEJ5QW5C0Z3ZQG4U4DGRIASYZY' # your Foursquare Secret
VERSION = '20180605'
LIMIT = 200
radius = 500

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: UVQYTO1U3J0FFQAHZIVP2ZXP5M3EYSCVPOMOEXYQ0DL1XDRK
CLIENT_SECRET:AQL0YLQ3WQAHQPIDNWXPC1EEJ5QW5C0Z3ZQG4U4DGRIASYZY


The function below was copied form Manhatan example.

In [39]:
def getNearbyVenues(postalcodes, names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for postalcode, name, lat, lng in zip(postalcodes, names, latitudes, longitudes):
        # print(postalcode, ' ', name)    
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            postalcode,
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal Code',
                             'Neighborhood', 
                             'Neighborhood Latitude',
                             'Neighborhood Longitude',
                             'Venue',
                             'Venue Latitude',
                             'Venue Longitude',
                             'Venue Category']
    
    return(nearby_venues)

Applying the function above to every Postal Code

In [40]:
toronto_venues = getNearbyVenues(postalcodes = result['Postal Code'],
                                 names=result['Borough'],
                                 latitudes=result['Latitude'],
                                 longitudes=result['Longitude']
                                  )

In [41]:
toronto_venues.size

17296

In [42]:
toronto_venues.head()

Unnamed: 0,Postal Code,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M3A,North York,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,M3A,North York,43.753259,-79.329656,Sun Life,43.75476,-79.332783,Construction & Landscaping
2,M3A,North York,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,M4A,North York,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,M4A,North York,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


In [43]:
toronto_venues.shape

(2162, 8)

#### Let's see how many venues there are in each Postal Code

In [44]:
toronto_venues.groupby('Postal Code').count()

Unnamed: 0_level_0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
M1B,1,1,1,1,1,1,1
M1C,3,3,3,3,3,3,3
M1E,8,8,8,8,8,8,8
M1G,3,3,3,3,3,3,3
M1H,9,9,9,9,9,9,9
...,...,...,...,...,...,...,...
M9N,2,2,2,2,2,2,2
M9P,7,7,7,7,7,7,7
M9R,4,4,4,4,4,4,4
M9V,9,9,9,9,9,9,9


It is necessary to get_dummies form 'Venue Category' column. 

In [68]:
#one hot encoding
toronto_venues_categories = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

In [69]:
toronto_venues_categories.shape

(2162, 272)

In [70]:
toronto_venues_categories_merged = toronto_venues.merge(toronto_venues_categories,on=toronto_venues.index)

In [71]:
toronto_venues_categories_merged.drop(['key_0'],axis=1,inplace=True)

In [72]:
toronto_venues_categories_merged.shape

(2162, 280)

Lets group rows by Postal Code, taking the total amount of a given venue

In [81]:
toronto_grouped = toronto_venues_categories_merged.groupby('Postal Code').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Postal Code,Neighborhood Latitude,Neighborhood Longitude,Venue Latitude,Venue Longitude,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,43.806686,-79.194353,43.807448,-79.199056,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,43.784535,-79.160497,43.783618,-79.160541,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,43.763573,-79.188711,43.76632,-79.191291,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,43.770992,-79.216917,43.770559,-79.219579,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,43.773136,-79.239476,43.774519,-79.240678,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [82]:
toronto_grouped.shape

(100, 277)

#### 3.3 Clustering

In [83]:
# set number of clusters
kclusters = 6

toronto_grouped_clustering = toronto_grouped.drop(['Postal Code','Neighborhood Latitude', 'Neighborhood Longitude', 'Venue Latitude', 'Venue Longitude'], axis=1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([5, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [84]:
toronto_grouped = toronto_grouped.join(pd.DataFrame(kmeans.labels_,columns=['Cluster Label']))

In [85]:
toronto_grouped.head()

Unnamed: 0,Postal Code,Neighborhood Latitude,Neighborhood Longitude,Venue Latitude,Venue Longitude,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Cluster Label
0,M1B,43.806686,-79.194353,43.807448,-79.199056,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
1,M1C,43.784535,-79.160497,43.783618,-79.160541,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,M1E,43.763573,-79.188711,43.76632,-79.191291,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,M1G,43.770992,-79.216917,43.770559,-79.219579,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,M1H,43.773136,-79.239476,43.774519,-79.240678,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [86]:
toronto_grouped.shape

(100, 278)

In [87]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_grouped['Neighborhood Latitude'], toronto_grouped['Neighborhood Longitude'], toronto_grouped['Postal Code'], toronto_grouped['Cluster Label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.vector_layers.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters