# Segmenting and Clustering Neighborhoods in Toronto pt. 3

Install required packages.

In [None]:
%%capture
!pip install -r requirements.txt

Load packages

In [1]:
import pandas as pd
import numpy as np
import geocoder
import requests
import os
import json
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

Set constants

In [2]:
FOURSQUARE_API_URL = 'https://api.foursquare.com/v2/venues/explore'
POST_CODES_FILE = 'out/postal_codes.pickle'
VENUES_FILE = 'data/toronto_venues.pickle'

Select only the postcodes in boroughs which have "Toronto" in their names.

In [3]:
postal_codes = pd.read_pickle(POST_CODES_FILE)
postal_codes = postal_codes[postal_codes.Borough.str.contains('Toronto')]
postal_codes.shape

(38, 5)

Define function to retrieve venues in (500 m) radius around the post code centres.

In [None]:
def getNearbyVenues(post_codes, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for post_code, lat, lng in zip(post_codes, latitudes, longitudes):

        params = dict(
            client_id=os.getenv('FOURSQUARE_CLIENT_ID'),
            client_secret=os.getenv('FOURSQUARE_SECRET'),
            v='20190920',
            ll=f'{lat},{lng}',
            radius=radius,
            limit=100
        )
            
        # make the GET request
        results = requests.get(url=FOURSQUARE_API_URL, params=params).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            post_code, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postcode', 
                  'Postcode Latitude', 
                  'Postcode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [35]:
toronto_venues = getNearbyVenues(postal_codes.Postcode, postal_codes.latitude, postal_codes.longitude, 
                                 radius=500)
toronto_venues.head()
toronto_venues.to_pickle(VENUES_FILE)

Unnamed: 0,Postcode,Postcode Latitude,Postcode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M4N,43.72802,-79.38879,Lawrence Park Ravine,43.726963,-79.394382,Park
1,M4N,43.72802,-79.38879,Dim Sum Deluxe,43.726953,-79.39426,Dim Sum Restaurant
2,M4N,43.72802,-79.38879,Zodiac Swim School,43.728532,-79.38286,Swim School
3,M4N,43.72802,-79.38879,TTC Bus #162 - Lawrence-Donway,43.728026,-79.382805,Bus Line
4,M4P,43.712751,-79.390197,Sherwood Park,43.716551,-79.387776,Park


In [38]:
toronto_venues = pd.read_pickle(VENUES_FILE)
toronto_venues.shape

(832, 7)

Let's check how many venues were returned for each neighborhood

In [39]:
toronto_venues.Postcode.value_counts()

M6S    30
M5E    30
M5K    30
M5B    30
M5W    30
M4K    30
M5G    30
M5T    30
M4M    30
M4Y    30
M4X    30
M5J    30
M6J    30
M5A    30
M5H    30
M5L    30
M5X    30
M5S    30
M5C    30
M4S    30
M6P    23
M5R    23
M6K    22
M4L    20
M4R    20
M7Y    18
M6G    17
M6R    15
M4V    15
M6H    14
M5V    14
M4P     8
M5P     5
M4W     5
M4E     4
M4T     4
M4N     4
M5N     1
Name: Postcode, dtype: int64

#### Let's find out how many unique categories can be curated from all the returned venues

In [41]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 186 uniques categories.


<a id='item3'></a>

## Analyze Each Postcode

In [43]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add postcode column back to dataframe
toronto_onehot['Postcode'] = toronto_venues['Postcode'] 

# move postcode column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Postcode,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Aquarium,Art Gallery,...,Theater,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,M4N,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M4N,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M4N,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4N,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4P,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


And let's examine the new dataframe size.

In [45]:
toronto_onehot.shape

(832, 187)

#### Next, let's group rows by postcode and by taking the mean of the frequency of occurrence of each category

In [46]:
toronto_grouped = toronto_onehot.groupby('Postcode').mean().reset_index()
toronto_grouped

Unnamed: 0,Postcode,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Aquarium,Art Gallery,...,Theater,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,M4E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.033333
2,M4L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M4N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,M4P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,M4R,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05
7,M4S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,M4T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0
9,M4V,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.0


#### Let's confirm the new size

In [47]:
toronto_grouped.shape

(38, 187)

#### Let's print each postcode along with the top 5 most common venues

In [48]:
num_top_venues = 5

for hood in toronto_grouped['Postcode']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Postcode'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----M4E----
               venue  freq
0       Neighborhood  0.25
1  Health Food Store  0.25
2                Pub  0.25
3              Trail  0.25
4             Museum  0.00


----M4K----
                    venue  freq
0        Greek Restaurant  0.30
1          Ice Cream Shop  0.07
2      Italian Restaurant  0.07
3             Yoga Studio  0.03
4  Furniture / Home Store  0.03


----M4L----
              venue  freq
0       Pizza Place  0.10
1    Sandwich Place  0.10
2  Sushi Restaurant  0.05
3         Pet Store  0.05
4              Park  0.05


----M4M----
                 venue  freq
0                 Café  0.13
1          Coffee Shop  0.10
2  American Restaurant  0.07
3   Italian Restaurant  0.07
4               Bakery  0.07


----M4N----
                venue  freq
0  Dim Sum Restaurant  0.25
1                Park  0.25
2         Swim School  0.25
3            Bus Line  0.25
4             Airport  0.00


----M4P----
               venue  freq
0                Gym  0.12
1           

#### Let's put that into a *pandas* dataframe

First, let's write a function to sort the venues in descending order.

In [49]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each postcode.

In [61]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postcode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Postcode'] = toronto_grouped['Postcode']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,Trail,Health Food Store,Pub,Neighborhood,Yoga Studio,Cuban Restaurant,Eastern European Restaurant,Dog Run,Discount Store,Diner
1,M4K,Greek Restaurant,Italian Restaurant,Ice Cream Shop,Yoga Studio,Spa,Pub,Pizza Place,Juice Bar,Indian Restaurant,Furniture / Home Store
2,M4L,Sandwich Place,Pizza Place,Pet Store,Food & Drink Shop,Fish & Chips Shop,Steakhouse,Ice Cream Shop,Sushi Restaurant,Brewery,Pub
3,M4M,Café,Coffee Shop,Bakery,Italian Restaurant,American Restaurant,Sandwich Place,Fish Market,Neighborhood,Bookstore,Seafood Restaurant
4,M4N,Bus Line,Park,Dim Sum Restaurant,Swim School,Yoga Studio,Eastern European Restaurant,Dog Run,Discount Store,Diner,Dessert Shop


<a id='item4'></a>

## Cluster Postcodes

Run *k*-means to cluster the postcodes into 5 clusters.

In [55]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Postcode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 2, 2, 3, 2, 2, 2, 0, 2], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each postcode.

In [62]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = postal_codes

# merge toronto_grouped with toronto_data to add latitude/longitude for each postcode
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Postcode'), on='Postcode')

toronto_merged.head() # check the last columns!

Unnamed: 0,Borough,Postcode,Neighbourhood,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,M4N,Lawrence Park,43.72802,-79.38879,3,Bus Line,Park,Dim Sum Restaurant,Swim School,Yoga Studio,Eastern European Restaurant,Dog Run,Discount Store,Diner,Dessert Shop
1,Central Toronto,M4P,Davisville North,43.712751,-79.390197,2,Park,Gym,Clothing Store,Breakfast Spot,Dog Run,Hotel,Sandwich Place,Food & Drink Shop,Yoga Studio,Dance Studio
2,Central Toronto,M4R,North Toronto West,43.715383,-79.405678,2,Coffee Shop,Sporting Goods Shop,Yoga Studio,Bagel Shop,Dessert Shop,Spa,Burger Joint,Salon / Barbershop,Metro Station,Restaurant
3,Central Toronto,M4S,Davisville,43.704324,-79.38879,2,Dessert Shop,Sandwich Place,Sushi Restaurant,Coffee Shop,Italian Restaurant,Gym,Café,Pizza Place,Brewery,Diner
4,Central Toronto,M4T,"Moore Park, Summerhill East",43.689574,-79.38316,0,Gym,Playground,Trail,Restaurant,Yoga Studio,Creperie,Dog Run,Discount Store,Diner,Dim Sum Restaurant


Finally, let's visualize the resulting clusters

In [67]:
# Coordinates of Toronto's center
g = geocoder.google('Toronto, Ontario', components='country:CA')
print(g.geojson['features'][0]['properties']['lat'], g.geojson['features'][0]['properties']['lng'])

43.653226 -79.3831843


In [71]:
# create map
map_clusters = folium.Map(location=[43.653226, -79.3831843], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['latitude'], toronto_merged['longitude'], toronto_merged['Postcode'],
                                  toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

<a id='item5'></a>

## Examine Clusters

Now, we can examine each cluster and determine the discriminating venue categories that distinguish each cluster.

#### Cluster 1

In [73]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Postcode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,M4T,0,Gym,Playground,Trail,Restaurant,Yoga Studio,Creperie,Dog Run,Discount Store,Diner,Dim Sum Restaurant


#### Cluster 2

In [74]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Postcode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,M5N,1,Garden,Yoga Studio,Cuban Restaurant,Ethiopian Restaurant,Eastern European Restaurant,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop


#### Cluster 3

In [75]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Postcode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,M4P,2,Park,Gym,Clothing Store,Breakfast Spot,Dog Run,Hotel,Sandwich Place,Food & Drink Shop,Yoga Studio,Dance Studio
2,M4R,2,Coffee Shop,Sporting Goods Shop,Yoga Studio,Bagel Shop,Dessert Shop,Spa,Burger Joint,Salon / Barbershop,Metro Station,Restaurant
3,M4S,2,Dessert Shop,Sandwich Place,Sushi Restaurant,Coffee Shop,Italian Restaurant,Gym,Café,Pizza Place,Brewery,Diner
5,M4V,2,Coffee Shop,Pub,American Restaurant,Sushi Restaurant,Sports Bar,Restaurant,Bagel Shop,Pizza Place,Fried Chicken Joint,Liquor Store
8,M5R,2,Coffee Shop,Sandwich Place,Café,Pizza Place,Cosmetics Shop,Liquor Store,Burger Joint,Jewish Restaurant,Indian Restaurant,Pub
10,M4X,2,Coffee Shop,Restaurant,Italian Restaurant,Café,Japanese Restaurant,Sandwich Place,Jewelry Store,Liquor Store,Butcher,Beer Store
11,M4Y,2,Gay Bar,Park,Burger Joint,Indian Restaurant,Ethiopian Restaurant,Italian Restaurant,Bookstore,Breakfast Spot,Bubble Tea Shop,Salon / Barbershop
12,M5A,2,Coffee Shop,Bakery,Park,Gym / Fitness Center,Mexican Restaurant,Breakfast Spot,Performing Arts Venue,Chocolate Shop,Pub,Café
13,M5B,2,Clothing Store,Coffee Shop,Café,Ramen Restaurant,Hotel,Beer Bar,Sandwich Place,Diner,Japanese Restaurant,Burger Joint
14,M5C,2,Coffee Shop,Gastropub,Restaurant,Japanese Restaurant,Italian Restaurant,Hotel,New American Restaurant,Cosmetics Shop,Café,Middle Eastern Restaurant


#### Cluster 4

In [76]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Postcode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4N,3,Bus Line,Park,Dim Sum Restaurant,Swim School,Yoga Studio,Eastern European Restaurant,Dog Run,Discount Store,Diner,Dessert Shop
7,M5P,3,Bus Line,Park,Trail,Jewelry Store,Sushi Restaurant,Yoga Studio,Eastern European Restaurant,Dog Run,Discount Store,Diner


#### Cluster 5

In [78]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Postcode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
9,M4W,4,Park,Playground,Trail,Building,Yoga Studio,Cuban Restaurant,Dog Run,Discount Store,Diner,Dim Sum Restaurant
