# Segmenting and Clustering Neighborhoods in Toronto
## Segmenting and Clustering Neighborhoods with Foursquare information
---
**Xu Qianyi**

Data Scientist

## 1. Load location of neighborhoods in Toronta

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('http://cocl.us/Geospatial_data')
df_loc = df.rename(columns={'Postal Code' : 'PostalCode'})
print(df_loc.shape)
df_loc.head()

(103, 3)


Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## 2. Fetch all neighborhood information from wikipedia

Import **requests** library for scraping data from wikipedia, and import **lxml** library for html parsing

In [3]:
import requests
from lxml import etree
import pandas as pd

Get the html content of the website page with requests.get() function

In [4]:
# using requests.get(url) to get html content
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
resp = requests.get(wiki_url)

#delete all '\n' in the content for further string proces
resp_str=resp.text.replace('\n', '')
 

Get table column names and data with xpah function in lxml library.

In [5]:
#Parse from html string
root = etree.fromstring(resp_str)
trs = root.xpath('//table[contains(@class, "wikitable")]/.//tr')

#Get table headers, and use them to construct headers of a new dataframe
ths = trs[0].xpath('th/text()')
df_original = pd.DataFrame(columns=[th for th in ths])

#Get all Postcode, Borough, Neighborhood from the table 
loc_idx = 0
for tr in trs[1:]:
    tds = tr.xpath('td/text() | td/a/text()')
    df_original.loc[loc_idx] = [td for td in tds]
    loc_idx += 1
    

In [6]:
print('We get {} rows of neighborhoods in Toronto.'.format(df_original.shape[0]))
df_original.head()

We get 289 rows of neighborhoods in Toronto.


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Process the Neighborhood dataframe, replace 'Not assigned' cells

In [7]:
df_neighborhood = df_original[df_original["Borough"] != 'Not assigned']
df_neighborhood.head()

for index, row in df_neighborhood.iterrows():
    if row['Neighbourhood'] == 'Not assigned':
        row['Neighbourhood'] = row['Borough']

df_postcodes = df_neighborhood.groupby(['Postcode', 'Borough']).agg({'Neighbourhood':lambda x: '%s'%', '.join(x)}).reset_index()
df_postcodes = df_postcodes.rename(columns={'Postcode' : 'PostalCode'})

print('We finally get {} rows of different postcodes.'.format(df_postcodes.shape))
df_postcodes.head()

We finally get (103, 3) rows of different postcodes.


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Inner join two tables into one

In [8]:
df_infos = df_postcodes.join(df_loc.set_index('PostalCode'), on='PostalCode')
print(df_infos.shape)
df_infos.head()

(103, 5)


Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## 3. Explore one Borough in Toronto

We only work with borough name to do clustering task

In [16]:
df_boroughs = df_infos[["Borough", "Latitude", "Longitude"]].groupby('Borough').mean().reset_index()
df_boroughs

Unnamed: 0,Borough,Latitude,Longitude
0,Central Toronto,43.70198,-79.398954
1,Downtown Toronto,43.654169,-79.383665
2,East Toronto,43.669436,-79.324654
3,East York,43.700303,-79.335851
4,Etobicoke,43.660043,-79.542074
5,Mississauga,43.636966,-79.615819
6,North York,43.750727,-79.429338
7,Queen's Park,43.662301,-79.389494
8,Scarborough,43.766229,-79.249085
9,West Toronto,43.652653,-79.44929


Before we get the data and start exploring it, let's download all the dependencies that we will need.

In [18]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [None]:
# import geocoder # import geocoder

# # initialize your variable to None
# lat_lng_coords = None

# # loop until you get the coordinates
# while(lat_lng_coords is None):
#   g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#   lat_lng_coords = g.latlng

# latitude = lat_lng_coords[0]
# longitude = lat_lng_coords[1]

Let's get the geographical coordinates of Toronto

In [19]:
address = 'Toronto, Ontario'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

  This is separate from the ipykernel package so we can avoid doing imports until


The geograpical coordinate of Toronto are 43.653963, -79.387207.


Show all boroughs with blue markers on the map

In [21]:
# create map of Boroughs in Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_boroughs['Latitude'], df_boroughs['Longitude'], df_boroughs['Borough']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Next, we are going to start utilizing the Foursquare API to explore the Boroughs and segment them.

#### Define Foursquare Credentials and Version

In [25]:
CLIENT_ID = '3ES0INYJN3CE5SBJGBTUNGWY0OQM30YD11UMKKFXJTG2P1XP' # your Foursquare ID
CLIENT_SECRET = 'BGQ1TEAYASKLBAKAC3YCO15KRW4RT1AFRLBATI3XJT00JPLE' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 3ES0INYJN3CE5SBJGBTUNGWY0OQM30YD11UMKKFXJTG2P1XP
CLIENT_SECRET:BGQ1TEAYASKLBAKAC3YCO15KRW4RT1AFRLBATI3XJT00JPLE


#### Let's explore the first borough in our dataframe.

In [22]:
df_boroughs.loc[0, 'Borough']

'Central Toronto'

Get the borough's latitude and longitude values.

In [23]:
borough_latitude = df_boroughs.loc[0, 'Latitude'] # borough latitude value
borough_longitude = df_boroughs.loc[0, 'Longitude'] # borough longitude value

borough_name = df_boroughs.loc[0, 'Borough'] # borough name

print('Latitude and longitude values of {} are {}, {}.'.format(borough_name, 
                                                               borough_latitude, 
                                                               borough_longitude))

Latitude and longitude values of Central Toronto are 43.701979788888885, -79.39895405555556.


#### Now, let's get the top 100 venues that are in Central Toronto within a radius of 500 meters.

In [26]:
radius = 500
LIMIT = 100
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(\
    CLIENT_ID, CLIENT_SECRET, borough_latitude, borough_longitude, VERSION, radius, LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?client_id=3ES0INYJN3CE5SBJGBTUNGWY0OQM30YD11UMKKFXJTG2P1XP&client_secret=BGQ1TEAYASKLBAKAC3YCO15KRW4RT1AFRLBATI3XJT00JPLE&ll=43.701979788888885,-79.39895405555556&v=20180605&radius=500&limit=100'

In [27]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5c31d3cd4c1f67404f4a8f83'},
 'response': {'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 29,
  'suggestedBounds': {'ne': {'lat': 43.70647979338889,
    'lng': -79.39274111652504},
   'sw': {'lat': 43.69747978438888, 'lng': -79.40516699458608}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '514740dfe4b07022fc5f1395',
       'name': 'Boar Sandwiches',
       'location': {'address': '3 Glebe Rd E',
        'crossStreet': 'at Yonge St',
        'lat': 43.70115480524192,
        'lng': -79.3967198293826,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.70115480524192,
          'lng': -79.3967198293826}],
        'distance': 201,
        'postalC

define get_category_type() function to get the category type of the venue

In [28]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [29]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Boar Sandwiches,Sandwich Place,43.701155,-79.39672
1,Balsamico,Italian Restaurant,43.701505,-79.397162
2,Little Sister,Indonesian Restaurant,43.701552,-79.397163
3,Tabülè,Middle Eastern Restaurant,43.700932,-79.397078
4,Zezafoun Syrian Cousine,Restaurant,43.702667,-79.39712


In [30]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

29 venues were returned by Foursquare.


## 4. Explore Boroughs in Toronto

#### Let's create a function to repeat the same process to all the boroughs in Toronto

In [36]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Borough', 
                  'Borough Latitude', 
                  'Borough Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [37]:
toronto_venues = getNearbyVenues(names=df_boroughs["Borough"], 
                                   latitudes=df_boroughs["Latitude"], 
                                   longitudes=df_boroughs["Longitude"])

Central Toronto
Downtown Toronto
East Toronto
East York
Etobicoke
Mississauga
North York
Queen's Park
Scarborough
West Toronto
York


#### Let's check the size of the resulting dataframe

In [38]:
print(toronto_venues.shape)
toronto_venues.head()

(273, 7)


Unnamed: 0,Borough,Borough Latitude,Borough Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Central Toronto,43.70198,-79.398954,Boar Sandwiches,43.701155,-79.39672,Sandwich Place
1,Central Toronto,43.70198,-79.398954,Balsamico,43.701505,-79.397162,Italian Restaurant
2,Central Toronto,43.70198,-79.398954,Little Sister,43.701552,-79.397163,Indonesian Restaurant
3,Central Toronto,43.70198,-79.398954,Tabülè,43.700932,-79.397078,Middle Eastern Restaurant
4,Central Toronto,43.70198,-79.398954,Zezafoun Syrian Cousine,43.702667,-79.39712,Restaurant


Let's check how many venues were returned for each borough

In [39]:
toronto_venues.groupby('Borough').count()

Unnamed: 0_level_0,Borough Latitude,Borough Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Central Toronto,29,29,29,29,29,29
Downtown Toronto,100,100,100,100,100,100
East Toronto,26,26,26,26,26,26
East York,4,4,4,4,4,4
Etobicoke,4,4,4,4,4,4
Mississauga,11,11,11,11,11,11
North York,2,2,2,2,2,2
Queen's Park,47,47,47,47,47,47
Scarborough,1,1,1,1,1,1
West Toronto,43,43,43,43,43,43


In [40]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))
print('There are {} uniques Venues.'.format(len(toronto_venues['Venue'].unique())))

There are 117 uniques categories.
There are 251 uniques Venues.


## 5. Analyze Each Neighborhood

In [41]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add borough column back to dataframe
toronto_onehot['Borough'] = toronto_venues['Borough'] 

# move borough column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Borough,Accessories Store,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Garage,Bakery,Bank,Bar,Beer Bar,Bistro,Bookstore,Breakfast Spot,Brewery,Bubble Tea Shop,Burger Joint,Burrito Place,Business Service,Café,Chinese Restaurant,Clothing Store,Coffee Shop,College Auditorium,Comic Shop,Concert Hall,Convenience Store,Cosmetics Shop,Creperie,Cuban Restaurant,Deli / Bodega,Department Store,Dessert Shop,Diner,Discount Store,Eastern European Restaurant,Egyptian Restaurant,Electronics Store,Falafel Restaurant,Fast Food Restaurant,Flower Shop,Food Court,French Restaurant,Fried Chicken Joint,Furniture / Home Store,Gas Station,Gastropub,General Entertainment,General Travel,Gift Shop,Gourmet Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Hobby Shop,Hockey Arena,Hotel,Ice Cream Shop,Indian Chinese Restaurant,Indian Restaurant,Indie Theater,Indonesian Restaurant,Italian Restaurant,Japanese Restaurant,Juice Bar,Lingerie Store,Liquor Store,Mediterranean Restaurant,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Modern European Restaurant,Monument / Landmark,Movie Theater,Music Venue,Neighborhood,New American Restaurant,Nightclub,Office,Opera House,Pakistani Restaurant,Park,Persian Restaurant,Pizza Place,Plaza,Poke Place,Portuguese Restaurant,Pub,Public Art,Ramen Restaurant,Restaurant,Sandwich Place,Seafood Restaurant,Shoe Store,Shopping Mall,Skating Rink,Ski Area,Ski Chalet,Smoothie Shop,Snack Place,Spa,Sporting Goods Shop,Steakhouse,Sushi Restaurant,Tanning Salon,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wings Joint,Women's Store,Yoga Studio
0,Central Toronto,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Central Toronto,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Central Toronto,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Central Toronto,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Central Toronto,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [42]:
toronto_onehot.shape

(273, 118)

#### Next, let's group rows by borough and by taking the mean of the frequency of occurrence of each category

In [44]:
toronto_grouped = toronto_onehot.groupby('Borough').mean().reset_index()
toronto_grouped

Unnamed: 0,Borough,Accessories Store,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Garage,Bakery,Bank,Bar,Beer Bar,Bistro,Bookstore,Breakfast Spot,Brewery,Bubble Tea Shop,Burger Joint,Burrito Place,Business Service,Café,Chinese Restaurant,Clothing Store,Coffee Shop,College Auditorium,Comic Shop,Concert Hall,Convenience Store,Cosmetics Shop,Creperie,Cuban Restaurant,Deli / Bodega,Department Store,Dessert Shop,Diner,Discount Store,Eastern European Restaurant,Egyptian Restaurant,Electronics Store,Falafel Restaurant,Fast Food Restaurant,Flower Shop,Food Court,French Restaurant,Fried Chicken Joint,Furniture / Home Store,Gas Station,Gastropub,General Entertainment,General Travel,Gift Shop,Gourmet Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Hobby Shop,Hockey Arena,Hotel,Ice Cream Shop,Indian Chinese Restaurant,Indian Restaurant,Indie Theater,Indonesian Restaurant,Italian Restaurant,Japanese Restaurant,Juice Bar,Lingerie Store,Liquor Store,Mediterranean Restaurant,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Modern European Restaurant,Monument / Landmark,Movie Theater,Music Venue,Neighborhood,New American Restaurant,Nightclub,Office,Opera House,Pakistani Restaurant,Park,Persian Restaurant,Pizza Place,Plaza,Poke Place,Portuguese Restaurant,Pub,Public Art,Ramen Restaurant,Restaurant,Sandwich Place,Seafood Restaurant,Shoe Store,Shopping Mall,Skating Rink,Ski Area,Ski Chalet,Smoothie Shop,Snack Place,Spa,Sporting Goods Shop,Steakhouse,Sushi Restaurant,Tanning Salon,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wings Joint,Women's Store,Yoga Studio
0,Central Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034483,0.034483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034483,0.0,0.0,0.068966,0.0,0.0,0.0,0.034483,0.0,0.0,0.0,0.034483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034483,0.0,0.0,0.0,0.034483,0.034483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034483,0.0,0.068966,0.0,0.034483,0.137931,0.0,0.0,0.0,0.0,0.0,0.034483,0.034483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034483,0.0,0.0,0.0,0.0,0.0,0.0,0.068966,0.034483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034483,0.103448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034483,0.0,0.0
1,Downtown Toronto,0.01,0.03,0.0,0.01,0.0,0.01,0.0,0.0,0.01,0.01,0.01,0.01,0.0,0.01,0.01,0.0,0.01,0.01,0.01,0.0,0.03,0.02,0.08,0.07,0.0,0.01,0.02,0.0,0.02,0.0,0.0,0.0,0.01,0.01,0.01,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.02,0.0,0.0,0.01,0.0,0.01,0.0,0.01,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.02,0.0,0.0,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.0,0.01,0.01,0.0,0.0,0.0,0.02,0.03,0.01,0.0,0.0,0.0,0.01,0.03,0.0,0.02,0.01,0.01,0.0,0.0,0.0,0.01,0.0,0.01,0.01,0.02,0.01,0.01,0.03,0.01,0.02,0.01,0.0,0.02,0.01,0.0,0.01,0.0
2,East Toronto,0.0,0.0,0.038462,0.0,0.0,0.038462,0.0,0.0,0.0,0.0,0.038462,0.0,0.038462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.038462,0.384615,0.038462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038462,0.038462,0.0,0.038462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038462,0.0,0.0,0.0,0.038462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,East York,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Etobicoke,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0
5,Mississauga,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.181818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.181818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,North York,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Queen's Park,0.0,0.0,0.0,0.0,0.021277,0.0,0.0,0.0,0.0,0.0,0.021277,0.0,0.0,0.0,0.0,0.0,0.021277,0.021277,0.021277,0.0,0.021277,0.021277,0.0,0.212766,0.021277,0.0,0.0,0.0,0.0,0.021277,0.0,0.0,0.0,0.0,0.042553,0.021277,0.0,0.0,0.0,0.0,0.021277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021277,0.0,0.0,0.0,0.0,0.0,0.042553,0.0,0.021277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021277,0.042553,0.0,0.0,0.0,0.0,0.021277,0.0,0.0,0.0,0.0,0.0,0.021277,0.0,0.0,0.021277,0.0,0.0,0.0,0.021277,0.021277,0.0,0.0,0.0,0.021277,0.0,0.0,0.0,0.021277,0.021277,0.021277,0.0,0.0,0.0,0.0,0.0,0.021277,0.0,0.021277,0.0,0.0,0.042553,0.0,0.0,0.0,0.021277,0.0,0.0,0.021277,0.0,0.021277,0.0,0.021277
8,Scarborough,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,West Toronto,0.0,0.0,0.023256,0.023256,0.0,0.0,0.023256,0.023256,0.023256,0.023256,0.023256,0.0,0.0,0.046512,0.046512,0.046512,0.0,0.023256,0.0,0.0,0.023256,0.0,0.023256,0.046512,0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,0.0,0.023256,0.0,0.0,0.023256,0.0,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.023256,0.023256,0.0,0.046512,0.023256,0.0,0.0,0.046512,0.0,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,0.0,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,0.0,0.0,0.046512,0.0,0.0,0.069767,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,0.0,0.0,0.0,0.023256,0.0,0.0,0.0


In [45]:
toronto_grouped.shape

(11, 118)

In [54]:
num_top_venues = 8

for hood in toronto_grouped['Borough']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Borough'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Central Toronto----
                   venue  freq
0     Italian Restaurant  0.14
1       Sushi Restaurant  0.10
2            Coffee Shop  0.07
3      Indian Restaurant  0.07
4             Restaurant  0.07
5      Convenience Store  0.03
6  General Entertainment  0.03
7         Sandwich Place  0.03


----Downtown Toronto----
                 venue  freq
0       Clothing Store  0.08
1          Coffee Shop  0.07
2                 Café  0.03
3           Restaurant  0.03
4             Tea Room  0.03
5  American Restaurant  0.03
6                Plaza  0.03
7          Pizza Place  0.02


----East Toronto----
               venue  freq
0  Indian Restaurant  0.38
1               Café  0.08
2      Grocery Store  0.08
3      Indie Theater  0.04
4               Park  0.04
5       Skating Rink  0.04
6             Bistro  0.04
7                Bar  0.04


----East York----
                  venue  freq
0                  Park  0.50
1      Department Store  0.25
2            Public Art  0.25
3  

First, let's write a function to sort the venues in descending order.

In [55]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [56]:
num_top_venues = 8

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Borough']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
boroughs_venues_sorted = pd.DataFrame(columns=columns)
boroughs_venues_sorted['Borough'] = toronto_grouped['Borough']

for ind in np.arange(toronto_grouped.shape[0]):
    boroughs_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

boroughs_venues_sorted

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue
0,Central Toronto,Italian Restaurant,Sushi Restaurant,Restaurant,Coffee Shop,Indian Restaurant,Bank,Middle Eastern Restaurant,Mexican Restaurant
1,Downtown Toronto,Clothing Store,Coffee Shop,Tea Room,Restaurant,Plaza,Café,American Restaurant,Concert Hall
2,East Toronto,Indian Restaurant,Café,Grocery Store,Pizza Place,Skating Rink,Egyptian Restaurant,Snack Place,Bistro
3,East York,Park,Public Art,Department Store,Yoga Studio,Cosmetics Shop,Cuban Restaurant,Deli / Bodega,Dessert Shop
4,Etobicoke,Women's Store,Park,Clothing Store,Flower Shop,Creperie,Deli / Bodega,Department Store,Dessert Shop
5,Mississauga,Hotel,Coffee Shop,Burrito Place,Fried Chicken Joint,Mediterranean Restaurant,Middle Eastern Restaurant,Sandwich Place,Gym / Fitness Center
6,North York,Ski Area,Ski Chalet,Yoga Studio,Cosmetics Shop,Cuban Restaurant,Deli / Bodega,Department Store,Dessert Shop
7,Queen's Park,Coffee Shop,Gym,Japanese Restaurant,Diner,Sushi Restaurant,Yoga Studio,Nightclub,Burger Joint
8,Scarborough,Business Service,Yoga Studio,Food Court,Deli / Bodega,Department Store,Dessert Shop,Diner,Discount Store
9,West Toronto,Restaurant,Coffee Shop,Gym,Brewery,Breakfast Spot,Bookstore,Gift Shop,Pub


In [57]:
# set number of clusters
kclusters = 3

toronto_grouped_clustering = toronto_grouped.drop('Borough', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 2, 0, 1, 0], dtype=int32)

In [58]:
toronto_grouped.shape

(11, 118)

In [61]:
toronto_merged = df_boroughs

# add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(boroughs_venues_sorted.set_index('Borough'), on='Borough')

toronto_merged # check the last columns!

Unnamed: 0,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue
0,Central Toronto,43.70198,-79.398954,0,Italian Restaurant,Sushi Restaurant,Restaurant,Coffee Shop,Indian Restaurant,Bank,Middle Eastern Restaurant,Mexican Restaurant
1,Downtown Toronto,43.654169,-79.383665,0,Clothing Store,Coffee Shop,Tea Room,Restaurant,Plaza,Café,American Restaurant,Concert Hall
2,East Toronto,43.669436,-79.324654,0,Indian Restaurant,Café,Grocery Store,Pizza Place,Skating Rink,Egyptian Restaurant,Snack Place,Bistro
3,East York,43.700303,-79.335851,0,Park,Public Art,Department Store,Yoga Studio,Cosmetics Shop,Cuban Restaurant,Deli / Bodega,Dessert Shop
4,Etobicoke,43.660043,-79.542074,0,Women's Store,Park,Clothing Store,Flower Shop,Creperie,Deli / Bodega,Department Store,Dessert Shop
5,Mississauga,43.636966,-79.615819,0,Hotel,Coffee Shop,Burrito Place,Fried Chicken Joint,Mediterranean Restaurant,Middle Eastern Restaurant,Sandwich Place,Gym / Fitness Center
6,North York,43.750727,-79.429338,2,Ski Area,Ski Chalet,Yoga Studio,Cosmetics Shop,Cuban Restaurant,Deli / Bodega,Department Store,Dessert Shop
7,Queen's Park,43.662301,-79.389494,0,Coffee Shop,Gym,Japanese Restaurant,Diner,Sushi Restaurant,Yoga Studio,Nightclub,Burger Joint
8,Scarborough,43.766229,-79.249085,1,Business Service,Yoga Studio,Food Court,Deli / Bodega,Department Store,Dessert Shop,Diner,Discount Store
9,West Toronto,43.652653,-79.44929,0,Restaurant,Coffee Shop,Gym,Brewery,Breakfast Spot,Bookstore,Gift Shop,Pub


visualize the map

In [64]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Borough'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 6. Examine Clusters

Examine each cluster and determine the discriminating venue categories that distinguish each cluster.

#### Cluster 1

In [77]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[0] + list(range(3, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue
0,Central Toronto,0,Italian Restaurant,Sushi Restaurant,Restaurant,Coffee Shop,Indian Restaurant,Bank,Middle Eastern Restaurant,Mexican Restaurant
1,Downtown Toronto,0,Clothing Store,Coffee Shop,Tea Room,Restaurant,Plaza,Café,American Restaurant,Concert Hall
2,East Toronto,0,Indian Restaurant,Café,Grocery Store,Pizza Place,Skating Rink,Egyptian Restaurant,Snack Place,Bistro
3,East York,0,Park,Public Art,Department Store,Yoga Studio,Cosmetics Shop,Cuban Restaurant,Deli / Bodega,Dessert Shop
4,Etobicoke,0,Women's Store,Park,Clothing Store,Flower Shop,Creperie,Deli / Bodega,Department Store,Dessert Shop
5,Mississauga,0,Hotel,Coffee Shop,Burrito Place,Fried Chicken Joint,Mediterranean Restaurant,Middle Eastern Restaurant,Sandwich Place,Gym / Fitness Center
7,Queen's Park,0,Coffee Shop,Gym,Japanese Restaurant,Diner,Sushi Restaurant,Yoga Studio,Nightclub,Burger Joint
9,West Toronto,0,Restaurant,Coffee Shop,Gym,Brewery,Breakfast Spot,Bookstore,Gift Shop,Pub
10,York,0,Convenience Store,Turkish Restaurant,Coffee Shop,Dessert Shop,Restaurant,Sandwich Place,Fast Food Restaurant,Cuban Restaurant


Keywords: Restorant

#### Cluster 2

In [80]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[0] + list(range(3, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue
8,Scarborough,1,Business Service,Yoga Studio,Food Court,Deli / Bodega,Department Store,Dessert Shop,Diner,Discount Store


Keywords: Business, Yoga, Store

#### Cluster 3

In [81]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[0] + list(range(3, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue
6,North York,2,Ski Area,Ski Chalet,Yoga Studio,Cosmetics Shop,Cuban Restaurant,Deli / Bodega,Department Store,Dessert Shop


Keywords: Sport, Ski, Store