Segmenting and Clustering Neighborhoods in Toronto

In [1]:
#In this notebook, we will scrape a wikipedia page using pandas pd.read_html with a data table consisting of postal code, borough, and neighborhood 
#for the city of Torornto and will clean it up by removing rows that contain a value of "Not assigned" in the Borough column, index the dataframe  
#and show of rowns and columns.

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]# syntax(df=pd.read_html(url, header=0))

In [4]:
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [5]:
df=df[df.Borough != "Not assigned"]

In [6]:
df2=df.reset_index(drop=True)

In [7]:
df2

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [8]:
df2.shape

(103, 3)

In [9]:
df2.head(12)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [10]:
pip install geocoder


Note: you may need to restart the kernel to use updated packages.


In [11]:
import geocoder # import geocoder
postal_code = df2['Postal Code']

# Using while Statement
latitude = []
longitude = []
n = 0

while n < len(postal_code):
    g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code[n]))
    lat_lng_coords = g.latlng
    print('The geograpical coordinate of {} are {}, {}.'.format(postal_code[n], lat_lng_coords[0], lat_lng_coords[1]))
    latitude.append(lat_lng_coords[0])
    longitude.append(lat_lng_coords[1])
    n = n + 1

df2['Latitude'] = latitude
df2['Longitude'] = longitude

df2

The geograpical coordinate of M3A are 43.75245000000007, -79.32990999999998.
The geograpical coordinate of M4A are 43.73057000000006, -79.31305999999995.
The geograpical coordinate of M5A are 43.65512000000007, -79.36263999999994.
The geograpical coordinate of M6A are 43.72327000000007, -79.45041999999995.
The geograpical coordinate of M7A are 43.66253000000006, -79.39187999999996.
The geograpical coordinate of M9A are 43.662630000000036, -79.52830999999998.
The geograpical coordinate of M1B are 43.811390000000074, -79.19661999999994.
The geograpical coordinate of M3B are 43.74923000000007, -79.36185999999998.
The geograpical coordinate of M4B are 43.70718000000005, -79.31191999999999.
The geograpical coordinate of M5B are 43.65739000000008, -79.37803999999994.
The geograpical coordinate of M6B are 43.70687000000004, -79.44811999999996.
The geograpical coordinate of M9B are 43.65034000000003, -79.55361999999997.
The geograpical coordinate of M1C are 43.78574000000003, -79.1587499999999

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.65319,-79.51113
99,M4Y,Downtown Toronto,Church and Wellesley,43.66659,-79.38133
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.64869,-79.38544
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.63278,-79.48945


In [12]:
import requests #library to handle requests
import json #library to handle JSON files
from pandas.io.json import json_normalize #to transform JSON file into a pandas dataframe

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

#Matplotlib and associated plotting modules
import matplotlib.cm as cm 
import matplotlib.colors as colors

#import k-means from clustering stage
from sklearn.cluster import KMeans

import folium #map rendering library


In [13]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df2['Borough'].unique()),
        df2.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


In [14]:
# use geopy to get the latitudes and longitudes of Toronto
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [15]:
# create map of Toronto using the latitude and longitude values
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

#add markers to map
for lat, lng, borough, neighborhood in zip(df2['Latitude'], df2['Longitude'], df2['Borough'], df2['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

In [16]:
df2.Borough.unique()

array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

In [17]:
#I chose 'Downtown Toronto' as one of the Boroughs to do analysis, explore, and cluster.
DowntownToronto_data = df2[df2['Borough']=='Downtown Toronto'].reset_index(drop=True)
DowntownToronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.65739,-79.37804
3,M5C,Downtown Toronto,St. James Town,43.65215,-79.37587
4,M5E,Downtown Toronto,Berczy Park,43.64536,-79.37306


In [18]:
#let's get the geographical coordinates of 'Downtown Toronto'
address = 'Downtown Toronto, ON'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Downtown Toronto are 43.6563221, -79.3809161.


In [19]:
# let' visualize 'Downtown Toronto'
map_DowntownToronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(DowntownToronto_data['Latitude'], DowntownToronto_data['Longitude'], DowntownToronto_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_DowntownToronto)  
    
map_DowntownToronto

Use foursquare API to explore neighborhoods

In [20]:
CLIENT_ID = 'TB1CA3QB3CCLCMRUHUERFQTGH4LCUVZPDL2MJGHIJKFWUPPZ' # your Foursquare ID
CLIENT_SECRET = 'VX2HS53FWAGG301SPTOGL44GSEOVGSMN514ZT5XZIOEMSK5J' # your Foursquare Secret
VERSION = '20201201' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

In [21]:
# let's explore the first neigbourhood in our dataframe
DowntownToronto_data.loc[0,'Neighbourhood']

'Regent Park, Harbourfront'

In [22]:
# get geographical coordinates for the neighbourhoods, they are close to each other
neighborhood_latitude = DowntownToronto_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = DowntownToronto_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = DowntownToronto_data.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Regent Park, Harbourfront are 43.65512000000007, -79.36263999999994.


Now let's get the top 100 venues that are in Regent Park or/and Harbour front within 500 meters

In [23]:
# create the GET request url
radius = 500
LIMIT = 100


url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, neighborhood_latitude, neighborhood_longitude, VERSION, radius, LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=TB1CA3QB3CCLCMRUHUERFQTGH4LCUVZPDL2MJGHIJKFWUPPZ&client_secret=VX2HS53FWAGG301SPTOGL44GSEOVGSMN514ZT5XZIOEMSK5J&ll=43.65512000000007,-79.36263999999994&v=20201201&radius=500&limit=100'

Send the get request and examine the results

In [24]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '60131a66c395a9504b3fb178'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Corktown',
  'headerFullLocation': 'Corktown, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 23,
  'suggestedBounds': {'ne': {'lat': 43.65962000450007,
    'lng': -79.3564319112327},
   'sw': {'lat': 43.650619995500065, 'lng': -79.36884808876718}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '54ea41ad498e9a11e9e13308',
       'name': 'Roselle Desserts',
       'location': {'address': '362 King St E',
        'crossStreet': 'Trinity St',
        'lat': 43.653446723052674,
        'lng': -79.3620167174383,
        'labeledLatLngs': [{'label':

In [25]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [26]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,categories,lat,lng
0,Roselle Desserts,Bakery,43.653447,-79.362017
1,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,Figs Breakfast & Lunch,Breakfast Spot,43.655675,-79.364503
3,The Yoga Lounge,Yoga Studio,43.655515,-79.364955
4,Body Blitz Spa East,Spa,43.654735,-79.359874


In [27]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

23 venues were returned by Foursquare.


Let's create a function to repeat the same process to all neighbourhoods in 'Downtown Toronto'

In [28]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Now write the code to run the above function on each neigbourhood and create a new dataframe called DowntownToronto_venues

In [29]:
DowntownToronto_venues= getNearbyVenues(names=DowntownToronto_data['Neighbourhood'],
                                 latitudes=DowntownToronto_data['Latitude'],
                                 longitudes=DowntownToronto_data['Longitude']
                                 )

Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Rosedale
Stn A PO Boxes
St. James Town, Cabbagetown
First Canadian Place, Underground city
Church and Wellesley


In [30]:
# check the size of the resulting dataframe
print(DowntownToronto_venues.shape)
DowntownToronto_venues.head()

(1213, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65512,-79.36264,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65512,-79.36264,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65512,-79.36264,Figs Breakfast & Lunch,43.655675,-79.364503,Breakfast Spot
3,"Regent Park, Harbourfront",43.65512,-79.36264,The Yoga Lounge,43.655515,-79.364955,Yoga Studio
4,"Regent Park, Harbourfront",43.65512,-79.36264,Body Blitz Spa East,43.654735,-79.359874,Spa


Let's check how many venues were returned for each neighborhood


In [31]:
DowntownToronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,61,61,61,61,61,61
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",78,78,78,78,78,78
Central Bay Street,61,61,61,61,61,61
Christie,11,11,11,11,11,11
Church and Wellesley,81,81,81,81,81,81
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
"First Canadian Place, Underground city",100,100,100,100,100,100
"Garden District, Ryerson",100,100,100,100,100,100
"Harbourfront East, Union Station, Toronto Islands",54,54,54,54,54,54
"Kensington Market, Chinatown, Grange Park",47,47,47,47,47,47


Let's find out how many unique categories can be curated from all the returned venues

In [32]:
# one hot encoding
DowntownToronto_onehot = pd.get_dummies(DowntownToronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
DowntownToronto_onehot['Neighborhood'] = DowntownToronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [DowntownToronto_onehot.columns[-1]] + list(DowntownToronto_onehot.columns[:-1])
DowntownToronto_onehot = DowntownToronto_onehot[fixed_columns]

DowntownToronto_onehot.head()

Unnamed: 0,Yoga Studio,Adult Boutique,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Theme Restaurant,Thrift / Vintage Store,Train Station,Vegetarian / Vegan Restaurant,Veterinarian,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
DowntownToronto_onehot.shape

(1213, 191)

Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [34]:
DowntownToronto_grouped = DowntownToronto_onehot.groupby('Neighborhood').mean().reset_index()
DowntownToronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Adult Boutique,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Theme Restaurant,Thrift / Vintage Store,Train Station,Vegetarian / Vegan Restaurant,Veterinarian,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint
0,Berczy Park,0.016393,0.0,0.0,0.016393,0.0,0.016393,0.0,0.0,0.0,...,0.0,0.0,0.0,0.016393,0.0,0.0,0.0,0.0,0.0,0.0
1,"CN Tower, King and Spadina, Railway Lands, Har...",0.012821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012821,...,0.0,0.0,0.012821,0.0,0.012821,0.0,0.0,0.0,0.0,0.0
2,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.016393,0.016393,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.016393,0.016393,0.016393,0.0,0.0
3,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Church and Wellesley,0.012346,0.0,0.012346,0.0,0.0,0.0,0.0,0.0,0.0,...,0.012346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Commerce Court, Victoria Hotel",0.0,0.0,0.04,0.0,0.0,0.01,0.0,0.0,0.01,...,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0
6,"First Canadian Place, Underground city",0.0,0.0,0.03,0.0,0.0,0.01,0.0,0.0,0.03,...,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.01,0.0,0.0
7,"Garden District, Ryerson",0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.01,0.0,0.0
8,"Harbourfront East, Union Station, Toronto Islands",0.0,0.0,0.0,0.0,0.037037,0.0,0.0,0.0,0.0,...,0.0,0.0,0.018519,0.018519,0.0,0.0,0.0,0.0,0.018519,0.0
9,"Kensington Market, Chinatown, Grange Park",0.0,0.0,0.0,0.0,0.0,0.021277,0.0,0.021277,0.0,...,0.0,0.0,0.0,0.06383,0.0,0.0,0.042553,0.021277,0.0,0.0


In [35]:
DowntownToronto_grouped.shape

(19, 191)

Let's print each neighborhood along with the top 5 most common venues

In [36]:
num_top_venues = 5

for hood in DowntownToronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = DowntownToronto_grouped[DowntownToronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
                venue  freq
0         Coffee Shop  0.08
1        Cocktail Bar  0.05
2  Seafood Restaurant  0.05
3      Breakfast Spot  0.03
4            Beer Bar  0.03


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
                venue  freq
0  Italian Restaurant  0.06
1         Coffee Shop  0.06
2                Café  0.05
3                Park  0.04
4                 Bar  0.04


----Central Bay Street----
                       venue  freq
0                Coffee Shop  0.13
1             Clothing Store  0.07
2                Pizza Place  0.03
3  Middle Eastern Restaurant  0.03
4                       Café  0.03


----Christie----
           venue  freq
0           Café  0.27
1  Grocery Store  0.18
2     Playground  0.09
3    Coffee Shop  0.09
4    Candy Store  0.09


----Church and Wellesley----
                  venue  freq
0           Coffee Shop  0.11
1   Japanese Restaurant  0.06
2      Sus

 Let's put that into a _pandas_ dataframe

In [37]:
# First, let's write a function to sort the venues in descending order.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.


In [38]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = DowntownToronto_grouped['Neighborhood']

for ind in np.arange(DowntownToronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(DowntownToronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Seafood Restaurant,Breakfast Spot,Farmers Market,Beer Bar,Restaurant,Bakery,Cheese Shop,Eastern European Restaurant
1,"CN Tower, King and Spadina, Railway Lands, Har...",Italian Restaurant,Coffee Shop,Café,Park,French Restaurant,Bar,Bakery,Restaurant,Sandwich Place,Grocery Store
2,Central Bay Street,Coffee Shop,Clothing Store,Cosmetics Shop,Café,Pizza Place,Plaza,Bubble Tea Shop,Sandwich Place,Middle Eastern Restaurant,Art Gallery
3,Christie,Café,Grocery Store,Italian Restaurant,Candy Store,Playground,Athletics & Sports,Baby Store,Coffee Shop,Distribution Center,Falafel Restaurant
4,Church and Wellesley,Coffee Shop,Japanese Restaurant,Restaurant,Sushi Restaurant,Fast Food Restaurant,Gay Bar,Café,Hotel,Pub,Dance Studio


### Cluster Neigbourhoods

Run _k_-means to cluster the neighborhood into 5 clusters.

In [39]:
# set number of clusters
kclusters = 5

DowntownToronto_grouped_clustering = DowntownToronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(DowntownToronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 0, 3, 0, 2, 2, 0, 0, 2])

In [40]:
DowntownToronto_grouped_clustering.head()

Unnamed: 0,Yoga Studio,Adult Boutique,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Theme Restaurant,Thrift / Vintage Store,Train Station,Vegetarian / Vegan Restaurant,Veterinarian,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint
0,0.016393,0.0,0.0,0.016393,0.0,0.016393,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.016393,0.0,0.0,0.0,0.0,0.0,0.0
1,0.012821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012821,0.0,...,0.0,0.0,0.012821,0.0,0.012821,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.016393,0.016393,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.016393,0.016393,0.016393,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.012346,0.0,0.012346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.012346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
DowntownToronto_data.rename(columns={'Neighbourhood':'Neighborhood'}, inplace=True)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.


In [42]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

DowntownToronto_merged = DowntownToronto_data

# merge DowntownToronto_grouped with DowntownToronto_data to add latitude/longitude for each neighborhood
DowntownToronto_merged = DowntownToronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

DowntownToronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264,0,Coffee Shop,Breakfast Spot,Yoga Studio,Thai Restaurant,Italian Restaurant,Pub,Bakery,Event Space,Playground,Spa
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188,4,Coffee Shop,Sandwich Place,Park,Gastropub,Mediterranean Restaurant,Burrito Place,Café,Fried Chicken Joint,Bank,Italian Restaurant
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.65739,-79.37804,0,Coffee Shop,Clothing Store,Café,Hotel,Cosmetics Shop,Japanese Restaurant,Middle Eastern Restaurant,Chinese Restaurant,Bubble Tea Shop,Ramen Restaurant
3,M5C,Downtown Toronto,St. James Town,43.65215,-79.37587,2,Coffee Shop,Hotel,Cocktail Bar,Café,Italian Restaurant,Cosmetics Shop,Clothing Store,Gastropub,Restaurant,Theater
4,M5E,Downtown Toronto,Berczy Park,43.64536,-79.37306,2,Coffee Shop,Cocktail Bar,Seafood Restaurant,Breakfast Spot,Farmers Market,Beer Bar,Restaurant,Bakery,Cheese Shop,Eastern European Restaurant


In [44]:
# drop columns 'Postal Code' from the merge from 'DowntownToronto_data' and drop the duplicate cluster labels from above
DowntownToronto_merged.drop(columns=['Postal Code'], inplace =True)

In [45]:
DowntownToronto_merged.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264,0,Coffee Shop,Breakfast Spot,Yoga Studio,Thai Restaurant,Italian Restaurant,Pub,Bakery,Event Space,Playground,Spa
1,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188,4,Coffee Shop,Sandwich Place,Park,Gastropub,Mediterranean Restaurant,Burrito Place,Café,Fried Chicken Joint,Bank,Italian Restaurant
2,Downtown Toronto,"Garden District, Ryerson",43.65739,-79.37804,0,Coffee Shop,Clothing Store,Café,Hotel,Cosmetics Shop,Japanese Restaurant,Middle Eastern Restaurant,Chinese Restaurant,Bubble Tea Shop,Ramen Restaurant
3,Downtown Toronto,St. James Town,43.65215,-79.37587,2,Coffee Shop,Hotel,Cocktail Bar,Café,Italian Restaurant,Cosmetics Shop,Clothing Store,Gastropub,Restaurant,Theater
4,Downtown Toronto,Berczy Park,43.64536,-79.37306,2,Coffee Shop,Cocktail Bar,Seafood Restaurant,Breakfast Spot,Farmers Market,Beer Bar,Restaurant,Bakery,Cheese Shop,Eastern European Restaurant


Let's visualize the clusters

In [46]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(DowntownToronto_merged['Latitude'], DowntownToronto_merged['Longitude'], DowntownToronto_merged['Neighborhood'], DowntownToronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine Clusters

Now, we can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, you can then assign a name to each cluster.

In [47]:
# Examine cluster 1
DowntownToronto_merged.loc[DowntownToronto_merged['Cluster Labels'] == 0, DowntownToronto_merged.columns[[1] + list(range(5, DowntownToronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Regent Park, Harbourfront",Coffee Shop,Breakfast Spot,Yoga Studio,Thai Restaurant,Italian Restaurant,Pub,Bakery,Event Space,Playground,Spa
2,"Garden District, Ryerson",Coffee Shop,Clothing Store,Café,Hotel,Cosmetics Shop,Japanese Restaurant,Middle Eastern Restaurant,Chinese Restaurant,Bubble Tea Shop,Ramen Restaurant
5,Central Bay Street,Coffee Shop,Clothing Store,Cosmetics Shop,Café,Pizza Place,Plaza,Bubble Tea Shop,Sandwich Place,Middle Eastern Restaurant,Art Gallery
8,"Harbourfront East, Union Station, Toronto Islands",Coffee Shop,Hotel,Theater,Japanese Restaurant,Plaza,Boat or Ferry,Deli / Bodega,Park,Aquarium,Roof Deck
18,Church and Wellesley,Coffee Shop,Japanese Restaurant,Restaurant,Sushi Restaurant,Fast Food Restaurant,Gay Bar,Café,Hotel,Pub,Dance Studio


In [48]:
# Examine cluster 2
DowntownToronto_merged.loc[DowntownToronto_merged['Cluster Labels'] == 1, DowntownToronto_merged.columns[[1] + list(range(5, DowntownToronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
14,Rosedale,Park,Playground,Bike Trail,Wings Joint,Distribution Center,Farm,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store


In [49]:
# Examine cluster 3
DowntownToronto_merged.loc[DowntownToronto_merged['Cluster Labels'] == 2, DowntownToronto_merged.columns[[1] + list(range(5, DowntownToronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,St. James Town,Coffee Shop,Hotel,Cocktail Bar,Café,Italian Restaurant,Cosmetics Shop,Clothing Store,Gastropub,Restaurant,Theater
4,Berczy Park,Coffee Shop,Cocktail Bar,Seafood Restaurant,Breakfast Spot,Farmers Market,Beer Bar,Restaurant,Bakery,Cheese Shop,Eastern European Restaurant
7,"Richmond, Adelaide, King",Coffee Shop,Hotel,Café,Restaurant,Gym,Japanese Restaurant,Steakhouse,Asian Restaurant,Salad Place,American Restaurant
9,"Toronto Dominion Centre, Design Exchange",Coffee Shop,Hotel,Café,Salad Place,Restaurant,American Restaurant,Japanese Restaurant,Italian Restaurant,Seafood Restaurant,Beer Bar
10,"Commerce Court, Victoria Hotel",Coffee Shop,Restaurant,Hotel,Italian Restaurant,Café,Gym,American Restaurant,Japanese Restaurant,Seafood Restaurant,Deli / Bodega
11,"University of Toronto, Harbord",Café,Bakery,Coffee Shop,Restaurant,Bar,Bookstore,Japanese Restaurant,Gym,Wings Joint,French Restaurant
12,"Kensington Market, Chinatown, Grange Park",Café,Vegetarian / Vegan Restaurant,Coffee Shop,Mexican Restaurant,Gaming Cafe,Farmers Market,Grocery Store,Caribbean Restaurant,Vietnamese Restaurant,Record Shop
13,"CN Tower, King and Spadina, Railway Lands, Har...",Italian Restaurant,Coffee Shop,Café,Park,French Restaurant,Bar,Bakery,Restaurant,Sandwich Place,Grocery Store
15,Stn A PO Boxes,Coffee Shop,Café,Hotel,Asian Restaurant,Sushi Restaurant,Gym,Sandwich Place,Bar,Salon / Barbershop,Salad Place
16,"St. James Town, Cabbagetown",Park,Coffee Shop,Café,Restaurant,Pub,Italian Restaurant,Pizza Place,Pet Store,Bakery,Deli / Bodega


In [50]:
# Examine cluster 4
DowntownToronto_merged.loc[DowntownToronto_merged['Cluster Labels'] == 3, DowntownToronto_merged.columns[[1] + list(range(5, DowntownToronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Christie,Café,Grocery Store,Italian Restaurant,Candy Store,Playground,Athletics & Sports,Baby Store,Coffee Shop,Distribution Center,Falafel Restaurant


In [51]:
# Examine cluster 5
DowntownToronto_merged.loc[DowntownToronto_merged['Cluster Labels'] == 4, DowntownToronto_merged.columns[[1] + list(range(5, DowntownToronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,"Queen's Park, Ontario Provincial Government",Coffee Shop,Sandwich Place,Park,Gastropub,Mediterranean Restaurant,Burrito Place,Café,Fried Chicken Joint,Bank,Italian Restaurant
