## This workbook will be used for the Coursera Capstone project

In [1]:
import pandas as pd
import numpy as np
print("Hello Project Capstone Course!")

Hello Project Capstone Course!


## This rest of this workbook is about Toronto Neighborhood Exploration and Clustering 

### Part I - Creating an initial df of Toronto postcodes/neighborhoods

In [2]:
#read in the df from a stored .csv file scraped from the wikipedia page (code was auto-generated via Notebook option)
import types
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
# Credentials removed
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

df = pd.read_csv(body)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M9Z,Not assigned,Not assigned
1,M9Y,Not assigned,Not assigned
2,M9X,Not assigned,Not assigned
3,M9W,Etobicoke,Northwest
4,M9V,Etobicoke,Albion Gardens


In [3]:
# Get names of Boroughs for which column value is 'Not assigned'
naBoroughs = df[df['Borough'] == 'Not assigned'].index
 
# Delete these row indexes from df
df.drop(naBoroughs, inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
3,M9W,Etobicoke,Northwest
4,M9V,Etobicoke,Albion Gardens
5,M9V,Etobicoke,Beaumond Heights
6,M9V,Etobicoke,Humbergate
7,M9V,Etobicoke,Jamestown


In [4]:
#group df on unique postcodes and concatenate Neighbourhood names where multiple in original dataset, reset index
df_postcodes=df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: ', '.join(x)).reset_index()
df_postcodes.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [5]:
#replace any neighborhoods having a value of 'Not assigned' with the name of the Borough
df_postcodes['Neighbourhood'].mask(df_postcodes['Neighbourhood'] == 'Not assigned', df_postcodes['Borough'], inplace=True)
df_postcodes.tail(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
93,M9A,Queen's Park,Queen's Park
94,M9B,Etobicoke,"Cloverdale, Islington, Martin Grove, Princess ..."
95,M9C,Etobicoke,"Bloordale Gardens, Eringate, Markland Wood, Ol..."
96,M9L,North York,Humber Summit
97,M9M,North York,"Emery, Humberlea"
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."
102,M9W,Etobicoke,Northwest


In [6]:
df_postcodes.shape

(103, 3)

### Part II - Adding Latitude and Longditude to df_postcodes

In [7]:
#Loading the location data from the link provided and renaming 'Postcode' column to facilitate df merge
path='http://cocl.us/Geospatial_data/geospatial_coordinates.csv'
latlong_df=pd.read_csv(path)
latlong_df.columns = ['Postcode', 'Latitude','Longditude']
latlong_df.head()

Unnamed: 0,Postcode,Latitude,Longditude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [8]:
#Merge lat & long coordinates with the postcodes df
neighborhoods=pd.merge(df_postcodes, latlong_df, on="Postcode", how='left')
neighborhoods.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longditude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [9]:
neighborhoods.shape

(103, 5)

### Part III - Exploring Toronto Neighborhoods

In [10]:
import requests 
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library
print('Folium installed')
print('Libraries imported')

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.

Folium installed
Libraries imported


In [11]:
#to explore the immediate area around the University of Toronto, get co-ordinates
address = "27 King's College Cir, Toronto, ON"

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

43.6607225 -79.3958255


In [48]:
#This cell had credentials information

In [49]:
#Let's search for bars close to the University of Toronto
search_query = 'Bar'
radius = 1000
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, search_query, radius, LIMIT)
#url

In [14]:
#Retrieve results
results = requests.get(url).json()

# assign relevant part of JSON to venues
venues = results['response']['venues']

# tranform venues into a dataframe
dataframe = json_normalize(venues)

# keep only columns that include venue name, and anything that is associated with location
filtered_columns = ['name', 'categories'] + [col for col in dataframe.columns if col.startswith('location.')] + ['id']
nearby_bars = dataframe.loc[:, filtered_columns]

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

# filter the category for each row
nearby_bars['categories'] = nearby_bars.apply(get_category_type, axis=1)

# clean column names by keeping only last term
nearby_bars.columns = [column.split('.')[-1] for column in nearby_bars.columns]

nearby_bars

Unnamed: 0,name,categories,address,cc,city,country,crossStreet,distance,formattedAddress,labeledLatLngs,lat,lng,neighborhood,postalCode,state,id
0,St. Louis Bar and Grill,Wings Joint,376 Bloor St. W.,CA,Toronto,Canada,at Major St.,1034,"[376 Bloor St. W. (at Major St.), Toronto ON M...","[{'label': 'display', 'lat': 43.66623296658398...",43.666233,-79.406167,,M5S 1X2,ON,4b11c343f964a5201f8423e3
1,The Rex Hotel Jazz & Blues Bar,Jazz Club,194 Queen St W,CA,Toronto,Canada,Queen & St. Patrick,1278,"[194 Queen St W (Queen & St. Patrick), Toronto...","[{'label': 'display', 'lat': 43.65050475544005...",43.650505,-79.388577,,M5V 1Z1,ON,4b68aed1f964a520de862be3
2,Nespresso Boutique - Bar,Coffee Shop,159 Cumberland St.,CA,Toronto,Canada,at Avenue Rd.,1038,"[159 Cumberland St. (at Avenue Rd.), Toronto O...","[{'label': 'display', 'lat': 43.66988316570838...",43.669883,-79.393359,,M5R 1A2,ON,524c6ea511d207bb0817c28e
3,Cibo Wine Bar,Wine Bar,133 Yorkville Ave,CA,Toronto,Canada,btwn Bay St and Avenue Rd,1113,[133 Yorkville Ave (btwn Bay St and Avenue Rd)...,"[{'label': 'display', 'lat': 43.67060682276035...",43.670607,-79.39369,,M5R 1C4,ON,4b6b4014f964a52025fc2be3
4,Dark Horse Espresso Bar,Coffee Shop,215 Spadina Ave,CA,Toronto,Canada,at Sullivan St.,1134,"[215 Spadina Ave (at Sullivan St.), Toronto ON...","[{'label': 'display', 'lat': 43.65056381026066...",43.650564,-79.397018,,M5T 2C7,ON,4ad76dadf964a520530a21e3
5,St. Louis Bar and Grill,Wings Joint,528 Yonge St.,CA,Toronto,Canada,at Breadalbane St.,999,"[528 Yonge St. (at Breadalbane St.), Toronto O...","[{'label': 'display', 'lat': 43.66385323730235...",43.663853,-79.384187,,M4Y 1X9,ON,4de11393e4cd846e40987a4d
6,Bar Mercurio,Restaurant,270 Bloor St West,CA,Toronto,Canada,St George,829,"[270 Bloor St West (St George), Toronto ON M5S...","[{'label': 'display', 'lat': 43.66742849463208...",43.667428,-79.40033,,M5S 1V8,ON,4b5e2f40f964a520618229e3
7,Blo Blow Dry Bar,Cosmetics Shop,21 Avenue Road,CA,Toronto,Canada,Avenue Road & Cumberland Street,1102,[21 Avenue Road (Avenue Road & Cumberland Stre...,"[{'label': 'display', 'lat': 43.67053239214243...",43.670532,-79.393983,,M5R 2G1,ON,4b4a0236f964a520cc7726e3
8,Jun Jun Bar,Karaoke Bar,374 College St.,CA,Toronto,Canada,,836,"[374 College St., Toronto ON, Canada]","[{'label': 'display', 'lat': 43.65705954806553...",43.65706,-79.404894,,,ON,4f3dc140e4b01fd8583a5b63
9,Graffiti's Bar & Grill,Bar,170 Baldwin St.,CA,Toronto,Canada,,763,"[170 Baldwin St., Toronto ON M5T 1L8, Canada]","[{'label': 'display', 'lat': 43.65477810299509...",43.654778,-79.40056,,M5T 1L8,ON,4ad4c05df964a52062f620e3


In [15]:
#my search for bars has picked up any venue with the string 'bar' in the name. 
#I'm worried I won't be able to get a drink at all these places, so I'm going to
#drop all rows where 'categories'<>'Bar', 'Karaoke Bar', 'Hotel Bar', 'Wine Bar', 'Beer Bar' or 'Pub'

myList=['Bar','Karaoke Bar','Hotel Bar','Wine Bar','Beer Bar','Pub']

nearby_bars=pd.DataFrame({'categories':myList}).merge(nearby_bars)
nearby_bars

Unnamed: 0,categories,name,address,cc,city,country,crossStreet,distance,formattedAddress,labeledLatLngs,lat,lng,neighborhood,postalCode,state,id
0,Bar,Graffiti's Bar & Grill,170 Baldwin St.,CA,Toronto,Canada,,763,"[170 Baldwin St., Toronto ON M5T 1L8, Canada]","[{'label': 'display', 'lat': 43.65477810299509...",43.654778,-79.40056,,M5T 1L8,ON,4ad4c05df964a52062f620e3
1,Bar,Jang bang bar and grill,430.5 college st,CA,Toronto,Canada,Bathurst st,1019,"[430.5 college st (Bathurst st), Toronto ON, C...","[{'label': 'display', 'lat': 43.65657408486233...",43.656574,-79.40711,,,ON,4e35c3e2fa7656ba3174b564
2,Bar,Erl's Bistro and Bar,700 University Avenue,CA,Toronto,Canada,,182,"[700 University Avenue, Toronto ON M5G 1Z5, Ca...","[{'label': 'display', 'lat': 43.65912705545884...",43.659127,-79.395308,,M5G 1Z5,ON,4ad4c05cf964a520d3f520e3
3,Bar,The Embassy Bar,223 Augusta Ave,CA,Toronto,Canada,,863,"[223 Augusta Ave, Toronto ON, Canada]","[{'label': 'display', 'lat': 43.65430848069026...",43.654308,-79.401852,,,ON,4ae2620df964a520da8d21e3
4,Karaoke Bar,Jun Jun Bar,374 College St.,CA,Toronto,Canada,,836,"[374 College St., Toronto ON, Canada]","[{'label': 'display', 'lat': 43.65705954806553...",43.65706,-79.404894,,,ON,4f3dc140e4b01fd8583a5b63
5,Wine Bar,Cibo Wine Bar,133 Yorkville Ave,CA,Toronto,Canada,btwn Bay St and Avenue Rd,1113,[133 Yorkville Ave (btwn Bay St and Avenue Rd)...,"[{'label': 'display', 'lat': 43.67060682276035...",43.670607,-79.39369,,M5R 1C4,ON,4b6b4014f964a52025fc2be3
6,Beer Bar,Bar Volo,17 St. Nicholas St.,CA,Toronto,Canada,,971,"[17 St. Nicholas St., Toronto ON M4Y 3G4, Canada]","[{'label': 'display', 'lat': 43.66546184849369...",43.665462,-79.385692,,M4Y 3G4,ON,5d9399e49b61d90008bac7b0
7,Beer Bar,Her Father's Cider Bar + Kitchen,119 Harbord Street,CA,Toronto,Canada,at Major St,740,"[119 Harbord Street (at Major St), Toronto ON ...","[{'label': 'display', 'lat': 43.66244824890102...",43.662448,-79.404703,Downtown Toronto,M5S 1G7,ON,5707bcb9498e709b544e20b2
8,Pub,The 460 Bistro Bar,460 Spadina Ave.,CA,Toronto,Canada,College St.,446,"[460 Spadina Ave. (College St.), Toronto ON L4...","[{'label': 'display', 'lat': 43.65781812743475...",43.657818,-79.399645,,L4M 6N8,ON,5390f9dd498e1f669af53326


In [16]:
#display bar venues on a map of Toronto
venues_map = folium.Map(location=[latitude, longitude], zoom_start=13) # generate map centred around the University of Toronto

# add a red circle marker to represent the Univesity of Toronto
folium.features.CircleMarker(
    [latitude, longitude],
    radius=10,
    color='red',
    popup='Univ of Toronto',
    fill = True,
    fill_color = 'red',
    fill_opacity = 0.6
).add_to(venues_map)

# add the bars as blue circle markers
for lat, lng, label in zip(nearby_bars.lat, nearby_bars.lng, nearby_bars.categories):
    folium.features.CircleMarker(
        [lat, lng],
        radius=5,
        color='blue',
        popup=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(venues_map)

# display map
venues_map


In [50]:
#let's find out more about one of the bars
venue_id = '4ad4c05df964a52062f620e3' # ID of Graffitti Bar & Grill
url = 'https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&v={}'.format(venue_id, CLIENT_ID, CLIENT_SECRET, VERSION)
#url

In [18]:
#retrieve information & see how this venue is rated (5.0 or better is good enough for me!)
result = requests.get(url).json()

try:
    print(result['response']['venue']['rating'])
except:
    print('This venue has not been rated yet.')

6.0


In [19]:
#get tips on the Graffiti Bar & Grill - will probably only be able to retrieve one of them due to FourSquare account limitation
result['response']['venue']['tips']['count']

4

In [20]:
limit = 4 # set limit to be greater than or equal to the total number of tips
url = 'https://api.foursquare.com/v2/venues/{}/tips?client_id={}&client_secret={}&v={}&limit={}'.format(venue_id, CLIENT_ID, CLIENT_SECRET, VERSION, limit)

results = requests.get(url).json()

In [21]:
tips = results['response']['tips']['items']

tip = results['response']['tips']['items'][0]
tip.keys()

dict_keys(['id', 'createdAt', 'text', 'type', 'canonicalUrl', 'lang', 'likes', 'logView', 'agreeCount', 'disagreeCount', 'todo', 'user'])

In [22]:
pd.set_option('display.max_colwidth', -1)

tips_df = json_normalize(tips) # json normalize tips

# columns to keep
filtered_columns = ['text', 'agreeCount', 'disagreeCount', 'id', 'user.firstName', 'user.lastName', 'user.gender', 'user.id']
tips_filtered = tips_df.loc[:, filtered_columns]

# display tips
tips_filtered

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Unnamed: 0,text,agreeCount,disagreeCount,id,user.firstName,user.lastName,user.gender,user.id
0,Everopne looks good in Graffiti's lighting. Say hi to Steve!,1,0,4f280e79e4b0fa5ce5900fa0,Stephanie,M,,337382


#### I like the sound of this place!
#### Now let's do a clustering analysis for Toronto Neighborhoods

In [23]:
import matplotlib.pyplot as plt # plotting library
%matplotlib inline 

from sklearn.cluster import KMeans 
from sklearn.datasets.samples_generator import make_blobs

print('Libraries imported')

Libraries imported


#### Steps followed in the clustering analysis
###### 1. Get a df with base listing of neighborhoods and their co-ordinates (we already have the 'neighborhoods' df created above based on Toronto postcodes)
###### 2. Filter the 'neighborhoods' df to narrow the scope of neighborhoods of interest 
###### 3. Get information (json) about amenities available in those neighborhoods; define a program to generate requests for venue information for each neighborhood
###### 4. Create a df with venue information for all neighborhoods 
###### 5. Perform one-hot encoding of the venue information to facilitate clustering
###### 6. Perform cluster analysis

In [24]:
#filter list of Toronto neighborhoods to get something more manageable to work with
boroughs=df_postcodes[['Borough']]
boroughs['Borough'].unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       'Mississauga', 'Etobicoke', "Queen's Park"], dtype=object)

In [25]:
#a new df 'T_hoods' wil contain location information only for Boroughs with the word 'Toronto'
myList2=['East Toronto','Central Toronto','Downtown Toronto','West Toronto']

T_hoods=pd.DataFrame({'Borough':myList2}).merge(neighborhoods)
T_hoods.head()

Unnamed: 0,Borough,Postcode,Neighbourhood,Latitude,Longditude
0,East Toronto,M4E,The Beaches,43.676357,-79.293031
1,East Toronto,M4K,"The Danforth West, Riverdale",43.679557,-79.352188
2,East Toronto,M4L,"The Beaches West, India Bazaar",43.668999,-79.315572
3,East Toronto,M4M,Studio District,43.659526,-79.340923
4,East Toronto,M7Y,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558


In [26]:
#summarize the T_hoods df
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(T_hoods['Borough'].unique()),
        T_hoods.shape[0]
    )
)

The dataframe has 4 boroughs and 39 neighborhoods.


In [27]:
#Reset central location as City of Toronto for map plotting purposes (vs. Univ of Toronto), not that it makes much difference...
address = "Toronto, ON"

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

43.653963 -79.387207


In [28]:
# create map of Toronto using latitude and longitude values
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(T_hoods['Latitude'], T_hoods['Longditude'], T_hoods['Borough'], T_hoods['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto


In [29]:
#function to get venue information for each neighborhood, limited to 100 items within 0.6 miles of centre of each neighborhood
def getNearbyVenues(names, latitudes, longitudes, radius=1000,LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)

In [30]:
Toronto_venues = getNearbyVenues(names=T_hoods['Neighbourhood'],
                                   latitudes=T_hoods['Latitude'],
                                   longitudes=T_hoods['Longditude']
                                  )


In [31]:
print(Toronto_venues.shape)
Toronto_venues.head()

(3185, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Beech Tree,43.680493,-79.288846,Gastropub
2,The Beaches,43.676357,-79.293031,The Fox Theatre,43.672801,-79.287272,Indie Movie Theater
3,The Beaches,43.676357,-79.293031,Beaches Bake Shop,43.680363,-79.289692,Bakery
4,The Beaches,43.676357,-79.293031,Ed's Real Scoop,43.67263,-79.287993,Ice Cream Shop


In [32]:
Toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Berczy Park,100,100,100,100,100,100
"Brockton, Exhibition Place, Parkdale Village",100,100,100,100,100,100
Business Reply Mail Processing Centre 969 Eastern,47,47,47,47,47,47
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",15,15,15,15,15,15
"Cabbagetown, St. James Town",35,35,35,35,35,35
Central Bay Street,100,100,100,100,100,100
"Chinatown, Grange Park, Kensington Market",100,100,100,100,100,100
Christie,100,100,100,100,100,100
Church and Wellesley,100,100,100,100,100,100


In [33]:
#it looks like the limit of 100 venues was found for many of the neighborhoods. How many unique categories are there? 
print('There are {} unique categories.'.format(len(Toronto_venues['Venue Category'].unique())))

There are 270 unique categories.


In [34]:
#create one hot encoded df for each of the categories, to facilitate clustering analysis
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighborhood'] = Toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot.head()

Unnamed: 0,Zoo,Accessories Store,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,...,University,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
Toronto_onehot.shape

(3185, 270)

In [36]:
#Create a df showing the relative frequency of each venue category in each neighborhood
Toronto_grouped = Toronto_onehot.groupby('Neighborhood').mean().reset_index()
Toronto_grouped.head()

Unnamed: 0,Neighborhood,Zoo,Accessories Store,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,...,University,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,...,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.01
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,...,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",0.0,0.0,0.0,0.066667,0.066667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [38]:
#create a df with the Top 10 most common venue category for each neighborhood
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
T_neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
T_neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighborhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    T_neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

T_neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Café,Coffee Shop,Hotel,Theater,Steakhouse,Cosmetics Shop,Restaurant,Clothing Store,Deli / Bodega,Sushi Restaurant
1,Berczy Park,Coffee Shop,Café,Hotel,Beer Bar,Japanese Restaurant,Restaurant,Park,Cocktail Bar,BBQ Joint,Bakery
2,"Brockton, Exhibition Place, Parkdale Village",Café,Coffee Shop,Bar,Restaurant,Furniture / Home Store,Bakery,Tibetan Restaurant,Lounge,Gift Shop,Hotel
3,Business Reply Mail Processing Centre 969 Eastern,Park,Coffee Shop,Pizza Place,Brewery,Pet Store,Sushi Restaurant,Italian Restaurant,Burrito Place,Snack Place,French Restaurant
4,"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",Harbor / Marina,Coffee Shop,Café,Scenic Lookout,Park,Garden,Airport,Airport Lounge,Dog Run,Sculpture Garden


In [39]:
# Additional Matplotlib modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means
from sklearn.cluster import KMeans

print('Libraries imported')

Libraries imported


In [40]:
# let's cluster the neighborhoods - set number of clusters
kclusters = 5

Toronto_grouped_clustering = Toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([4, 4, 1, 3, 0, 4, 4, 1, 1, 4], dtype=int32)

In [41]:
# add clustering labels
T_neighborhoods_venues_sorted.insert(0,'ClusterLabels',kmeans.labels_,allow_duplicates=True)

#merge Toronto_grouped with T_hoods to add latitude/longitude for each neighborhood
Toronto_merged = T_hoods.join(T_neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

Toronto_merged.head() # check the last columns!

Unnamed: 0,Borough,Postcode,Neighbourhood,Latitude,Longditude,ClusterLabels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,M4E,The Beaches,43.676357,-79.293031,1,Pub,Pizza Place,Coffee Shop,Breakfast Spot,Japanese Restaurant,Beach,Burger Joint,Bar,Tea Room,Bakery
1,East Toronto,M4K,"The Danforth West, Riverdale",43.679557,-79.352188,1,Greek Restaurant,Coffee Shop,Pub,Café,Ice Cream Shop,Italian Restaurant,Fast Food Restaurant,Pizza Place,Spa,Bakery
2,East Toronto,M4L,"The Beaches West, India Bazaar",43.668999,-79.315572,1,Indian Restaurant,Café,Coffee Shop,Beach,Fast Food Restaurant,Bakery,Sandwich Place,Butcher,Restaurant,Burrito Place
3,East Toronto,M4M,Studio District,43.659526,-79.340923,1,Coffee Shop,Bar,American Restaurant,Café,Bakery,Italian Restaurant,Brewery,Vietnamese Restaurant,French Restaurant,Diner
4,East Toronto,M7Y,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558,3,Park,Coffee Shop,Pizza Place,Brewery,Pet Store,Sushi Restaurant,Italian Restaurant,Burrito Place,Snack Place,French Restaurant


In [42]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longditude'], Toronto_merged['Neighbourhood'], Toronto_merged['ClusterLabels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [43]:
Toronto_merged.loc[Toronto_merged['ClusterLabels'] == 0, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Postcode,ClusterLabels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
28,M5V,0,Harbor / Marina,Coffee Shop,Café,Scenic Lookout,Park,Garden,Airport,Airport Lounge,Dog Run,Sculpture Garden


In [44]:
Toronto_merged.loc[Toronto_merged['ClusterLabels'] == 1, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Postcode,ClusterLabels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,1,Pub,Pizza Place,Coffee Shop,Breakfast Spot,Japanese Restaurant,Beach,Burger Joint,Bar,Tea Room,Bakery
1,M4K,1,Greek Restaurant,Coffee Shop,Pub,Café,Ice Cream Shop,Italian Restaurant,Fast Food Restaurant,Pizza Place,Spa,Bakery
2,M4L,1,Indian Restaurant,Café,Coffee Shop,Beach,Fast Food Restaurant,Bakery,Sandwich Place,Butcher,Restaurant,Burrito Place
3,M4M,1,Coffee Shop,Bar,American Restaurant,Café,Bakery,Italian Restaurant,Brewery,Vietnamese Restaurant,French Restaurant,Diner
13,M5R,1,Café,Vegetarian / Vegan Restaurant,Coffee Shop,Italian Restaurant,Bakery,Gym,Grocery Store,Museum,Mexican Restaurant,Restaurant
26,M5S,1,Café,Vegetarian / Vegan Restaurant,Bar,Bakery,Mexican Restaurant,Coffee Shop,Bookstore,Restaurant,Comfort Food Restaurant,Park
27,M5T,1,Café,Vegetarian / Vegan Restaurant,Bar,Coffee Shop,Art Gallery,Vietnamese Restaurant,Ice Cream Shop,Dessert Shop,Mexican Restaurant,Cocktail Bar
31,M6G,1,Korean Restaurant,Café,Coffee Shop,Grocery Store,Ice Cream Shop,Mexican Restaurant,Cocktail Bar,Diner,Sandwich Place,Pizza Place
34,M6J,1,Café,Bar,Restaurant,Bakery,Coffee Shop,Pizza Place,Men's Store,Vegetarian / Vegan Restaurant,Cocktail Bar,Italian Restaurant
35,M6K,1,Café,Coffee Shop,Bar,Restaurant,Furniture / Home Store,Bakery,Tibetan Restaurant,Lounge,Gift Shop,Hotel


In [45]:
Toronto_merged.loc[Toronto_merged['ClusterLabels'] == 2, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Postcode,ClusterLabels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,M4N,2,Café,Trail,Bookstore,College Quad,Gym / Fitness Center,College Gym,Coffee Shop,Park,Yoga Studio,Doner Restaurant


In [46]:
Toronto_merged.loc[Toronto_merged['ClusterLabels'] == 3, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Postcode,ClusterLabels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,M7Y,3,Park,Coffee Shop,Pizza Place,Brewery,Pet Store,Sushi Restaurant,Italian Restaurant,Burrito Place,Snack Place,French Restaurant
7,M4R,3,Sporting Goods Shop,Coffee Shop,Skating Rink,Italian Restaurant,Café,Fast Food Restaurant,Mexican Restaurant,Diner,Park,Spa
9,M4T,3,Coffee Shop,Italian Restaurant,Grocery Store,Thai Restaurant,Park,Gym,Café,Sandwich Place,Restaurant,Pub
10,M4V,3,Coffee Shop,Park,Sushi Restaurant,Italian Restaurant,Thai Restaurant,Gym / Fitness Center,Grocery Store,Liquor Store,Gym,Pizza Place
11,M5N,3,Bank,Pharmacy,Sushi Restaurant,Italian Restaurant,Coffee Shop,Café,Bakery,Skating Rink,Dance Studio,Clothing Store
12,M5P,3,Café,Park,Coffee Shop,Gym / Fitness Center,Italian Restaurant,Skating Rink,Japanese Restaurant,Burger Joint,Liquor Store,Bank
14,M4W,3,Park,Coffee Shop,Grocery Store,Convenience Store,Athletics & Sports,Filipino Restaurant,Metro Station,Breakfast Spot,Candy Store,Bistro
17,M5A,3,Coffee Shop,Café,Pub,Park,Diner,Breakfast Spot,Restaurant,Sushi Restaurant,Theater,Italian Restaurant
33,M6H,3,Café,Park,Coffee Shop,Sushi Restaurant,Italian Restaurant,Bar,Grocery Store,Brewery,Restaurant,Portuguese Restaurant


In [47]:
Toronto_merged.loc[Toronto_merged['ClusterLabels'] == 4, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Postcode,ClusterLabels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,M4P,4,Coffee Shop,Italian Restaurant,Fast Food Restaurant,Café,Sushi Restaurant,Dessert Shop,Gym,Pizza Place,Restaurant,Sporting Goods Shop
8,M4S,4,Coffee Shop,Italian Restaurant,Sushi Restaurant,Pizza Place,Middle Eastern Restaurant,Gym,Dessert Shop,Pub,Indian Restaurant,Café
15,M4X,4,Coffee Shop,Gastropub,Japanese Restaurant,Café,Diner,Park,Farm,Performing Arts Venue,Taiwanese Restaurant,Jewelry Store
16,M4Y,4,Coffee Shop,Japanese Restaurant,Park,Gym,Restaurant,Italian Restaurant,Café,Men's Store,Hotel,Gay Bar
18,M5B,4,Coffee Shop,Cosmetics Shop,Restaurant,Clothing Store,Middle Eastern Restaurant,Café,Tea Room,Italian Restaurant,Japanese Restaurant,Gastropub
19,M5C,4,Coffee Shop,Café,Restaurant,Seafood Restaurant,Hotel,Bakery,Cosmetics Shop,Breakfast Spot,Gastropub,BBQ Joint
20,M5E,4,Coffee Shop,Café,Hotel,Beer Bar,Japanese Restaurant,Restaurant,Park,Cocktail Bar,BBQ Joint,Bakery
21,M5G,4,Coffee Shop,Café,Japanese Restaurant,Italian Restaurant,Ice Cream Shop,Cosmetics Shop,Art Gallery,Chinese Restaurant,Park,Tea Room
22,M5H,4,Café,Coffee Shop,Hotel,Theater,Steakhouse,Cosmetics Shop,Restaurant,Clothing Store,Deli / Bodega,Sushi Restaurant
23,M5J,4,Coffee Shop,Hotel,Café,Aquarium,Restaurant,Italian Restaurant,Brewery,Park,Scenic Lookout,Theater


#### So what do we conclude about the clusters? Clusters 0(Red) and 1(Purple) have a lot of coffee shops and restaurants; Cluster 0 seems to have more amenities of everyday life e.g. fast food, gym, stores; there are presumably more people living in these neighborhoods which are further out from downtown (Cluster 1). Cluster 2(Blue) is the neighborhoods on the waterfront, distinguished by marinas. Cluster 3(Green) seems to be a somewhat unique neighborhood (Lawrence Park) dominated by the venue categories 'Trail','Bookstore' and various college buildings (a bookish introvert's paradise!). Cluster 4(Orange) is also big on restaurants and coffee shops, but seems more focused on the eating part with many ethnic and specialty restaurants. Since I have never been to Toronto I hope this is not too wide of the mark!