# Capstone (Week3): Segmenting and Clustering Neighborhoods in Toronto

View map to use this link:
https://nbviewer.jupyter.org/github/phyhuhu/Coursera_Capstone/blob/master/Capstone_week3.ipynb

### Import Libary

In [1]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import requests
import csv

import folium

import json
from pandas.io.json import json_normalize

from geopy.geocoders import Nominatim

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

### Use _BeautifulSoup_ to get the neighborhood table of Toronto

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
table=soup.find('table')

In [3]:
col=[]
for tt in table.find_all('th'):
    col.append(tt.text)
col[-1]=col[-1][:-1]
col

['Postcode', 'Borough', 'Neighbourhood']

In [4]:
pretable=[]
temp=[]
check=-1
for po in table.find_all('td'):
    if check<=2:
        temp.append(po.text)
        check+=1
    if check==2:
        temp[-1]=temp[-1][:-1]
        
    if check==2:
        pretable.append(temp)
        temp=[]
        check=-1
originaltable=pd.DataFrame(pretable, columns=col)
originaltable.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Store the neighborhood table of Toronto

In [5]:
originaltable.to_csv('toronto_neigh.csv', index=False)

### Clean up the table:
1. create new table without the rows with 'Borough' as 'Not assigned'
2. find the rows with 'Neighbourhood' as 'Not assigned', which are replaced by the correspoding 'Borough'
3. group the 'Neighbourhood' by 'Postcode' and 'Borough' to get df

In [6]:
newtable=originaltable[originaltable['Borough']!='Not assigned']
dd=newtable[newtable['Neighbourhood']=='Not assigned'].index
newtable.loc[dd,'Neighbourhood']=newtable.loc[dd,'Borough']
newtable.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [7]:
neigh=newtable.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda Neighbourhood: ','.join(Neighbourhood))
df1=neigh.index.tolist()
df1=pd.DataFrame(df1)
df2=neigh.tolist()
df2=pd.DataFrame([neigh.index,neigh]).T
df=pd.concat([df1[0],df1[1],df2[1]], axis=1)
df.columns=col

## <font color='red'> For Question 1: view the DataFrame _df_ and use _.shape_ to get the number of rows. </font>

In [8]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [9]:
df.shape

(103, 3)

### Get the latitude and longitude

In [10]:
# postal_code=df['Postcode']

# import geocoder # import geocoder

# # initialize your variable to None
# lat_lng_coords = None

# # loop until you get the coordinates
# while(lat_lng_coords is None):
#   g = geocoder.google('{}, Toronto, Ontario'.format(postal_code[0]))
#   lat_lng_coords = g.latlng

# latitude = lat_lng_coords[0]
# longitude = lat_lng_coords[1]

In [11]:
coor=pd.read_csv('Geospatial_Coordinates.csv')
coor.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### merge two DataFrame _df_ and _coor_ with the same 'Postcode'

In [12]:
newdf=pd.merge(df, coor, left_on='Postcode', right_on='Postal Code')
newdf=newdf.drop(columns=['Postal Code'])

## <font color='red'> For Question 2: view the DataFrame _newdf_.

In [13]:
newdf.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Find the unique 'Borough' in DataFrame _newdf_

In [14]:
print('The dataframe has {} boroughs.'.format(
        len(newdf['Borough'].unique())
    ), ': ',
    newdf['Borough'].unique()
)

The dataframe has 11 boroughs. :  ['Scarborough' 'North York' 'East York' 'East Toronto' 'Central Toronto'
 'Downtown Toronto' 'York' 'West Toronto' 'Mississauga' 'Etobicoke'
 "Queen's Park"]


### Setup ID and secret of Foursquare

In [15]:
CLIENT_ID = 'D4VOKODCIB5TZSFRE0YCLETXJ3AGU2C1ASMDKRYDGS0XNSSO'
CLIENT_SECRET = 'FLXWWPLI4YNEJ5TVIX34PWK3MSNQZACEFOVVRXVZD4SYXZJ3'
VERSION = '20200122'

print('My credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

My credentails:
CLIENT_ID: D4VOKODCIB5TZSFRE0YCLETXJ3AGU2C1ASMDKRYDGS0XNSSO
CLIENT_SECRET:FLXWWPLI4YNEJ5TVIX34PWK3MSNQZACEFOVVRXVZD4SYXZJ3


** get the latitude and longitude of Toronto from Foursquare **

In [16]:
city = 'Toronto, Canada'
geolocator = Nominatim(user_agent="foursquare_agent")
loc = geolocator.geocode(city)
lat_Toronto = loc.latitude
lon_Toronto = loc.longitude
print(city, lat_Toronto, lon_Toronto)

Toronto, Canada 43.653963 -79.387207


** build the map of Toronto with markers of 'Neighbourhood' **

In [17]:
map_Toronto = folium.Map(location=[lat_Toronto, lon_Toronto], zoom_start=10)

# add markers to map
for lat, lon, borough, neighbourhood in zip(newdf['Latitude'], newdf['Longitude'], newdf['Borough'], newdf['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

## <font color='red'> For Question 3: Explore Neighborhoods in Downtown Toronto </font>

In [18]:
downtown_data=newdf[newdf['Borough']=='Downtown Toronto']
downtown_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
50,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
51,M4X,Downtown Toronto,"Cabbagetown,St. James Town",43.667967,-79.367675
52,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
53,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
54,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937


In [19]:
city = 'Toronto, Canada'
geolocator = Nominatim(user_agent="foursquare_agent")
loc = geolocator.geocode('Downtown Toronto')
lat_downtown = loc.latitude
lon_downtown = loc.longitude
print('Downtown Toronto', lat_downtown, lon_downtown)

Downtown Toronto 43.6541737 -79.38081164513409


** build the map of downtown Toronto with markers of 'Neighbourhood' **

In [20]:
map_downtown = folium.Map(location=[lat_downtown, lon_downtown], zoom_start=13)

# add markers to map
for lat, lon, borough, neighbourhood in zip(downtown_data['Latitude'], downtown_data['Longitude'], downtown_data['Borough'], downtown_data['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtown)  
    
map_downtown

In [21]:
# city = 'Toronto, Canada'
# geolocator = Nominatim(user_agent="foursquare_agent")
# for i in newdf['Borough'].unique():
#     address = i + ', ' + city
#     loc = geolocator.geocode(address)
#     lat = loc.latitude
#     lng = loc.longitude
#     print(address, lat, lng)

### Get venues of downtown Toronto

In [22]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    lat_downtown, 
    lon_downtown, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=D4VOKODCIB5TZSFRE0YCLETXJ3AGU2C1ASMDKRYDGS0XNSSO&client_secret=FLXWWPLI4YNEJ5TVIX34PWK3MSNQZACEFOVVRXVZD4SYXZJ3&v=20200122&ll=43.6541737,-79.38081164513409&radius=500&limit=100'

In [23]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e29e854b4b684001b2bb65b'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Bay Street Corridor',
  'headerFullLocation': 'Bay Street Corridor, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 138,
  'suggestedBounds': {'ne': {'lat': 43.6586737045, 'lng': -79.37460365419369},
   'sw': {'lat': 43.6496736955, 'lng': -79.38701963607448}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '57eda381498ebe0e6ef40972',
       'name': 'UNIQLO ユニクロ',
       'location': {'address': '220 Yonge St',
        'crossStreet': 'at Dundas St W',
        'lat': 43.65591027779457,
        'lng': -79.38064099181345,
        'labeledLatLngs': [

In [24]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [25]:
venues = results['response']['groups'][0]['items']

nearby_venues = json_normalize(venues)

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,UNIQLO ユニクロ,Clothing Store,43.65591,-79.380641
1,Elgin And Winter Garden Theatres,Theater,43.653394,-79.378507
2,Ed Mirvish Theatre,Theater,43.655102,-79.379768
3,Indigo,Bookstore,43.653515,-79.380696
4,CF Toronto Eaton Centre,Shopping Mall,43.653594,-79.380611


In [26]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

100 venues were returned by Foursquare.


In [27]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lon in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lon, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lon, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [28]:
new_venues = getNearbyVenues(names=downtown_data['Neighbourhood'],
                                   latitudes=newdf['Latitude'],
                                   longitudes=newdf['Longitude']
                                  )

Rosedale
Cabbagetown,St. James Town
Church and Wellesley
Harbourfront
Ryerson,Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide,King,Richmond
Harbourfront East,Toronto Islands,Union Station
Design Exchange,Toronto Dominion Centre
Commerce Court,Victoria Hotel
Harbord,University of Toronto
Chinatown,Grange Park,Kensington Market
CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place,Underground city
Christie
Queen's Park


In [29]:
print(new_venues.shape)
new_venues.head()

(157, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Rosedale,43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Cabbagetown,St. James Town",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Cabbagetown,St. James Town",43.784535,-79.160497,Affordable Toronto Movers,43.787919,-79.162977,Moving Target
3,Church and Wellesley,43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,Church and Wellesley,43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


In [30]:
new_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,King,Richmond",3,3,3,3,3,3
Berczy Park,5,5,5,5,5,5
"CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara",2,2,2,2,2,2
"Cabbagetown,St. James Town",2,2,2,2,2,2
Central Bay Street,10,10,10,10,10,10
"Chinatown,Grange Park,Kensington Market",13,13,13,13,13,13
Christie,4,4,4,4,4,4
Church and Wellesley,7,7,7,7,7,7
"Commerce Court,Victoria Hotel",8,8,8,8,8,8
"Design Exchange,Toronto Dominion Centre",5,5,5,5,5,5


In [31]:
print('There are {} uniques categories.'.format(len(new_venues['Venue Category'].unique())))

There are 83 uniques categories.


In [32]:
# one hot encoding
downtown_onehot = pd.get_dummies(new_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
downtown_onehot['Neighborhood'] = new_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [downtown_onehot.columns[-1]] + list(downtown_onehot.columns[:-1])
downtown_onehot = downtown_onehot[fixed_columns]

downtown_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,Athletics & Sports,Auto Garage,Bakery,Bank,Bar,Baseball Field,Boutique,...,Supplement Shop,Tailor Shop,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Vietnamese Restaurant,Wings Joint,Women's Store
0,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Cabbagetown,St. James Town",0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Cabbagetown,St. James Town",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Church and Wellesley,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Church and Wellesley,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
downtown_onehot.shape

(157, 84)

In [34]:
downtown_grouped = downtown_onehot.groupby('Neighborhood').mean().reset_index()
downtown_grouped.head()

Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,Athletics & Sports,Auto Garage,Bakery,Bank,Bar,Baseball Field,Boutique,...,Supplement Shop,Tailor Shop,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Vietnamese Restaurant,Wings Joint,Women's Store
0,"Adelaide,King,Richmond",0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Cabbagetown,St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
downtown_grouped.shape

(18, 84)

In [36]:
num_top_venues = 10

for hood in downtown_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = downtown_grouped[downtown_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide,King,Richmond----
                       venue  freq
0        American Restaurant  0.33
1              Movie Theater  0.33
2                      Motel  0.33
3                Pizza Place  0.00
4                  Pet Store  0.00
5                       Park  0.00
6               Noodle House  0.00
7              Moving Target  0.00
8  Middle Eastern Restaurant  0.00
9         Mexican Restaurant  0.00


----Berczy Park----
                 venue  freq
0       Discount Store   0.4
1    Convenience Store   0.2
2          Coffee Shop   0.2
3     Department Store   0.2
4  American Restaurant   0.0
5             Pharmacy   0.0
6            Pet Store   0.0
7                 Park   0.0
8         Noodle House   0.0
9        Moving Target   0.0


----CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara----
                       venue  freq
0                 Playground   0.5
1                       Park   0.5
2                Pizza Plac

In [37]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [38]:
num_top_venues = 20

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = downtown_grouped['Neighborhood']

for ind in np.arange(downtown_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(downtown_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,...,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue,16th Most Common Venue,17th Most Common Venue,18th Most Common Venue,19th Most Common Venue,20th Most Common Venue
0,"Adelaide,King,Richmond",American Restaurant,Movie Theater,Motel,Electronics Store,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Discount Store,...,Fast Food Restaurant,Ice Cream Shop,Food Court,Fried Chicken Joint,Frozen Yogurt Shop,Gas Station,General Entertainment,Golf Course,Greek Restaurant,Grocery Store
1,Berczy Park,Discount Store,Coffee Shop,Convenience Store,Department Store,Women's Store,Cosmetics Shop,Deli / Bodega,Dog Run,Electronics Store,...,Food Court,Fried Chicken Joint,Frozen Yogurt Shop,Gas Station,General Entertainment,Golf Course,Greek Restaurant,Grocery Store,Hakka Restaurant,College Stadium
2,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Park,Playground,Electronics Store,College Stadium,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Discount Store,...,Women's Store,Coffee Shop,Food Court,Fried Chicken Joint,Frozen Yogurt Shop,Gas Station,General Entertainment,Golf Course,Greek Restaurant,Grocery Store
3,"Cabbagetown,St. James Town",Moving Target,Bar,Women's Store,Food Court,Cosmetics Shop,Deli / Bodega,Department Store,Discount Store,Dog Run,...,Fast Food Restaurant,Fried Chicken Joint,College Stadium,Frozen Yogurt Shop,Gas Station,General Entertainment,Golf Course,Greek Restaurant,Grocery Store,Hakka Restaurant
4,Central Bay Street,Bus Line,Bakery,Intersection,Ice Cream Shop,Metro Station,Bus Station,Soccer Field,Park,Golf Course,...,Grocery Store,Deli / Bodega,Department Store,Discount Store,Greek Restaurant,Gas Station,Frozen Yogurt Shop,Dog Run,Electronics Store,Convenience Store


In [39]:
# set number of clusters
kclusters = 10

downtown_grouped_clustering = downtown_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(downtown_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([6, 8, 4, 2, 1, 1, 7, 1, 1, 9], dtype=int32)

In [40]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

downtown_merged = downtown_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
downtown_merged = downtown_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

downtown_merged=downtown_merged.dropna()

downtown_merged # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,...,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue,16th Most Common Venue,17th Most Common Venue,18th Most Common Venue,19th Most Common Venue,20th Most Common Venue
50,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,0.0,Fast Food Restaurant,Women's Store,Coffee Shop,Convenience Store,...,Food Court,Fried Chicken Joint,Frozen Yogurt Shop,Gas Station,General Entertainment,Golf Course,Greek Restaurant,Grocery Store,Hakka Restaurant,College Stadium
51,M4X,Downtown Toronto,"Cabbagetown,St. James Town",43.667967,-79.367675,2.0,Moving Target,Bar,Women's Store,Food Court,...,Fast Food Restaurant,Fried Chicken Joint,College Stadium,Frozen Yogurt Shop,Gas Station,General Entertainment,Golf Course,Greek Restaurant,Grocery Store,Hakka Restaurant
52,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,1.0,Intersection,Breakfast Spot,Pizza Place,Rental Car Location,...,Frozen Yogurt Shop,Coffee Shop,Gas Station,Dog Run,Discount Store,General Entertainment,Department Store,Golf Course,Deli / Bodega,Cosmetics Shop
53,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,5.0,Coffee Shop,Korean Restaurant,Food Court,Convenience Store,...,Fast Food Restaurant,Fried Chicken Joint,Indian Restaurant,Frozen Yogurt Shop,Gas Station,General Entertainment,Golf Course,Greek Restaurant,Grocery Store,Hakka Restaurant
54,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937,1.0,Gas Station,Hakka Restaurant,Athletics & Sports,Bakery,...,Department Store,Discount Store,Dog Run,Electronics Store,Women's Store,Food Court,Frozen Yogurt Shop,Convenience Store,General Entertainment,Golf Course
55,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,3.0,Playground,Women's Store,Fast Food Restaurant,Convenience Store,...,Food Court,Coffee Shop,Fried Chicken Joint,Frozen Yogurt Shop,Gas Station,General Entertainment,Golf Course,Greek Restaurant,Grocery Store,Hakka Restaurant
56,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,8.0,Discount Store,Coffee Shop,Convenience Store,Department Store,...,Food Court,Fried Chicken Joint,Frozen Yogurt Shop,Gas Station,General Entertainment,Golf Course,Greek Restaurant,Grocery Store,Hakka Restaurant,College Stadium
57,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,1.0,Bus Line,Bakery,Intersection,Ice Cream Shop,...,Grocery Store,Deli / Bodega,Department Store,Discount Store,Greek Restaurant,Gas Station,Frozen Yogurt Shop,Dog Run,Electronics Store,Convenience Store
58,M5H,Downtown Toronto,"Adelaide,King,Richmond",43.650571,-79.384568,6.0,American Restaurant,Movie Theater,Motel,Electronics Store,...,Fast Food Restaurant,Ice Cream Shop,Food Court,Fried Chicken Joint,Frozen Yogurt Shop,Gas Station,General Entertainment,Golf Course,Greek Restaurant,Grocery Store
59,M5J,Downtown Toronto,"Harbourfront East,Toronto Islands,Union Station",43.640816,-79.381752,1.0,College Stadium,Café,General Entertainment,Skating Rink,...,Dog Run,Women's Store,Coffee Shop,Food Court,Fried Chicken Joint,Frozen Yogurt Shop,Gas Station,Golf Course,Greek Restaurant,Grocery Store


In [41]:
# create map
map_clusters = folium.Map(location=[lat_downtown, lon_downtown], zoom_start=13)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(downtown_merged['Latitude'], downtown_merged['Longitude'], downtown_merged['Neighbourhood'], downtown_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters