## INSTALL MODULES

In [2]:
! pip install lxml html5lib beautifulsoup4



### Get Webpage Content

In [3]:
import pandas as pd
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
dfs = pd.read_html(url)
df = dfs[0]

### Remove calls with borough that is not assigned

In [4]:
df = df.dropna(axis=0, how='any')

### Reset the index

In [5]:
df = df.reset_index(drop=True)
df.head(11)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [6]:
df.shape

(103, 3)

### Get the Latitude and longitude

In [7]:
#download the csv containing the geographical coordinates of each postal code
geo = pd.read_csv('http://cocl.us/Geospatial_data')
geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Since both the 'df' and 'geo' have a column 'Postal Code' in common, we merge the two dataframes using inner join

In [10]:
merged_inner = pd.merge(left=df, right=geo, left_on='Postal Code', right_on='Postal Code')

merged_inner.shape
merged_inner.head(11)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


## Get the coordinates of Toronto

In [19]:
! pip install geopy

from geopy.geocoders import Nominatim
print('done')

Collecting geopy
  Downloading geopy-1.22.0-py2.py3-none-any.whl (113 kB)
Collecting geographiclib<2,>=1.49
  Downloading geographiclib-1.50-py3-none-any.whl (38 kB)
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.50 geopy-1.22.0
done


In [20]:
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


## Install the necessary libraries for plotting maps and K-means clustering

In [15]:
! pip install folium 





Collecting folium
  Downloading folium-0.11.0-py2.py3-none-any.whl (93 kB)
Collecting branca>=0.3.0
  Downloading branca-0.4.1-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.11.0


In [21]:
from sklearn.cluster import KMeans
print('done')

done


In [22]:
import matplotlib.cm as cm
import matplotlib.colors as colors
print('done')

done


In [23]:
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
print('done')

done


In [24]:
import json # library to handle JSON files
print('done')

done


In [25]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
print('done')


done


## Get the location data of all Burough assiciated with Toronto

In [36]:
toronto_data = merged_inner[merged_inner['Borough'] == 'East Toronto'].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558


In [37]:
toronto_data1 = merged_inner[merged_inner['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
toronto_data1.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [39]:
toronto_data2 = merged_inner[merged_inner['Borough'] == 'West Toronto'].reset_index(drop=True)
toronto_data2.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259
1,M6J,West Toronto,"Little Portugal, Trinity",43.647927,-79.41975
2,M6K,West Toronto,"Brockton, Parkdale Village, Exhibition Place",43.636847,-79.428191
3,M6P,West Toronto,"High Park, The Junction South",43.661608,-79.464763
4,M6R,West Toronto,"Parkdale, Roncesvalles",43.64896,-79.456325


In [40]:
toronto_data3 = merged_inner[merged_inner['Borough'] == 'Central Toronto'].reset_index(drop=True)
toronto_data3.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
1,M5N,Central Toronto,Roselawn,43.711695,-79.416936
2,M4P,Central Toronto,Davisville North,43.712751,-79.390197
3,M5P,Central Toronto,"Forest Hill North & West, Forest Hill Road Park",43.696948,-79.411307
4,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678


## Merge the dataframes of the various Toronto coordinates with concat function

In [41]:
toronto_data4 = pd.concat([toronto_data,toronto_data1,toronto_data2,toronto_data3], ignore_index=True)

In [42]:
toronto_data4

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
5,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
7,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
8,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
9,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [43]:
import folium
# create map of Toronto using latitude and longitude values
TORONTO = folium.Map(location=[43.651070, -79.347015], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_data3['Latitude'], toronto_data3['Longitude'], toronto_data3['Borough'], toronto_data3['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(TORONTO)  
    
TORONTO

## Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them.

In [44]:
CLIENT_ID = 'LLWE21UDOZKODGWB3I0GEKHNC1Q5RHMGDDL4QNTZ3RQLGUQU' # your Foursquare ID
CLIENT_SECRET = 'ECAMQAX42VBGNZRZA4WUKO1M5IDSLGWC4T1I0H0XFP1MOCS3' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: LLWE21UDOZKODGWB3I0GEKHNC1Q5RHMGDDL4QNTZ3RQLGUQU
CLIENT_SECRET:ECAMQAX42VBGNZRZA4WUKO1M5IDSLGWC4T1I0H0XFP1MOCS3


## Explore Neighborhoods in Toronto

In [49]:
LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

## Now write the code to run the above function on each neighborhood and create a new dataframe called Toronto_venues.


In [50]:
toronto_venues = getNearbyVenues(names=toronto_data3['Neighborhood'],
                                   latitudes=toronto_data3['Latitude'],
                                   longitudes=toronto_data3['Longitude']
                                  )


Lawrence Park
Roselawn
Davisville North
Forest Hill North & West, Forest Hill Road Park
North Toronto West, Lawrence Park
The Annex, North Midtown, Yorkville
Davisville
Moore Park, Summerhill East
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park


In [51]:
print(toronto_venues.shape)
toronto_venues.head()

(113, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Lawrence Park,43.72802,-79.38879,Lawrence Park Ravine,43.726963,-79.394382,Park
1,Lawrence Park,43.72802,-79.38879,Zodiac Swim School,43.728532,-79.38286,Swim School
2,Lawrence Park,43.72802,-79.38879,TTC Bus #162 - Lawrence-Donway,43.728026,-79.382805,Bus Line
3,Roselawn,43.711695,-79.416936,Rosalind's Garden Oasis,43.712189,-79.411978,Garden
4,Roselawn,43.711695,-79.416936,Havergal College,43.712108,-79.41168,Music Venue


#### Let's check how many venues were returned for each neighborhood

In [53]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Davisville,32,32,32,32,32,32
Davisville North,8,8,8,8,8,8
"Forest Hill North & West, Forest Hill Road Park",4,4,4,4,4,4
Lawrence Park,3,3,3,3,3,3
"Moore Park, Summerhill East",4,4,4,4,4,4
"North Toronto West, Lawrence Park",19,19,19,19,19,19
Roselawn,2,2,2,2,2,2
"Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park",17,17,17,17,17,17
"The Annex, North Midtown, Yorkville",24,24,24,24,24,24


#### Let's find out how many unique categories can be curated from all the returned venues

In [55]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 64 uniques categories.


### Analyze Each Neighborhood

In [56]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,BBQ Joint,Bagel Shop,Bank,Bar,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,Chinese Restaurant,Clothing Store,Coffee Shop,Cosmetics Shop,Department Store,Dessert Shop,Diner,Dog Run,Donut Shop,Farmers Market,Fast Food Restaurant,Food & Drink Shop,Fried Chicken Joint,Garden,Gas Station,Gift Shop,Gourmet Shop,Greek Restaurant,Grocery Store,Gym,History Museum,Hotel,Indian Restaurant,Italian Restaurant,Jewelry Store,Light Rail Station,Liquor Store,Mexican Restaurant,Middle Eastern Restaurant,Music Venue,Park,Pharmacy,Pizza Place,Playground,Pub,Rental Car Location,Restaurant,Salon / Barbershop,Sandwich Place,Seafood Restaurant,Spa,Sporting Goods Shop,Sports Bar,Summer Camp,Supermarket,Sushi Restaurant,Swim School,Tennis Court,Thai Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Yoga Studio
0,Lawrence Park,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Lawrence Park,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,Lawrence Park,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Roselawn,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Roselawn,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [58]:
toronto_onehot.shape

(113, 65)

### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [59]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,American Restaurant,BBQ Joint,Bagel Shop,Bank,Bar,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,Chinese Restaurant,Clothing Store,Coffee Shop,Cosmetics Shop,Department Store,Dessert Shop,Diner,Dog Run,Donut Shop,Farmers Market,Fast Food Restaurant,Food & Drink Shop,Fried Chicken Joint,Garden,Gas Station,Gift Shop,Gourmet Shop,Greek Restaurant,Grocery Store,Gym,History Museum,Hotel,Indian Restaurant,Italian Restaurant,Jewelry Store,Light Rail Station,Liquor Store,Mexican Restaurant,Middle Eastern Restaurant,Music Venue,Park,Pharmacy,Pizza Place,Playground,Pub,Rental Car Location,Restaurant,Salon / Barbershop,Sandwich Place,Seafood Restaurant,Spa,Sporting Goods Shop,Sports Bar,Summer Camp,Supermarket,Sushi Restaurant,Swim School,Tennis Court,Thai Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Yoga Studio
0,Davisville,0.0,0.0,0.0,0.0,0.03125,0.0,0.03125,0.0,0.0,0.0625,0.0,0.0,0.0625,0.0,0.0,0.09375,0.03125,0.0,0.0,0.03125,0.0,0.0,0.0,0.0,0.03125,0.0,0.03125,0.03125,0.0,0.0625,0.0,0.0,0.03125,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,0.03125,0.0625,0.0,0.0,0.0,0.03125,0.0,0.09375,0.03125,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.03125,0.03125,0.0,0.0,0.0,0.0
1,Davisville North,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.125,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Forest Hill North & West, Forest Hill Road Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0
3,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Moore Park, Summerhill East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0
5,"North Toronto West, Lawrence Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.052632,0.210526,0.105263,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.052632,0.052632,0.052632,0.0,0.0,0.052632,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632
6,Roselawn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Summerhill West, Rathnelly, South Hill, Forest...",0.058824,0.0,0.058824,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.058824,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.117647,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.058824,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0
8,"The Annex, North Midtown, Yorkville",0.041667,0.041667,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.125,0.0,0.0,0.125,0.041667,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.041667,0.0,0.041667,0.0,0.0,0.0,0.041667,0.0,0.041667,0.0,0.041667,0.041667,0.041667,0.0,0.041667,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0


In [60]:
toronto_grouped.shape

(9, 65)

### Let's print each neighborhood along with the top 5 most common venues

In [61]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Davisville----
                venue  freq
0      Sandwich Place  0.09
1        Dessert Shop  0.09
2         Pizza Place  0.06
3  Italian Restaurant  0.06
4                 Gym  0.06


----Davisville North----
               venue  freq
0            Dog Run  0.12
1  Food & Drink Shop  0.12
2              Hotel  0.12
3     Breakfast Spot  0.12
4   Department Store  0.12


----Forest Hill North & West, Forest Hill Road Park----
              venue  freq
0     Jewelry Store  0.25
1             Trail  0.25
2  Sushi Restaurant  0.25
3          Bus Line  0.25
4      Liquor Store  0.00


----Lawrence Park----
                 venue  freq
0             Bus Line  0.33
1          Swim School  0.33
2                 Park  0.33
3           Restaurant  0.00
4  Rental Car Location  0.00


----Moore Park, Summerhill East----
                 venue  freq
0                 Park  0.25
1           Playground  0.25
2          Summer Camp  0.25
3         Tennis Court  0.25
4  American Restaurant  0.00


### Let's put that into a pandas dataframe

First, let's write a function to sort the venues in descending order.

In [86]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [87]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Davisville,Dessert Shop,Sandwich Place,Coffee Shop,Sushi Restaurant,Pizza Place,Italian Restaurant,Gym,Café,Bar,Toy / Game Store
1,Davisville North,Hotel,Breakfast Spot,Food & Drink Shop,Park,Dog Run,Sandwich Place,Department Store,Gym,Burger Joint,Grocery Store
2,"Forest Hill North & West, Forest Hill Road Park",Bus Line,Trail,Jewelry Store,Sushi Restaurant,Yoga Studio,Dog Run,Donut Shop,Farmers Market,Fast Food Restaurant,Food & Drink Shop
3,Lawrence Park,Park,Swim School,Bus Line,Yoga Studio,Fried Chicken Joint,Donut Shop,Farmers Market,Fast Food Restaurant,Food & Drink Shop,Garden
4,"Moore Park, Summerhill East",Summer Camp,Playground,Park,Tennis Court,Yoga Studio,Dog Run,Donut Shop,Farmers Market,Fast Food Restaurant,Food & Drink Shop


### Cluster Neighborhoods

In [88]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:9] 

array([1, 1, 2, 0, 4, 1, 3, 1, 1])

In [89]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data3

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,0,Park,Swim School,Bus Line,Yoga Studio,Fried Chicken Joint,Donut Shop,Farmers Market,Fast Food Restaurant,Food & Drink Shop,Garden
1,M5N,Central Toronto,Roselawn,43.711695,-79.416936,3,Garden,Music Venue,Yoga Studio,History Museum,Grocery Store,Greek Restaurant,Gourmet Shop,Gift Shop,Gas Station,Fried Chicken Joint
2,M4P,Central Toronto,Davisville North,43.712751,-79.390197,1,Hotel,Breakfast Spot,Food & Drink Shop,Park,Dog Run,Sandwich Place,Department Store,Gym,Burger Joint,Grocery Store
3,M5P,Central Toronto,"Forest Hill North & West, Forest Hill Road Park",43.696948,-79.411307,2,Bus Line,Trail,Jewelry Store,Sushi Restaurant,Yoga Studio,Dog Run,Donut Shop,Farmers Market,Fast Food Restaurant,Food & Drink Shop
4,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678,1,Clothing Store,Coffee Shop,Yoga Studio,Sporting Goods Shop,Gift Shop,Fast Food Restaurant,Mexican Restaurant,Park,Diner,Rental Car Location


Finally, let's visualize the resulting clusters

In [90]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        #color=rainbow[cluster-1],
        fill=True,
        #fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Explore Clusters

CLUSTER 1

In [91]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,0,Park,Swim School,Bus Line,Yoga Studio,Fried Chicken Joint,Donut Shop,Farmers Market,Fast Food Restaurant,Food & Drink Shop,Garden


CLUSTER 2

In [92]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Central Toronto,1,Hotel,Breakfast Spot,Food & Drink Shop,Park,Dog Run,Sandwich Place,Department Store,Gym,Burger Joint,Grocery Store
4,Central Toronto,1,Clothing Store,Coffee Shop,Yoga Studio,Sporting Goods Shop,Gift Shop,Fast Food Restaurant,Mexican Restaurant,Park,Diner,Rental Car Location
5,Central Toronto,1,Café,Sandwich Place,Coffee Shop,American Restaurant,Pharmacy,Pizza Place,Middle Eastern Restaurant,Pub,Donut Shop,Liquor Store
6,Central Toronto,1,Dessert Shop,Sandwich Place,Coffee Shop,Sushi Restaurant,Pizza Place,Italian Restaurant,Gym,Café,Bar,Toy / Game Store
8,Central Toronto,1,Coffee Shop,Pub,American Restaurant,Sushi Restaurant,Restaurant,Liquor Store,Light Rail Station,Supermarket,Sports Bar,Fried Chicken Joint


CLUSTER 3

In [93]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Central Toronto,2,Bus Line,Trail,Jewelry Store,Sushi Restaurant,Yoga Studio,Dog Run,Donut Shop,Farmers Market,Fast Food Restaurant,Food & Drink Shop


CLUSTER 4

In [94]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Central Toronto,3,Garden,Music Venue,Yoga Studio,History Museum,Grocery Store,Greek Restaurant,Gourmet Shop,Gift Shop,Gas Station,Fried Chicken Joint


CLUSTER 5

In [95]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
7,Central Toronto,4,Summer Camp,Playground,Park,Tennis Court,Yoga Studio,Dog Run,Donut Shop,Farmers Market,Fast Food Restaurant,Food & Drink Shop
