# Segmenting and Clustering Neighborhoods in New York City
## Applied Data Science Capstone - Week 3

--------

## Part 3: Explore and cluster the neighborhoods in Toronto. 

In [1]:
import pandas as pd
import folium

In [3]:
# Load data

data = pd.read_csv('Data/data.csv')
data.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood\n, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae\n,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park\n, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West\n",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West\n",43.692657,-79.264848


In [4]:
# 11 Boroughs in the data frame

print(len(data['Borough'].unique()))
print(data['Borough'].unique())

11
['Scarborough' 'North York' 'East York' 'East Toronto' 'Central Toronto'
 'Downtown Toronto' 'York' 'West Toronto' "Queen's Park" 'Mississauga'
 'Etobicoke']


### Map of Toronto with the boroughs

In [5]:
# Create a simple map of toronto

latitude = 43.6529
longitude = -79.3849
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers for B
for lat, lng, label in zip(data.Latitude, data.Longitude, data.Borough):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, lng], radius=5, popup=label, color='purple', fill=True, parse_html=False).add_to(toronto_map)

toronto_map


#### Segment and cluster only boroughs that contain the word Toronto

In [6]:
# slice and create a new data frame with the boroughs of interet

toronto_df = data[data['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto_df

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West\n, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West\n, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District\n,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North\n,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West\n,43.715383,-79.405678
7,M4S,Central Toronto,Davisville\n,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East\n",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park, Forest Hill SE\n, Rathnelly, South ...",43.686412,-79.400049


#### Foursquare Credentials

In [8]:
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '' # Foursquare API version
LIMIT = 100

In [9]:
import requests 
import numpy as np

### Explore Neighborhoods

In [10]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):    #radius -> meters
    
    venues_list=[]
    
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [11]:
toronto_venues = getNearbyVenues(names=toronto_df['Neighbourhood'],
                                 latitudes=toronto_df['Latitude'],
                                 longitudes=toronto_df['Longitude']
                                )

The Beaches
The Danforth West
, Riverdale
The Beaches West
, India Bazaar
Studio District

Lawrence Park
Davisville North

North Toronto West

Davisville

Moore Park, Summerhill East

Deer Park, Forest Hill SE
, Rathnelly, South Hill, Summerhill West

Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront, Regent Park
Ryerson
, Garden District

St. James Town
Berczy Park
Central Bay Street

Adelaide
, King
, Richmond

Harbourfront East
, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel

Roselawn

Forest Hill North, Forest Hill West

The Annex, North Midtown
, Yorkville
Harbord
, University of Toronto
Chinatown, Grange Park, Kensington Market
CN Tower, Bathurst Quay
, Island airport
, Harbourfront West
, King and Spadina, Railway Lands, South Niagara
Stn A PO Boxes 25 The Esplanade

First Canadian Place, Underground city
Christie

Dovercourt Village, Dufferin

Little Portugal, Trinity
Brockton
, Exhibition Place, Park

In [12]:
print(toronto_venues.shape)
toronto_venues.head(20)

(1706, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Glen Stewart Ravine,43.6763,-79.294784,Other Great Outdoors
4,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
5,"The Danforth West\n, Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant
6,"The Danforth West\n, Riverdale",43.679557,-79.352188,MenEssentials,43.67782,-79.351265,Cosmetics Shop
7,"The Danforth West\n, Riverdale",43.679557,-79.352188,Dolce Gelato,43.677773,-79.351187,Ice Cream Shop
8,"The Danforth West\n, Riverdale",43.679557,-79.352188,Cafe Fiorentina,43.677743,-79.350115,Italian Restaurant
9,"The Danforth West\n, Riverdale",43.679557,-79.352188,La Diperie,43.67753,-79.352295,Ice Cream Shop


In [13]:
print(len(toronto_venues['Venue Category'].unique()), 'unique venues')

234 unique venues


#### Analyzing each neighbourhood

In [14]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
toronto_onehot.shape

(1706, 234)

In [16]:
# group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Theme Restaurant,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint
0,"Adelaide\n, King\n, Richmond\n",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.017544,0.0,0.0,0.0,0.0
2,"Brockton\n, Exhibition Place, Parkdale Village",0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 East...,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower, Bathurst Quay\n, Island airport\n, H...",0.0,0.0,0.071429,0.071429,0.071429,0.071429,0.142857,0.142857,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Central Bay Street\n,0.012048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012048,...,0.0,0.0,0.0,0.0,0.0,0.012048,0.0,0.0,0.012048,0.0
7,"Chinatown, Grange Park, Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.01,0.01,0.0,0.0,0.06,0.0,0.04,0.01,0.0
8,Christie\n,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Church and Wellesley,0.011364,0.011364,0.0,0.0,0.0,0.0,0.0,0.0,0.022727,...,0.011364,0.0,0.0,0.0,0.0,0.0,0.011364,0.011364,0.0,0.011364


In [26]:
toronto_grouped.shape

(38, 234)

#### 5 top venues in each neighborhood

In [18]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide
, King
, Richmond
----
                 venue  freq
0          Coffee Shop  0.08
1                 Café  0.05
2  American Restaurant  0.04
3                  Bar  0.04
4      Thai Restaurant  0.04


----Berczy Park----
          venue  freq
0   Coffee Shop  0.07
1  Cocktail Bar  0.05
2    Steakhouse  0.04
3        Bakery  0.04
4          Café  0.04


----Brockton
, Exhibition Place, Parkdale Village----
                   venue  freq
0         Breakfast Spot  0.08
1                   Café  0.08
2  Performing Arts Venue  0.08
3            Coffee Shop  0.08
4            Yoga Studio  0.04


----Business Reply Mail Processing Centre 969 Eastern
----
                  venue  freq
0           Yoga Studio  0.06
1                   Spa  0.06
2  Gym / Fitness Center  0.06
3         Garden Center  0.06
4                Garden  0.06


----CN Tower, Bathurst Quay
, Island airport
, Harbourfront West
, King and Spadina, Railway Lands, South Niagara----
                venue  freq
0    

In [19]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [66]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide\n, King\n, Richmond\n",Coffee Shop,Café,Bar,Thai Restaurant,American Restaurant,Steakhouse,Hotel,Asian Restaurant,Restaurant,Burger Joint
1,Berczy Park,Coffee Shop,Cocktail Bar,Cheese Shop,Beer Bar,Bakery,Farmers Market,Steakhouse,Seafood Restaurant,Café,Park
2,"Brockton\n, Exhibition Place, Parkdale Village",Breakfast Spot,Performing Arts Venue,Coffee Shop,Café,Yoga Studio,Caribbean Restaurant,Stadium,Sandwich Place,Burrito Place,Restaurant
3,Business Reply Mail Processing Centre 969 East...,Yoga Studio,Spa,Garden Center,Garden,Fast Food Restaurant,Farmers Market,Light Rail Station,Comic Shop,Park,Pizza Place
4,"CN Tower, Bathurst Quay\n, Island airport\n, H...",Airport Service,Airport Terminal,Harbor / Marina,Sculpture Garden,Airport,Airport Food Court,Airport Gate,Airport Lounge,Boutique,Bar



### Cluster neighborhoods

In [67]:
neighborhoods_venues_sorted.shape

(38, 11)

In [68]:
from sklearn.cluster import KMeans

import matplotlib.cm as cm
import matplotlib.colors as colors


In [74]:
#toronto_df.rename(columns={'Neighbourhood':'Neighborhood'}, inplace=True)
#toronto_df.head()


In [69]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [70]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head(10) 


Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Health Food Store,Other Great Outdoors,Trail,Pub,Dessert Shop,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
1,M4K,East Toronto,"The Danforth West\n, Riverdale",43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Furniture / Home Store,Bubble Tea Shop,Indian Restaurant,Sports Bar,Spa,Juice Bar
2,M4L,East Toronto,"The Beaches West\n, India Bazaar",43.668999,-79.315572,0,Park,Sandwich Place,Sushi Restaurant,Fast Food Restaurant,Steakhouse,Food & Drink Shop,Intersection,Brewery,Pub,Ice Cream Shop
3,M4M,East Toronto,Studio District\n,43.659526,-79.340923,0,Café,Coffee Shop,Italian Restaurant,American Restaurant,Bakery,Clothing Store,Seafood Restaurant,Bar,Stationery Store,Comfort Food Restaurant
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,4,Park,Lawyer,Bus Line,Swim School,Wings Joint,Diner,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store
5,M4P,Central Toronto,Davisville North\n,43.712751,-79.390197,0,Gym,Convenience Store,Sandwich Place,Food & Drink Shop,Clothing Store,Hotel,Breakfast Spot,Pizza Place,Park,Electronics Store
6,M4R,Central Toronto,North Toronto West\n,43.715383,-79.405678,0,Clothing Store,Coffee Shop,Sporting Goods Shop,Italian Restaurant,Diner,Mexican Restaurant,Dessert Shop,Park,Chinese Restaurant,Café
7,M4S,Central Toronto,Davisville\n,43.704324,-79.38879,0,Sandwich Place,Dessert Shop,Pizza Place,Sushi Restaurant,Gym,Coffee Shop,Italian Restaurant,Café,Fried Chicken Joint,Brewery
8,M4T,Central Toronto,"Moore Park, Summerhill East\n",43.689574,-79.38316,1,Gym,Playground,Summer Camp,Wings Joint,Dessert Shop,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
9,M4V,Central Toronto,"Deer Park, Forest Hill SE\n, Rathnelly, South ...",43.686412,-79.400049,0,Pub,Coffee Shop,Sports Bar,Vietnamese Restaurant,Liquor Store,Supermarket,Sushi Restaurant,Light Rail Station,Pizza Place,Fried Chicken Joint


In [75]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters