## Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import pandas as pd
import numpy as np

In [2]:
import requests
from bs4 import BeautifulSoup
url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(url,'lxml')
toronto_dat = soup.find('table',{'class':'wikitable sortable'})
PostalCode = []
Borough = []
Neighbor = []

### 1. Create a dataframe from the postal code table

In [3]:
for row in toronto_dat.findAll('tr'):
    col = row.findAll('td')
    if (len(col) > 1):
        PostalCode.append(col[0].text.strip())
        Borough.append(col[1].text.strip())
        Neighbor.append(col[2].text.strip())
        
torontodf = pd.DataFrame()
torontodf['PostalCode'] = PostalCode
torontodf['Borough'] = Borough
torontodf['Neighborhood'] = Neighbor

##### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [4]:
torontodf.replace('Not assigned', np.nan, inplace=True)
torontodf = torontodf[pd.notnull(torontodf['Borough'])]

##### Not assigned neighborhood is replaced with corresponding borough

In [5]:
torontodf.Neighborhood.fillna(torontodf.Borough, inplace=True)

##### Combine Neighborhoods with same postalcode

In [6]:
toronto_grouped = torontodf.groupby(['PostalCode', 'Borough']).Neighborhood.unique().reset_index()
toronto_grouped['Neighborhood'] =  toronto_grouped['Neighborhood'].apply(lambda x: ", ".join(x))
toronto_grouped.shape

(103, 3)

In [7]:
toronto_grouped[30:40]

Unnamed: 0,PostalCode,Borough,Neighborhood
30,M3K,North York,"CFB Toronto, Downsview East"
31,M3L,North York,Downsview West
32,M3M,North York,Downsview Central
33,M3N,North York,Downsview Northwest
34,M4A,North York,Victoria Village
35,M4B,East York,"Woodbine Gardens, Parkview Hill"
36,M4C,East York,Woodbine Heights
37,M4E,East Toronto,The Beaches
38,M4G,East York,Leaside
39,M4H,East York,Thorncliffe Park


### 2. Get geographical coordinates of postal codes

In [8]:
#!conda install -c conda-forge geopy --yes
!conda install -c conda-forge geocoder --yes
import geocoder # import geocoder

Latitude = []
Longitude = []

for pcode in toronto_grouped['PostalCode']:
    # initialize lat_lng to None
    lat_lng_coords = None

    # loop until getting the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(pcode))
        lat_lng_coords = g.latlng
    Latitude.append(lat_lng_coords[0])
    Longitude.append(lat_lng_coords[1])

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [9]:
toronto_grouped['Latitude'] = Latitude
toronto_grouped['Longitude'] = Longitude

In [10]:
toronto_grouped[30:40]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
30,M3K,North York,"CFB Toronto, Downsview East",43.738685,-79.46732
31,M3L,North York,Downsview West,43.72014,-79.51698
32,M3M,North York,Downsview Central,43.73369,-79.49674
33,M3N,North York,Downsview Northwest,43.755371,-79.51959
34,M4A,North York,Victoria Village,43.7306,-79.313265
35,M4B,East York,"Woodbine Gardens, Parkview Hill",43.707535,-79.311773
36,M4C,East York,Woodbine Heights,43.68964,-79.306874
37,M4E,East Toronto,The Beaches,43.676531,-79.295425
38,M4G,East York,Leaside,43.709495,-79.363989
39,M4H,East York,Thorncliffe Park,43.70124,-79.349825


### 3. Explore and cluster the neighborhoods in Toronto

In [11]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
#from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Libraries imported.


##### Expand Neighborhood

In [12]:
toronto_neigh = pd.DataFrame()
Borough = []
Neighborhood = []
for ind in np.arange(toronto_grouped.shape[0]):
    for neigh in toronto_grouped.iloc[ind]['Neighborhood'].split(','):
        Borough.append(toronto_grouped.iloc[ind]['Borough'])
        Neighborhood.append(neigh)

In [13]:
toronto_neigh['Borough'] = Borough
toronto_neigh['Neighborhood'] = Neighborhood

In [14]:
toronto_neigh.shape

(210, 2)

##### Get geographical coordinates for all the neighborhoods

In [15]:
Latitude = []
Longitude = []

for ind in np.arange(toronto_neigh.shape[0]):
    # initialize lat_lng to None
    lat_lng_coords = None

    # loop until getting the coordinates
    while(lat_lng_coords is None):
        borough = toronto_neigh.iloc[ind]['Borough']
        neighbor = toronto_neigh.iloc[ind]['Neighborhood']
        g = geocoder.arcgis('{}, {}, Toronto, Ontario'.format(neighbor, borough))
        lat_lng_coords = g.latlng
    Latitude.append(lat_lng_coords[0])
    Longitude.append(lat_lng_coords[1])

In [16]:
toronto_neigh['Latitude'] = Latitude
toronto_neigh['Longitude'] = Longitude

In [17]:
toronto_neigh[0:10]

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Scarborough,Rouge,43.80766,-79.17405
1,Scarborough,Malvern,43.81024,-79.22034
2,Scarborough,Highland Creek,43.78948,-79.17614
3,Scarborough,Rouge Hill,43.78716,-79.13252
4,Scarborough,Port Union,43.77897,-79.13109
5,Scarborough,Guildwood,43.74953,-79.18992
6,Scarborough,Morningside,43.78255,-79.20523
7,Scarborough,West Hill,43.76343,-79.1782
8,Scarborough,Woburn,43.76748,-79.22829
9,Scarborough,Cedarbrae,43.747728,-79.235174


##### Create a map of Toronto with neighborhoods superimposed on top

In [18]:
address = 'Toronto, Ontario'

g = geocoder.arcgis(address)
lat_lng_coords = g.latlng
latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

print('The geographical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geographical coordinate of Toronto are 43.648690000000045, -79.38543999999996.


In [19]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_neigh['Latitude'], toronto_neigh['Longitude'], toronto_neigh['Borough'], toronto_neigh['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

##### Now, we want to segment and cluster only neighborhoods in the boroughs that contain the word "Toronto". Let's slice the original dataframe and create a new dataframe of the "\*Toronto\*" data. 

In [20]:
torontob_data = toronto_neigh[toronto_neigh['Borough'].str.contains('Toronto')].reset_index(drop=True)

In [21]:
torontob_data.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,East Toronto,The Beaches,43.67413,-79.29644
1,East Toronto,The Danforth West,43.690666,-79.290799
2,East Toronto,Riverdale,43.732151,-79.559983
3,East Toronto,The Beaches West,43.67413,-79.29644
4,East Toronto,India Bazaar,43.73835,-79.56551


##### Now create a new map of Toronto with neighborhoods in the "\*Toronto\*" boroughs only

In [22]:
# create map of Toronto using latitude and longitude values
map_torontob = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(torontob_data['Latitude'], torontob_data['Longitude'], torontob_data['Borough'], torontob_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_torontob)  
    
map_torontob

Next, utilizing the Foursquare API to explore the neighborhoods and segment them

##### Foursquare Credentials and Version

In [23]:
CLIENT_ID = 'GYH5OUBS4E0EHHXEDYVVE41XJQ4BK4T1ZNL1Q51SCGPJPR4O' # your Foursquare ID
CLIENT_SECRET = 'KEQ5OCJ2PCWVH1PWKTC1ANWUS5VCT4SQEP2BQJALJZLISCC2' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: GYH5OUBS4E0EHHXEDYVVE41XJQ4BK4T1ZNL1Q51SCGPJPR4O
CLIENT_SECRET:KEQ5OCJ2PCWVH1PWKTC1ANWUS5VCT4SQEP2BQJALJZLISCC2


##### Define function to get nearby venues of all neighborhoods

In [24]:
LIMIT = 100
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

##### Get nearby venues of all neighborhoods and create new dataframe torontob_venues

In [25]:
torontob_venues = getNearbyVenues(names=torontob_data['Neighborhood'],
                                   latitudes=torontob_data['Latitude'],
                                   longitudes=torontob_data['Longitude']
                                  )

The Beaches
The Danforth West
 Riverdale
The Beaches West
 India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park
 Summerhill East
Deer Park
 Forest Hill SE
 Rathnelly
 South Hill
 Summerhill West
Rosedale
Cabbagetown
 St. James Town
Church and Wellesley
Harbourfront
Ryerson
 Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide
 King
 Richmond
Harbourfront East
 Toronto Islands
 Union Station
Design Exchange
 Toronto Dominion Centre
Commerce Court
 Victoria Hotel
Roselawn
Forest Hill North
 Forest Hill West
The Annex
 North Midtown
 Yorkville
Harbord
 University of Toronto
Chinatown
 Grange Park
 Kensington Market
CN Tower
 Bathurst Quay
 Island airport
 Harbourfront West
 King and Spadina
 Railway Lands
 South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place
 Underground city
Christie
Dovercourt Village
 Dufferin
Little Portugal
 Trinity
Brockton
 Exhibition Place
 Parkdale Village
High Park
 The Junction Sout

##### Check the size of the resulting dataframe

In [26]:
print(torontob_venues.shape)
torontob_venues.head()

(3877, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.67413,-79.29644,Mastermind Toys,43.671453,-79.293971,Toy / Game Store
1,The Beaches,43.67413,-79.29644,Glen Manor Ravine,43.676821,-79.293942,Trail
2,The Beaches,43.67413,-79.29644,Sanna's Farmacia,43.670929,-79.295969,Juice Bar
3,The Beaches,43.67413,-79.29644,The Ten Spot,43.67034,-79.299363,Nail Salon
4,The Beaches,43.67413,-79.29644,Green Eggplant,43.670517,-79.29866,Mediterranean Restaurant


##### Find out how many unique categories can be curated from all the returned venues

In [27]:
print('There are {} uniques categories.'.format(len(torontob_venues['Venue Category'].unique())))

There are 275 uniques categories.


##### Analyze each neighborhood

In [28]:
# one hot encoding
torontob_onehot = pd.get_dummies(torontob_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
torontob_onehot['Neighborhood'] = torontob_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [torontob_onehot.columns[-1]] + list(torontob_onehot.columns[:-1])
torontob_onehot = torontob_onehot[fixed_columns]

torontob_onehot.shape

(3877, 275)

Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [29]:
torontob_grouped = torontob_onehot.groupby('Neighborhood').mean().reset_index()
print(torontob_grouped.shape)
torontob_grouped.head()

(74, 275)


Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Animal Shelter,Antique Shop,Aquarium,Arepa Restaurant,Argentinian Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,Bagel Shop,Bakery,Bank,Bar,Baseball Stadium,Basketball Stadium,Beach,Bed & Breakfast,Beer Bar,Beer Store,Belgian Restaurant,Bike Shop,Bistro,Board Shop,Boat or Ferry,Bookstore,Boutique,Brazilian Restaurant,Breakfast Spot,Brewery,Bridal Shop,Bubble Tea Shop,Building,Burger Joint,Burrito Place,Bus Line,Bus Stop,Butcher,Café,Cajun / Creole Restaurant,Candy Store,Caribbean Restaurant,Castle,Cheese Shop,Chinese Restaurant,Chocolate Shop,Church,Clothing Store,Cocktail Bar,Coffee Shop,College Arts Building,College Auditorium,College Gym,College Theater,Colombian Restaurant,Comedy Club,Comfort Food Restaurant,Comic Shop,Concert Hall,Convenience Store,Convention Center,Cosmetics Shop,Creperie,Cuban Restaurant,Cupcake Shop,Curling Ice,Cycle Studio,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dive Bar,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,Event Space,Exhibit,Falafel Restaurant,Farm,Farmers Market,Fast Food Restaurant,Field,Filipino Restaurant,Fish & Chips Shop,Fish Market,Flea Market,Flower Shop,Food & Drink Shop,Food Court,Food Truck,Fountain,French Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Furniture / Home Store,Gaming Cafe,Garden,Gas Station,Gastropub,Gay Bar,General Entertainment,General Travel,Gift Shop,Gluten-free Restaurant,Gourmet Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Gym Pool,Harbor / Marina,Hardware Store,Hawaiian Restaurant,Health & Beauty Service,Health Food Store,Historic Site,History Museum,Hobby Shop,Hockey Arena,Hong Kong Restaurant,Hospital,Hostel,Hot Dog Joint,Hotel,Hotel Bar,Hotpot Restaurant,IT Services,Ice Cream Shop,Indian Restaurant,Indie Movie Theater,Intersection,Irish Pub,Italian Restaurant,Japanese Restaurant,Jazz Club,Jewelry Store,Juice Bar,Karaoke Bar,Kids Store,Korean Restaurant,Lake,Latin American Restaurant,Library,Light Rail Station,Liquor Store,Lounge,Malay Restaurant,Market,Massage Studio,Mattress Store,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Mobile Phone Shop,Modern European Restaurant,Molecular Gastronomy Restaurant,Monument / Landmark,Movie Theater,Moving Target,Museum,Music School,Music Store,Music Venue,Nail Salon,New American Restaurant,Nightclub,Noodle House,Office,Opera House,Optical Shop,Organic Grocery,Other Nightlife,Paintball Field,Paper / Office Supplies Store,Park,Pastry Shop,Performing Arts Venue,Persian Restaurant,Peruvian Restaurant,Pet Store,Pharmacy,Pie Shop,Pizza Place,Playground,Plaza,Poke Place,Pool,Pool Hall,Portuguese Restaurant,Poutine Place,Pub,Racetrack,Ramen Restaurant,Record Shop,Rental Car Location,Restaurant,Rock Club,Roof Deck,Sake Bar,Salad Place,Salon / Barbershop,Sandwich Place,Scenic Lookout,Sculpture Garden,Seafood Restaurant,Shoe Repair,Shoe Store,Shopping Mall,Skating Rink,Smoke Shop,Soccer Stadium,Social Club,Soup Place,South American Restaurant,Souvlaki Shop,Spa,Spanish Restaurant,Speakeasy,Sporting Goods Shop,Sports Bar,Sri Lankan Restaurant,Steakhouse,Street Art,Strip Club,Supermarket,Sushi Restaurant,Taco Place,Tailor Shop,Taiwanese Restaurant,Tanning Salon,Tapas Restaurant,Tea Room,Thai Restaurant,Theater,Theme Park,Theme Park Ride / Attraction,Theme Restaurant,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Tunnel,Udon Restaurant,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,Bathurst Quay,0.0,0.0,0.0,0.0,0.041667,0.0,0.041667,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.208333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Dufferin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Exhibition Place,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.066667,0.0,0.0,0.033333,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.033333,0.0,0.033333,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.1,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Forest Hill SE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Forest Hill West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##### Print each neighborhood along with the top 5 most common venues

In [30]:
num_top_venues = 5

for hood in torontob_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = torontob_grouped[torontob_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

---- Bathurst Quay----
                 venue  freq
0          Coffee Shop  0.21
1                 Park  0.08
2                 Café  0.08
3         Dance Studio  0.04
4  Japanese Restaurant  0.04


---- Dufferin----
                    venue  freq
0         Paintball Field  0.11
1    Fast Food Restaurant  0.11
2          Clothing Store  0.11
3             Coffee Shop  0.11
4  Argentinian Restaurant  0.11


---- Exhibition Place----
                 venue  freq
0           Theme Park  0.10
1       Soccer Stadium  0.10
2          Coffee Shop  0.07
3   Athletics & Sports  0.07
4  Arts & Crafts Store  0.07


---- Forest Hill SE----
               venue  freq
0  Convenience Store  0.25
1              Field  0.25
2     Farmers Market  0.25
3              Trail  0.25
4             Museum  0.00


---- Forest Hill West----
               venue  freq
0  Convenience Store  0.25
1              Field  0.25
2     Farmers Market  0.25
3              Trail  0.25
4             Museum  0.00


---- Gard

##### Define a function to sort the venues in descending order

In [31]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

##### Create a new dataframe and display the top 10 venues for each neighborhood

In [32]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = torontob_grouped['Neighborhood']

for ind in np.arange(torontob_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(torontob_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Bathurst Quay,Coffee Shop,Park,Café,Dance Studio,Sculpture Garden,Light Rail Station,Caribbean Restaurant,Sushi Restaurant,Bank,Japanese Restaurant
1,Dufferin,Fast Food Restaurant,Bakery,Sandwich Place,Bike Shop,Paintball Field,Argentinian Restaurant,Coffee Shop,Clothing Store,Trail,Electronics Store
2,Exhibition Place,Theme Park,Soccer Stadium,Athletics & Sports,Arts & Crafts Store,Coffee Shop,Burger Joint,Café,Flea Market,Restaurant,Racetrack
3,Forest Hill SE,Field,Convenience Store,Farmers Market,Trail,Electronics Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant
4,Forest Hill West,Field,Convenience Store,Farmers Market,Trail,Electronics Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant


##### Cluster Neighborhoods

Run k-means to cluster the neighborhood into 5 clusters

In [33]:
# set number of clusters
kclusters = 5

torontob_grouped_clustering = torontob_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(torontob_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 4, 4, 1, 1, 1, 1, 1], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [34]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

torontob_merged = torontob_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
torontob_merged = torontob_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

torontob_merged.head() # check the last columns!

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,The Beaches,43.67413,-79.29644,3,Bar,Japanese Restaurant,Pharmacy,Toy / Game Store,Burger Joint,Sandwich Place,Café,Juice Bar,Bank,Breakfast Spot
1,East Toronto,The Danforth West,43.690666,-79.290799,1,Grocery Store,Coffee Shop,Ice Cream Shop,Pharmacy,Sandwich Place,Supermarket,Food & Drink Shop,Flower Shop,Beer Store,Fast Food Restaurant
2,East Toronto,Riverdale,43.732151,-79.559983,3,Asian Restaurant,Dance Studio,Pharmacy,Thai Restaurant,Supermarket,Dive Bar,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant
3,East Toronto,The Beaches West,43.67413,-79.29644,3,Bar,Japanese Restaurant,Pharmacy,Toy / Game Store,Burger Joint,Sandwich Place,Café,Juice Bar,Bank,Breakfast Spot
4,East Toronto,India Bazaar,43.73835,-79.56551,1,Indian Restaurant,Caribbean Restaurant,Pizza Place,American Restaurant,Dance Studio,Coffee Shop,Bank,Ice Cream Shop,Spa,Grocery Store


##### Visualize the resulting clusters

In [35]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(torontob_merged['Latitude'], torontob_merged['Longitude'], torontob_merged['Neighborhood'], torontob_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

##### We can see that cluster 2 (index 1) is the dominant. Let's examine this cluster

##### Cluster 2

In [36]:
torontob_merged.loc[torontob_merged['Cluster Labels'] == 1, torontob_merged.columns[[1] + list(range(5, torontob_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,The Danforth West,Grocery Store,Coffee Shop,Ice Cream Shop,Pharmacy,Sandwich Place,Supermarket,Food & Drink Shop,Flower Shop,Beer Store,Fast Food Restaurant
4,India Bazaar,Indian Restaurant,Caribbean Restaurant,Pizza Place,American Restaurant,Dance Studio,Coffee Shop,Bank,Ice Cream Shop,Spa,Grocery Store
5,Studio District,Coffee Shop,Bar,Hotel,Restaurant,Café,Pizza Place,Steakhouse,Sushi Restaurant,Seafood Restaurant,Thai Restaurant
6,Lawrence Park,Coffee Shop,Italian Restaurant,Hobby Shop,Ice Cream Shop,Bubble Tea Shop,Seafood Restaurant,Metro Station,Spa,Bus Line,Mobile Phone Shop
7,Davisville North,Sandwich Place,Thai Restaurant,Coffee Shop,Restaurant,Café,Gas Station,Italian Restaurant,Dessert Shop,Chinese Restaurant,Indian Restaurant
8,North Toronto West,Pharmacy,Hotel,Garden,Gastropub,Italian Restaurant,Intersection,Coffee Shop,Ice Cream Shop,Bus Line,Bank
9,Davisville,Sandwich Place,Thai Restaurant,Coffee Shop,Restaurant,Café,Gas Station,Italian Restaurant,Dessert Shop,Chinese Restaurant,Indian Restaurant
10,Moore Park,Grocery Store,Park,Discount Store,Food Court,Café,Coffee Shop,Tea Room,Hardware Store,Event Space,Ethiopian Restaurant
12,Deer Park,Sandwich Place,History Museum,Café,Coffee Shop,Park,Pizza Place,Steakhouse,Museum,Pub,BBQ Joint
14,Rathnelly,Park,Café,Mexican Restaurant,French Restaurant,American Restaurant,Coffee Shop,Sandwich Place,Pub,Shoe Repair,BBQ Joint


##### Coffee Shops are the most common venues in neighborhoods of \*Toronto\* boroughs