# Using K-Means algorithm to cluster neighborhoods in Toronto

### Objective

#### Use K-Means machine learning algorithm to determine the which cluster of neighborhoods have the most frequently visited venues.

## Part 1: Preprocessing the datasets

### Import libraries

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json
from pandas.io.json import json_normalize
import numpy as np
from bs4 import BeautifulSoup
import requests

from geopy.geocoders import Nominatim

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium 

### Scrape Wikipedia page

In [2]:
wiki_page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(wiki_page,'html.parser')

table = soup.find('table')
colvalues = table.find_all('td')

length = len(colvalues)

PostalCode = []
Borough = []
Neighborhood = []

for i in range(0, length, 3):
    PostalCode.append(colvalues[i].text.strip())
    Borough.append(colvalues[i+1].text.strip())
    Neighborhood.append(colvalues[i+2].text.strip())

### Build dataframe from wikipedia table

In [3]:
df_postalcodes = pd.DataFrame(data=[PostalCode, Borough, Neighborhood]).transpose()
df_postalcodes.columns = ['PostalCode', 'Borough', 'Neighborhood']

### Cleanse the data in dataframe by reassigning a "Not assigned" neighborhood to the corresponding borough

In [4]:
df_postalcodes.drop(df_postalcodes[df_postalcodes['Borough'] == 'Not assigned'].index, inplace=True)
df_postalcodes.loc[df_postalcodes.Neighborhood == 'Not assigned', "Neighborhood"] = df_postalcodes.Borough

### Group the data by Postal Code and Borough

In [5]:
df_postalcodes_grouped = df_postalcodes.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
df_postalcodes_grouped.columns = ['PostalCode', 'Borough', 'Neighborhood']
df_postalcodes_grouped

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


### Read population data of 2016

In [6]:
pop=pd.read_csv('population.CSV',encoding='latin-1')
to_keep=['Geographic code','Population, 2016']#dropping irrelevant columns
pop=pop[to_keep]
#combining both Data Frames
combined=pd.merge(df_postalcodes_grouped,pop,how='inner',left_on='PostalCode',right_on='Geographic code')
combined.drop('Geographic code',axis=1,inplace=True)
combined
#pop[895:997]

Unnamed: 0,PostalCode,Borough,Neighborhood,"Population, 2016"
0,M1B,Scarborough,"Rouge, Malvern",66108.0
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",35626.0
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",46943.0
3,M1G,Scarborough,Woburn,29690.0
4,M1H,Scarborough,Cedarbrae,24383.0
5,M1J,Scarborough,Scarborough Village,36699.0
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",48434.0
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",35081.0
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",22913.0
9,M1N,Scarborough,"Birch Cliff, Cliffside West",22136.0


### Read in geospatial data from csv file and join dataframes based on "PostalCodes" column; this will create a single dataframe providing the geographical coordinates of each postal code

In [7]:
df_lat_long = pd.read_csv('http://cocl.us/Geospatial_data')
df_lat_long.columns = ['PostalCode', 'Latitude', 'Longitude']

df_lat_long_join = pd.merge(combined, df_lat_long, on=['PostalCode'], how='inner')

df_lat_long_join

Unnamed: 0,PostalCode,Borough,Neighborhood,"Population, 2016",Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",66108.0,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",35626.0,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",46943.0,43.763573,-79.188711
3,M1G,Scarborough,Woburn,29690.0,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,24383.0,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,36699.0,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",48434.0,43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",35081.0,43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",22913.0,43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",22136.0,43.692657,-79.264848


### Create a map of Toronto with the different neighborhoods

In [8]:
address = 'Toronto, Ontario'

geolocator = Nominatim()
location = geolocator.geocode(address)
T_latitude = location.latitude
T_longitude = location.longitude
print('The geograpical coordinate of Toronto, ON, Canada are {}, {}.'.format(T_latitude, T_longitude))

T_map = folium.Map(location=[T_latitude, T_longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_lat_long_join['Latitude'], df_lat_long_join['Longitude'], df_lat_long_join['Borough'], df_lat_long_join['Neighborhood']):
    label = '{}'.format(borough)
    #label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(T_map)  
    
T_map

  This is separate from the ipykernel package so we can avoid doing imports until


The geograpical coordinate of Toronto, ON, Canada are 43.653963, -79.387207.


### Connecting to Four Square API

In [9]:
CLIENT_ID = 'FL4X1EERBNVDNVLZVAIBJYZCELP3SKJQA3AGNBABZZKF5MJE' # your Foursquare ID
CLIENT_SECRET = 'XWRYSR5O0OWP3Y5O0ZNU1DLQLVXQQHI4VWFIX5EUVWGGN5PK' # your Foursquare Secret
VERSION = '20190317' # Foursquare API version

print('Your credentials:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentials:
CLIENT_ID: FL4X1EERBNVDNVLZVAIBJYZCELP3SKJQA3AGNBABZZKF5MJE
CLIENT_SECRET:XWRYSR5O0OWP3Y5O0ZNU1DLQLVXQQHI4VWFIX5EUVWGGN5PK


### Explore West Toronto region

In [10]:
westt = df_lat_long_join[df_lat_long_join['Borough'] == 'West Toronto']
westt.reset_index(drop=True)

Unnamed: 0,PostalCode,Borough,Neighborhood,"Population, 2016",Latitude,Longitude
0,M6H,West Toronto,"Dovercourt Village, Dufferin",44950.0,43.669005,-79.442259
1,M6J,West Toronto,"Little Portugal, Trinity",32684.0,43.647927,-79.41975
2,M6K,West Toronto,"Brockton, Exhibition Place, Parkdale Village",40957.0,43.636847,-79.428191
3,M6P,West Toronto,"High Park, The Junction South",40035.0,43.661608,-79.464763
4,M6R,West Toronto,"Parkdale, Roncesvalles",19857.0,43.64896,-79.456325
5,M6S,West Toronto,"Runnymede, Swansea",34299.0,43.651571,-79.48445


In [11]:
wt_lat = westt['Latitude'].values[0] # neighborhood latitude value
print(type(wt_lat))
wt_lon = westt['Longitude'].values[0] # neighborhood longitude value

wt_name = westt['Neighborhood'].values[0] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(wt_name, 
                                                               wt_lat, 
                                                               wt_lon))

<class 'numpy.float64'>
Latitude and longitude values of Dovercourt Village, Dufferin are 43.66900510000001, -79.4422593.


### Use Foursquare API to get most visited venues in the West Toronto borough

In [12]:
radius = 1000
LIMIT = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    wt_lat, 
    wt_lon, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=FL4X1EERBNVDNVLZVAIBJYZCELP3SKJQA3AGNBABZZKF5MJE&client_secret=XWRYSR5O0OWP3Y5O0ZNU1DLQLVXQQHI4VWFIX5EUVWGGN5PK&v=20190317&ll=43.66900510000001,-79.4422593&radius=1000&limit=500'

In [13]:
results = requests.get(url).json()
# results
if results != None:
    print("Request successfully processed")

Request successfully processed


In [14]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [15]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,The Greater Good Bar,Bar,43.669409,-79.439267
1,Parallel,Middle Eastern Restaurant,43.669516,-79.438728
2,Blood Brothers Brewing,Brewery,43.669944,-79.436533
3,Happy Bakery & Pastries,Bakery,43.66705,-79.441791
4,FreshCo,Supermarket,43.667918,-79.440754


In [16]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

67 venues were returned by Foursquare.


In [17]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
                # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [18]:
wt_venues = getNearbyVenues(names=westt['Neighborhood'],
                                   latitudes=westt['Latitude'],
                                   longitudes=westt['Longitude']
                                  )

Dovercourt Village, Dufferin
Little Portugal, Trinity
Brockton, Exhibition Place, Parkdale Village
High Park, The Junction South
Parkdale, Roncesvalles
Runnymede, Swansea


In [30]:
print(wt_venues.shape)
wt_venues.head()

(175, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Dovercourt Village, Dufferin",43.669005,-79.442259,The Greater Good Bar,43.669409,-79.439267,Bar
1,"Dovercourt Village, Dufferin",43.669005,-79.442259,Parallel,43.669516,-79.438728,Middle Eastern Restaurant
2,"Dovercourt Village, Dufferin",43.669005,-79.442259,Happy Bakery & Pastries,43.66705,-79.441791,Bakery
3,"Dovercourt Village, Dufferin",43.669005,-79.442259,FreshCo,43.667918,-79.440754,Supermarket
4,"Dovercourt Village, Dufferin",43.669005,-79.442259,Planet Fitness Toronto Galleria,43.667588,-79.442574,Gym / Fitness Center


In [31]:
print('There are {} uniques categories.'.format(len(wt_venues['Venue Category'].unique())))

There are 87 uniques categories.


### One hot encode categorical data into numerical data for machine learning algorithm

In [32]:
# one hot encoding
wt_onehot = pd.get_dummies(wt_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
wt_onehot['Neighborhood'] = wt_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [wt_onehot.columns[-1]] + list(wt_onehot.columns[:-1])
wt_onehot = wt_onehot[fixed_columns]

wt_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,Antique Shop,Art Gallery,Arts & Crafts Store,Asian Restaurant,Bakery,Bank,Bar,Bistro,Bookstore,Boutique,Breakfast Spot,Brewery,Burger Joint,Burrito Place,Café,Cajun / Creole Restaurant,Caribbean Restaurant,Climbing Gym,Cocktail Bar,Coffee Shop,Convenience Store,Cuban Restaurant,Cupcake Shop,Dessert Shop,Diner,Discount Store,Dog Run,Eastern European Restaurant,Falafel Restaurant,Fast Food Restaurant,Fish & Chips Shop,Flea Market,Food,French Restaurant,Fried Chicken Joint,Furniture / Home Store,Gastropub,Gift Shop,Gourmet Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Ice Cream Shop,Indie Movie Theater,Italian Restaurant,Juice Bar,Korean Restaurant,Latin American Restaurant,Liquor Store,Mac & Cheese Joint,Men's Store,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Movie Theater,Music Venue,New American Restaurant,Park,Performing Arts Venue,Pet Store,Pharmacy,Pizza Place,Pool,Pub,Record Shop,Restaurant,Salon / Barbershop,Sandwich Place,School,Smoothie Shop,Southern / Soul Food Restaurant,Speakeasy,Sports Bar,Stadium,Supermarket,Supplement Shop,Sushi Restaurant,Tapas Restaurant,Tea Room,Thai Restaurant,Theater,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,"Dovercourt Village, Dufferin",0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,"Dovercourt Village, Dufferin",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,"Dovercourt Village, Dufferin",0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,"Dovercourt Village, Dufferin",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,"Dovercourt Village, Dufferin",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [23]:
wt_grouped = wt_onehot.groupby('Neighborhood').mean().reset_index()
wt_grouped

Unnamed: 0,Neighborhood,American Restaurant,Antique Shop,Art Gallery,Arts & Crafts Store,Asian Restaurant,Bakery,Bank,Bar,Bistro,Bookstore,Boutique,Breakfast Spot,Brewery,Burger Joint,Burrito Place,Café,Cajun / Creole Restaurant,Caribbean Restaurant,Climbing Gym,Cocktail Bar,Coffee Shop,Convenience Store,Cuban Restaurant,Cupcake Shop,Dessert Shop,Diner,Discount Store,Dog Run,Eastern European Restaurant,Falafel Restaurant,Fast Food Restaurant,Fish & Chips Shop,Flea Market,Food,French Restaurant,Fried Chicken Joint,Furniture / Home Store,Gastropub,Gift Shop,Gourmet Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Ice Cream Shop,Indie Movie Theater,Italian Restaurant,Juice Bar,Korean Restaurant,Latin American Restaurant,Liquor Store,Mac & Cheese Joint,Men's Store,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Movie Theater,Music Venue,New American Restaurant,Park,Performing Arts Venue,Pet Store,Pharmacy,Pizza Place,Pool,Pub,Record Shop,Restaurant,Salon / Barbershop,Sandwich Place,School,Smoothie Shop,Southern / Soul Food Restaurant,Speakeasy,Sports Bar,Stadium,Supermarket,Supplement Shop,Sushi Restaurant,Tapas Restaurant,Tea Room,Thai Restaurant,Theater,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.047619,0.0,0.0,0.0,0.095238,0.0,0.0,0.047619,0.095238,0.0,0.047619,0.047619,0.0,0.095238,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.047619,0.047619,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.047619,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Dovercourt Village, Dufferin",0.0,0.0,0.0,0.0,0.0,0.1,0.05,0.05,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.05,0.0,0.0,0.05,0.0,0.05,0.0,0.0,0.1,0.05,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"High Park, The Junction South",0.0,0.043478,0.0,0.043478,0.0,0.043478,0.0,0.043478,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.086957,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.043478,0.0,0.043478,0.0,0.0,0.043478,0.043478,0.043478,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.086957,0.0,0.0,0.0,0.043478,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0
3,"Little Portugal, Trinity",0.016393,0.0,0.016393,0.0,0.04918,0.032787,0.0,0.131148,0.016393,0.0,0.032787,0.0,0.016393,0.0,0.0,0.032787,0.0,0.0,0.0,0.032787,0.04918,0.0,0.016393,0.016393,0.0,0.016393,0.0,0.016393,0.0,0.0,0.0,0.0,0.0,0.0,0.016393,0.0,0.0,0.0,0.016393,0.0,0.016393,0.0,0.0,0.016393,0.016393,0.0,0.016393,0.016393,0.016393,0.0,0.0,0.016393,0.032787,0.0,0.0,0.016393,0.0,0.016393,0.016393,0.016393,0.0,0.0,0.0,0.032787,0.0,0.016393,0.016393,0.032787,0.016393,0.0,0.0,0.0,0.016393,0.0,0.016393,0.0,0.0,0.0,0.0,0.016393,0.0,0.0,0.016393,0.016393,0.032787,0.016393,0.016393
4,"Parkdale, Roncesvalles",0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0625,0.0,0.0625,0.0,0.125,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0625,0.0,0.0625,0.0,0.0,0.0625,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Runnymede, Swansea",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029412,0.0,0.0,0.0,0.0,0.029412,0.088235,0.0,0.0,0.0,0.0,0.088235,0.0,0.0,0.0,0.029412,0.029412,0.0,0.0,0.0,0.029412,0.0,0.029412,0.0,0.029412,0.029412,0.0,0.0,0.029412,0.0,0.029412,0.0,0.0,0.029412,0.0,0.0,0.029412,0.058824,0.0,0.0,0.029412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029412,0.058824,0.0,0.029412,0.0,0.029412,0.0,0.029412,0.029412,0.029412,0.0,0.0,0.0,0.0,0.0,0.029412,0.058824,0.0,0.029412,0.0,0.0,0.029412,0.0,0.0,0.0


### Display top 5 venues for each West Toronto neighborhood

In [24]:
num_top_venues = 5

for hood in wt_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = wt_grouped[wt_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Brockton, Exhibition Place, Parkdale Village----
                venue  freq
0                Café  0.10
1         Coffee Shop  0.10
2      Breakfast Spot  0.10
3       Burrito Place  0.05
4  Falafel Restaurant  0.05


----Dovercourt Village, Dufferin----
            venue  freq
0  Discount Store  0.10
1          Bakery  0.10
2        Pharmacy  0.10
3     Supermarket  0.10
4         Brewery  0.05


----High Park, The Junction South----
                 venue  freq
0                 Café  0.09
1   Mexican Restaurant  0.09
2         Antique Shop  0.04
3          Flea Market  0.04
4  Fried Chicken Joint  0.04


----Little Portugal, Trinity----
              venue  freq
0               Bar  0.13
1       Coffee Shop  0.05
2  Asian Restaurant  0.05
3       Men's Store  0.03
4      Cocktail Bar  0.03


----Parkdale, Roncesvalles----
            venue  freq
0       Gift Shop  0.12
1  Breakfast Spot  0.12
2    Dessert Shop  0.06
3   Movie Theater  0.06
4         Dog Run  0.06


----Runnymed

In [25]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [26]:
num_top_venues = 3

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = wt_grouped['Neighborhood']

for ind in np.arange(wt_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(wt_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,"Brockton, Exhibition Place, Parkdale Village",Coffee Shop,Café,Breakfast Spot
1,"Dovercourt Village, Dufferin",Discount Store,Bakery,Pharmacy
2,"High Park, The Junction South",Mexican Restaurant,Café,Gastropub
3,"Little Portugal, Trinity",Bar,Asian Restaurant,Coffee Shop
4,"Parkdale, Roncesvalles",Gift Shop,Breakfast Spot,Coffee Shop
5,"Runnymede, Swansea",Café,Coffee Shop,Italian Restaurant


## Part 2: Data Analysis using K-Means

### Apply K-Means cluster algorithm to neighborhood venue data

In [27]:
# set number of clusters
kclusters = 3

wt_grouped_clustering = wt_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(wt_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 2, 1, 1, 0, 1])

In [28]:
wt_merged = westt

# add clustering labels
wt_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
wt_merged = wt_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

# wt_merged.head() # check the last columns!
wt_merged

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,PostalCode,Borough,Neighborhood,"Population, 2016",Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
76,M6H,West Toronto,"Dovercourt Village, Dufferin",44950.0,43.669005,-79.442259,1,Discount Store,Bakery,Pharmacy
77,M6J,West Toronto,"Little Portugal, Trinity",32684.0,43.647927,-79.41975,2,Bar,Asian Restaurant,Coffee Shop
78,M6K,West Toronto,"Brockton, Exhibition Place, Parkdale Village",40957.0,43.636847,-79.428191,1,Coffee Shop,Café,Breakfast Spot
82,M6P,West Toronto,"High Park, The Junction South",40035.0,43.661608,-79.464763,1,Mexican Restaurant,Café,Gastropub
83,M6R,West Toronto,"Parkdale, Roncesvalles",19857.0,43.64896,-79.456325,0,Gift Shop,Breakfast Spot,Coffee Shop
84,M6S,West Toronto,"Runnymede, Swansea",34299.0,43.651571,-79.48445,1,Café,Coffee Shop,Italian Restaurant


## Part 3: Data visualization

### Create map of clusters

In [29]:
map_cluster = folium.Map(location=[T_latitude, T_longitude], zoom_start=12)

color_code=['red','green','blue']
for lat, lng, cluster, neighborhood in zip(wt_merged['Latitude'],wt_merged['Longitude'],wt_merged['Cluster Labels'], wt_merged['Neighborhood']):
    
    label = folium.Popup(str(neighborhood) + ' Cluster' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=color_code[cluster],
        fill=True,
        fill_color=color_code[cluster],
        fill_opacity=0.7).add_to(map_cluster) 
map_cluster