This notebook will be used for the IBM Data Science Certificate capstone project, "Battle of the Neighbourhoods".

In [1]:
import pandas as pd
import numpy as np

In [2]:
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


## Week 3 Assignment

Import config dict stored in pickle file (contains private credentials for Foursquare & Bing Maps APIs)

In [3]:
import pickle

with open('config.p', 'rb') as pfile:
    config = pickle.load(pfile)

### Sections 1 - 3

In [4]:
from bs4 import BeautifulSoup
import requests

In [5]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

html = requests.get(wiki_url).content
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table')

In [6]:
df_toronto = pd.read_html(str(table))[0]

In [7]:
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


**The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood**

In [8]:
df_toronto.columns = ['postalCode','borough','neighborhood']

**Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.**

In [9]:
df_toronto = df_toronto[df_toronto['borough'] != 'Not assigned']

**More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.**

In [10]:
# Proof that there are no duplicates in the PostalCode column
if len(list(df_toronto['postalCode'])) == len(set(df_toronto['postalCode'])):
    print('No duplicates in postalCode column')
else:
    print('Possible duplicates in postalCode column')

No duplicates in postalCode column


**If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.**

In [11]:
df_toronto[df_toronto['neighborhood'] == 'Not assigned']

Unnamed: 0,postalCode,borough,neighborhood


**Clean your Notebook and add Markdown cells to explain your work and any assumptions you are making.**

**In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.**

In [12]:
df_toronto.shape

(103, 3)

### Section 4

In [13]:
# !pip install geocoder - uncomment to install

In [14]:
import geocoder

def get_coords_from_postalcode(row):
    postal_code = row.to_dict()['postalCode']
    g = geocoder.bing('{}, Toronto, Ontario'.format(postal_code), key=config['bing']['api_key'])
    if g.geojson['features']:
        return g.geojson['features'][0]['properties']['lat'], g.geojson['features'][0]['properties']['lng']
    else:
        print('Error on {}'.format(postal_code))
        return 'error'

In [15]:
df_toronto['lat'], df_toronto['lng'] = zip(*df_toronto.apply(get_coords_from_postalcode, axis=1))

In [16]:
df_toronto.head()

Unnamed: 0,postalCode,borough,neighborhood,lat,lng
2,M3A,North York,Parkwoods,43.756123,-79.329636
3,M4A,North York,Victoria Village,43.72678,-79.310738
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.655354,-79.365044
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.721996,-79.445915
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66391,-79.388733


Get coordinates of Toronto centre to initiate Folium map

In [17]:
toronto_centre = geocoder.bing('Toronto, Ontario', \
                               key=config['bing']['api_key'])

In [18]:
toronto_centre_coords = {}
toronto_centre_coords['lat'] = toronto_centre.geojson['features'][0]['properties']['lat']
toronto_centre_coords['lng'] = toronto_centre.geojson['features'][0]['properties']['lng']

Draw base map of Toronto

In [19]:
import folium

tmap = folium.Map(location=[toronto_centre_coords['lat'], toronto_centre_coords['lng']], zoom_start=11)

Add markers for each borough / post code

In [20]:
for lat, lng, borough, neighborhood in zip(df_toronto['lat'], df_toronto['lng'], df_toronto['borough'], df_toronto['neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(tmap)  

In [21]:
tmap

## 5. Get data about venues in Toronto

In [22]:
CLIENT_ID = config['four_square']['client_id'] # your Foursquare ID
CLIENT_SECRET = config['four_square']['client_secret'] # your Foursquare Secret
VERSION = '20201208' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value
RADIUS = 500

In [32]:
def get_venues(neighborhoods, radius=RADIUS, limit=LIMIT):
    
    venues_list=[]
    
    for name, lat, lng in neighborhoods:            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}\
            &client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        try:
            results = requests.get(url).json()['response']['groups'][0]['items']
            
            if len(results) == 0:
                print('WARNING! Neighborhood {} returned 0 results'.format(name))
        
            # return only relevant information for each nearby venue
            venues_list.append([(
                name, 
                lat, 
                lng, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in results])
            
        except KeyError as error:
            print(name)
            print(error)
        
        

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['n_name', 
                              'n_lat', 
                              'n_lng', 
                              'v_name', 
                              'v_lat', 
                              'v_lng', 
                              'v_cat']
    
    return(nearby_venues)

In [37]:
pd.set_option('display.max_rows', 500)

In [34]:
data = df_toronto[['neighborhood', 'lat', 'lng']].to_dict(orient='records')
data_as_tuples = [tuple(v for k,v in d.items()) for d in data]

venues = get_venues(data_as_tuples)



In [35]:
print('There are {} uniques categories.'.format(len(venues['v_cat'].unique())))

There are 264 uniques categories.


In [38]:
venues

Unnamed: 0,n_name,n_lat,n_lng,v_name,v_lat,v_lng,v_cat
0,Parkwoods,43.756123,-79.329636,TTC Stop #09083,43.759655,-79.332223,Bus Stop
1,Parkwoods,43.756123,-79.329636,DVP at York Mills,43.758899,-79.334099,Intersection
2,Parkwoods,43.756123,-79.329636,Chick-N-Joy,43.759900,-79.326520,Fried Chicken Joint
3,Parkwoods,43.756123,-79.329636,TTC Stop 9083,43.759251,-79.334000,Bus Stop
4,Victoria Village,43.726780,-79.310738,Portugril,43.725819,-79.312785,Portuguese Restaurant
...,...,...,...,...,...,...,...
2562,"Mimico NW, The Queensway West, South of Bloor,...",43.629711,-79.517479,McDonald's,43.630007,-79.518041,Fast Food Restaurant
2563,"Mimico NW, The Queensway West, South of Bloor,...",43.629711,-79.517479,RONA,43.629393,-79.518320,Hardware Store
2564,"Mimico NW, The Queensway West, South of Bloor,...",43.629711,-79.517479,Value Village,43.631269,-79.518238,Thrift / Vintage Store
2565,"Mimico NW, The Queensway West, South of Bloor,...",43.629711,-79.517479,Royal Canadian Legion #210,43.628855,-79.518903,Social Club


In [40]:
pd.set_option('display.max_columns', 265)

In [41]:
# one hot encoding
onehot = pd.get_dummies(venues[['v_cat']], prefix='', prefix_sep='')

# add neighborhood column back to dataframe
onehot['neighborhood'] = venues['n_name'] 

# move neighborhood column to the first column
fixed_columns = [onehot.columns[-1]] + list(onehot.columns[:-1])
onehot = onehot[fixed_columns]

onehot.head()

Unnamed: 0,neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Automotive Shop,BBQ Joint,Baby Store,Badminton Court,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Basketball Court,Basketball Stadium,Beach Bar,Beer Bar,Beer Store,Belgian Restaurant,Bike Shop,Bistro,Boat or Ferry,Bookstore,Boutique,Breakfast Spot,Brewery,Bubble Tea Shop,Burger Joint,Burrito Place,Bus Line,Bus Station,Bus Stop,Business Service,Butcher,Café,Cajun / Creole Restaurant,Candy Store,Cantonese Restaurant,Caribbean Restaurant,Cheese Shop,Chinese Restaurant,Chiropractor,Chocolate Shop,Church,Climbing Gym,Clothing Store,Cocktail Bar,Coffee Shop,College Arts Building,College Cafeteria,College Gym,College Rec Center,College Stadium,College Theater,Colombian Restaurant,Comedy Club,Comfort Food Restaurant,Comic Shop,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Creperie,Cuban Restaurant,Cupcake Shop,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Elementary School,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Field,Fish & Chips Shop,Fish Market,Flea Market,Flower Shop,Food & Drink Shop,Food Court,Food Truck,Fountain,French Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Fruit & Vegetable Store,Furniture / Home Store,Gaming Cafe,Garden,Gas Station,Gastropub,Gay Bar,General Entertainment,General Travel,German Restaurant,Gift Shop,Gluten-free Restaurant,Gourmet Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Gym Pool,Hardware Store,Hawaiian Restaurant,Health & Beauty Service,Health Food Store,Historic Site,History Museum,Hobby Shop,Hockey Arena,Home Service,Hookah Bar,Hotel,Hotel Bar,Housing Development,IT Services,Ice Cream Shop,Indian Chinese Restaurant,Indian Restaurant,Intersection,Irish Pub,Italian Restaurant,Japanese Restaurant,Jazz Club,Jewelry Store,Juice Bar,Karaoke Bar,Korean Restaurant,Lake,Latin American Restaurant,Laundromat,Library,Light Rail Station,Lingerie Store,Liquor Store,Locksmith,Lounge,Marijuana Dispensary,Market,Martial Arts School,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Mobile Phone Shop,Modern European Restaurant,Molecular Gastronomy Restaurant,Monument / Landmark,Moroccan Restaurant,Movie Theater,Museum,Music School,Music Store,Music Venue,Neighborhood,New American Restaurant,Nightclub,Noodle House,Office,Opera House,Optical Shop,Organic Grocery,Other Great Outdoors,Park,Performing Arts Venue,Peruvian Restaurant,Pet Store,Pharmacy,Pie Shop,Pizza Place,Platform,Playground,Plaza,Poke Place,Pool,Portuguese Restaurant,Poutine Place,Pub,Ramen Restaurant,Record Shop,Rental Car Location,Rental Service,Residential Building (Apartment / Condo),Restaurant,Road,Rock Club,Sake Bar,Salad Place,Salon / Barbershop,Sandwich Place,Sculpture Garden,Seafood Restaurant,Shoe Store,Shop & Service,Shopping Mall,Shopping Plaza,Skating Rink,Smoke Shop,Smoothie Shop,Snack Place,Soccer Stadium,Social Club,Soup Place,Spa,Spanish Restaurant,Speakeasy,Sporting Goods Shop,Sports Bar,Stadium,Stationery Store,Steakhouse,Street Art,Strip Club,Supermarket,Supplement Shop,Sushi Restaurant,Syrian Restaurant,Taco Place,Tailor Shop,Taiwanese Restaurant,Tanning Salon,Tea Room,Tennis Court,Thai Restaurant,Theater,Theme Restaurant,Thrift / Vintage Store,Tibetan Restaurant,Toy / Game Store,Track,Trail,Train Station,Turkish Restaurant,University,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Parkwoods,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [42]:
df_grouped = onehot.groupby('neighborhood').mean().reset_index()

## Extract Most Common Venue Types

In [47]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [48]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
df_venues_sorted = pd.DataFrame(columns=columns)
df_venues_sorted['neighborhood'] = df_grouped['neighborhood']

for ind in np.arange(df_grouped.shape[0]):
    df_venues_sorted.iloc[ind, 1:] = return_most_common_venues(df_grouped.iloc[ind, :], num_top_venues)

df_venues_sorted.head()


Unnamed: 0,neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Pool,Supermarket,Bank,Shopping Mall,Badminton Court,Noodle House,Skating Rink,Dim Sum Restaurant,Park,Chinese Restaurant
1,"Bathurst Manor, Wilson Heights, Downsview North",Middle Eastern Restaurant,Mediterranean Restaurant,Pizza Place,Park,Fast Food Restaurant,Elementary School,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farmers Market
2,Bayview Village,Tennis Court,Yoga Studio,Eastern European Restaurant,Food & Drink Shop,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Field,Fast Food Restaurant
3,"Bedford Park, Lawrence Manor East",Coffee Shop,Italian Restaurant,Sandwich Place,Comfort Food Restaurant,Bagel Shop,Pet Store,Pub,Restaurant,Cosmetics Shop,Locksmith
4,Berczy Park,Coffee Shop,Café,Italian Restaurant,Japanese Restaurant,Seafood Restaurant,Breakfast Spot,Cocktail Bar,Hotel,Beer Bar,Restaurant


In [57]:
tmp = df_grouped[df_grouped['neighborhood'] == 'Parkwoods'].T

tmp.drop('neighborhood', inplace=True)
tmp.columns = ['Freq']

In [59]:
tmp[tmp['Freq'] > 0]

Unnamed: 0,Freq
Bus Stop,0.5
Fried Chicken Joint,0.25
Intersection,0.25


In [49]:
df_venues_sorted[df_venues_sorted['neighborhood'] == 'Parkwoods']

Unnamed: 0,neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
61,Parkwoods,Bus Stop,Fried Chicken Joint,Intersection,Yoga Studio,Field,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farmers Market,Fast Food Restaurant


## Cluster Neighborhoods

In [63]:
from sklearn.cluster import KMeans

In [64]:
for k in range(3,8):
    # set number of clusters
    kclusters = k

    df_clustering = df_grouped.drop('neighborhood', 1)

    # run k-means clustering
    kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_clustering)

    # add cluster labels
    if 'cluster' in df_venues_sorted.columns:
        df_venues_sorted.drop(columns=['cluster'], inplace=True)

    df_venues_sorted.insert(0, 'cluster', kmeans.labels_)

    df_merged = df_toronto

    # merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
    df_merged = df_merged.join(df_venues_sorted.set_index('neighborhood'), on='neighborhood')

    print('######')
    print('K = {}'.format(kclusters))
    print(df_merged['cluster'].value_counts())
    print('######')

######
K = 3
1.0    96
2.0     3
0.0     2
Name: cluster, dtype: int64
######
######
K = 4
0.0    74
1.0    22
2.0     3
3.0     2
Name: cluster, dtype: int64
######
######
K = 5
3.0    77
1.0    19
0.0     3
4.0     1
2.0     1
Name: cluster, dtype: int64
######
######
K = 6
4.0    86
1.0     9
3.0     3
0.0     1
2.0     1
5.0     1
Name: cluster, dtype: int64
######
######
K = 7
1.0    92
4.0     3
2.0     2
5.0     1
3.0     1
0.0     1
6.0     1
Name: cluster, dtype: int64
######


In [65]:
# set number of clusters
kclusters = 4

df_clustering = df_grouped.drop('neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_clustering)

# add cluster labels
if 'cluster' in df_venues_sorted.columns:
    df_venues_sorted.drop(columns=['cluster'], inplace=True)

df_venues_sorted.insert(0, 'cluster', kmeans.labels_)

df_merged = df_toronto

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
df_merged = df_merged.join(df_venues_sorted.set_index('neighborhood'), on='neighborhood')

print('######')
print('K = {}'.format(kclusters))
print(df_merged['cluster'].value_counts())
print('######')

######
K = 4
0.0    74
1.0    22
2.0     3
3.0     2
Name: cluster, dtype: int64
######


In [66]:
df_clustering.shape

(97, 264)

In [67]:
df_merged[df_merged['cluster'].isna()]

Unnamed: 0,postalCode,borough,neighborhood,lat,lng,cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
108,M1R,Scarborough,"Wexford, Maryvale",43.749367,-79.302498,,,,,,,,,,,
151,M8W,Etobicoke,"Alderwood, Long Branch",43.599304,-79.536049,,,,,,,,,,,


In [68]:
df_merged.dropna(axis=0, inplace=True)
df_merged[df_merged['cluster'].isna()]

Unnamed: 0,postalCode,borough,neighborhood,lat,lng,cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


In [69]:
df_merged['cluster'] = df_merged['cluster'].astype(int)

In [70]:
# create map
map_clusters = folium.Map(location=[toronto_centre_coords['lat'], toronto_centre_coords['lng']], zoom_start=11)

# set colour scheme for clusters - NB, I have used a custom set rather than those from the Matplotlib library
# as I have deuteranomaly (colourblindness).
with open('colours.txt', 'r') as f:
    colours_array = f.read().splitlines()

In [71]:
# add markers to the map
markers_colors = []

for lat, lon, poi, cluster in zip(df_merged['lat'], df_merged['lng'], df_merged['neighborhood'], df_merged['cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='#000000',
        weight='1',
        fill=True,
        fill_color=colours_array[cluster-1],
        fill_opacity=1).add_to(map_clusters)

In [72]:
map_clusters