## Coursera - Applied Data Science Capstone - Week 4 assignment

### First part: create a Toronto dataframe from the web page

In [1]:
#import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
#import json # library to handle JSON files
import requests # library to handle requests
#from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
#import matplotlib.cm as cm
#import matplotlib.colors as colors
#from sklearn.cluster import KMeans
from bs4 import BeautifulSoup

#### Input data source from the web page

In [2]:
#input data source
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

In [3]:
#grab the table in the web page
table = soup.find('tbody')
#print(table.prettify())

#### Transform the data into a pandas dataframe

In [4]:
#grab the column name of the table
header = table.tr.text.split('\n')
header

['', 'Postcode', 'Borough', 'Neighbourhood', '']

In [5]:
"""
Write a function that get rid of all the empty strings in the list, 
this will be useful in the following operations
"""
def no_emtpy_str(ls):
    ret = []
    for ele in ls:
        if ele != '':
            ret.append(ele)
    return ret

In [6]:
#remove qll empty strings
header = no_emtpy_str(header)
header

['Postcode', 'Borough', 'Neighbourhood']

In [7]:
# instantiate the dataframe
toronto_df = pd.DataFrame(columns=header)
toronto_df.columns

Index(['Postcode', 'Borough', 'Neighbourhood'], dtype='object')

In [8]:
#get the body of the table
body = table.find_all('tr')
#print(body)

In [9]:
#pop off the column name, which is the first row
body.pop(0)
#print(body)

<tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>

In [10]:
#Take a look at the structure of a single row
print(body[0].text)
print(body[0].text.split('\n'))


M1A
Not assigned
Not assigned

['', 'M1A', 'Not assigned', 'Not assigned', '']


In [11]:
#insert each row into the datframe
for data in body:
    row = data.text.split('\n')
    if row[2] == 'Not assigned':
        pass
    else:
        if row[3] == 'Not assigned':
            #use row[2] to replace row[3]
            toronto_df = toronto_df.append({'Postcode': row[1],
                              'Borough': row[2],
                              'Neighbourhood': row[2]}, ignore_index=True)
        else:
            toronto_df = toronto_df.append({'Postcode': row[1],
                                          'Borough': row[2],
                                          'Neighbourhood': row[3]}, ignore_index=True)
    
toronto_df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [12]:
toronto_df.shape

(211, 3)

### Second Part: Add geographical coordinates into the Toronto dataframe

#### Import the geographical coordinates for Toronto

In [13]:
coordinates_df = pd.read_csv('Geospatial_Coordinates.csv')
coordinates_df.head(100)
#check if matches both dataframe total postcode number matches
print(coordinates_df.shape)
print(toronto_df['Postcode'].unique().shape)

(103, 3)
(103,)


In [14]:
#create a Latitude and Longitude column for toronto_df
latitude = pd.Series([])
longitude = pd.Series([])
for i in range(len(toronto_df)):
    latitude[i] = float(coordinates_df.loc[coordinates_df['Postal Code'] == toronto_df['Postcode'][i]].Latitude)
    longitude[i] = float(coordinates_df.loc[coordinates_df['Postal Code'] == toronto_df['Postcode'][i]].Longitude)

In [15]:
#Add a new column into the toronto dataframe
toronto_df['Latitude'] = latitude
toronto_df['Longitude'] = longitude
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
4,M6A,North York,Lawrence Heights,43.718518,-79.464763


In [16]:
toronto_df.shape

(211, 5)

### Third Part: Clustering by Neighboorhoods and Visualization

#### Visualizing the distribution of the the neighborhoods

In [17]:
from sklearn.cluster import KMeans
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np

In [18]:
# create map of Toronto latitude and longitude values
map_toronto = folium.Map(location=[43.753259, -79.329656], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

#### Add more information from Foursquare to help analizing neighborhoods (Actually defined by 'Postcode')
Each Postcode has its unique cooridinates, but different neighborhoods could share the same Postcode. Since we pass cooridinates to get venues through Foursquare API to, we are actually trying to analyze and cluster Postcode instead of neighborhoods.

In [19]:
CLIENT_ID = 'S1JC3LXUQRZHM5103DDRL1BWK44PFATYYDOQOY4R5WQ2NIJL' 
CLIENT_SECRET = 'FZMEIQQKP1M2OKSXMMWTERU2CUOVIQRPAHOH2VWPN3X4KP2L' 
VERSION = '20180605'

#### Foursquare API call test (Neighborhood: Lawrence Park, Central Toronto)

In [20]:
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
4,M6A,North York,Lawrence Heights,43.718518,-79.464763


In [21]:
#Define the url for foursquare API
neighborhood_latitude = toronto_df.loc[0, 'Latitude']#Postcode M3A (Parkwoods)
neighborhood_longitude = toronto_df.loc[0, 'Longitude']#Postcode M3A (Parkwoods)
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=S1JC3LXUQRZHM5103DDRL1BWK44PFATYYDOQOY4R5WQ2NIJL&client_secret=FZMEIQQKP1M2OKSXMMWTERU2CUOVIQRPAHOH2VWPN3X4KP2L&v=20180605&ll=43.7532586,-79.3296565&radius=500&limit=100'

In [22]:
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import numpy as np # library to handle data in a vectorized manner
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5ca7dbb6dd57977cccb94e02'},
 'response': {'headerLocation': 'Parkwoods - Donalda',
  'headerFullLocation': 'Parkwoods - Donalda, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.757758604500005,
    'lng': -79.32343823984928},
   'sw': {'lat': 43.7487585955, 'lng': -79.33587476015072}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4e8d9dcdd5fbbbb6b3003c7b',
       'name': 'Brookbanks Park',
       'location': {'address': 'Toronto',
        'lat': 43.751976046055574,
        'lng': -79.33214044722958,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.751976046055574,
          'lng': -79.33214044722958}],
        'distance': 245,
        'cc': 'CA'

In [23]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [24]:
#clean the json and structure it into a pandas dataframe
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Brookbanks Park,Park,43.751976,-79.33214
1,KFC,Fast Food Restaurant,43.754387,-79.333021
2,Variety Store,Food & Drink Shop,43.751974,-79.333114
3,TTC stop - 44 Valley Woods,Bus Stop,43.755402,-79.333741


#### Explore Neighborhoods (Postcode) in Toronto

In [25]:
#create a function to repeat the same process to all the neighborhoods in Toronto
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [26]:
#run the above function on each neighborhood and create a new dataframe
toronto_venues = getNearbyVenues(names=toronto_df['Postcode'],
                                   latitudes=toronto_df['Latitude'],
                                   longitudes=toronto_df['Longitude']
                                  )

M3A
M4A
M5A
M5A
M6A
M6A
M7A
M9A
M1B
M1B
M3B
M4B
M4B
M5B
M5B
M6B
M9B
M9B
M9B
M9B
M9B
M1C
M1C
M1C
M3C
M3C
M4C
M5C
M6C
M9C
M9C
M9C
M9C
M1E
M1E
M1E
M4E
M5E
M6E
M1G
M4G
M5G
M6G
M1H
M2H
M3H
M3H
M3H
M4H
M5H
M5H
M5H
M6H
M6H
M1J
M2J
M2J
M2J
M3J
M3J
M4J
M5J
M5J
M5J
M6J
M6J
M1K
M1K
M1K
M2K
M3K
M3K
M4K
M4K
M5K
M5K
M6K
M6K
M6K
M1L
M1L
M1L
M2L
M2L
M3L
M4L
M4L
M5L
M5L
M6L
M6L
M6L
M9L
M1M
M1M
M1M
M2M
M2M
M3M
M4M
M5M
M5M
M6M
M6M
M6M
M6M
M9M
M9M
M1N
M1N
M2N
M3N
M4N
M5N
M6N
M6N
M9N
M1P
M1P
M1P
M2P
M4P
M5P
M5P
M6P
M6P
M9P
M1R
M1R
M2R
M4R
M5R
M5R
M5R
M6R
M6R
M7R
M9R
M9R
M9R
M9R
M1S
M4S
M5S
M5S
M6S
M6S
M1T
M1T
M1T
M4T
M4T
M5T
M5T
M5T
M1V
M1V
M1V
M1V
M4V
M4V
M4V
M4V
M4V
M5V
M5V
M5V
M5V
M5V
M5V
M5V
M8V
M8V
M8V
M9V
M9V
M9V
M9V
M9V
M9V
M9V
M9V
M1W
M4W
M5W
M8W
M8W
M9W
M1X
M4X
M4X
M5X
M5X
M8X
M8X
M8X
M4Y
M7Y
M8Y
M8Y
M8Y
M8Y
M8Y
M8Y
M8Y
M8Y
M8Z
M8Z
M8Z
M8Z
M8Z


In [45]:
#Check the size of the resulting dataframe
print(toronto_venues.shape)
toronto_venues.head()

(4444, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M3A,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,M3A,43.753259,-79.329656,KFC,43.754387,-79.333021,Fast Food Restaurant
2,M3A,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,M3A,43.753259,-79.329656,TTC stop - 44 Valley Woods,43.755402,-79.333741,Bus Stop
4,M4A,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena


In [46]:
#check how many venues were returned for each neighborhood
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M1B,2,2,2,2,2,2
M1C,6,6,6,6,6,6
M1E,24,24,24,24,24,24
M1G,3,3,3,3,3,3
M1H,7,7,7,7,7,7
M1J,1,1,1,1,1,1
M1K,21,21,21,21,21,21
M1L,27,27,27,27,27,27
M1M,6,6,6,6,6,6
M1N,8,8,8,8,8,8


#### Analyze each neighborhood (Postcode)

In [27]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
#examine the new dataframe size
toronto_onehot.shape

(4444, 275)

In [29]:
#group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,M1B,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
1,M1C,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
2,M1E,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
3,M1G,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
4,M1H,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
5,M1J,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
6,M1K,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
7,M1L,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
8,M1M,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
9,M1N,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000


In [30]:
#confirm the new size
toronto_grouped.shape

(100, 275)

In [31]:
#put that into a pandas dataframe
#a function to sort the venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [32]:
#create the new dataframe and display the top 10 venues for each neighborhood
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Fast Food Restaurant,Women's Store,Doner Restaurant,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Donut Shop
1,M1C,Bar,History Museum,Doner Restaurant,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Women's Store
2,M1E,Mexican Restaurant,Medical Center,Electronics Store,Rental Car Location,Spa,Pizza Place,Intersection,Breakfast Spot,Department Store,Dessert Shop
3,M1G,Coffee Shop,Korean Restaurant,Doner Restaurant,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Donut Shop
4,M1H,Athletics & Sports,Caribbean Restaurant,Hakka Restaurant,Bakery,Thai Restaurant,Bank,Fried Chicken Joint,Department Store,Dessert Shop,Dim Sum Restaurant


#### Cluster the neighborhoods

In [34]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [35]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster_Labels', kmeans.labels_)

In [44]:
#create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood
toronto_merged = toronto_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Postcode')

toronto_merged.head() 

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster_Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,1.0,Park,Fast Food Restaurant,Food & Drink Shop,Bus Stop,Women's Store,Diner,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant
1,M4A,North York,Victoria Village,43.725882,-79.315572,0.0,Coffee Shop,Intersection,Hockey Arena,Portuguese Restaurant,Discount Store,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Dog Run
2,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636,0.0,Coffee Shop,Café,Bakery,Pub,Park,Breakfast Spot,Theater,Mexican Restaurant,Ice Cream Shop,Historic Site
3,M5A,Downtown Toronto,Regent Park,43.654260,-79.360636,0.0,Coffee Shop,Café,Bakery,Pub,Park,Breakfast Spot,Theater,Mexican Restaurant,Ice Cream Shop,Historic Site
4,M6A,North York,Lawrence Heights,43.718518,-79.464763,0.0,Clothing Store,Shoe Store,Furniture / Home Store,Event Space,Gift Shop,Miscellaneous Shop,Boutique,Vietnamese Restaurant,Coffee Shop,Accessories Store
5,M6A,North York,Lawrence Manor,43.718518,-79.464763,0.0,Clothing Store,Shoe Store,Furniture / Home Store,Event Space,Gift Shop,Miscellaneous Shop,Boutique,Vietnamese Restaurant,Coffee Shop,Accessories Store
6,M7A,Queen's Park,Queen's Park,43.662301,-79.389494,0.0,Coffee Shop,Diner,Gym,Burger Joint,Japanese Restaurant,Yoga Studio,Fast Food Restaurant,Creperie,Portuguese Restaurant,Café
7,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242,,,,,,,,,,,
8,M1B,Scarborough,Rouge,43.806686,-79.194353,4.0,Fast Food Restaurant,Women's Store,Doner Restaurant,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Donut Shop
9,M1B,Scarborough,Malvern,43.806686,-79.194353,4.0,Fast Food Restaurant,Women's Store,Doner Restaurant,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Donut Shop


In [77]:
#Drop the rows that Class_Label = "nan", due to failing to get data from Foursquare API
toronto_merged = toronto_merged.dropna()
#Cast Class_Label into int
toronto_merged['Cluster_Labels'] = toronto_merged['Cluster_Labels'].astype(int)
toronto_merged.head() 

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster_Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,1,Park,Fast Food Restaurant,Food & Drink Shop,Bus Stop,Women's Store,Diner,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant
1,M4A,North York,Victoria Village,43.725882,-79.315572,0,Coffee Shop,Intersection,Hockey Arena,Portuguese Restaurant,Discount Store,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Dog Run
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,0,Coffee Shop,Café,Bakery,Pub,Park,Breakfast Spot,Theater,Mexican Restaurant,Ice Cream Shop,Historic Site
3,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636,0,Coffee Shop,Café,Bakery,Pub,Park,Breakfast Spot,Theater,Mexican Restaurant,Ice Cream Shop,Historic Site
4,M6A,North York,Lawrence Heights,43.718518,-79.464763,0,Clothing Store,Shoe Store,Furniture / Home Store,Event Space,Gift Shop,Miscellaneous Shop,Boutique,Vietnamese Restaurant,Coffee Shop,Accessories Store


In [78]:
#visualize the resulting clusters
# create map
map_clusters = folium.Map(location=[43.753259, -79.329656], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster_Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters