# PART 1 - DATA SCRAPING AND CLEANING

### Import the necessary libraries

In [60]:
import json
import urllib.request
import pandas as pd
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
import requests
import folium
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans

### Acquire the data from the wikipedia page using pandas

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

urllib.request.urlretrieve(url, filename='toronto')

toronto_html = pd.read_html('toronto')
toronto_html

[    Postal code           Borough  \
 0           M1A      Not assigned   
 1           M2A      Not assigned   
 2           M3A        North York   
 3           M4A        North York   
 4           M5A  Downtown Toronto   
 5           M6A        North York   
 6           M7A  Downtown Toronto   
 7           M8A      Not assigned   
 8           M9A         Etobicoke   
 9           M1B       Scarborough   
 10          M2B      Not assigned   
 11          M3B        North York   
 12          M4B         East York   
 13          M5B  Downtown Toronto   
 14          M6B        North York   
 15          M7B      Not assigned   
 16          M8B      Not assigned   
 17          M9B         Etobicoke   
 18          M1C       Scarborough   
 19          M2C      Not assigned   
 20          M3C        North York   
 21          M4C         East York   
 22          M5C  Downtown Toronto   
 23          M6C              York   
 24          M7C      Not assigned   
 25         

### Get the required data from the list

In [3]:
len(toronto_html)

3

In [4]:
toronto_html[0].head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [5]:
toronto_html[0].shape

(180, 3)

In [6]:
toronto_df = toronto_html[0]

### Clean the data by removing the Boroughs with a 'Not assigned' value

In [11]:
toronto_df = toronto_df[toronto_df['Borough']!='Not assigned']
toronto_df.reset_index(drop=True, inplace=True)
toronto_df.columns = ['Postal Code', 'Borough', 'Neighborhood']
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


### Check for blank/NaN cells in the 'Neighborhood' column

In [13]:
nan_nei = toronto_df[toronto_df['Neighborhood'].isnull()]
nan_nei.shape #no blank or Nan cells found

(0, 3)

### Check for any duplicates in postal code

In [14]:
duplicate_pc=toronto_df[toronto_df['Postal Code'].duplicated()==True]
duplicate_pc.shape #no duplicates found

(0, 3)

### Replace the '/' with a ','

In [18]:
toronto_df['Neighborhood'] = toronto_df['Neighborhood'].str.replace(r'(/)',',')
toronto_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


### Print the total number of rows for the toronto_df dataframe

In [19]:
print('The dataframe has {} rows.'.format(toronto_df.shape[0]))

The dataframe has 103 rows.


# PART 2 - DATA ANALYSIS

### Install and import geocoder

In [20]:
!conda install -c conda-forge geocoder --yes
import geocoder # import geocoder

Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



### Create a function for getting the latitude and longitude for each postal code

In [21]:
def getCoordinates(postalcodes, boroughs, neighborhoods):
    
    places=[]
    for postalcode, borough, neighborhood in zip(postalcodes, boroughs, neighborhoods):
        
        g = geocoder.arcgis('{}, Toronto, Ontario, Canada'.format(postalcode)) #used arcgis instead since google was not working
        lat_lng_coords = g.latlng

        latitude = lat_lng_coords[0]
        longitude = lat_lng_coords[1]

        places.append([(
            postalcode, 
            borough, 
            neighborhood, 
            latitude,
            longitude)]
            )

    newdf = pd.DataFrame([item for places in places for item in places])
    newdf.columns = ['Postal Code', 
                  'Borough', 
                  'Neighborhood', 
                  'Latitude', 
                  'Longitude']

    return(newdf)

In [22]:
toronto_newdf = getCoordinates(postalcodes=toronto_df['Postal Code'],
                       boroughs=toronto_df['Borough'],
                       neighborhoods=toronto_df['Neighborhood'])

In [23]:
toronto_newdf.head() #used the geocoder to acquire the latitude and longitude

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.752935,-79.335641
1,M4A,North York,Victoria Village,43.728102,-79.31189
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.650964,-79.353041
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.723265,-79.451211
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.66179,-79.38939


### Used provided coordinates in the course instead since values were different with what was collected using arcgis

In [24]:
toronto_csv = pd.read_csv('Geospatial_Coordinates.csv')
toronto_csv.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge the two dataframes by Postal Code

In [25]:
merged_df = pd.merge(toronto_df, toronto_csv, how='left')
merged_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494


In [26]:
merged_df.shape

(103, 5)

In [27]:
toronto_wcoor_df = merged_df.copy()

# PART 3 - DATA CLUSTERING

### Create a function for making a map of Toronto, Ontario

In [28]:
def torontoMap(postalcodes=None, boroughs=None, neighborhoods=None, latitudes=None, longitudes=None):
    address = 'Toronto, Ontario, Canada'

    geolocator = Nominatim(user_agent="tor_explorer")
    location = geolocator.geocode(address)
    tor_latitude = location.latitude
    tor_longitude = location.longitude
    
    map_toronto = folium.Map(location=[tor_latitude, tor_longitude], zoom_start=10)

    if postalcodes.empty:
        pass
    else:
        for latitude, longitude, postalcode, borough, neighborhood in zip(latitudes, longitudes, postalcodes, boroughs, neighborhoods):
            label = '{}, {}, {}'.format(neighborhood, borough, postalcode)
            label = folium.Popup(label, parse_html=True)
            folium.CircleMarker(
            [latitude, longitude],
            radius=5,
            popup=label,
            color='blue',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            parse_html=False).add_to(map_toronto)

    
    return(map_toronto)

### Create a map using the coordinates of all the boroughs

In [29]:
torontoMap(postalcodes=toronto_wcoor_df['Postal Code'],
           boroughs=toronto_wcoor_df['Borough'], 
           neighborhoods=toronto_wcoor_df['Neighborhood'], 
           latitudes=toronto_wcoor_df['Latitude'],
           longitudes=toronto_wcoor_df['Longitude'])

### Limit the data set to boroughs with Toronto in its name

In [30]:
toronto_word_df = toronto_wcoor_df.copy()
toronto_word_df['Borough'] = np.where(toronto_word_df['Borough'].str.contains('Toronto'), toronto_word_df['Borough'], np.NaN)
toronto_word_df.dropna(inplace=True)
toronto_word_df.reset_index(drop=True, inplace=True)
toronto_word_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [32]:
toronto_word_df.shape #size of the new dataframe

(39, 5)

### Create the map of the new data set with only the boroughs with the word Toronto are included

In [83]:
torontoMap(postalcodes=toronto_word_df['Postal Code'],
           boroughs=toronto_word_df['Borough'], 
           neighborhoods=toronto_word_df['Neighborhood'], 
           latitudes=toronto_word_df['Latitude'],
           longitudes=toronto_word_df['Longitude'])

In [87]:
CLIENT_ID = 'PNY5K0LYMDJKJCCUXV1GW0KO2H42PGGG3H1MJJ34NKYHKD0Z'
CLIENT_SECRET = 'WY3IAP5MTHYN2PLAOZ22DVNR4RJFCD2SHWQIAHIMXAU225XP'
VERSION = '20180605'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: PNY5K0LYMDJKJCCUXV1GW0KO2H42PGGG3H1MJJ34NKYHKD0Z
CLIENT_SECRET:WY3IAP5MTHYN2PLAOZ22DVNR4RJFCD2SHWQIAHIMXAU225XP


### Create a function to get the nearby venues for each coordinates

In [112]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, limit=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
  
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [113]:
toronto_venues_df = getNearbyVenues(names=toronto_word_df['Neighborhood'],
                                    latitudes=toronto_word_df['Latitude'],
                                    longitudes=toronto_word_df['Longitude'])

In [103]:
toronto_venues_df.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park , Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park , Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park , Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park , Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park , Harbourfront",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot


In [104]:
toronto_venues_df.shape

(1635, 7)

### Group the dataframe by Neighborhood and use count() to see the number of venues in each neaighborhood 

In [114]:
toronto_venues_df.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,55,55,55,55,55,55
"Brockton , Parkdale Village , Exhibition Place",24,24,24,24,24,24
Business reply mail Processing CentrE,19,19,19,19,19,19
"CN Tower , King and Spadina , Railway Lands , Harbourfront West , Bathurst Quay , South Niagara , Island airport",17,17,17,17,17,17
Central Bay Street,65,65,65,65,65,65
Christie,17,17,17,17,17,17
Church and Wellesley,74,74,74,74,74,74
"Commerce Court , Victoria Hotel",100,100,100,100,100,100
Davisville,34,34,34,34,34,34
Davisville North,11,11,11,11,11,11


### Get the total number of unique categories or type of venue

In [106]:
print('There are {} uniques categories.'.format(len(toronto_venues_df['Venue Category'].unique())))

There are 226 uniques categories.


### Perform one hot encoding to create columns of each unique categories

In [118]:
# one hot encoding
toronto_onehot_df = pd.get_dummies(toronto_venues_df[['Venue Category']], prefix="", prefix_sep=" ")

# add neighborhood column back to dataframe
toronto_onehot_df['Neighborhood'] = toronto_venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot_df.columns[-1]] + list(toronto_onehot_df.columns[:-1])
toronto_onehot_df = toronto_onehot_df[fixed_columns]

toronto_onehot_df.head()

Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,"Regent Park , Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Regent Park , Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park , Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park , Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Regent Park , Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [119]:
toronto_onehot_df.shape

(1635, 227)

### Again, group the new data frame by its Neighborhood but this time use mean() to show the frequency of each category for each neighborhood

In [140]:
toronto_grouped_df = toronto_onehot_df.groupby('Neighborhood').mean().reset_index()
toronto_grouped_df

Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.018182,0.0,0.0,0.0,0.0,0.0
1,"Brockton , Parkdale Village , Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667
2,Business reply mail Processing CentrE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632
3,"CN Tower , King and Spadina , Railway Lands , ...",0.058824,0.058824,0.058824,0.117647,0.176471,0.117647,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.015385,0.0,0.0,0.015385,0.0,0.015385
5,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.0,0.0,0.0,0.0,0.0,0.0,0.013514,0.0,0.0,...,0.013514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027027
7,"Commerce Court , Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0
8,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.029412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [141]:
toronto_grouped_df.shape

(39, 227)

### Get the top 5 venues for each neighborhood

In [178]:
num_top_venues = 5

for hood in toronto_grouped_df['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped_df[toronto_grouped_df['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
             venue  freq
0      Coffee Shop  0.05
1       Restaurant  0.04
2   Farmers Market  0.04
3             Café  0.04
4      Cheese Shop  0.04


----Brockton , Parkdale Village , Exhibition Place----
             venue  freq
0             Café  0.12
1   Breakfast Spot  0.08
2      Coffee Shop  0.08
3        Nightclub  0.08
4              Gym  0.04


----Business reply mail Processing CentrE----
                 venue  freq
0   Light Rail Station  0.11
1          Yoga Studio  0.05
2        Garden Center  0.05
3          Pizza Place  0.05
4           Comic Shop  0.05


----CN Tower , King and Spadina , Railway Lands , Harbourfront West , Bathurst  Quay , South Niagara , Island airport----
                  venue  freq
0       Airport Service  0.18
1        Airport Lounge  0.12
2      Airport Terminal  0.12
3               Airport  0.06
4   Rental Car Location  0.06


----Central Bay Street----
                 venue  freq
0          Coffee Shop  0.18
1   Italia

### Put the information in a pandas dataframe

In [214]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [215]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped_df['Neighborhood']

for ind in np.arange(toronto_grouped_df.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped_df.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Italian Restaurant,Restaurant,Beer Bar,Café,Cheese Shop,Cocktail Bar,Bakery,Seafood Restaurant,Farmers Market
1,"Brockton , Parkdale Village , Exhibition Place",Café,Nightclub,Breakfast Spot,Coffee Shop,Pet Store,Burrito Place,Restaurant,Climbing Gym,Yoga Studio,Bar
2,Business reply mail Processing CentrE,Light Rail Station,Yoga Studio,Auto Workshop,Skate Park,Smoke Shop,Spa,Burrito Place,Farmers Market,Fast Food Restaurant,Restaurant
3,"CN Tower , King and Spadina , Railway Lands , ...",Airport Service,Airport Lounge,Airport Terminal,Airport,Harbor / Marina,Coffee Shop,Rental Car Location,Sculpture Garden,Boat or Ferry,Bar
4,Central Bay Street,Coffee Shop,Italian Restaurant,Sandwich Place,Café,Spa,Sushi Restaurant,Burger Joint,Middle Eastern Restaurant,Bubble Tea Shop,Salad Place


### Cluster the Neighborhoods: fit the data set to the KMeans function

In [216]:
kclusters = 3 #from trial and error, 3 clusters seems to be the logical value

toronto_grouped_clustering = toronto_grouped_df.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

labels = kmeans.labels_
labels[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

### Add the cluster label to the dataframe

In [217]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged_df = toronto_word_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged_df = toronto_merged_df.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged_df.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636,0,Coffee Shop,Park,Pub,Bakery,Mexican Restaurant,Breakfast Spot,Restaurant,Café,Theater,Shoe Store
1,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494,0,Coffee Shop,Diner,Hobby Shop,Burrito Place,Sandwich Place,Juice Bar,Italian Restaurant,Café,Beer Bar,Mexican Restaurant
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Coffee Shop,Clothing Store,Café,Middle Eastern Restaurant,Cosmetics Shop,Japanese Restaurant,Bubble Tea Shop,Italian Restaurant,Bookstore,Restaurant
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Café,Coffee Shop,Cocktail Bar,Hotel,Beer Bar,Restaurant,American Restaurant,Breakfast Spot,Art Gallery,Farmers Market
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Health Food Store,Asian Restaurant,Neighborhood,Trail,Pub,Dog Run,Diner,Discount Store,Distribution Center,Yoga Studio


### Group the dataset by Cluster labels to show how the division of each clusters

In [218]:
toronto_merged_df.groupby(['Cluster Labels']).count()

Unnamed: 0_level_0,Postal Code,Borough,Neighborhood,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,34,34,34,34,34,34,34,34,34,34,34,34,34,34,34
1,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


### Get the latitude and longitude values for 'Toronto, Ontario'

In [219]:
address = 'Toronto, Ontario, Canada'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
tor_latitude = location.latitude
tor_longitude = location.longitude

### Create the map of Toronto with the cluster labels

In [225]:
map_clusters = folium.Map(location=[tor_latitude, tor_longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged_df['Latitude'], toronto_merged_df['Longitude'], toronto_merged_df['Neighborhood'], toronto_merged_df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Show the list of Boroughs per cluster

In [221]:
toronto_merged_df.loc[toronto_merged_df['Cluster Labels'] == 0, toronto_merged_df.columns[[1] + list(range(5, toronto_merged_df.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,0,Coffee Shop,Park,Pub,Bakery,Mexican Restaurant,Breakfast Spot,Restaurant,Café,Theater,Shoe Store
1,Downtown Toronto,0,Coffee Shop,Diner,Hobby Shop,Burrito Place,Sandwich Place,Juice Bar,Italian Restaurant,Café,Beer Bar,Mexican Restaurant
2,Downtown Toronto,0,Coffee Shop,Clothing Store,Café,Middle Eastern Restaurant,Cosmetics Shop,Japanese Restaurant,Bubble Tea Shop,Italian Restaurant,Bookstore,Restaurant
3,Downtown Toronto,0,Café,Coffee Shop,Cocktail Bar,Hotel,Beer Bar,Restaurant,American Restaurant,Breakfast Spot,Art Gallery,Farmers Market
4,East Toronto,0,Health Food Store,Asian Restaurant,Neighborhood,Trail,Pub,Dog Run,Diner,Discount Store,Distribution Center,Yoga Studio
5,Downtown Toronto,0,Coffee Shop,Italian Restaurant,Restaurant,Beer Bar,Café,Cheese Shop,Cocktail Bar,Bakery,Seafood Restaurant,Farmers Market
6,Downtown Toronto,0,Coffee Shop,Italian Restaurant,Sandwich Place,Café,Spa,Sushi Restaurant,Burger Joint,Middle Eastern Restaurant,Bubble Tea Shop,Salad Place
7,Downtown Toronto,0,Grocery Store,Café,Park,Coffee Shop,Baby Store,Diner,Italian Restaurant,Candy Store,Nightclub,Gas Station
8,Downtown Toronto,0,Coffee Shop,Café,Restaurant,Gym,Deli / Bodega,Hotel,Thai Restaurant,Salad Place,Sushi Restaurant,Cosmetics Shop
9,West Toronto,0,Bakery,Pharmacy,Supermarket,Bar,Bank,Brewery,Café,Pizza Place,Music Venue,Gym / Fitness Center


In [222]:
toronto_merged_df.loc[toronto_merged_df['Cluster Labels'] == 1, toronto_merged_df.columns[[1] + list(range(5, toronto_merged_df.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
18,Central Toronto,1,Park,Bus Line,Swim School,Dessert Shop,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant
21,Central Toronto,1,Park,Jewelry Store,Trail,Sushi Restaurant,Yoga Studio,Dessert Shop,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
29,Central Toronto,1,Park,Playground,Summer Camp,Deli / Bodega,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run
33,Downtown Toronto,1,Park,Playground,Trail,Deli / Bodega,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run


In [223]:
toronto_merged_df.loc[toronto_merged_df['Cluster Labels'] == 2, toronto_merged_df.columns[[1] + list(range(5, toronto_merged_df.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
19,Central Toronto,2,Garden,Music Venue,Yoga Studio,Department Store,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant


### I tried 2 clusters but the index 19 (Central Toronto) was being clustered with Cluster 0 (with the cafes) instead of Cluster 1 (with the parks).