In [3]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis

import json # library to handle JSON files

#!conda install -c conda-forge BeautifulSoup4 --yes # uncomment this line to install Beautiful Soup package if not previously installed
from bs4 import BeautifulSoup # library to handle HTML files

#!conda install -c conda-forge geopy --yes # uncomment this line to install Geopy package if not previously installed
from geopy.geocoders import ArcGIS # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans
!pip install folium

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line to install Folium package if not previously installed
import folium # map rendering library

print('Libraries imported.')

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/a4/f0/44e69d50519880287cc41e7c8a6acc58daa9a9acf5f6afc52bcc70f69a6d/folium-0.11.0-py2.py3-none-any.whl (93kB)
[K     |████████████████████████████████| 102kB 6.9MB/s ta 0:00:011
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/13/fb/9eacc24ba3216510c6b59a4ea1cd53d87f25ba76237d7f4393abeaf4c94e/branca-0.4.1-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.11.0
Libraries imported.


In [4]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
website_html = requests.get(url).text
soup = BeautifulSoup(website_html,'html.parser')

In [5]:
table = soup.find('table',{'class':'wikitable sortable'})


In [6]:
postal_codes = []
boroughs = []
neighborhoods = []

# Extract rows from the table
rows = table.find_all('tr')

# Iterate over rows
for row in rows[1:]:
    # Extract all cells from the row
    cells = row.find_all('td')
    # Check if all three columns are available
    if len(cells) == 3:
        # Append 'postal_codes' with the new data
        postal_code = cells[0]
        postal_codes.append(postal_code.text.strip())
        # Append 'boroughs' with the new data
        borough = cells[1]
        boroughs.append(borough.text.strip())
        # Append 'neighborhoods' with the new data
        neighborhood = cells[2]
        neighborhoods.append(neighborhood.text.strip())

In [7]:
df = pd.DataFrame({'PostalCode':postal_codes, 'Borough':boroughs, 'Neighborhood':neighborhoods})
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [8]:
df.tail()


Unnamed: 0,PostalCode,Borough,Neighborhood
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."
179,M9Z,Not assigned,Not assigned


In [9]:
df.shape


(180, 3)

In [10]:
len(df['PostalCode'].unique())


180

In [11]:
df = df[df['Borough'] != 'Not assigned'].reset_index(drop=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [12]:
sum(df['Neighborhood']=='Not assigned')


0

In [13]:
df.shape

(103, 3)

In [14]:

# Initialize ArcGIS instance
geolocator = ArcGIS()

# Toronto Latitude and Longitude
tor_location = geolocator.geocode('Toronto, Canada')
tor_lat = tor_location[1][0]
tor_lng = tor_location[1][1]
print('Toronto: Latitude %.4f, Longitude %.4f'%(tor_lat, tor_lng))

Toronto: Latitude 43.6487, Longitude -79.3854


In [16]:
latitudes = []
longitudes = []

for postal_code in df['PostalCode'] :
    # Query the location address for each Postal Code
    location = geolocator.geocode('{}, Toronto, Ontario'.format(postal_code))
    # Extract the Latitude and append to the list
    latitude = location[1][0]
    latitudes.append(latitude)
    # Extract the Longitude and append to the list
    longitude = location[1][1]
    longitudes.append(longitude)

In [17]:
df['Latitude'] = latitudes
df['Longitude'] = longitudes
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.752935,-79.335641
1,M4A,North York,Victoria Village,43.728102,-79.31189
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.723265,-79.451211
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66179,-79.38939


In [18]:
df.isna().values.any()


False

In [19]:
df.shape


(103, 5)

In [20]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[tor_lat, tor_lng], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [21]:

CLIENT_ID = 'ESQATSEBEN2WZEGZDHXBGATAZ22UF30AWU3EUMDS151VYDVR' 
CLIENT_SECRET = 'CUO3NSX2ZOD0HQEOLMOOVKW0W5MCN2XWE5HIO5RN1MVPB3ZC'
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: ESQATSEBEN2WZEGZDHXBGATAZ22UF30AWU3EUMDS151VYDVR
CLIENT_SECRET:CUO3NSX2ZOD0HQEOLMOOVKW0W5MCN2XWE5HIO5RN1MVPB3ZC


In [22]:
def getNearbyVenues(postal_codes, boroughs, names, latitudes, longitudes, radius=500, LIMIT=50):
    
    venues_list=[]
    number_of_venues=[]
    for postal_code, borough, name, lat, lng in zip(postal_codes, boroughs, names, latitudes, longitudes):
        print('Postal Code: {}; Borough: {}; Neighbourhood: {}'.format(postal_code, borough, name))
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(postal_code,
                             borough,
                             name,
                             lat,
                             lng,
                             v['venue']['name'],
                             v['venue']['location']['lat'],
                             v['venue']['location']['lng'],
                             v['venue']['categories'][0]['name']) for v in results])
        
        # Add the number of venues returned for the location to the list
        number_of_venues.append(len(results))
        print('Number of venues returned:', len(results))

    # Transform the venues data into a pandas dataframe
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal Code',
                             'Borough',
                             'Neighborhood',
                             'Neighborhood Latitude',
                             'Neighborhood Longitude',
                             'Venue',
                             'Venue Latitude',
                             'Venue Longitude',
                             'Venue Category']
    # Return the venue data
    return(nearby_venues, number_of_venues)

In [23]:
toronto_venues, number_of_venues = getNearbyVenues(postal_codes=df['PostalCode'],
                                                   boroughs=df['Borough'],
                                                   names=df['Neighborhood'],
                                                   latitudes=df['Latitude'],
                                                   longitudes=df['Longitude'])


Postal Code: M3A; Borough: North York; Neighbourhood: Parkwoods
Number of venues returned: 3
Postal Code: M4A; Borough: North York; Neighbourhood: Victoria Village
Number of venues returned: 6
Postal Code: M5A; Borough: Downtown Toronto; Neighbourhood: Regent Park, Harbourfront
Number of venues returned: 27
Postal Code: M6A; Borough: North York; Neighbourhood: Lawrence Manor, Lawrence Heights
Number of venues returned: 50
Postal Code: M7A; Borough: Downtown Toronto; Neighbourhood: Queen's Park, Ontario Provincial Government
Number of venues returned: 37
Postal Code: M9A; Borough: Etobicoke; Neighbourhood: Islington Avenue, Humber Valley Village
Number of venues returned: 4
Postal Code: M1B; Borough: Scarborough; Neighbourhood: Malvern, Rouge
Number of venues returned: 1
Postal Code: M3B; Borough: North York; Neighbourhood: Don Mills
Number of venues returned: 4
Postal Code: M4B; Borough: East York; Neighbourhood: Parkview Hill, Woodbine Gardens
Number of venues returned: 14
Postal Code

Number of venues returned: 23
Postal Code: M6R; Borough: West Toronto; Neighbourhood: Parkdale, Roncesvalles
Number of venues returned: 50
Postal Code: M7R; Borough: Mississauga; Neighbourhood: Canada Post Gateway Processing Centre
Number of venues returned: 50
Postal Code: M9R; Borough: Etobicoke; Neighbourhood: Kingsview Village, St. Phillips, Martin Grove Gardens, Richview Gardens
Number of venues returned: 8
Postal Code: M1S; Borough: Scarborough; Neighbourhood: Agincourt
Number of venues returned: 5
Postal Code: M4S; Borough: Central Toronto; Neighbourhood: Davisville
Number of venues returned: 26
Postal Code: M5S; Borough: Downtown Toronto; Neighbourhood: University of Toronto, Harbord
Number of venues returned: 33
Postal Code: M6S; Borough: West Toronto; Neighbourhood: Runnymede, Swansea
Number of venues returned: 50
Postal Code: M1T; Borough: Scarborough; Neighbourhood: Clarks Corners, Tam O'Shanter, Sullivan
Number of venues returned: 7
Postal Code: M4T; Borough: Central Toron

In [24]:
toronto_venues.head()


Unnamed: 0,Postal Code,Borough,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M3A,North York,Parkwoods,43.752935,-79.335641,Brookbanks Park,43.751976,-79.33214,Park
1,M3A,North York,Parkwoods,43.752935,-79.335641,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,M3A,North York,Parkwoods,43.752935,-79.335641,649 Variety,43.754513,-79.331942,Convenience Store
3,M4A,North York,Victoria Village,43.728102,-79.31189,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,M4A,North York,Victoria Village,43.728102,-79.31189,Portugril,43.725819,-79.312785,Portuguese Restaurant


In [25]:
toronto_venues.shape


(1759, 9)

In [26]:
df['NumberOfVenues'] = number_of_venues
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,NumberOfVenues
0,M3A,North York,Parkwoods,43.752935,-79.335641,3
1,M4A,North York,Victoria Village,43.728102,-79.31189,6
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041,27
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.723265,-79.451211,50
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66179,-79.38939,37


In [27]:
df.shape


(103, 6)

In [28]:
df['NumberOfVenues'].max()


50

In [29]:
df['NumberOfVenues'].min()


0

In [30]:
df[df['NumberOfVenues']==df['NumberOfVenues'].min()]


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,NumberOfVenues
53,M3M,North York,Downsview,43.73322,-79.4977,0
95,M1X,Scarborough,Upper Rouge,43.834768,-79.204101,0


In [31]:
set(df['PostalCode']) - set(toronto_venues['Postal Code'])


{'M1X', 'M3M'}

In [32]:
df_mod = df[df['NumberOfVenues']!=df['NumberOfVenues'].min()]


In [33]:
df_mod.shape


(101, 6)

In [34]:
len(toronto_venues['Postal Code'].unique())


101

In [35]:
len(toronto_venues['Neighborhood'].unique())


98

In [36]:
df[df['Neighborhood'].duplicated(keep=False)]


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,NumberOfVenues
7,M3B,North York,Don Mills,43.7489,-79.35722,4
13,M3C,North York,Don Mills,43.722143,-79.352023,5
40,M3K,North York,Downsview,43.739101,-79.467631,5
46,M3L,North York,Downsview,43.729992,-79.512027,4
53,M3M,North York,Downsview,43.73322,-79.4977,0
60,M3N,North York,Downsview,43.755819,-79.519973,21


In [37]:
toronto_venues.shape


(1759, 9)

In [38]:
toronto_venues[toronto_venues['Venue Category'] == 'Neighborhood']


Unnamed: 0,Postal Code,Borough,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
320,M4E,East Toronto,The Beaches,43.678148,-79.295349,Upper Beaches,43.680563,-79.292869,Neighborhood
426,M5G,Downtown Toronto,Central Bay Street,43.656072,-79.385653,Downtown Toronto,43.653232,-79.385296,Neighborhood
533,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650542,-79.384116,Downtown Toronto,43.653232,-79.385296,Neighborhood
778,M6K,West Toronto,"Brockton, Parkdale Village, Exhibition Place",43.639922,-79.43124,Parkdale,43.640524,-79.4322,Neighborhood


In [39]:
toronto_venues = toronto_venues[toronto_venues['Venue Category'] != 'Neighborhood'].reset_index(drop=True)
toronto_venues.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M3A,North York,Parkwoods,43.752935,-79.335641,Brookbanks Park,43.751976,-79.33214,Park
1,M3A,North York,Parkwoods,43.752935,-79.335641,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,M3A,North York,Parkwoods,43.752935,-79.335641,649 Variety,43.754513,-79.331942,Convenience Store
3,M4A,North York,Victoria Village,43.728102,-79.31189,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,M4A,North York,Victoria Village,43.728102,-79.31189,Portugril,43.725819,-79.312785,Portuguese Restaurant


In [40]:
toronto_venues.shape


(1755, 9)

In [41]:
len(toronto_venues['Venue Category'].unique())


254

In [42]:

# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add various columns back to dataframe
toronto_onehot.insert(loc=0, column='Postal Code', value=toronto_venues['Postal Code'])
toronto_onehot.insert(loc=1, column='Borough', value=toronto_venues['Borough'])
toronto_onehot.insert(loc=2, column='Neighborhood', value=toronto_venues['Neighborhood'])

toronto_onehot.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,ATM,Accessories Store,Airport,American Restaurant,Antique Shop,Art Gallery,Art Museum,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M3A,North York,Parkwoods,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M3A,North York,Parkwoods,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M3A,North York,Parkwoods,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4A,North York,Victoria Village,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4A,North York,Victoria Village,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
toronto_onehot.shape


(1755, 257)

In [44]:
toronto_grouped = toronto_onehot.groupby('Postal Code').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Postal Code,ATM,Accessories Store,Airport,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Arts & Entertainment,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
toronto_grouped.shape


(101, 255)

In [46]:
num_top_venues = 3

for postal_code in toronto_grouped['Postal Code']:
    print("----- Postal Code: "+postal_code+" -----")
    temp = toronto_grouped[toronto_grouped['Postal Code'] == postal_code].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----- Postal Code: M1B -----
                     venue  freq
0                    Trail   1.0
1                      ATM   0.0
2  New American Restaurant   0.0


----- Postal Code: M1C -----
            venue  freq
0             Bar   1.0
1             ATM   0.0
2  Mattress Store   0.0


----- Postal Code: M1E -----
                  venue  freq
0            Restaurant   0.1
1  Fast Food Restaurant   0.1
2           Coffee Shop   0.1


----- Postal Code: M1G -----
               venue  freq
0        Coffee Shop   0.4
1       Soccer Field   0.2
2  Indian Restaurant   0.2


----- Postal Code: M1H -----
                        venue  freq
0  Construction & Landscaping   0.5
1                 Gaming Cafe   0.5
2                         ATM   0.0


----- Postal Code: M1J -----
                  venue  freq
0        Sandwich Place  0.25
1  Fast Food Restaurant  0.25
2         Big Box Store  0.25


----- Postal Code: M1K -----
                venue  freq
0   Convenience Store   0.2
1      Di

            venue  freq
0    Home Service   0.5
1  Ice Cream Shop   0.5
2             ATM   0.0


----- Postal Code: M5P -----
           venue  freq
0           Park   0.5
1  Event Service   0.5
2            ATM   0.0


----- Postal Code: M5R -----
            venue  freq
0            Café  0.09
1  Sandwich Place  0.09
2     Pizza Place  0.04


----- Postal Code: M5S -----
                venue  freq
0                Café  0.15
1           Bookstore  0.06
2  Italian Restaurant  0.06


----- Postal Code: M5T -----
                           venue  freq
0                           Café  0.09
1             Mexican Restaurant  0.07
2  Vegetarian / Vegan Restaurant  0.07


----- Postal Code: M5V -----
         venue  freq
0  Coffee Shop  0.08
1         Café  0.06
2       Lounge  0.04


----- Postal Code: M5W -----
          venue  freq
0   Coffee Shop  0.10
1         Hotel  0.06
2  Concert Hall  0.04


----- Postal Code: M5X -----
         venue  freq
0         Café  0.12
1  Coffee Shop  0

In [47]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [48]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postal Code']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
venues_sorted = pd.DataFrame(columns=columns)
venues_sorted['Postal Code'] = toronto_grouped['Postal Code']

for ind in np.arange(toronto_grouped.shape[0]):
    venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

venues_sorted.head()

Unnamed: 0,Postal Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Trail,Yoga Studio,Diner,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Event Service,Ethiopian Restaurant,Electronics Store
1,M1C,Bar,Yoga Studio,Fish Market,Field,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Event Service,Ethiopian Restaurant
2,M1E,Pizza Place,Fast Food Restaurant,Restaurant,Coffee Shop,Thrift / Vintage Store,Breakfast Spot,Sports Bar,Mexican Restaurant,Beer Store,Supermarket
3,M1G,Coffee Shop,Indian Restaurant,Soccer Field,Korean Restaurant,Yoga Studio,Doctor's Office,Dog Run,Donut Shop,Eastern European Restaurant,Ethiopian Restaurant
4,M1H,Construction & Landscaping,Gaming Cafe,Yoga Studio,Distribution Center,Field,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Event Service


In [49]:
venues_sorted.shape


(101, 11)

In [50]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Postal Code', axis=1)

# run k-means clustering
kmeans = KMeans(init="k-means++", n_clusters=kclusters, n_init=15, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([3, 4, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [51]:
# initilize the dataset
toronto_merged = df_mod.copy()

# add clustering labels
toronto_merged.insert(len(toronto_merged.columns), 'Cluster Labels', kmeans.labels_)

# merge toronto_merged with venues_sorted to add the venues data for each neighborhood
toronto_merged = toronto_merged.join(venues_sorted.set_index('Postal Code'), on='PostalCode')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,NumberOfVenues,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.752935,-79.335641,3,3,Food & Drink Shop,Convenience Store,Park,Yoga Studio,Ethiopian Restaurant,Doctor's Office,Dog Run,Donut Shop,Eastern European Restaurant,Electronics Store
1,M4A,North York,Victoria Village,43.728102,-79.31189,6,4,Pizza Place,Intersection,Park,Portuguese Restaurant,Coffee Shop,French Restaurant,Donut Shop,Discount Store,Distribution Center,Doctor's Office
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041,27,0,Pub,Café,Athletics & Sports,French Restaurant,Distribution Center,Food Truck,Chocolate Shop,Bank,Bakery,Seafood Restaurant
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.723265,-79.451211,50,0,Clothing Store,Food Court,Toy / Game Store,Restaurant,Men's Store,American Restaurant,Furniture / Home Store,Bookstore,Cosmetics Shop,Movie Theater
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66179,-79.38939,37,0,Coffee Shop,Sushi Restaurant,Café,Yoga Studio,Pharmacy,Bookstore,Smoothie Shop,Burrito Place,Sandwich Place,Restaurant


In [52]:
# create map
map_clusters = folium.Map(location=[tor_lat, tor_lng], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, pos, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['PostalCode'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(pos) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [53]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[0] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,NumberOfVenues,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5A,27,0,Pub,Café,Athletics & Sports,French Restaurant,Distribution Center,Food Truck,Chocolate Shop,Bank,Bakery,Seafood Restaurant
3,M6A,50,0,Clothing Store,Food Court,Toy / Game Store,Restaurant,Men's Store,American Restaurant,Furniture / Home Store,Bookstore,Cosmetics Shop,Movie Theater
4,M7A,37,0,Coffee Shop,Sushi Restaurant,Café,Yoga Studio,Pharmacy,Bookstore,Smoothie Shop,Burrito Place,Sandwich Place,Restaurant
5,M9A,4,0,Park,Skating Rink,Baseball Field,Ethiopian Restaurant,Doctor's Office,Dog Run,Donut Shop,Eastern European Restaurant,Electronics Store,Yoga Studio
6,M1B,1,0,Trail,Yoga Studio,Diner,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Event Service,Ethiopian Restaurant,Electronics Store
7,M3B,4,0,Restaurant,Bank,Burger Joint,Athletics & Sports,Electronics Store,Doctor's Office,Dog Run,Donut Shop,Eastern European Restaurant,Ethiopian Restaurant
8,M4B,14,0,Pizza Place,Fast Food Restaurant,Breakfast Spot,Athletics & Sports,Rock Climbing Spot,Café,Pet Store,Bank,Gastropub,Intersection
9,M5B,50,0,Clothing Store,Café,Coffee Shop,Japanese Restaurant,Theater,Cosmetics Shop,Middle Eastern Restaurant,Tea Room,Ramen Restaurant,Sandwich Place
10,M6B,10,0,Pizza Place,Asian Restaurant,Mediterranean Restaurant,Sushi Restaurant,Gas Station,Pub,Japanese Restaurant,Fast Food Restaurant,Grocery Store,Discount Store
11,M9B,5,0,Pizza Place,Tea Room,Chinese Restaurant,Sandwich Place,Dog Run,Discount Store,Distribution Center,Doctor's Office,Donut Shop,Dessert Shop


In [54]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[0] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,NumberOfVenues,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
16,M6C,6,1,Field,Hockey Arena,Business Service,Grocery Store,Park,Trail,Yoga Studio,Eastern European Restaurant,Doctor's Office,Dog Run
22,M1G,5,1,Coffee Shop,Indian Restaurant,Soccer Field,Korean Restaurant,Yoga Studio,Doctor's Office,Dog Run,Donut Shop,Eastern European Restaurant,Ethiopian Restaurant
24,M5G,50,1,Coffee Shop,Middle Eastern Restaurant,Japanese Restaurant,Clothing Store,Plaza,Sandwich Place,Bubble Tea Shop,Breakfast Spot,Restaurant,Spa
26,M1H,2,1,Construction & Landscaping,Gaming Cafe,Yoga Studio,Distribution Center,Field,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Event Service
39,M2K,2,1,Construction & Landscaping,Trail,Yoga Studio,Electronics Store,Distribution Center,Doctor's Office,Dog Run,Donut Shop,Eastern European Restaurant,Ethiopian Restaurant
43,M6K,43,1,Café,Coffee Shop,Pizza Place,Gift Shop,Thrift / Vintage Store,Brewery,Cocktail Bar,Mexican Restaurant,Indian Restaurant,Supermarket
44,M1L,9,1,Bus Line,Intersection,Soccer Field,Metro Station,Coffee Shop,Bakery,Bus Station,Electronics Store,Dog Run,Donut Shop
48,M5L,50,1,Café,Coffee Shop,Gym,Hotel,Restaurant,American Restaurant,Gastropub,Japanese Restaurant,Deli / Bodega,Steakhouse
58,M1N,5,1,College Stadium,Construction & Landscaping,Skating Rink,General Entertainment,Pizza Place,Distribution Center,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant
63,M6N,7,1,Brewery,Gas Station,Storage Facility,Park,Coffee Shop,Electronics Store,Doctor's Office,Dog Run,Donut Shop,Eastern European Restaurant


In [55]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[0] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,NumberOfVenues,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
38,M1K,5,2,Discount Store,Department Store,Convenience Store,Coffee Shop,Chinese Restaurant,Ethiopian Restaurant,Dog Run,Donut Shop,Eastern European Restaurant,Electronics Store
89,M9V,12,2,Pizza Place,Beer Store,Grocery Store,Japanese Restaurant,Fried Chicken Joint,Liquor Store,Fast Food Restaurant,Park,Pharmacy,Caribbean Restaurant


In [56]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[0] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,NumberOfVenues,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,3,3,Food & Drink Shop,Convenience Store,Park,Yoga Studio,Ethiopian Restaurant,Doctor's Office,Dog Run,Donut Shop,Eastern European Restaurant,Electronics Store
18,M1E,20,3,Pizza Place,Fast Food Restaurant,Restaurant,Coffee Shop,Thrift / Vintage Store,Breakfast Spot,Sports Bar,Mexican Restaurant,Beer Store,Supermarket


In [57]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[0] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,NumberOfVenues,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,M4A,6,4,Pizza Place,Intersection,Park,Portuguese Restaurant,Coffee Shop,French Restaurant,Donut Shop,Discount Store,Distribution Center,Doctor's Office
