Importing Pandas, BeautifulSoup and requests

In [9]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

Setting the variable for the Wiki webpage and parsing it with LXML

In [10]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

Data Wrangling

In [11]:
toronto_ps = []

for td in soup.find_all('td'):
    postalCode = td.b.text
    bor_nei = td.span.text
    nhoodvalue = 0
    if bor_nei != "Not assigned":
        bor_nei2 = bor_nei.split('(')
        borough = bor_nei2[0]
        # Removing special conditions of addresses in Boroughs
        borough = borough.replace("Business reply mail Processing Centre969 Eastern","")
        borough = borough.replace("Stn A PO Boxes25 The Esplanade","")
        borough = borough.replace("Canada Post Gateway Processing Centre","")
        borough = borough.replace("East YorkEast Toronto","East York")
        
        try:
            nhood = bor_nei2[1].replace(' / ',', ')
            nhood = nhood.split(')')[0]
            nhoodvalue = 1
        except Exception as e:
            nhood = borough
    else:
        borough = "Not assigned"

    if nhoodvalue == 1:
        if borough != "Not assigned":
            toronto_ps.append([postalCode,borough,nhood])
    else:
        if borough != "Not assigned":
            toronto_ps.append([postalCode,borough,borough])
        
    # breaks after it gets loops to the last postal code
    if postalCode == "M9Z":
        break

Create Pandas Dataframe from list

In [12]:
df = pd.DataFrame(toronto_ps,columns=['PostalCode','Borough','Neighborhood'])

df.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park / Ontario Provincial Government,Queen's Park / Ontario Provincial Government


Get the shape of the dataframe

In [13]:
df.shape

(103, 3)

Reading from Geospatial_Coodinates.csv file

In [14]:
df_geospatial = pd.read_csv("http://cocl.us/Geospatial_data")
df_geospatial.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
df_geospatial

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


Merge geospatial dataframe with scraped dataframe

In [15]:
df_all = pd.merge(df, df_geospatial[['PostalCode','Latitude','Longitude']], on = 'PostalCode')
df_all.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park / Ontario Provincial Government,Queen's Park / Ontario Provincial Government,43.662301,-79.389494


Get latitude and longitude of Toronto

In [17]:
#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))


The geograpical coordinate of Toronto are 43.653963, -79.387207.


Folium map of Toronto

In [18]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_all['Latitude'], df_all['Longitude'], df_all['Borough'], df_all['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Define FourSquare credential and version

In [19]:
CLIENT_ID = '54K3S1GZMKIQNXJMHKLPE4X3RGW15YISTFJ102HMXOC2X3GP' # your Foursquare ID
CLIENT_SECRET = 'RQKZENVAAOZXU2HHP2SKZKUUD2TOL4RPQ3A3QAJCKZKALBME' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 54K3S1GZMKIQNXJMHKLPE4X3RGW15YISTFJ102HMXOC2X3GP
CLIENT_SECRET:RQKZENVAAOZXU2HHP2SKZKUUD2TOL4RPQ3A3QAJCKZKALBME


Exploring first borough in dataframe

In [20]:
df_all.loc[0, 'Borough']

'North York'

Get the borough's latitude and longitude values.

In [21]:
borough_latitude = df_all.loc[0, 'Latitude'] # borough latitude value
borough_longitude = df_all.loc[0, 'Longitude'] # boroughlongitude value

borough_name = df_all.loc[0, 'Borough'] # boroughname

print('Latitude and longitude values of {} are {}, {}.'.format(borough_name, 
                                                               borough_latitude, 
                                                               borough_longitude))

Latitude and longitude values of North York are 43.7532586, -79.3296565.


Now, let's get the top 100 venues that are in North York within a radius of 500 meters.

In [24]:
LIMIT = 100

radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    borough_latitude, 
    borough_longitude, 
    radius, 
    LIMIT)

url

'https://api.foursquare.com/v2/venues/explore?&client_id=54K3S1GZMKIQNXJMHKLPE4X3RGW15YISTFJ102HMXOC2X3GP&client_secret=RQKZENVAAOZXU2HHP2SKZKUUD2TOL4RPQ3A3QAJCKZKALBME&v=20180605&ll=43.7532586,-79.3296565&radius=500&limit=100'

Send GET request

In [25]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e7a9255b9a389001b3a0511'},
  'headerLocation': 'Parkwoods - Donalda',
  'headerFullLocation': 'Parkwoods - Donalda, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 2,
  'suggestedBounds': {'ne': {'lat': 43.757758604500005,
    'lng': -79.32343823984928},
   'sw': {'lat': 43.7487585955, 'lng': -79.33587476015072}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4e8d9dcdd5fbbbb6b3003c7b',
       'name': 'Brookbanks Park',
       'location': {'address': 'Toronto',
        'lat': 43.751976046055574,
        'lng': -79.33214044722958,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.751976046055574,
          'lng': -79.33214044722958}],
        'distance': 245,
        'cc': 'CA',
        'c

Get category type

In [26]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

JSON to dataframe

In [31]:
from pandas.io.json import json_normalize

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Brookbanks Park,Park,43.751976,-79.33214
1,Variety Store,Food & Drink Shop,43.751974,-79.333114


Venues returned by Foursquare

In [32]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

2 venues were returned by Foursquare.


Repeat function for all boroughs

In [33]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Borough', 
                  'Borough Latitude', 
                  'Borough Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Calling function to get venues for all boroughs

In [35]:
toronto_venues = getNearbyVenues(names=df_all['Borough'],
                                   latitudes=df_all['Latitude'],
                                   longitudes=df_all['Longitude']
                                  )

North York
North York
Downtown Toronto
North York
Queen's Park / Ontario Provincial Government
Etobicoke
Scarborough
North York
East York
Downtown Toronto
North York
Etobicoke
Scarborough
North York
East York
Downtown Toronto
York
Etobicoke
Scarborough
East Toronto
Downtown Toronto
York
Scarborough
East York
Downtown Toronto
Downtown Toronto
Scarborough
North York
North York
East York
Downtown Toronto
West Toronto
Scarborough
North York
North York
East York
Downtown Toronto
West Toronto
Scarborough
North York
North York
East Toronto
Downtown Toronto
West Toronto
Scarborough
North York
North York
East Toronto
Downtown Toronto
North York
North York
Scarborough
North York
North York
East Toronto
North York
York
North York
Scarborough
North York
North York
Central Toronto
Central Toronto
York
York
Scarborough
North York
Central Toronto
Central Toronto
West Toronto
Etobicoke
Scarborough
North York
Central Toronto
Central Toronto
West Toronto
Mississauga
Etobicoke
Scarborough
Central Toronto

Checking on size of dataframe

In [36]:
print(toronto_venues.shape)
toronto_venues.head()

(2233, 7)


Unnamed: 0,Borough,Borough Latitude,Borough Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,North York,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,North York,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,North York,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,North York,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,North York,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


Let's check how many venues were returned for each borough

In [37]:
toronto_venues.groupby('Borough').count()

Unnamed: 0_level_0,Borough Latitude,Borough Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Central Toronto,110,110,110,110,110,110
Downtown Toronto,1274,1274,1274,1274,1274,1274
East Toronto,119,119,119,119,119,119
East York,76,76,76,76,76,76
Etobicoke,71,71,71,71,71,71
EtobicokeNorthwest,3,3,3,3,3,3
Mississauga,13,13,13,13,13,13
North York,246,246,246,246,246,246
Queen's Park / Ontario Provincial Government,42,42,42,42,42,42
Scarborough,92,92,92,92,92,92


Let's find out how many unique categories can be curated from all the returned venues

In [39]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 271 uniques categories.


Analyze Each Neighborhood

In [40]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Borough'] = toronto_venues['Borough'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Borough,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,North York,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,North York,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,North York,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,North York,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,North York,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Size of dataframe

In [41]:
toronto_onehot.shape

(2233, 272)

Grouping rows by boroughs

In [42]:
toronto_grouped = toronto_onehot.groupby('Borough').mean().reset_index()
toronto_grouped

Unnamed: 0,Borough,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Central Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018182,...,0.009091,0.0,0.0,0.009091,0.0,0.0,0.0,0.0,0.0,0.009091
1,Downtown Toronto,0.0,0.000785,0.000785,0.000785,0.000785,0.00157,0.002355,0.00157,0.014129,...,0.011774,0.00157,0.0,0.005495,0.0,0.007064,0.000785,0.000785,0.000785,0.003925
2,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02521,...,0.0,0.0,0.0,0.0,0.0,0.008403,0.0,0.0,0.0,0.02521
3,East York,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.013158,0.0,0.013158,0.0,0.0,0.0,0.0,0.013158
4,Etobicoke,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014085,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014085,0.0,0.0
5,EtobicokeNorthwest,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Mississauga,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,North York,0.004065,0.0,0.004065,0.0,0.0,0.0,0.0,0.0,0.00813,...,0.0,0.004065,0.0,0.00813,0.0,0.0,0.0,0.0,0.01626,0.0
8,Queen's Park / Ontario Provincial Government,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.02381,0.0,0.0,0.0,0.0,0.0,0.0,0.02381,0.0,0.02381
9,Scarborough,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01087,...,0.0,0.0,0.0,0.01087,0.0,0.0,0.0,0.0,0.0,0.0


Confirming size

In [43]:
toronto_grouped.shape

(12, 272)

Let's print each neighborhood along with the top 5 most common venues

In [44]:
num_top_venues = 5

for hood in toronto_grouped['Borough']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Borough'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Central Toronto----
            venue  freq
0     Coffee Shop  0.07
1  Sandwich Place  0.06
2            Park  0.06
3            Café  0.05
4     Pizza Place  0.05


----Downtown Toronto----
                 venue  freq
0          Coffee Shop  0.10
1                 Café  0.05
2           Restaurant  0.04
3                Hotel  0.03
4  Japanese Restaurant  0.03


----East Toronto----
                venue  freq
0    Greek Restaurant  0.07
1         Coffee Shop  0.06
2  Italian Restaurant  0.04
3                Café  0.04
4             Brewery  0.04


----East York----
          venue  freq
0          Bank  0.05
1   Coffee Shop  0.05
2      Pharmacy  0.04
3  Burger Joint  0.04
4          Park  0.04


----Etobicoke----
                  venue  freq
0           Pizza Place  0.13
1        Sandwich Place  0.07
2              Pharmacy  0.04
3  Fast Food Restaurant  0.04
4         Grocery Store  0.04


----EtobicokeNorthwest----
                 venue  freq
0                  Bar  0.33
1

First, let's write a function to sort the venues in descending order.

In [45]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each borough.

In [55]:
import numpy as np

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Borough']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
boroughs_venues_sorted = pd.DataFrame(columns=columns)
boroughs_venues_sorted['Borough'] = toronto_grouped['Borough']

for ind in np.arange(toronto_grouped.shape[0]):
    boroughs_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

boroughs_venues_sorted.head()

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,Coffee Shop,Park,Sandwich Place,Café,Pizza Place,Sushi Restaurant,Dessert Shop,Restaurant,Pub,Gym
1,Downtown Toronto,Coffee Shop,Café,Restaurant,Hotel,Japanese Restaurant,Italian Restaurant,Bakery,Seafood Restaurant,Bar,Park
2,East Toronto,Greek Restaurant,Coffee Shop,Café,Brewery,Italian Restaurant,Ice Cream Shop,Park,Yoga Studio,Bookstore,Pub
3,East York,Coffee Shop,Bank,Park,Sporting Goods Shop,Pizza Place,Burger Joint,Pharmacy,Beer Store,Supermarket,Restaurant
4,Etobicoke,Pizza Place,Sandwich Place,Fast Food Restaurant,Pharmacy,Coffee Shop,Gym,Grocery Store,Liquor Store,Beer Store,Pet Store


## Cluster Neighborhoods

Run k-means to cluster the neighborhood into 5 clusters.

In [56]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Borough', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 2, 1, 1, 0, 3, 2, 2, 1], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each borough.

In [57]:
# add clustering labels
boroughs_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_all

toronto_merged = toronto_merged.join(boroughs_venues_sorted.set_index('Borough'), on='Borough')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,2,Coffee Shop,Clothing Store,Bank,Restaurant,Japanese Restaurant,Fast Food Restaurant,Sandwich Place,Pizza Place,Park,Grocery Store
1,M4A,North York,Victoria Village,43.725882,-79.315572,2,Coffee Shop,Clothing Store,Bank,Restaurant,Japanese Restaurant,Fast Food Restaurant,Sandwich Place,Pizza Place,Park,Grocery Store
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,2,Coffee Shop,Café,Restaurant,Hotel,Japanese Restaurant,Italian Restaurant,Bakery,Seafood Restaurant,Bar,Park
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,2,Coffee Shop,Clothing Store,Bank,Restaurant,Japanese Restaurant,Fast Food Restaurant,Sandwich Place,Pizza Place,Park,Grocery Store
4,M7A,Queen's Park / Ontario Provincial Government,Queen's Park / Ontario Provincial Government,43.662301,-79.389494,2,Coffee Shop,Park,Yoga Studio,Nightclub,Beer Bar,Boutique,Seafood Restaurant,Sandwich Place,Burger Joint,Burrito Place


Finally, let's visualize the resulting clusters

In [60]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Borough'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine Clusters

Now, you can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, you can then assign a name to each cluster. I will leave this exercise to you.

### Cluster 1


In [61]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
94,EtobicokeNorthwest,0,Rental Car Location,Bar,Drugstore,Yoga Studio,Doner Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Donut Shop


Cluster 2

In [62]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Etobicoke,1,Pizza Place,Sandwich Place,Fast Food Restaurant,Pharmacy,Coffee Shop,Gym,Grocery Store,Liquor Store,Beer Store,Pet Store
6,Scarborough,1,Coffee Shop,Fast Food Restaurant,Bank,Bakery,Chinese Restaurant,Breakfast Spot,Pizza Place,Fried Chicken Joint,Park,Gas Station
8,East York,1,Coffee Shop,Bank,Park,Sporting Goods Shop,Pizza Place,Burger Joint,Pharmacy,Beer Store,Supermarket,Restaurant
11,Etobicoke,1,Pizza Place,Sandwich Place,Fast Food Restaurant,Pharmacy,Coffee Shop,Gym,Grocery Store,Liquor Store,Beer Store,Pet Store
12,Scarborough,1,Coffee Shop,Fast Food Restaurant,Bank,Bakery,Chinese Restaurant,Breakfast Spot,Pizza Place,Fried Chicken Joint,Park,Gas Station
14,East York,1,Coffee Shop,Bank,Park,Sporting Goods Shop,Pizza Place,Burger Joint,Pharmacy,Beer Store,Supermarket,Restaurant
17,Etobicoke,1,Pizza Place,Sandwich Place,Fast Food Restaurant,Pharmacy,Coffee Shop,Gym,Grocery Store,Liquor Store,Beer Store,Pet Store
18,Scarborough,1,Coffee Shop,Fast Food Restaurant,Bank,Bakery,Chinese Restaurant,Breakfast Spot,Pizza Place,Fried Chicken Joint,Park,Gas Station
22,Scarborough,1,Coffee Shop,Fast Food Restaurant,Bank,Bakery,Chinese Restaurant,Breakfast Spot,Pizza Place,Fried Chicken Joint,Park,Gas Station
23,East York,1,Coffee Shop,Bank,Park,Sporting Goods Shop,Pizza Place,Burger Joint,Pharmacy,Beer Store,Supermarket,Restaurant


Cluster 3

In [63]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,North York,2,Coffee Shop,Clothing Store,Bank,Restaurant,Japanese Restaurant,Fast Food Restaurant,Sandwich Place,Pizza Place,Park,Grocery Store
1,North York,2,Coffee Shop,Clothing Store,Bank,Restaurant,Japanese Restaurant,Fast Food Restaurant,Sandwich Place,Pizza Place,Park,Grocery Store
2,Downtown Toronto,2,Coffee Shop,Café,Restaurant,Hotel,Japanese Restaurant,Italian Restaurant,Bakery,Seafood Restaurant,Bar,Park
3,North York,2,Coffee Shop,Clothing Store,Bank,Restaurant,Japanese Restaurant,Fast Food Restaurant,Sandwich Place,Pizza Place,Park,Grocery Store
4,Queen's Park / Ontario Provincial Government,2,Coffee Shop,Park,Yoga Studio,Nightclub,Beer Bar,Boutique,Seafood Restaurant,Sandwich Place,Burger Joint,Burrito Place
7,North York,2,Coffee Shop,Clothing Store,Bank,Restaurant,Japanese Restaurant,Fast Food Restaurant,Sandwich Place,Pizza Place,Park,Grocery Store
9,Downtown Toronto,2,Coffee Shop,Café,Restaurant,Hotel,Japanese Restaurant,Italian Restaurant,Bakery,Seafood Restaurant,Bar,Park
10,North York,2,Coffee Shop,Clothing Store,Bank,Restaurant,Japanese Restaurant,Fast Food Restaurant,Sandwich Place,Pizza Place,Park,Grocery Store
13,North York,2,Coffee Shop,Clothing Store,Bank,Restaurant,Japanese Restaurant,Fast Food Restaurant,Sandwich Place,Pizza Place,Park,Grocery Store
15,Downtown Toronto,2,Coffee Shop,Café,Restaurant,Hotel,Japanese Restaurant,Italian Restaurant,Bakery,Seafood Restaurant,Bar,Park


Cluster 4

In [64]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
76,Mississauga,3,Intersection,Coffee Shop,Hotel,Gym,Mediterranean Restaurant,American Restaurant,Fried Chicken Joint,Burrito Place,Middle Eastern Restaurant,Sandwich Place


Cluster 5

In [65]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
16,York,4,Park,Trail,Coffee Shop,Convenience Store,Restaurant,Bus Line,Sandwich Place,Market,Breakfast Spot,Skating Rink
21,York,4,Park,Trail,Coffee Shop,Convenience Store,Restaurant,Bus Line,Sandwich Place,Market,Breakfast Spot,Skating Rink
56,York,4,Park,Trail,Coffee Shop,Convenience Store,Restaurant,Bus Line,Sandwich Place,Market,Breakfast Spot,Skating Rink
63,York,4,Park,Trail,Coffee Shop,Convenience Store,Restaurant,Bus Line,Sandwich Place,Market,Breakfast Spot,Skating Rink
64,York,4,Park,Trail,Coffee Shop,Convenience Store,Restaurant,Bus Line,Sandwich Place,Market,Breakfast Spot,Skating Rink
