In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np 
import json 

import pgeocode #https://pypi.org/project/pgeocode/
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans
import folium # map rendering library



print('Libraries imported.')

Libraries imported.


In [2]:
#URL for scraping the data
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
# set number of top most common venues
num_top_venues = 5
# set number of clusters
kclusters = 10


## 1. Download and Explore Dataset

In [3]:

table = pd.DataFrame(columns=["postcode","borough","neighborhood"])
res=requests.get(url).text
soup=BeautifulSoup(res,'lxml')
for items in soup.find('table',class_='wikitable').find_all('tr')[1::1]:
    data=items.find_all(['th','td'])
    try:
        postcode = data[0].text.rstrip()
        borough = data[1].text.rstrip()
        neighborhood = data[2].text.rstrip()
    except IndexError:pass
    table = table.append({"postcode":postcode,"borough":borough,"neighborhood":neighborhood},ignore_index=True)


In [4]:
# Drop rows with 'Not assigned' borough
table = table[table.borough!='Not assigned']
table = table.reset_index(drop=True)

In [5]:
table.head()

Unnamed: 0,postcode,borough,neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [6]:
#Check there's no "Not assigned" value in borough
table.borough.value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East Toronto         5
York                 5
East York            5
Mississauga          1
Name: borough, dtype: int64

In [7]:
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
dftemp=table[(table.borough!="Not assigned") & (table.neighborhood=="Not assigned")]
dftemp

Unnamed: 0,postcode,borough,neighborhood


In [8]:
# Replace 'Not assigned' neighborhood with corresponding borough
table.neighborhood = table.borough.where((table.borough!='Not assigned')&(table.neighborhood=='Not assigned'),
                                         table.neighborhood)

In [9]:
#Combine neigborhoods having same postcodes in one row separated by a comma.
f_merge_comma = lambda x: " , ".join(x)
table = table.groupby(['postcode','borough']).agg({'neighborhood':f_merge_comma}).reset_index()

In [10]:
table.head(10)

Unnamed: 0,postcode,borough,neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West
9,M1N,Scarborough,Birch Cliff / Cliffside West


In [11]:
table.shape

(103, 3)

In [12]:
postcodes_df=table.copy()

In [13]:
nomi = pgeocode.Nominatim('ca')
def get_geocode(post_code):
    loc=nomi.query_postal_code(post_code)
    return loc.latitude, loc.longitude

In [14]:
postcodes_df['latitude'], postcodes_df['longitude'] = zip(*postcodes_df['postcode'].apply(get_geocode))

In [15]:
postcodes_df[postcodes_df.longitude.isnull()]

Unnamed: 0,postcode,borough,neighborhood,latitude,longitude
86,M7R,Mississauga,Canada Post Gateway Processing Centre,,


In [16]:
postcodes_df.dropna(inplace=True)

In [17]:
postcodes_df[postcodes_df.longitude.isnull()]

Unnamed: 0,postcode,borough,neighborhood,latitude,longitude


In [18]:
postcodes_df.head(12)

Unnamed: 0,postcode,borough,neighborhood,latitude,longitude
0,M1B,Scarborough,Malvern / Rouge,43.8113,-79.193
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.7878,-79.1564
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.7678,-79.1866
3,M1G,Scarborough,Woburn,43.7712,-79.2144
4,M1H,Scarborough,Cedarbrae,43.7686,-79.2389
5,M1J,Scarborough,Scarborough Village,43.7464,-79.2323
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park,43.7298,-79.2639
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge,43.7122,-79.2843
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West,43.7247,-79.2312
9,M1N,Scarborough,Birch Cliff / Cliffside West,43.6952,-79.2646


In [19]:
neighborhoods = postcodes_df.copy()

In [20]:
neighborhoods.borough.unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       'Etobicoke'], dtype=object)

Get the number of the neighborhoods and boroughs in the dataframe.

In [21]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 9 boroughs and 102 neighborhoods.


#### Use geopy library to get the latitude and longitude values of Toronto.

In [22]:
address = 'Toronto'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.6534817, -79.3839347.


In [23]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['latitude'], neighborhoods['longitude'], 
                                           neighborhoods['borough'], neighborhoods['neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [24]:
borough_data = neighborhoods.copy()
borough_data

Unnamed: 0,postcode,borough,neighborhood,latitude,longitude
0,M1B,Scarborough,Malvern / Rouge,43.8113,-79.1930
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.7878,-79.1564
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.7678,-79.1866
3,M1G,Scarborough,Woburn,43.7712,-79.2144
4,M1H,Scarborough,Cedarbrae,43.7686,-79.2389
5,M1J,Scarborough,Scarborough Village,43.7464,-79.2323
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park,43.7298,-79.2639
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge,43.7122,-79.2843
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West,43.7247,-79.2312
9,M1N,Scarborough,Birch Cliff / Cliffside West,43.6952,-79.2646


Now I'm going to utilize the Foursquare API to explore the neighborhoods and segment them.

In [25]:
#### Define Foursquare Credentials and Version
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 200EPI4MFVFK1R3Q0K0RQ1MFUX1AF33GSU5YCSDG5S3AN1J1
CLIENT_SECRET:C2QF0IPD4XUCFMIBIOBRWU5MUPSBQD41J2EOXSEGFTKC1NGC


Explore the first neighborhood in Scarborough

In [26]:
neighborhood_explore = borough_data.loc[0, 'neighborhood']
neighborhood_explore

'Malvern / Rouge'

In [27]:
neighborhood_latitude = borough_data[borough_data['neighborhood']==neighborhood_explore].loc[0, 'latitude'] # neighborhood latitude value
neighborhood_longitude = borough_data[borough_data['neighborhood']==neighborhood_explore].loc[0, 'longitude'] # neighborhood longitude value

neighborhood_name = borough_data[borough_data['neighborhood']==neighborhood_explore].loc[0, 'neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Malvern / Rouge are 43.8113, -79.193.


#### Getting the top 100 venues that are in Malvern / Rouge within a radius of 500 meters.

In [28]:
search_query = neighborhood_explore
radius = 500
limit = 100

url =\
'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'\
.format(CLIENT_ID,CLIENT_SECRET,neighborhood_latitude,neighborhood_longitude,VERSION,search_query,radius,limit)

url

'https://api.foursquare.com/v2/venues/search?client_id=200EPI4MFVFK1R3Q0K0RQ1MFUX1AF33GSU5YCSDG5S3AN1J1&client_secret=C2QF0IPD4XUCFMIBIOBRWU5MUPSBQD41J2EOXSEGFTKC1NGC&ll=43.8113,-79.193&v=20180605&query=Malvern / Rouge&radius=500&limit=100'

In [29]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5ea8376b7828ae001ba9ff16'},
 'response': {'venues': [{'id': '5561eacb498e09ff53b397ad',
    'name': 'Upper Rouge Trail',
    'location': {'lat': 43.809988,
     'lng': -79.186147,
     'labeledLatLngs': [{'label': 'display',
       'lat': 43.809988,
       'lng': -79.186147}],
     'distance': 569,
     'cc': 'CA',
     'country': 'Canada',
     'formattedAddress': ['Canada']},
    'categories': [{'id': '4bf58dd8d48988d159941735',
      'name': 'Trail',
      'pluralName': 'Trails',
      'shortName': 'Trail',
      'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/parks_outdoors/hikingtrail_',
       'suffix': '.png'},
      'primary': True}],
    'referralId': 'v-1588082609',
    'hasPerk': False}]}}

In [30]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [31]:
results['response']

{'venues': [{'id': '5561eacb498e09ff53b397ad',
   'name': 'Upper Rouge Trail',
   'location': {'lat': 43.809988,
    'lng': -79.186147,
    'labeledLatLngs': [{'label': 'display',
      'lat': 43.809988,
      'lng': -79.186147}],
    'distance': 569,
    'cc': 'CA',
    'country': 'Canada',
    'formattedAddress': ['Canada']},
   'categories': [{'id': '4bf58dd8d48988d159941735',
     'name': 'Trail',
     'pluralName': 'Trails',
     'shortName': 'Trail',
     'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/parks_outdoors/hikingtrail_',
      'suffix': '.png'},
     'primary': True}],
   'referralId': 'v-1588082609',
   'hasPerk': False}]}

In [32]:
venues = results['response']['venues']

nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
#filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
filtered_columns = ['name', 'categories', 'location.lat', 'location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng,categories.1
0,Upper Rouge Trail,"[{'id': '4bf58dd8d48988d159941735', 'name': 'T...",43.809988,-79.186147,Trail


Looks like we only have one venue here

## 2. Explore Neighborhoods

In [33]:
#function to repeat the same process to all the neighborhoods
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT = 100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        try:
            results = requests.get(url).json()["response"]['groups'][0]['items']
        except:
            pass
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['neighborhood', 
                  'neighborhood_latitude', 
                  'neighborhood_longitude', 
                  'venue', 
                  'venue_latitude', 
                  'venue_longitude', 
                  'venue_category']
    
    return(nearby_venues)

In [34]:
borough_venues = getNearbyVenues(names=borough_data['neighborhood'],
                                   latitudes=borough_data['latitude'],
                                   longitudes=borough_data['longitude']
                                  )

In [35]:
print(borough_venues.shape)
borough_venues.head()

(2185, 7)


Unnamed: 0,neighborhood,neighborhood_latitude,neighborhood_longitude,venue,venue_latitude,venue_longitude,venue_category
0,Rouge Hill / Port Union / Highland Creek,43.7878,-79.1564,Scarborough Historical Society,43.788755,-79.162438,History Museum
1,Guildwood / Morningside / West Hill,43.7678,-79.1866,Chick-N-Joy,43.768752,-79.187982,Fried Chicken Joint
2,Guildwood / Morningside / West Hill,43.7678,-79.1866,Little Caesars Pizza,43.769046,-79.184386,Pizza Place
3,Guildwood / Morningside / West Hill,43.7678,-79.1866,LCBO,43.771462,-79.184384,Liquor Store
4,Guildwood / Morningside / West Hill,43.7678,-79.1866,Bulk Barn,43.771342,-79.184341,Food & Drink Shop


In [36]:
#Number of venues for each neighborhood
borough_venues.groupby('neighborhood').count()

Unnamed: 0_level_0,neighborhood_latitude,neighborhood_longitude,venue,venue_latitude,venue_longitude,venue_category
neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,4,4,4,4,4,4
Alderwood / Long Branch,9,9,9,9,9,9
Bathurst Manor / Wilson Heights / Downsview North,6,6,6,6,6,6
Bayview Village,4,4,4,4,4,4
Bedford Park / Lawrence Manor East,23,23,23,23,23,23
Berczy Park,93,93,93,93,93,93
Birch Cliff / Cliffside West,4,4,4,4,4,4
Brockton / Parkdale Village / Exhibition Place,39,39,39,39,39,39
Business reply mail Processing CentrE,16,16,16,16,16,16
CN Tower / King and Spadina / Railway Lands / Harbourfront West / Bathurst Quay / South Niagara / Island airport,58,58,58,58,58,58


In [37]:
print('There are {} uniques categories.'.format(len(borough_venues['venue_category'].unique())))

There are 256 uniques categories.


## 3. Analyze Each Neighborhood

In [38]:
# one hot encoding
borough_onehot = pd.get_dummies(borough_venues[['venue_category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
borough_onehot['neighborhood'] = borough_venues['neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [borough_onehot.columns[-1]] + list(borough_onehot.columns[:-1])
borough_onehot = borough_onehot[fixed_columns]

borough_onehot.head()

Unnamed: 0,neighborhood,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Rouge Hill / Port Union / Highland Creek,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Guildwood / Morningside / West Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Guildwood / Morningside / West Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Guildwood / Morningside / West Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Guildwood / Morningside / West Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
borough_onehot.shape

(2185, 257)

#### Grouping rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [40]:
borough_grouped = borough_onehot.groupby('neighborhood').mean().reset_index()
borough_grouped.head()

Unnamed: 0,neighborhood,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Alderwood / Long Branch,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Bathurst Manor / Wilson Heights / Downsview North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Bedford Park / Lawrence Manor East,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
borough_grouped.shape

(93, 257)

In [42]:

for hood in borough_grouped['neighborhood']:
    print("----"+hood+"----")
    temp = borough_grouped[borough_grouped['neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
                       venue  freq
0  Latin American Restaurant  0.25
1         Chinese Restaurant  0.25
2            Badminton Court  0.25
3             Breakfast Spot  0.25
4          Accessories Store  0.00


----Alderwood / Long Branch----
                venue  freq
0         Coffee Shop  0.11
1            Pharmacy  0.11
2                 Gym  0.11
3                 Pub  0.11
4  Athletics & Sports  0.11


----Bathurst Manor / Wilson Heights / Downsview North----
                      venue  freq
0               Pizza Place  0.17
1             Deli / Bodega  0.17
2  Mediterranean Restaurant  0.17
3       Fried Chicken Joint  0.17
4               Coffee Shop  0.17


----Bayview Village----
               venue  freq
0        Gas Station  0.25
1        Flower Shop  0.25
2              Trail  0.25
3               Park  0.25
4  Accessories Store  0.00


----Bedford Park / Lawrence Manor East----
                venue  freq
0         Pizza Place  0.09
1         Coffee 

            venue  freq
0       Nightclub  0.17
1            Café  0.17
2   Grocery Store  0.17
3     Coffee Shop  0.17
4  Discount Store  0.17


----Humewood-Cedarvale----
           venue  freq
0   Hockey Arena  0.14
1    Bridal Shop  0.14
2  Deli / Bodega  0.14
3  Grocery Store  0.14
4           Park  0.14


----India Bazaar / The Beaches West----
                  venue  freq
0  Fast Food Restaurant  0.08
1        Sandwich Place  0.08
2           Pizza Place  0.08
3      Sushi Restaurant  0.04
4               Brewery  0.04


----Islington Avenue----
           venue  freq
0       Pharmacy  0.33
1           Bank  0.17
2           Park  0.17
3  Grocery Store  0.17
4   Skating Rink  0.17


----Kennedy Park / Ionview / East Birchmount Park----
              venue  freq
0       Coffee Shop  0.19
1    Discount Store  0.12
2     Metro Station  0.06
3  Department Store  0.06
4          Pharmacy  0.06


----Kensington Market / Chinatown / Grange Park----
                venue  freq
0       

                        venue  freq
0                 Pizza Place  0.29
1  Construction & Landscaping  0.14
2                 Coffee Shop  0.14
3          Chinese Restaurant  0.14
4              Sandwich Place  0.14


----Westmount----
                venue  freq
0         Pizza Place  0.22
1      Discount Store  0.11
2      Ice Cream Shop  0.11
3  Chinese Restaurant  0.11
4         Coffee Shop  0.11


----Weston----
                       venue  freq
0          Convenience Store   0.5
1                       Park   0.5
2               Neighborhood   0.0
3         Mexican Restaurant   0.0
4  Middle Eastern Restaurant   0.0


----Wexford / Maryvale----
                 venue  freq
0    Convenience Store   0.5
1          Auto Garage   0.5
2    Accessories Store   0.0
3  Monument / Landmark   0.0
4          Music Store   0.0


----Willowdale----
              venue  freq
0       Coffee Shop  0.06
1       Pizza Place  0.06
2  Ramen Restaurant  0.06
3    Sandwich Place  0.04
4              

#### Putting that into a pandas dataframe

In [43]:
# function to sort the venues in descending order.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

 #### Create the new dataframe and display the top venues for each neighborhood.

In [44]:
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['neighborhood'] = borough_grouped['neighborhood']

for ind in np.arange(borough_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(borough_grouped.iloc[ind, :], 
                                                                          num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Agincourt,Badminton Court,Breakfast Spot,Latin American Restaurant,Chinese Restaurant,Yoga Studio
1,Alderwood / Long Branch,Pharmacy,Athletics & Sports,Sandwich Place,Coffee Shop,Pub
2,Bathurst Manor / Wilson Heights / Downsview North,Pizza Place,Deli / Bodega,Middle Eastern Restaurant,Fried Chicken Joint,Coffee Shop
3,Bayview Village,Flower Shop,Gas Station,Trail,Park,Falafel Restaurant
4,Bedford Park / Lawrence Manor East,Italian Restaurant,Sandwich Place,Restaurant,Coffee Shop,Pizza Place


In [45]:
neighborhoods_venues_sorted

Unnamed: 0,neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Agincourt,Badminton Court,Breakfast Spot,Latin American Restaurant,Chinese Restaurant,Yoga Studio
1,Alderwood / Long Branch,Pharmacy,Athletics & Sports,Sandwich Place,Coffee Shop,Pub
2,Bathurst Manor / Wilson Heights / Downsview North,Pizza Place,Deli / Bodega,Middle Eastern Restaurant,Fried Chicken Joint,Coffee Shop
3,Bayview Village,Flower Shop,Gas Station,Trail,Park,Falafel Restaurant
4,Bedford Park / Lawrence Manor East,Italian Restaurant,Sandwich Place,Restaurant,Coffee Shop,Pizza Place
5,Berczy Park,Coffee Shop,Café,Hotel,Seafood Restaurant,Italian Restaurant
6,Birch Cliff / Cliffside West,Café,Skating Rink,College Stadium,General Entertainment,Yoga Studio
7,Brockton / Parkdale Village / Exhibition Place,Café,Coffee Shop,Thrift / Vintage Store,Gift Shop,Breakfast Spot
8,Business reply mail Processing CentrE,Coffee Shop,Restaurant,Yoga Studio,Breakfast Spot,Bookstore
9,CN Tower / King and Spadina / Railway Lands / ...,Coffee Shop,Restaurant,Café,Bar,Italian Restaurant


In [46]:
neighborhoods_venues_sorted.shape

(93, 6)

In [47]:
neighborhoods_venues_sorted[neighborhoods_venues_sorted.isna()].sum()

neighborhood             0.0
1st Most Common Venue    0.0
2nd Most Common Venue    0.0
3rd Most Common Venue    0.0
4th Most Common Venue    0.0
5th Most Common Venue    0.0
dtype: float64

## 4. Cluster Neighborhoods

Running *k*-means to cluster the neighborhoods

In [48]:
borough_grouped_clustering = borough_grouped.drop('neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(borough_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([6, 1, 1, 7, 1, 6, 6, 6, 6, 6])

In [49]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'cluster_labels', kmeans.labels_)

borough_merged = borough_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
borough_merged = borough_merged.join(neighborhoods_venues_sorted.set_index('neighborhood'), on='neighborhood',how='right')

borough_merged.head() # check the last columns!

Unnamed: 0,postcode,borough,neighborhood,latitude,longitude,cluster_labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.7878,-79.1564,2,History Museum,Yoga Studio,Donut Shop,Flower Shop,Flea Market
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.7678,-79.1866,1,Pizza Place,Bank,Fast Food Restaurant,Coffee Shop,Beer Store
3,M1G,Scarborough,Woburn,43.7712,-79.2144,9,Korean Restaurant,Insurance Office,Yoga Studio,Farmers Market,Electronics Store
4,M1H,Scarborough,Cedarbrae,43.7686,-79.2389,6,Construction & Landscaping,Lounge,Trail,Gaming Cafe,Farmers Market
5,M1J,Scarborough,Scarborough Village,43.7464,-79.2323,7,Grocery Store,Park,Yoga Studio,Falafel Restaurant,Dumpling Restaurant


In [50]:
borough_merged.shape

(98, 11)

#### Visualizing Clusters

In [51]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(borough_merged['latitude'], borough_merged['longitude'], 
                                  borough_merged['neighborhood'], borough_merged['cluster_labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 5. Examine Clusters

In [52]:
borough_merged[['neighborhood','cluster_labels']].groupby(['cluster_labels']).count().sort_values('neighborhood', ascending = False).reset_index()

Unnamed: 0,cluster_labels,neighborhood
0,6,52
1,1,21
2,7,12
3,3,4
4,0,2
5,5,2
6,8,2
7,2,1
8,4,1
9,9,1


In [53]:
100*12/borough_merged['neighborhood'].count()

12.244897959183673

### Cluster 6

In [54]:
Cluster_index = 6
cluster_6=borough_merged.loc[borough_merged['cluster_labels'] == Cluster_index, 
                     borough_merged.columns[[1] + list(range(5, borough_merged.shape[1]))]]
cluster_6

Unnamed: 0,borough,cluster_labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
4,Scarborough,6,Construction & Landscaping,Lounge,Trail,Gaming Cafe,Farmers Market
6,Scarborough,6,Coffee Shop,Discount Store,Hobby Shop,Grocery Store,Bus Station
7,Scarborough,6,Bus Line,Bakery,Intersection,Soccer Field,Park
9,Scarborough,6,Café,Skating Rink,College Stadium,General Entertainment,Yoga Studio
12,Scarborough,6,Badminton Court,Breakfast Spot,Latin American Restaurant,Chinese Restaurant,Yoga Studio
18,North York,6,Clothing Store,Fast Food Restaurant,Coffee Shop,Restaurant,Women's Store
20,North York,6,Martial Arts Dojo,Pool,Cafeteria,Yoga Studio,Farmers Market
21,North York,6,Home Service,Playground,Business Service,Donut Shop,Flea Market
22,North York,6,Pizza Place,Coffee Shop,Ramen Restaurant,Restaurant,Café
24,North York,6,Pizza Place,Coffee Shop,Ramen Restaurant,Restaurant,Café


In [55]:
cluster_6[['1st Most Common Venue','borough']].groupby(['1st Most Common Venue']).count()\
.sort_values(['borough'],ascending = False).reset_index().head(10)

Unnamed: 0,1st Most Common Venue,borough
0,Coffee Shop,10
1,Café,9
2,Sushi Restaurant,3
3,Pizza Place,2
4,Construction & Landscaping,2
5,Clothing Store,2
6,Park,1
7,Restaurant,1
8,Home Service,1
9,Sandwich Place,1


### Cluster 1

In [56]:
Cluster_index = 1
borough_merged.loc[borough_merged['cluster_labels'] == Cluster_index, 
                     borough_merged.columns[[1] + list(range(5, borough_merged.shape[1]))]]

Unnamed: 0,borough,cluster_labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
2,Scarborough,1,Pizza Place,Bank,Fast Food Restaurant,Coffee Shop,Beer Store
8,Scarborough,1,Ice Cream Shop,Bistro,Pharmacy,Pizza Place,Sandwich Place
13,Scarborough,1,Pizza Place,Bank,Intersection,Pharmacy,Bus Stop
15,Scarborough,1,Fast Food Restaurant,Chinese Restaurant,Grocery Store,Furniture / Home Store,Bank
28,North York,1,Pizza Place,Deli / Bodega,Middle Eastern Restaurant,Fried Chicken Joint,Coffee Shop
29,North York,1,Middle Eastern Restaurant,Sports Bar,Sandwich Place,Massage Studio,Pizza Place
30,North York,1,Shopping Mall,Discount Store,Grocery Store,Coffee Shop,Pizza Place
31,North York,1,Shopping Mall,Discount Store,Grocery Store,Coffee Shop,Pizza Place
32,North York,1,Shopping Mall,Discount Store,Grocery Store,Coffee Shop,Pizza Place
33,North York,1,Shopping Mall,Discount Store,Grocery Store,Coffee Shop,Pizza Place


### Cluster 7

In [57]:
Cluster_index = 7
borough_merged.loc[borough_merged['cluster_labels'] == Cluster_index, 
                     borough_merged.columns[[1] + list(range(5, borough_merged.shape[1]))]]

Unnamed: 0,borough,cluster_labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
5,Scarborough,7,Grocery Store,Park,Yoga Studio,Falafel Restaurant,Dumpling Restaurant
19,North York,7,Flower Shop,Gas Station,Trail,Park,Falafel Restaurant
25,North York,7,Food & Drink Shop,Park,Bus Stop,Dumpling Restaurant,Flower Shop
26,North York,7,Park,Yoga Studio,Gym,Pool,River
27,North York,7,Park,Yoga Studio,Gym,Pool,River
44,Central Toronto,7,Photography Studio,Park,Donut Shop,Flower Shop,Flea Market
46,Central Toronto,7,Playground,Garden,Park,Gym Pool,Yoga Studio
48,Central Toronto,7,Thai Restaurant,Park,Grocery Store,Gym,Playground
50,Downtown Toronto,7,Playground,Candy Store,Park,Grocery Store,Yoga Studio
59,Downtown Toronto,7,Music Venue,Café,Park,Harbor / Marina,Falafel Restaurant


In [58]:
cluster_7 = borough_merged.loc[borough_merged['cluster_labels'] == Cluster_index, 
                     borough_merged.columns[[1] + list(range(5, borough_merged.shape[1]))]]

In [59]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
stopwords = set(STOPWORDS)

def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=200,
        max_font_size=40, 
        scale=3,
        random_state=1 # chosen at random by flipping a coin; it was heads
    ).generate(str(data))

    fig = plt.figure(1, figsize=(12, 12))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

for i in cluster_6[cluster_6.columns[2:8]]:
    plt.title(i)
    show_wordcloud(cluster_6[i])
    
    

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

### Conclusion

In [60]:
borough_merged.groupby(['borough','cluster_labels'])['neighborhood'].count()\
.reset_index().sort_values('neighborhood',ascending = False)

Unnamed: 0,borough,cluster_labels,neighborhood
3,Downtown Toronto,6,17
17,North York,6,8
14,North York,1,8
10,Etobicoke,1,7
25,Scarborough,6,5
28,West Toronto,6,5
21,Scarborough,1,4
18,North York,7,4
6,East Toronto,6,4
1,Central Toronto,6,4


In [61]:
borough_merged.groupby(['cluster_labels','1st Most Common Venue'])['postcode'].count().reset_index()\
                                    .sort_values(['cluster_labels','postcode'],ascending=[True,False])

Unnamed: 0,cluster_labels,1st Most Common Venue,postcode
0,0,Bakery,2
8,1,Pizza Place,7
10,1,Shopping Mall,4
1,1,Fast Food Restaurant,2
7,1,Pharmacy,2
2,1,Grocery Store,1
3,1,Ice Cream Shop,1
4,1,Italian Restaurant,1
5,1,Middle Eastern Restaurant,1
6,1,Pet Store,1


In [62]:
borough_venues[borough_venues.venue_category=='Hotel'].groupby(['neighborhood','venue_category']).count()\
.sort_values(['venue'],ascending = False ).reset_index().filter(['neighborhood','venue_category','venue'])\
.rename(columns={'venue':'count'})

Unnamed: 0,neighborhood,venue_category,count
0,Toronto Dominion Centre / Design Exchange,Hotel,9
1,Commerce Court / Victoria Hotel,Hotel,7
2,First Canadian Place / Underground city,Hotel,7
3,Berczy Park,Hotel,5
4,Stn A PO Boxes,Hotel,4
5,Richmond / Adelaide / King,Hotel,3
6,Church and Wellesley,Hotel,2
7,"Garden District, Ryerson",Hotel,2
8,Northwest,Hotel,2
9,St. James Town,Hotel,2


In [63]:
borough_merged.head()

Unnamed: 0,postcode,borough,neighborhood,latitude,longitude,cluster_labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.7878,-79.1564,2,History Museum,Yoga Studio,Donut Shop,Flower Shop,Flea Market
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.7678,-79.1866,1,Pizza Place,Bank,Fast Food Restaurant,Coffee Shop,Beer Store
3,M1G,Scarborough,Woburn,43.7712,-79.2144,9,Korean Restaurant,Insurance Office,Yoga Studio,Farmers Market,Electronics Store
4,M1H,Scarborough,Cedarbrae,43.7686,-79.2389,6,Construction & Landscaping,Lounge,Trail,Gaming Cafe,Farmers Market
5,M1J,Scarborough,Scarborough Village,43.7464,-79.2323,7,Grocery Store,Park,Yoga Studio,Falafel Restaurant,Dumpling Restaurant


In [64]:
df = borough_venues.join(borough_merged.set_index('neighborhood'),on='neighborhood')

In [65]:
x = df.columns.tolist()
x.insert(1,x.pop(x.index('cluster_labels')))
df=df[x]

In [66]:
z=df[df.venue_category=='Hotel'].groupby(['neighborhood','venue_category','cluster_labels']).count()\
.sort_values(['venue']).reset_index().filter(['neighborhood','venue_category','cluster_labels','venue'])
z

Unnamed: 0,neighborhood,venue_category,cluster_labels,venue
0,CN Tower / King and Spadina / Railway Lands / ...,Hotel,6,1
1,Central Bay Street,Hotel,6,1
2,Church and Wellesley,Hotel,6,2
3,"Garden District, Ryerson",Hotel,6,2
4,Northwest,Hotel,6,2
5,St. James Town,Hotel,6,2
6,Willowdale,Hotel,6,2
7,Richmond / Adelaide / King,Hotel,6,3
8,Stn A PO Boxes,Hotel,6,4
9,Berczy Park,Hotel,6,5


In [67]:
borough_merged[borough_merged.neighborhood=='Central Bay Street']

Unnamed: 0,postcode,borough,neighborhood,latitude,longitude,cluster_labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
57,M5G,Downtown Toronto,Central Bay Street,43.6564,-79.386,6,Coffee Shop,Italian Restaurant,Café,Restaurant,Bubble Tea Shop


In [68]:
borough_venues[borough_venues.neighborhood=='Central Bay Street'].groupby(['neighborhood','venue_category']).count()\
.sort_values(['venue'],ascending = False ).reset_index().filter(['neighborhood','venue_category','cluster_labels','venue'])\
.rename(columns = {'venue':'count'}).head(10)

Unnamed: 0,neighborhood,venue_category,count
0,Central Bay Street,Coffee Shop,14
1,Central Bay Street,Italian Restaurant,3
2,Central Bay Street,Café,3
3,Central Bay Street,Sandwich Place,2
4,Central Bay Street,Breakfast Spot,2
5,Central Bay Street,Bubble Tea Shop,2
6,Central Bay Street,Middle Eastern Restaurant,2
7,Central Bay Street,Restaurant,2
8,Central Bay Street,Neighborhood,1
9,Central Bay Street,Park,1


In [69]:
z[z.neighborhood=='Central Bay Street']

Unnamed: 0,neighborhood,venue_category,cluster_labels,venue
1,Central Bay Street,Hotel,6,1


In [70]:
highlight_location = borough_data[borough_data.neighborhood == 'Central Bay Street']
highlight_location

Unnamed: 0,postcode,borough,neighborhood,latitude,longitude
57,M5G,Downtown Toronto,Central Bay Street,43.6564,-79.386


In [71]:
highlight_location=[highlight_location.iloc[0]['latitude'],highlight_location.iloc[0]['longitude']]

In [72]:
# create map
map_clusters = folium.Map(location=[highlight_location[0], highlight_location[1]], zoom_start=13)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 10, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(borough_merged['latitude'], borough_merged['longitude'], 
                                  borough_merged['neighborhood'], borough_merged['cluster_labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
#map_clusters
folium.Circle(highlight_location, 500, fill=True,popup='Central Bay Street').add_to(map_clusters)
map_clusters