In [64]:
pip install BeautifulSoup4

Note: you may need to restart the kernel to use updated packages.


In [65]:
pip install lxml

Note: you may need to restart the kernel to use updated packages.


In [66]:
pip install requests

Note: you may need to restart the kernel to use updated packages.


# 1. Scraping wikipedia page and creating dataframe

In [67]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [68]:
#send request

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

page = requests.get(url) #from here page.text gives the html text. We need to parse the html using BeautifulSoup

soup = BeautifulSoup(page.text, 'html')


In [69]:
#read table text

table = soup.find('table', {'class':'wikitable sortable'}).tbody

rows = table.find_all('tr')
columns = [v.text.replace('\n', '') for v in rows[0].find_all('th')]  # use replace to remove \n
print(columns)

['Postcode', 'Borough', 'Neighbourhood']


In [70]:

df = pd.DataFrame(columns=columns)

for i in range(1, len(rows)):
    tds = rows[i].find_all('td')
      
    if len(tds) ==4:
        values = [tds[0].text, '', ''.replace('\n', '')]  #use replace to remove '\n'
    else:
        values = [td.text.replace('\n', '') for td in tds]          #use .replace to remove '\n'    
        
    df = df.append(pd.Series(values, index=columns), ignore_index=True)

df
    

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West
285,M8Z,Etobicoke,South of Bloor


In [71]:
# rename Postcode column
df.rename(columns={'Postcode': 'PostalCode'}, inplace=True)


In [72]:
#dropping cells with Borough=Not assigned
df = df[df.Borough != 'Not assigned']
df

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
...,...,...,...
281,M8Z,Etobicoke,Kingsway Park South West
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West


In [73]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df['Borough'].unique()),
        df.shape[0]
    )
)

The dataframe has 11 boroughs and 210 neighborhoods.


In [74]:

df['Neighbourhood'] = df['Neighbourhood'].astype(str)
neighborhoods1 = df.groupby(['PostalCode'], sort=False).agg( ','.join)

neighborhoods1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,Borough,Neighbourhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,Harbourfront
M6A,"North York,North York","Lawrence Heights,Lawrence Manor"
M7A,Queen's Park,Not assigned
...,...,...
M8X,"Etobicoke,Etobicoke,Etobicoke","The Kingsway,Montgomery Road,Old Mill North"
M4Y,Downtown Toronto,Church and Wellesley
M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
M8Y,"Etobicoke,Etobicoke,Etobicoke,Etobicoke,Etobic...","Humber Bay,King's Mill Park,Kingsway Park Sout..."


In [75]:
#group Neighbourhoods with same postalcodes
df['Neighbourhood'] = df['Neighbourhood'].astype(str)
neighborhoods1 = df.groupby(['PostalCode', 'Borough'], sort=False).agg( ','.join)

neighborhoods1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Unnamed: 1_level_0,Neighbourhood
PostalCode,Borough,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,Harbourfront
M6A,North York,"Lawrence Heights,Lawrence Manor"
M7A,Queen's Park,Not assigned
...,...,...
M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North"
M4Y,Downtown Toronto,Church and Wellesley
M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
M8Y,Etobicoke,"Humber Bay,King's Mill Park,Kingsway Park Sout..."


In [76]:
#reset the levels from multi to single level
result = neighborhoods1.reset_index(level=['PostalCode', 'Borough'])
result

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Not assigned
...,...,...,...
98,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
101,M8Y,Etobicoke,"Humber Bay,King's Mill Park,Kingsway Park Sout..."


In [77]:
#replace the not assigned values in result
result.Neighbourhood = result.Neighbourhood.replace('Not assigned', result.Borough)
result

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
...,...,...,...
98,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
101,M8Y,Etobicoke,"Humber Bay,King's Mill Park,Kingsway Park Sout..."


In [78]:
#display the shape
result.shape

(103, 3)

# 2. Getting latitude and longitude coordinates and creating a new merged dataframe

In [79]:
# upload csv file containing the geographical coordinates of each postalcode

!wget -q -O 'Coordinates.csv' http://cocl.us/Geospatial_data
print('data downloaded')

data downloaded


In [80]:
# read it into a pandas dataframe
Coordinates_df = pd.read_csv('Coordinates.csv')
Coordinates_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [81]:
Coordinates_df.columns = ['PostalCode', 'Latitude', 'Longitude']
Coordinates_df

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [82]:
# merge the 2 dataframes
resultcoordinates_df = pd.merge(result, Coordinates_df, on='PostalCode', how='inner')
resultcoordinates_df

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558
101,M8Y,Etobicoke,"Humber Bay,King's Mill Park,Kingsway Park Sout...",43.636258,-79.498509


# 3. Explore and cluster : using FourSquare API to explore the neighbourhoods and segment them

Define FourSquare credentials and version

In [83]:
CLIENT_ID = '4MZ4U1YKCDSQCP5CGALCWZYFTEU0GLKTJ1UBLYDD15CQZ1VI' # your Foursquare ID
CLIENT_SECRET = 'AI505REPCBBLDK4VR3MQASWG2LKOAL2Z3LZ4JO1RJUIEQMCK' # your Foursquare Secret
VERSION = '20191213' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 4MZ4U1YKCDSQCP5CGALCWZYFTEU0GLKTJ1UBLYDD15CQZ1VI
CLIENT_SECRET:AI505REPCBBLDK4VR3MQASWG2LKOAL2Z3LZ4JO1RJUIEQMCK


### Exploring the first borough in our dataframe

In [84]:
# get the bough's name
resultcoordinates_df.loc[0, 'Borough']

'North York'

In [85]:
# get the borough's latitude and longitude values
neighborhood_latitude = resultcoordinates_df.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = resultcoordinates_df.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = resultcoordinates_df.loc[0, 'Borough'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of North York are 43.7532586, -79.3296565.


##### Getting the top 100 venues that are in North York within a radius of 500 meters.

In [86]:
# get the url
LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id=4MZ4U1YKCDSQCP5CGALCWZYFTEU0GLKTJ1UBLYDD15CQZ1VI&client_secret=AI505REPCBBLDK4VR3MQASWG2LKOAL2Z3LZ4JO1RJUIEQMCK&v=20191213&ll=43.7532586,-79.3296565&radius=500&limit=100'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=4MZ4U1YKCDSQCP5CGALCWZYFTEU0GLKTJ1UBLYDD15CQZ1VI&client_secret=AI505REPCBBLDK4VR3MQASWG2LKOAL2Z3LZ4JO1RJUIEQMCK&v=20191213&ll=43.7532586,-79.3296565&radius=500&limit=100'

Send the GET request and examine the resutls

In [87]:
results1 = requests.get(url).json()
results1

{'meta': {'code': 200, 'requestId': '5df97736edbcad001ba01b10'},
  'headerLocation': 'Parkwoods - Donalda',
  'headerFullLocation': 'Parkwoods - Donalda, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 2,
  'suggestedBounds': {'ne': {'lat': 43.757758604500005,
    'lng': -79.32343823984928},
   'sw': {'lat': 43.7487585955, 'lng': -79.33587476015072}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4e8d9dcdd5fbbbb6b3003c7b',
       'name': 'Brookbanks Park',
       'location': {'address': 'Toronto',
        'lat': 43.751976046055574,
        'lng': -79.33214044722958,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.751976046055574,
          'lng': -79.33214044722958}],
        'distance': 245,
        'cc': 'CA',
        'c

All the information is in the *items* key. Before we proceed, let's borrow the **get_category_type** function from the Foursquare lab.

In [88]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [89]:
pip install geopy

Note: you may need to restart the kernel to use updated packages.


In [90]:

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [91]:
# clean the json and create a pandas dataframe

venues = results1['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Brookbanks Park,Park,43.751976,-79.33214
1,Variety Store,Food & Drink Shop,43.751974,-79.333114


In [92]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

2 venues were returned by Foursquare.


### Exploring all the neighborhoods in Toronto

In [93]:
# defining a function to repeat thesame process to all neiborhoods in Toronto

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [94]:
# Running the above function on each neighborhood

toronto_venues = getNearbyVenues(names=resultcoordinates_df['Borough'],
                                   latitudes=resultcoordinates_df['Latitude'],
                                   longitudes=resultcoordinates_df['Longitude']
                                  )

North York
North York
Downtown Toronto
North York
Queen's Park
Downtown Toronto
Scarborough
North York
East York
Downtown Toronto
North York
Etobicoke
Scarborough
North York
East York
Downtown Toronto
York
Etobicoke
Scarborough
East Toronto
Downtown Toronto
York
Scarborough
East York
Downtown Toronto
Downtown Toronto
Scarborough
North York
North York
East York
Downtown Toronto
West Toronto
Scarborough
North York
North York
East York
Downtown Toronto
West Toronto
Scarborough
North York
North York
East Toronto
Downtown Toronto
West Toronto
Scarborough
North York
North York
East Toronto
Downtown Toronto
North York
North York
Scarborough
North York
North York
East Toronto
North York
York
North York
Scarborough
North York
North York
Central Toronto
Central Toronto
York
York
Scarborough
North York
Central Toronto
Central Toronto
West Toronto
Etobicoke
Scarborough
North York
Central Toronto
Central Toronto
West Toronto
Mississauga
Etobicoke
Scarborough
Central Toronto
Downtown Toronto
West To

In [95]:
# check the size of the resulting dataframe

print(toronto_venues.shape)
toronto_venues.head()

(2241, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,North York,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,North York,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,North York,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,North York,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,North York,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


In [96]:
#check how many venues were returned for each neighborhood

toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Central Toronto,114,114,114,114,114,114
Downtown Toronto,1279,1279,1279,1279,1279,1279
East Toronto,121,121,121,121,121,121
East York,76,76,76,76,76,76
Etobicoke,72,72,72,72,72,72
Mississauga,10,10,10,10,10,10
North York,246,246,246,246,246,246
Queen's Park,38,38,38,38,38,38
Scarborough,96,96,96,96,96,96
West Toronto,171,171,171,171,171,171


In [97]:
# to find out how many unique categories can be curated from all the returned values

print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 272 uniques categories.


### Analyse each borough (instead of neighborhood)

In [98]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Borough'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Borough,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,North York,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,North York,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,North York,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,North York,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,North York,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [99]:
# the new dataframe size:
toronto_onehot.shape

(2241, 273)

#### Group rows by borough, by taking the mean of the frequency of occurrence of each category

In [100]:
toronto_grouped = toronto_onehot.groupby('Borough').mean().reset_index()
toronto_grouped

Unnamed: 0,Borough,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Central Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026316,...,0.0,0.008772,0.0,0.0,0.008772,0.0,0.0,0.0,0.0,0.008772
1,Downtown Toronto,0.0,0.000782,0.000782,0.000782,0.000782,0.001564,0.001564,0.001564,0.014855,...,0.002346,0.011728,0.001564,0.0,0.004691,0.0,0.007037,0.000782,0.0,0.001564
2,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.024793,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016529
3,East York,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.013158,0.0,0.013158,0.0,0.0,0.0,0.013158
4,Etobicoke,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013889,...,0.0,0.0,0.0,0.013889,0.0,0.0,0.0,0.013889,0.0,0.0
5,Mississauga,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,North York,0.00813,0.0,0.004065,0.0,0.0,0.0,0.0,0.0,0.00813,...,0.0,0.0,0.004065,0.0,0.00813,0.0,0.0,0.004065,0.012195,0.0
7,Queen's Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026316,0.0,0.026316
8,Scarborough,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010417,...,0.0,0.0,0.0,0.0,0.010417,0.0,0.0,0.0,0.0,0.0
9,West Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.011696,0.0,0.0,0.011696,0.0,0.005848,0.0,0.0,0.011696


In [101]:
# see the size
toronto_grouped.shape

(11, 273)

In [102]:
#print each borough along with the top 5 most common venues

num_top_venues = 5

for hood in toronto_grouped['Borough']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Borough'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Central Toronto----
            venue  freq
0     Coffee Shop  0.07
1  Sandwich Place  0.06
2     Pizza Place  0.05
3            Park  0.05
4  Clothing Store  0.04


----Downtown Toronto----
         venue  freq
0  Coffee Shop  0.09
1         Café  0.06
2        Hotel  0.03
3   Restaurant  0.03
4       Bakery  0.03


----East Toronto----
                venue  freq
0    Greek Restaurant  0.07
1         Coffee Shop  0.06
2  Italian Restaurant  0.05
3      Ice Cream Shop  0.04
4             Brewery  0.03


----East York----
          venue  freq
0   Coffee Shop  0.07
1   Pizza Place  0.05
2  Burger Joint  0.05
3          Park  0.05
4          Bank  0.04


----Etobicoke----
            venue  freq
0     Pizza Place  0.11
1  Sandwich Place  0.07
2        Pharmacy  0.06
3     Coffee Shop  0.06
4             Gym  0.04


----Mississauga----
                 venue  freq
0          Coffee Shop   0.2
1                Hotel   0.2
2                  Gym   0.1
3  Fried Chicken Joint   0.1
4    

#### Putting that into a *pandas* dataframe

In [103]:
# writing a function to sort the venues in descending order

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [104]:
# create the new dataframe and display the top 10 venues for each borough

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Borough']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
boroughs_venues_sorted = pd.DataFrame(columns=columns)
boroughs_venues_sorted['Borough'] = toronto_grouped['Borough']

for ind in np.arange(toronto_grouped.shape[0]):
    boroughs_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

boroughs_venues_sorted.head()

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,Coffee Shop,Sandwich Place,Pizza Place,Park,Café,Clothing Store,Restaurant,Dessert Shop,Sushi Restaurant,Gym
1,Downtown Toronto,Coffee Shop,Café,Restaurant,Hotel,Bakery,Italian Restaurant,Japanese Restaurant,Bar,Park,Seafood Restaurant
2,East Toronto,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Brewery,Café,Pub,Pizza Place,Sandwich Place,Bookstore
3,East York,Coffee Shop,Pizza Place,Park,Burger Joint,Pharmacy,Bank,Sporting Goods Shop,Grocery Store,Fast Food Restaurant,Sandwich Place
4,Etobicoke,Pizza Place,Sandwich Place,Pharmacy,Coffee Shop,Fast Food Restaurant,Gym,Pool,Bakery,Café,Beer Store


## Cluster Neighborhoods (Boroughs, in this case)

Run K-means to cluster the boroughs into 5 clusters

In [105]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Borough', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 4, 4, 1, 0, 3, 4, 0], dtype=int32)

Create a new dataframe that includes the cluster as well as the top 10 venues for each borough.

In [106]:
# add clustering labels
boroughs_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = resultcoordinates_df

# merge toronto_grouped with toronto_data(resultcoordinates_df) to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(boroughs_venues_sorted.set_index('Borough'), on='Borough')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,0,Coffee Shop,Clothing Store,Fast Food Restaurant,Park,Japanese Restaurant,Furniture / Home Store,Restaurant,Pizza Place,Café,Sandwich Place
1,M4A,North York,Victoria Village,43.725882,-79.315572,0,Coffee Shop,Clothing Store,Fast Food Restaurant,Park,Japanese Restaurant,Furniture / Home Store,Restaurant,Pizza Place,Café,Sandwich Place
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,0,Coffee Shop,Café,Restaurant,Hotel,Bakery,Italian Restaurant,Japanese Restaurant,Bar,Park,Seafood Restaurant
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763,0,Coffee Shop,Clothing Store,Fast Food Restaurant,Park,Japanese Restaurant,Furniture / Home Store,Restaurant,Pizza Place,Café,Sandwich Place
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494,3,Coffee Shop,Diner,Park,Gym,Sushi Restaurant,Yoga Studio,Beer Bar,Smoothie Shop,Seafood Restaurant,Sandwich Place


### Visualize the resulting clusters

In [109]:
# creating a map of toronto centered around the north york borough:

toronto_latitude = 43.7532586  #centered on North York
toronto_longitude = -79.3296565
map_clusters = folium.Map(location=[toronto_latitude, toronto_longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Borough'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine clusters

Examine each cluster and determine the discriminating venue categories that distinguish each cluster. A name can then be assigned to each cluster based on the defining categories.

#### Cluster 1

In [110]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,North York,0,Coffee Shop,Clothing Store,Fast Food Restaurant,Park,Japanese Restaurant,Furniture / Home Store,Restaurant,Pizza Place,Café,Sandwich Place
1,North York,0,Coffee Shop,Clothing Store,Fast Food Restaurant,Park,Japanese Restaurant,Furniture / Home Store,Restaurant,Pizza Place,Café,Sandwich Place
2,Downtown Toronto,0,Coffee Shop,Café,Restaurant,Hotel,Bakery,Italian Restaurant,Japanese Restaurant,Bar,Park,Seafood Restaurant
3,North York,0,Coffee Shop,Clothing Store,Fast Food Restaurant,Park,Japanese Restaurant,Furniture / Home Store,Restaurant,Pizza Place,Café,Sandwich Place
5,Downtown Toronto,0,Coffee Shop,Café,Restaurant,Hotel,Bakery,Italian Restaurant,Japanese Restaurant,Bar,Park,Seafood Restaurant
...,...,...,...,...,...,...,...,...,...,...,...,...
92,Downtown Toronto,0,Coffee Shop,Café,Restaurant,Hotel,Bakery,Italian Restaurant,Japanese Restaurant,Bar,Park,Seafood Restaurant
96,Downtown Toronto,0,Coffee Shop,Café,Restaurant,Hotel,Bakery,Italian Restaurant,Japanese Restaurant,Bar,Park,Seafood Restaurant
97,Downtown Toronto,0,Coffee Shop,Café,Restaurant,Hotel,Bakery,Italian Restaurant,Japanese Restaurant,Bar,Park,Seafood Restaurant
99,Downtown Toronto,0,Coffee Shop,Café,Restaurant,Hotel,Bakery,Italian Restaurant,Japanese Restaurant,Bar,Park,Seafood Restaurant


#### Cluster 2

In [111]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
76,Mississauga,1,Hotel,Coffee Shop,Burrito Place,Mediterranean Restaurant,American Restaurant,Fried Chicken Joint,Gym,Sandwich Place,Dumpling Restaurant,Drugstore


#### Cluster 3

In [112]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
16,York,2,Park,Convenience Store,Grocery Store,Trail,Restaurant,Bus Line,Caribbean Restaurant,Hockey Arena,Field,Fast Food Restaurant
21,York,2,Park,Convenience Store,Grocery Store,Trail,Restaurant,Bus Line,Caribbean Restaurant,Hockey Arena,Field,Fast Food Restaurant
56,York,2,Park,Convenience Store,Grocery Store,Trail,Restaurant,Bus Line,Caribbean Restaurant,Hockey Arena,Field,Fast Food Restaurant
63,York,2,Park,Convenience Store,Grocery Store,Trail,Restaurant,Bus Line,Caribbean Restaurant,Hockey Arena,Field,Fast Food Restaurant
64,York,2,Park,Convenience Store,Grocery Store,Trail,Restaurant,Bus Line,Caribbean Restaurant,Hockey Arena,Field,Fast Food Restaurant


#### Cluster 4

In [113]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Queen's Park,3,Coffee Shop,Diner,Park,Gym,Sushi Restaurant,Yoga Studio,Beer Bar,Smoothie Shop,Seafood Restaurant,Sandwich Place


#### Cluster 5

In [114]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Scarborough,4,Chinese Restaurant,Fast Food Restaurant,Coffee Shop,Bakery,Breakfast Spot,Pizza Place,Pharmacy,Playground,Bank,Park
8,East York,4,Coffee Shop,Pizza Place,Park,Burger Joint,Pharmacy,Bank,Sporting Goods Shop,Grocery Store,Fast Food Restaurant,Sandwich Place
11,Etobicoke,4,Pizza Place,Sandwich Place,Pharmacy,Coffee Shop,Fast Food Restaurant,Gym,Pool,Bakery,Café,Beer Store
12,Scarborough,4,Chinese Restaurant,Fast Food Restaurant,Coffee Shop,Bakery,Breakfast Spot,Pizza Place,Pharmacy,Playground,Bank,Park
14,East York,4,Coffee Shop,Pizza Place,Park,Burger Joint,Pharmacy,Bank,Sporting Goods Shop,Grocery Store,Fast Food Restaurant,Sandwich Place
17,Etobicoke,4,Pizza Place,Sandwich Place,Pharmacy,Coffee Shop,Fast Food Restaurant,Gym,Pool,Bakery,Café,Beer Store
18,Scarborough,4,Chinese Restaurant,Fast Food Restaurant,Coffee Shop,Bakery,Breakfast Spot,Pizza Place,Pharmacy,Playground,Bank,Park
22,Scarborough,4,Chinese Restaurant,Fast Food Restaurant,Coffee Shop,Bakery,Breakfast Spot,Pizza Place,Pharmacy,Playground,Bank,Park
23,East York,4,Coffee Shop,Pizza Place,Park,Burger Joint,Pharmacy,Bank,Sporting Goods Shop,Grocery Store,Fast Food Restaurant,Sandwich Place
26,Scarborough,4,Chinese Restaurant,Fast Food Restaurant,Coffee Shop,Bakery,Breakfast Spot,Pizza Place,Pharmacy,Playground,Bank,Park
