Import necessary libraries e.g. BeautifulSoup, Numpy, Pandas, and Requests.

In [1]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import requests
import geocoder

Create Requests and BeautifulSoup objects, find the desired table, and convert the table into a dataframe.

In [2]:
req = requests.get("https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=1012118802")
soup = BeautifulSoup(req.content, 'lxml')
table = soup.find_all("table")[0]
torontodf = pd.read_html(str(table))
toronto_neighbourhoods = pd.DataFrame(torontodf[0])

In [3]:
toronto_neighbourhoods.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


Check the shape of the toronto_neighbourhoods dataframe.

In [4]:
print(toronto_neighbourhoods.shape)

(180, 3)


List indexes wherein the column "Borough" or "Neighbourhood" have the values of "Not assigned" and wherein the column "Borough" has the value of "Mississauga". These columns are about to be dropped since they don't contain the desired values. Also Mississauga is not part of Toronto, so we'll drop it.

In [5]:
indexes = toronto_neighbourhoods[(toronto_neighbourhoods["Borough"] == "Not assigned") \
                                | (toronto_neighbourhoods["Neighbourhood"] == "Not assigned") \
                                | (toronto_neighbourhoods["Borough"] == "Mississauga")].index
print(indexes)

Int64Index([  0,   1,   7,  10,  15,  16,  19,  24,  25,  28,  29,  33,  34,
             35,  37,  38,  42,  43,  44,  51,  52,  53,  60,  61,  62,  69,
             70,  71,  78,  79,  87,  88,  96,  97, 101, 105, 106, 110, 114,
            115, 118, 119, 123, 124, 125, 127, 128, 131, 132, 133, 134, 136,
            137, 140, 141, 145, 146, 149, 150, 154, 155, 158, 159, 161, 162,
            163, 164, 166, 167, 170, 171, 172, 173, 174, 175, 176, 177, 179],
           dtype='int64')


Drop rows where either "Borough" or "Neighbourhood" equals "Not assigned".

In [6]:
toronto_neighbourhoods.drop(indexes , inplace=True)
toronto_neighbourhoods

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


Check again the shape of the new toronto_neighbourhoods dataframe. The value of the rows should have changed.

In [7]:
print(toronto_neighbourhoods.shape)

(102, 3)


Let's check how many boroughs and neighbourhoods there are in the dataframe.

In [8]:
print(toronto_neighbourhoods["Borough"].unique())

['North York' 'Downtown Toronto' 'Etobicoke' 'Scarborough' 'East York'
 'York' 'East Toronto' 'West Toronto' 'Central Toronto' 'Toronto/York']


In [9]:
print('The dataframe has {} boroughs and {} neighbourhoods.'.format(
        len(toronto_neighbourhoods['Borough'].unique()),
        toronto_neighbourhoods.shape[0]
    )
)

The dataframe has 10 boroughs and 102 neighbourhoods.


Let's find out the geographical coordinate of the City of Toronto.

In [10]:
import geocoder
from geopy.geocoders import Nominatim

address = 'Toronto, ON'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


Load the dataframe containing geographical coordinates of Toronto's neighbourhoods.

In [12]:
toronto_neighbourhood_lat_lon = pd.read_csv("Geospatial_Coordinates.csv")
toronto_neighbourhood_lat_lon.head(10)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


Let's merge the toronto_neighbourhood_lat_lon dataframe with toronto_neighbourhoods to see which boroughs the above latitudes and longitudes belong to.

In [20]:
# Create the duplicate of toronto_neighbourhoods
toronto_neighbourhoods_dup = toronto_neighbourhoods

# Merge toronto_neighbourhoods_dup
toronto_neighbourhoods_dup = toronto_neighbourhoods_dup.join(toronto_neighbourhood_lat_lon.set_index('Postal Code'), \
                                                             on='Postal Code')
toronto_neighbourhoods_dup.reset_index(inplace = True)
toronto_neighbourhoods_dup.drop(["index"], axis = 1, inplace = True)
toronto_neighbourhoods_dup.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


Create a map indicating each neighbourhood of Toronto using Folium.

In [21]:
import folium

# create map of Toronto using latitude and longitude values
map_toronto_neighbourhoods = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_neighbourhoods_dup['Latitude'], toronto_neighbourhoods_dup['Longitude'], \
                           toronto_neighbourhoods_dup['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='purple',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_neighbourhoods)  
    
map_toronto_neighbourhoods

Define Foursquare credentials and version.

In [15]:
CLIENT_ID = 'M13N0HS0Z5WTCFNYALI0CNL4ER421HJWOHT4WCR1TIP2NLPM' # Foursquare ID
CLIENT_SECRET = '1JEB5SJDPRQKGJ0JDSCOAZVD0BZES4TKIF13IS0HIHFF24GU' # Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: M13N0HS0Z5WTCFNYALI0CNL4ER421HJWOHT4WCR1TIP2NLPM
CLIENT_SECRET:1JEB5SJDPRQKGJ0JDSCOAZVD0BZES4TKIF13IS0HIHFF24GU


Let's explore the first neighbourhood in toronto_neighbourhoods_dup dataframe and its geographical coordinate.

In [24]:
neighbourhood_latitude = toronto_neighbourhoods_dup.loc[0, 'Latitude'] # neighborhood latitude value
neighbourhood_longitude = toronto_neighbourhoods_dup.loc[0, 'Longitude'] # neighborhood longitude value

neighbourhood_name = toronto_neighbourhoods_dup.loc[0, 'Neighbourhood'] # neighborhood name

print(toronto_neighbourhoods_dup.loc[0, 'Neighbourhood'])
print('Latitude and longitude values of {} are {}, {}.'.format(toronto_neighbourhoods_dup.loc[0, 'Neighbourhood'], 
                                                               toronto_neighbourhoods_dup.loc[0, 'Latitude'], 
                                                               toronto_neighbourhoods_dup.loc[0, 'Longitude']))

Parkwoods
Latitude and longitude values of Parkwoods are 43.7532586, -79.3296565.


Let's get the top 100 venues that are located in Parkwoods within a radius of 500 metres.

In [25]:
# type your answer here

radius = 500
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, neighbourhood_latitude, neighbourhood_longitude, VERSION, radius, LIMIT)
url

'https://api.foursquare.com/v2/venues/search?client_id=M13N0HS0Z5WTCFNYALI0CNL4ER421HJWOHT4WCR1TIP2NLPM&client_secret=1JEB5SJDPRQKGJ0JDSCOAZVD0BZES4TKIF13IS0HIHFF24GU&ll=43.7532586,-79.3296565&v=20180605&radius=500&limit=100'

Import data from the above URL.

In [26]:
results_data = requests.get(url).json()
results_data

{'meta': {'code': 200, 'requestId': '60bc899fea5de36500156cc4'},
 'response': {'venues': [{'id': '4e8d9dcdd5fbbbb6b3003c7b',
    'name': 'Brookbanks Park',
    'location': {'address': 'Toronto',
     'lat': 43.751976046055574,
     'lng': -79.33214044722958,
     'labeledLatLngs': [{'label': 'display',
       'lat': 43.751976046055574,
       'lng': -79.33214044722958}],
     'distance': 245,
     'cc': 'CA',
     'city': 'Toronto',
     'state': 'ON',
     'country': 'Canada',
     'formattedAddress': ['Toronto', 'Toronto ON', 'Canada']},
    'categories': [{'id': '4bf58dd8d48988d163941735',
      'name': 'Park',
      'pluralName': 'Parks',
      'shortName': 'Park',
      'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/parks_outdoors/park_',
       'suffix': '.png'},
      'primary': True}],
    'venuePage': {'id': '600917367'},
    'referralId': 'v-1622968735',
    'hasPerk': False},
   {'id': '4dcc586845dd853165f01864',
    'name': 'Tailor Made',
    'location': {'lat':

Create a function that returns the types of categories of venues.

In [29]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Clean the json file and restructure it into a dataframe.

In [33]:
import json

venues = results_data['response']['venues']
    
nearby_venues = pd.json_normalize(venues) # flatten JSON
# nearby_venues.head(10)

# # filter columns
# filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
filtered_columns = ['name', 'categories', 'location.lat', 'location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# # filter the category for each row
nearby_venues['categories'] = nearby_venues.apply(get_category_type, axis=1)
# nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head(10)

Unnamed: 0,name,categories,lat,lng
0,Brookbanks Park,Park,43.751976,-79.33214
1,Tailor Made,Laundry Service,43.741513,-79.319707
2,17 Brookbanks Drive,Residential Building (Apartment / Condo),43.752266,-79.332322
3,Subway,Sandwich Place,43.760334,-79.326906
4,Shoppers Drug Mart,Pharmacy,43.754171,-79.358057
5,Allwyn's Bakery,Caribbean Restaurant,43.75984,-79.324719
6,Pheasant Run Golf Course,Golf Course,43.758386,-79.337191
7,Tommy,,43.746229,-79.328201
8,Mrs. Claus' Sweatshop,Public Art,43.7538,-79.319582
9,Broadlands Skating Rink,Skating Rink,43.746689,-79.322678


How many venues returned by Foursquare?

In [34]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

94 venues were returned by Foursquare.


Let's repeat the above process for all neighbourhoods in Toronto.

In [36]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        # results = requests.get(url).json()['response']['venues']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Let's create a dataframe containing venues data in all neighbourhoods of Toronto using the above function.

In [37]:
toronto_venues = getNearbyVenues(names = toronto_neighbourhoods_dup['Neighbourhood'], \
                                 latitudes = toronto_neighbourhoods_dup['Latitude'], \
                                 longitudes = toronto_neighbourhoods_dup['Longitude'])
toronto_venues.head()

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Queen's Park, Ontario Provincial Government
Islington Avenue, Humber Valley Village
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto, Broadview North (Old East York)
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmo

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,KFC,43.754387,-79.333021,Fast Food Restaurant
2,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop


Check the size of the returned dataframe toronto_venues.

In [38]:
print(toronto_venues.shape)

(1972, 7)


How many venues returned for each neighbourhood?

In [49]:
toronto_venues[["Neighborhood", "Venue"]].groupby('Neighborhood').count()

Unnamed: 0_level_0,Venue
Neighborhood,Unnamed: 1_level_1
Agincourt,4
"Alderwood, Long Branch",6
"Bathurst Manor, Wilson Heights, Downsview North",15
Bayview Village,4
"Bedford Park, Lawrence Manor East",25
...,...
"Willowdale, Willowdale East",29
"Willowdale, Willowdale West",6
Woburn,4
Woodbine Heights,6


Let's start analyzing each neighbourhood.

We would like to perform one-hot encoding to help in categorization.

In [54]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
columnslist = list(toronto_onehot.columns.values)
columnslist.pop(columnslist.index('Neighborhood'))

toronto_onehot = toronto_onehot[['Neighborhood'] + columnslist]

toronto_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Check the shape of toronto_onehot.

In [55]:
print(toronto_onehot.shape)

(1972, 257)


Let's group rows by neighbourhood and see the mean of the frequency of occurence of each category to measure occurrence density.

In [56]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,"Willowdale, Willowdale East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.0,0.0,0.000000,0.034483,0.0,0.0,0.0,0.0,0.0,0.0
91,"Willowdale, Willowdale West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
92,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
93,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.0,0.0,0.166667,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


Let's see the shape of the above grouped dataframe.

In [57]:
print(toronto_grouped.shape)

(95, 257)


Let's see the top 5 most common venues in each neighbourhood.

In [58]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
                       venue  freq
0             Clothing Store  0.25
1  Latin American Restaurant  0.25
2             Breakfast Spot  0.25
3                     Lounge  0.25
4        Moroccan Restaurant  0.00


----Alderwood, Long Branch----
            venue  freq
0     Pizza Place  0.33
1    Skating Rink  0.17
2             Pub  0.17
3     Coffee Shop  0.17
4  Sandwich Place  0.17


----Bathurst Manor, Wilson Heights, Downsview North----
              venue  freq
0              Bank  0.13
1       Coffee Shop  0.13
2     Shopping Mall  0.07
3  Sushi Restaurant  0.07
4       Gas Station  0.07


----Bayview Village----
                 venue  freq
0                 Café  0.25
1                 Bank  0.25
2  Japanese Restaurant  0.25
3   Chinese Restaurant  0.25
4    Accessories Store  0.00


----Bedford Park, Lawrence Manor East----
                     venue  freq
0              Coffee Shop  0.08
1               Restaurant  0.08
2           Sandwich Place  0.08
3  Co


----Malvern, Rouge----
                      venue  freq
0      Fast Food Restaurant   1.0
1            History Museum   0.0
2       Martial Arts School   0.0
3            Medical Center   0.0
4  Mediterranean Restaurant   0.0


----Milliken, Agincourt North, Steeles East, L'Amoreaux East----
               venue  freq
0         Playground  0.25
1       Intersection  0.25
2               Park  0.25
3   Asian Restaurant  0.25
4  Accessories Store  0.00


----Mimico NW, The Queensway West, South of Bloor, Kingsway Park South West, Royal York South West----
             venue  freq
0    Grocery Store  0.08
1              Gym  0.08
2    Tanning Salon  0.08
3           Bakery  0.08
4  Supplement Shop  0.08


----Moore Park, Summerhill East----
                venue  freq
0          Playground  0.33
1                Park  0.33
2          Restaurant  0.33
3   Accessories Store  0.00
4  Miscellaneous Shop  0.00


----New Toronto, Mimico South, Humber Bay Shores----
                  venue  fr

4                Pizza Place  0.07


----Willowdale, Willowdale West----
           venue  freq
0    Pizza Place  0.17
1    Coffee Shop  0.17
2    Supermarket  0.17
3        Butcher  0.17
4  Grocery Store  0.17


----Woburn----
                   venue  freq
0            Coffee Shop  0.50
1           Soccer Field  0.25
2  Korean BBQ Restaurant  0.25
3      Accessories Store  0.00
4      Mobile Phone Shop  0.00


----Woodbine Heights----
                venue  freq
0        Skating Rink  0.33
1                Park  0.17
2          Beer Store  0.17
3  Athletics & Sports  0.17
4         Video Store  0.17


----York Mills West----
                 venue  freq
0                 Park   0.5
1    Convenience Store   0.5
2    Accessories Store   0.0
3   Miscellaneous Shop   0.0
4  Moroccan Restaurant   0.0




Return the above data into a dataframe.

In [59]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [91]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head(10)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Clothing Store,Latin American Restaurant,Breakfast Spot,Lounge,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Mobile Phone Shop,Accessories Store
1,"Alderwood, Long Branch",Pizza Place,Skating Rink,Pub,Coffee Shop,Sandwich Place,Mexican Restaurant,Metro Station,Men's Store,Motel,Miscellaneous Shop
2,"Bathurst Manor, Wilson Heights, Downsview North",Bank,Coffee Shop,Shopping Mall,Sushi Restaurant,Gas Station,Sandwich Place,Diner,Grocery Store,Pharmacy,Ice Cream Shop
3,Bayview Village,Café,Bank,Japanese Restaurant,Chinese Restaurant,Accessories Store,Museum,Motel,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant
4,"Bedford Park, Lawrence Manor East",Coffee Shop,Restaurant,Sandwich Place,Comfort Food Restaurant,Toy / Game Store,Pharmacy,Pizza Place,Pub,Café,Butcher
5,Berczy Park,Coffee Shop,Cocktail Bar,Bakery,Sandwich Place,Seafood Restaurant,Beer Bar,Farmers Market,Vegetarian / Vegan Restaurant,Butcher,Beach
6,"Birch Cliff, Cliffside West",College Stadium,Café,General Entertainment,Skating Rink,Mobile Phone Shop,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Miscellaneous Shop
7,"Brockton, Parkdale Village, Exhibition Place",Bakery,Breakfast Spot,Café,Sandwich Place,Coffee Shop,Japanese Restaurant,Stadium,Furniture / Home Store,Climbing Gym,Bar
8,"Business reply mail Processing Centre, South C...",Yoga Studio,Auto Workshop,Gym / Fitness Center,Garden Center,Garden,Fast Food Restaurant,Farmers Market,Park,Comic Shop,Restaurant
9,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Airport Terminal,Plane,Harbor / Marina,Bar,Sculpture Garden,Coffee Shop,Rental Car Location,Boat or Ferry


Now, let's try clustering the neighbourhoods into nine clusters using K-Means clustering.

In [92]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 9

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

Create a new dataframe that includes the clusters and the top 10 venues of each neighbourhood.

In [93]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_neighbourhoods_dup

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.merge(neighborhoods_venues_sorted.set_index('Neighborhood'), left_on='Neighbourhood', \
                                     right_on = 'Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,7,Fast Food Restaurant,Park,Food & Drink Shop,Moroccan Restaurant,Martial Arts School,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant
1,M4A,North York,Victoria Village,43.725882,-79.315572,0,Hockey Arena,Financial or Legal Service,Coffee Shop,Portuguese Restaurant,Accessories Store,Miscellaneous Shop,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Coffee Shop,Bakery,Park,Pub,Café,Historic Site,Chocolate Shop,Beer Store,Breakfast Spot,Spa
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,0,Clothing Store,Furniture / Home Store,Coffee Shop,Boutique,Vietnamese Restaurant,Miscellaneous Shop,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0,Coffee Shop,Sushi Restaurant,Burrito Place,Yoga Studio,Theater,Park,Diner,College Auditorium,Salad Place,Sandwich Place


Let's visualize the resulting clusters.

In [94]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], \
                                  toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters