## Week 3 Assignment - Toronto Neighborhood Segmentation

#### Step 1: Webscraping wikipedia page table

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import folium

In [4]:
URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [5]:
response = requests.get(URL)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', {'class': 'wikitable sortable'}).tbody
rows = table.find_all('tr')
columns = [v.text.replace('\n','') for v in rows[0].find_all('th')]
print(columns)

['Postal Code', 'Borough', 'Neighborhood']


#### Prepare Dataframe to store wikidata

In [6]:
df = pd.DataFrame(columns=columns)

In [7]:
print(df)

Empty DataFrame
Columns: [Postal Code, Borough, Neighborhood]
Index: []


In [8]:
for i in range(1, len(rows)):
    tds = rows[i].find_all('td')
    
    if len(tds) == 3:
        values = [tds[0].text.replace('\n',""),tds[1].text.replace('\n',""),tds[2].text.replace('\n',"")]
    else:
        values = [td]
        
    print(values)
    
    df = df.append(pd.Series(values,index =columns), ignore_index = True)
    print(df)

['M1A', 'Not assigned', 'Not assigned']
  Postal Code       Borough  Neighborhood
0         M1A  Not assigned  Not assigned
['M2A', 'Not assigned', 'Not assigned']
  Postal Code       Borough  Neighborhood
0         M1A  Not assigned  Not assigned
1         M2A  Not assigned  Not assigned
['M3A', 'North York', 'Parkwoods']
  Postal Code       Borough  Neighborhood
0         M1A  Not assigned  Not assigned
1         M2A  Not assigned  Not assigned
2         M3A    North York     Parkwoods
['M4A', 'North York', 'Victoria Village']
  Postal Code       Borough      Neighborhood
0         M1A  Not assigned      Not assigned
1         M2A  Not assigned      Not assigned
2         M3A    North York         Parkwoods
3         M4A    North York  Victoria Village
['M5A', 'Downtown Toronto', 'Regent Park, Harbourfront']
  Postal Code           Borough               Neighborhood
0         M1A      Not assigned               Not assigned
1         M2A      Not assigned               Not assigned
2

In [9]:
df.shape

(180, 3)

#### Remove data from dataframe where Borough is 'Not assigned"

In [10]:
df_borough = df[df['Borough'] != 'Not assigned']

In [11]:
df_borough.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### combine neighbors as mentioned "If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough." 

In [12]:
df1 = df_borough.groupby(['Postal Code','Borough'])['Neighborhood'].apply(lambda x: ', '.join(x.astype(str))).reset_index()

In [13]:
df1.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [14]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

for i in range(len(df1.index)):
    if df1.iloc[i,1] == 'Not assigned':
        df1.iloc[i,1] = df1.iloc[i,2]
    else:
        df.iloc[i,1] = df1.iloc[i,1]

In [15]:
df1['Borough'].unique()     

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       'Mississauga', 'Etobicoke'], dtype=object)

In [16]:
df1.shape

(103, 3)

### ---------------------------------------------------------------------------------------------------------------------------

#### Get location attributes for each Postal code and update Dataframe

In [17]:
from geopy import geocoders
from geopy import Nominatim
import pgeocode

In [18]:
df1.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [19]:
locator = pgeocode.Nominatim('CA')
df1['latitude'] = ""
df1['longitude'] = ""

In [20]:
for j in range(0, len(df1.index)):
    df1['latitude'][j] = locator.query_postal_code(df1.iloc[j,0]).latitude
    df1['longitude'][j] = locator.query_postal_code(df1.iloc[j,0]).longitude

In [21]:
df1.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,latitude,longitude
0,M1B,Scarborough,"Malvern, Rouge",43.8113,-79.193
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.7878,-79.1564
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7678,-79.1866
3,M1G,Scarborough,Woburn,43.7712,-79.2144
4,M1H,Scarborough,Cedarbrae,43.7686,-79.2389


### Part 3 of the Assignment 

#### Select dataframe where Borough contains Toronto

In [22]:
df_toronto = df1[df1['Borough'].str.contains('Toronto')].reset_index()

In [23]:
df_toronto.head()

Unnamed: 0,index,Postal Code,Borough,Neighborhood,latitude,longitude
0,37,M4E,East Toronto,The Beaches,43.6784,-79.2941
1,41,M4K,East Toronto,"The Danforth West, Riverdale",43.6803,-79.3538
2,42,M4L,East Toronto,"India Bazaar, The Beaches West",43.6693,-79.3155
3,43,M4M,East Toronto,Studio District,43.6561,-79.3406
4,44,M4N,Central Toronto,Lawrence Park,43.7301,-79.3935


In [25]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Canada, CA {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Canada, CA 43.6534817, -79.3839347.


In [26]:
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['latitude'], df_toronto['longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.features.Marker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

#### Define Foursquare Credentials and Version

In [27]:
CLIENT_ID = 'TXCQZQRIL5U3LDKYRB41PTWUFEV3MS01P0ALR2W0SJ0GMFKG' 
CLIENT_SECRET = 'UPL1VIK2AXNXUQJZNEPXSPHVWM2MQ4TDGY33C4QB1S0FAOT0' 
VERSION = '20200615' 

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: TXCQZQRIL5U3LDKYRB41PTWUFEV3MS01P0ALR2W0SJ0GMFKG
CLIENT_SECRET:UPL1VIK2AXNXUQJZNEPXSPHVWM2MQ4TDGY33C4QB1S0FAOT0


In [28]:
df_toronto.loc[0, 'Neighborhood']

'The Beaches'

In [29]:
neighborhood_latitude = df_toronto.loc[0, 'latitude'] # neighborhood latitude value
neighborhood_longitude = df_toronto.loc[0, 'longitude'] # neighborhood longitude value

neighborhood_name = df_toronto.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of The Beaches are 43.6784, -79.2941.


In [30]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 3000 # define radius

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

url 

'https://api.foursquare.com/v2/venues/explore?&client_id=TXCQZQRIL5U3LDKYRB41PTWUFEV3MS01P0ALR2W0SJ0GMFKG&client_secret=UPL1VIK2AXNXUQJZNEPXSPHVWM2MQ4TDGY33C4QB1S0FAOT0&v=20200615&ll=43.6784,-79.2941&radius=3000&limit=100'

In [31]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5ee75194c1ce500e4e666d64'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 143,
  'suggestedBounds': {'ne': {'lat': 43.70540002700003,
    'lng': -79.25683701849361},
   'sw': {'lat': 43.651399972999975, 'lng': -79.33136298150639}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bd461bc77b29c74a07d9282',
       'name': 'Glen Manor Ravine',
       'location': {'address': 'Glen Manor',
        'crossStreet': 'Queen St.',
        'lat': 43.67682094413784,
        'lng': -79.29394208780985,
        'labeledLatLngs': [{'label': 'display',
        

In [32]:
from pandas.io.json import _json_normalize

In [33]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [34]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = _json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Glen Manor Ravine,Trail,43.676821,-79.293942
1,The Beech Tree,Gastropub,43.680493,-79.288846
2,Beaches Bake Shop,Bakery,43.680363,-79.289692
3,The Fox Theatre,Indie Movie Theater,43.672801,-79.287272
4,Tori's Bakeshop,Vegetarian / Vegan Restaurant,43.672114,-79.290331


In [35]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

100 venues were returned by Foursquare.


## Explore Neighborhoods in Toronto

#### Let's create a function to repeat the same process to all the neighborhoods in Toronto

In [36]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [37]:
Toronto_venues = getNearbyVenues(names=df_toronto['Neighborhood'],
                                   latitudes=df_toronto['latitude'],
                                   longitudes=df_toronto['longitude']
                                  )

The Beaches
The Danforth West, Riverdale
India Bazaar, The Beaches West
Studio District
Lawrence Park
Davisville North
North Toronto West,  Lawrence Park
Davisville
Moore Park, Summerhill East
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
Rosedale
St. James Town, Cabbagetown
Church and Wellesley
Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North & West, Forest Hill Road Park
The Annex, North Midtown, Yorkville
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Stn A PO Boxes
First Canadian Place, Underground city
Christie
Dufferin, Dovercourt Village
Little Portugal, Trinity
Brockton, Parkdale Village, Exhibition Place
High

In [38]:
print(Toronto_venues.shape)
Toronto_venues.head()

(1529, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.6784,-79.2941,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.6784,-79.2941,Beaches Bake Shop,43.680363,-79.289692,Bakery
2,The Beaches,43.6784,-79.2941,The Beech Tree,43.680493,-79.288846,Gastropub
3,The Beaches,43.6784,-79.2941,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
4,The Beaches,43.6784,-79.2941,Grover Pub and Grub,43.679181,-79.297215,Pub


In [39]:
Toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,89,89,89,89,89,89
"Brockton, Parkdale Village, Exhibition Place",39,39,39,39,39,39
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",14,14,14,14,14,14
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",57,57,57,57,57,57
Central Bay Street,63,63,63,63,63,63
Christie,11,11,11,11,11,11
Church and Wellesley,76,76,76,76,76,76
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
Davisville,22,22,22,22,22,22
Davisville North,7,7,7,7,7,7


In [40]:
print('There are {} uniques categories.'.format(len(Toronto_venues['Venue Category'].unique())))

There are 218 uniques categories.


## Analyze Each Neighborhood

In [41]:
# one hot encoding
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

In [42]:
Toronto_onehot.head()

Unnamed: 0,Accessories Store,Afghan Restaurant,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,Baby Store,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
# add neighborhood column back to dataframe
Toronto_onehot['Aa_Neighborhood'] = Toronto_venues['Neighborhood']

In [44]:
Toronto_onehot

Unnamed: 0,Accessories Store,Afghan Restaurant,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,Baby Store,...,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Aa_Neighborhood
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,The Beaches
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,The Beaches
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,The Beaches
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,The Beaches
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,The Beaches
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1524,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,"Business reply mail Processing Centre, South C..."
1525,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"Business reply mail Processing Centre, South C..."
1526,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"Business reply mail Processing Centre, South C..."
1527,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"Business reply mail Processing Centre, South C..."


In [45]:
# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot.head()

Unnamed: 0,Aa_Neighborhood,Accessories Store,Afghan Restaurant,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
Toronto_onehot.rename(columns={"Neighborhood": "Aa_Neighborhood"})

Unnamed: 0,Aa_Neighborhood,Accessories Store,Afghan Restaurant,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1524,"Business reply mail Processing Centre, South C...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1525,"Business reply mail Processing Centre, South C...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1526,"Business reply mail Processing Centre, South C...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1527,"Business reply mail Processing Centre, South C...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
Toronto_onehot.shape

(1529, 219)

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [48]:
Toronto_grouped = Toronto_onehot.groupby(['Aa_Neighborhood']).mean().reset_index()
Toronto_grouped.shape

(39, 219)

In [49]:
num_top_venues = 10

for hood in Toronto_grouped['Aa_Neighborhood']:
    print("----"+hood+"----")
    temp = Toronto_grouped[Toronto_grouped['Aa_Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
                 venue  freq
0          Coffee Shop  0.09
1                 Café  0.06
2                Hotel  0.04
3   Seafood Restaurant  0.03
4         Cocktail Bar  0.03
5  Japanese Restaurant  0.03
6           Restaurant  0.03
7             Beer Bar  0.03
8               Bakery  0.03
9          Cheese Shop  0.02


----Brockton, Parkdale Village, Exhibition Place----
                    venue  freq
0                    Café  0.08
1             Coffee Shop  0.08
2  Thrift / Vintage Store  0.05
3               Gift Shop  0.05
4          Breakfast Spot  0.05
5       Accessories Store  0.03
6                   Diner  0.03
7             Supermarket  0.03
8            Chiropractor  0.03
9            Cocktail Bar  0.03


----Business reply mail Processing Centre, South Central Letter Processing Plant Toronto----
                venue  freq
0          Restaurant  0.14
1         Coffee Shop  0.14
2  Italian Restaurant  0.14
3         Yoga Studio  0.07
4                Ba

In [50]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [51]:
import numpy as np

In [52]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Aa_Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Aa_Neighborhood'] = Toronto_grouped['Aa_Neighborhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Aa_Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Café,Hotel,Cocktail Bar,Restaurant,Japanese Restaurant,Bakery,Beer Bar,Seafood Restaurant,Breakfast Spot
1,"Brockton, Parkdale Village, Exhibition Place",Café,Coffee Shop,Breakfast Spot,Thrift / Vintage Store,Gift Shop,Brewery,Sandwich Place,Chiropractor,Restaurant,Cocktail Bar
2,"Business reply mail Processing Centre, South C...",Coffee Shop,Italian Restaurant,Restaurant,Yoga Studio,Bank,Martial Arts Dojo,Breakfast Spot,Bookstore,Sushi Restaurant,Japanese Restaurant
3,"CN Tower, King and Spadina, Railway Lands, Har...",Coffee Shop,Italian Restaurant,Café,Bar,Park,French Restaurant,Restaurant,Bank,Bakery,Speakeasy
4,Central Bay Street,Coffee Shop,Café,Japanese Restaurant,Middle Eastern Restaurant,Bubble Tea Shop,Italian Restaurant,Sandwich Place,Poke Place,Breakfast Spot,Shopping Mall


In [53]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans



## Cluster Neighborhoods

In [54]:
kclusters = 5

Toronto_grouped_clustering = Toronto_grouped.drop('Aa_Neighborhood', 1)
np.random.seed(32)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [55]:
df_toronto.head()

Unnamed: 0,index,Postal Code,Borough,Neighborhood,latitude,longitude
0,37,M4E,East Toronto,The Beaches,43.6784,-79.2941
1,41,M4K,East Toronto,"The Danforth West, Riverdale",43.6803,-79.3538
2,42,M4L,East Toronto,"India Bazaar, The Beaches West",43.6693,-79.3155
3,43,M4M,East Toronto,Studio District,43.6561,-79.3406
4,44,M4N,Central Toronto,Lawrence Park,43.7301,-79.3935


In [56]:
neighborhoods_venues_sorted.head()

Unnamed: 0,Aa_Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Café,Hotel,Cocktail Bar,Restaurant,Japanese Restaurant,Bakery,Beer Bar,Seafood Restaurant,Breakfast Spot
1,"Brockton, Parkdale Village, Exhibition Place",Café,Coffee Shop,Breakfast Spot,Thrift / Vintage Store,Gift Shop,Brewery,Sandwich Place,Chiropractor,Restaurant,Cocktail Bar
2,"Business reply mail Processing Centre, South C...",Coffee Shop,Italian Restaurant,Restaurant,Yoga Studio,Bank,Martial Arts Dojo,Breakfast Spot,Bookstore,Sushi Restaurant,Japanese Restaurant
3,"CN Tower, King and Spadina, Railway Lands, Har...",Coffee Shop,Italian Restaurant,Café,Bar,Park,French Restaurant,Restaurant,Bank,Bakery,Speakeasy
4,Central Bay Street,Coffee Shop,Café,Japanese Restaurant,Middle Eastern Restaurant,Bubble Tea Shop,Italian Restaurant,Sandwich Place,Poke Place,Breakfast Spot,Shopping Mall


### Based on the above exploratory analysis, it can be seen that in most of the neighborhoods, coffee shops and Restaurants are the most common venues, however in some of the neighbours parks, studios and home services are also seen. Based on these insights, I selected 5 clusters to get a more clear observation

In [59]:
#neighborhoods_venues_sorted = neighborhoods_venues_sorted.drop('Cluster Labels',axis = 1)
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

Toronto_merged = df_toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Toronto_merged = Toronto_merged.join(neighborhoods_venues_sorted.set_index('Aa_Neighborhood'), on='Neighborhood')

Toronto_merged.head() # check the last columns!

Unnamed: 0,index,Postal Code,Borough,Neighborhood,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,37,M4E,East Toronto,The Beaches,43.6784,-79.2941,1,Pub,Trail,Health Food Store,Gastropub,Bakery,Neighborhood,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Yoga Studio
1,41,M4K,East Toronto,"The Danforth West, Riverdale",43.6803,-79.3538,1,Greek Restaurant,Ice Cream Shop,Italian Restaurant,Café,Restaurant,Yoga Studio,Dessert Shop,Bubble Tea Shop,Spa,Cocktail Bar
2,42,M4L,East Toronto,"India Bazaar, The Beaches West",43.6693,-79.3155,1,Fast Food Restaurant,Restaurant,Sandwich Place,Italian Restaurant,Park,Gym,Pizza Place,Movie Theater,Pub,Liquor Store
3,43,M4M,East Toronto,Studio District,43.6561,-79.3406,1,Performing Arts Venue,Gym,Garden Center,Baseball Field,Diner,Coffee Shop,Coworking Space,Park,Dance Studio,Fast Food Restaurant
4,44,M4N,Central Toronto,Lawrence Park,43.7301,-79.3935,3,Photography Studio,Park,Dog Run,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store


In [60]:
Toronto_merged.shape

(39, 17)

### Map Clusters

In [61]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
map_clusters

In [62]:
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['latitude'], Toronto_merged['longitude'], Toronto_merged['Neighborhood'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Investigate Clusters

#### Cluster 1 - Showing neighbourhoods which are more inclined towards parks, gym, playgrounds

In [63]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 0, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Postal Code,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,M4R,-79.4065,0,Playground,Gym Pool,Park,Garden,Yoga Studio,Doner Restaurant,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space
8,M4T,-79.3853,0,Park,Gym,Grocery Store,Thai Restaurant,Yoga Studio,Doner Restaurant,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Falafel Restaurant
10,M4W,-79.373,0,Park,Playground,Grocery Store,Candy Store,Yoga Studio,Doner Restaurant,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space


#### Cluster 2 - More of the leisure neighborhood with common venues as Pubs, Cafe, Restaurants

In [64]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 1, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Postal Code,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,-79.2941,1,Pub,Trail,Health Food Store,Gastropub,Bakery,Neighborhood,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Yoga Studio
1,M4K,-79.3538,1,Greek Restaurant,Ice Cream Shop,Italian Restaurant,Café,Restaurant,Yoga Studio,Dessert Shop,Bubble Tea Shop,Spa,Cocktail Bar
2,M4L,-79.3155,1,Fast Food Restaurant,Restaurant,Sandwich Place,Italian Restaurant,Park,Gym,Pizza Place,Movie Theater,Pub,Liquor Store
3,M4M,-79.3406,1,Performing Arts Venue,Gym,Garden Center,Baseball Field,Diner,Coffee Shop,Coworking Space,Park,Dance Studio,Fast Food Restaurant
5,M4P,-79.3887,1,Food & Drink Shop,Dog Run,Park,Gym / Fitness Center,Gym,Breakfast Spot,Department Store,Event Space,Eastern European Restaurant,Electronics Store
7,M4S,-79.3853,1,Coffee Shop,Dessert Shop,Sandwich Place,Italian Restaurant,Café,Gas Station,Indian Restaurant,Sushi Restaurant,Park,Gym
9,M4V,-79.4025,1,Light Rail Station,Coffee Shop,Liquor Store,Supermarket,Yoga Studio,Donut Shop,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Falafel Restaurant
11,M4X,-79.3689,1,Coffee Shop,Café,Restaurant,Pizza Place,Italian Restaurant,Bakery,Park,Chinese Restaurant,Pub,Liquor Store
12,M4Y,-79.383,1,Japanese Restaurant,Coffee Shop,Sushi Restaurant,Restaurant,Gay Bar,Yoga Studio,Bubble Tea Shop,Café,Men's Store,Mediterranean Restaurant
13,M5A,-79.3626,1,Coffee Shop,Restaurant,Breakfast Spot,Yoga Studio,Theater,Distribution Center,Pub,Electronics Store,Event Space,Food Truck


#### Cluster 3 - Professions or offices visited most in the neighborhood

In [285]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 2, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
23,Central Toronto,2,Lawyer,Trail,Yoga Studio,Donut Shop,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space


#### Cluster 4 neighborhoods with most common interests towards hobbies and health

In [286]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 3, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Central Toronto,3,Photography Studio,Park,Dog Run,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store
34,West Toronto,3,Park,Yoga Studio,Doner Restaurant,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant


#### Cluster 5 Neighborhood with Home Services and Yoga 

In [287]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 4, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,Central Toronto,4,Home Service,Yoga Studio,Doner Restaurant,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant
