## Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [3]:
soup = BeautifulSoup(res.content,'lxml')

In [4]:
table = soup.find_all('table')[0]

In [5]:
df = pd.read_html(str(table))[0]

In [6]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [7]:
df = df[df['Borough'] != 'Not assigned']

In [8]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [9]:
for i, row in df.iterrows():
    if (row['Borough'] != 'Not assigned') & (row['Neighbourhood'] == 'Not assigned'):
        df.loc[i,'Neighbourhood'] = df.loc[i,'Borough']

#### More than one neighborhood can exist in one postal code area. These two rows will be combined into one row with the neighborhoods separated with a comma.

In [10]:
df = df.groupby(['Postcode', 'Borough']).agg(lambda x: tuple(x)).applymap(list)
df = df.reset_index()
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"[Rouge, Malvern]"
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]"
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]"
3,M1G,Scarborough,[Woburn]
4,M1H,Scarborough,[Cedarbrae]


In [11]:
for index, row in df.iterrows():
    elements = len(row['Neighbourhood'])
    df.at[index,'Neighbourhood2'] = ''
    for i in range(elements):
        df.at[index,'Neighbourhood2'] = df.at[index,'Neighbourhood2'] + row['Neighbourhood'][i] + ', '
    df.loc[index,'Neighbourhood2'] = df.loc[index,'Neighbourhood2'][:-2]

In [12]:
df = df.drop('Neighbourhood',axis=1)
df.columns = ['Postcode', 'Borough', 'Neighbourhood']
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [13]:
df.shape

(103, 3)

### Use Geocoder to get the latitude and longitude of all the postal codes

In [14]:
!pip install geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 15.2MB/s ta 0:00:01
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [15]:
import geocoder
for index, row in df.iterrows():
    lat_lng_coords = None
    while (lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(row.Postcode))
        lat_lng_coords = g.latlng  
    df.at[index,'Latitude'] = lat_lng_coords[0]
    df.at[index,'Longitude'] = lat_lng_coords[1]

In [16]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.811525,-79.195517
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.78573,-79.15875
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.76569,-79.175256
3,M1G,Scarborough,Woburn,43.768359,-79.21759
4,M1H,Scarborough,Cedarbrae,43.769688,-79.23944


### Get the data for boroughs with names containing "Toronto"

In [23]:
df_Toronto=df[[('Toronto' in x) for x in df['Borough']]].reset_index(drop = True)
df_Toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676845,-79.295225
1,M4K,East Toronto,"The Danforth West, Riverdale",43.683262,-79.35512
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.667965,-79.314673
3,M4M,East Toronto,Studio District,43.662766,-79.33483
4,M4N,Central Toronto,Lawrence Park,43.72816,-79.387085


In [24]:
df_Toronto.shape

(38, 5)

### Get the geographical coordinates of Toronto and Create a map with neighborhoods shown.

In [35]:
!pip install geopy
from geopy.geocoders import Nominatim

import requests

import matplotlib.cm as cm
import matplotlib.colors as colors

!pip install folium
import folium

from sklearn.cluster import KMeans
print('Libraries imported!')

Libraries imported!


#### Get the geographical coordinates of Toronto

In [36]:
address = "Toronto, Canada"
geolocator = Nominatim(user_agent = "toronto_explorer")
location = geolocator.geocode(address) 
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinate of Toronto is {}, {}.'.format(latitude, longitude))

The geographical coordinate of Toronto is 43.653963, -79.387207.


#### Create a map with neighborhoods of Toronto shown on the map.

In [56]:
map_toronto = folium.Map(location=[latitude,longitude], zoom_start = 10)

for lat, lng, borough, neighbourhood in zip(df_Toronto['Latitude'], df_Toronto['Longitude'], df_Toronto['Borough'], df_Toronto['Neighbourhood']):
    label = "({}), {}".format(neighbourhood, borough)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = 'red',
        fill_opacity = 0.5,
        parse_html = True).add_to(map_toronto)
map_toronto

### Define Foursquare Credentials and Version

In [57]:
CLIENT_ID = 'LKKVDN4NKFTRZE3SF10TTJ1WQ4EF3HDYQXRO3B5VMKLA2YSG' # your Foursquare ID
CLIENT_SECRET = 'NEM5FKYIJLKP5EL3EKUAW5CA4DR3HELLNXEBNEBXKUV5UBAR' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: LKKVDN4NKFTRZE3SF10TTJ1WQ4EF3HDYQXRO3B5VMKLA2YSG
CLIENT_SECRET:NEM5FKYIJLKP5EL3EKUAW5CA4DR3HELLNXEBNEBXKUV5UBAR


In [58]:
url = "https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        df_Toronto.iloc[0,:].Latitude,
        df_Toronto.iloc[0,:].Longitude,
        500,
        10)
        
# make the GET request
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5d8809b9a306190039144462'},
 'response': {'headerLocation': 'The Beaches',
  'headerFullLocation': 'The Beaches, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.68134518750007,
    'lng': -79.28901466400717},
   'sw': {'lat': 43.672345178500066, 'lng': -79.30143533599275}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bd461bc77b29c74a07d9282',
       'name': 'Glen Manor Ravine',
       'location': {'address': 'Glen Manor',
        'crossStreet': 'Queen St.',
        'lat': 43.67682094413784,
        'lng': -79.29394208780985,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.67682094413784,
          'lng': -79.29394208780985}],
        'distanc

### Focus on the top 150 venues in each neighborhood within a radius of 1000 meters.

In [83]:
def getNearbyVenues(Borough_Name, Neighbourhood_Name, Lat, Lng, radius = 500, limit = 100):
    
    venues_list = []
    for nn, bn, lat, lng in zip(Neighbourhood_Name, Borough_Name, Lat, Lng):
        print('({})'.format(nn), bn)
        
        # create API request URL
        url = "https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        lng,
        radius,
        limit)
        
        # make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        # return only the relevant information
        venues_list.append([(bn, 
                             nn, 
                             lat, 
                             lng, 
                             result['venue']['name'], 
                             result['venue']['location']['lat'],
                            result['venue']['location']['lng'],
                            result['venue']['categories'][0]['name']) for result in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = [
        'Borough',
        'Neighbourhood',
        'Latitude',
        'Longitude',
        'Venue',
        'Venue Latitude',
        'Venue Longitude',
        'Venue Category'
    ]
    
    return(nearby_venues)

#### Get the top 100 venues in each neighborhood within a radius of 500 meters.

In [84]:
Toronto_venues = getNearbyVenues(df_Toronto['Borough'], df_Toronto['Neighbourhood'], df_Toronto['Latitude'], df_Toronto['Longitude'])

(The Beaches) East Toronto
(The Danforth West, Riverdale) East Toronto
(The Beaches West, India Bazaar) East Toronto
(Studio District) East Toronto
(Lawrence Park) Central Toronto
(Davisville North) Central Toronto
(North Toronto West) Central Toronto
(Davisville) Central Toronto
(Moore Park, Summerhill East) Central Toronto
(Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West) Central Toronto
(Rosedale) Downtown Toronto
(Cabbagetown, St. James Town) Downtown Toronto
(Church and Wellesley) Downtown Toronto
(Harbourfront, Regent Park) Downtown Toronto
(Ryerson, Garden District) Downtown Toronto
(St. James Town) Downtown Toronto
(Berczy Park) Downtown Toronto
(Central Bay Street) Downtown Toronto
(Adelaide, King, Richmond) Downtown Toronto
(Harbourfront East, Toronto Islands, Union Station) Downtown Toronto
(Design Exchange, Toronto Dominion Centre) Downtown Toronto
(Commerce Court, Victoria Hotel) Downtown Toronto
(Roselawn) Central Toronto
(Forest Hill North, Forest Hill 

In [85]:
Toronto_venues.shape

(1744, 8)

In [86]:
Toronto_venues.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,East Toronto,The Beaches,43.676845,-79.295225,Glen Manor Ravine,43.676821,-79.293942,Trail
1,East Toronto,The Beaches,43.676845,-79.295225,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,East Toronto,The Beaches,43.676845,-79.295225,Grover Pub and Grub,43.679181,-79.297215,Pub
3,East Toronto,The Beaches,43.676845,-79.295225,Upper Beaches,43.680563,-79.292869,Neighborhood
4,East Toronto,"The Danforth West, Riverdale",43.683262,-79.35512,Dollarama,43.686197,-79.355989,Discount Store


In [87]:
Toronto_venues.groupby(['Borough','Neighbourhood']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Borough,Neighbourhood,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Central Toronto,Davisville,26,26,26,26,26,26
Central Toronto,Davisville North,8,8,8,8,8,8
Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West",6,6,6,6,6,6
Central Toronto,"Forest Hill North, Forest Hill West",1,1,1,1,1,1
Central Toronto,Lawrence Park,2,2,2,2,2,2
Central Toronto,"Moore Park, Summerhill East",4,4,4,4,4,4
Central Toronto,North Toronto West,4,4,4,4,4,4
Central Toronto,"The Annex, North Midtown, Yorkville",27,27,27,27,27,27
Downtown Toronto,"Adelaide, King, Richmond",100,100,100,100,100,100
Downtown Toronto,Berczy Park,63,63,63,63,63,63


### find out how many unique categories can be curated from all the returned venues

In [92]:
print("There are {} unique categories.".format(len(Toronto_venues['Venue Category'].unique())))

There are 207 unique categories.


### Analyze the neighborhoods

In [90]:
Toronto_onehot = pd.get_dummies(Toronto_venues['Venue Category'])

In [91]:
Toronto_onehot.head()

Unnamed: 0,Afghan Restaurant,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,Baby Store,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [93]:
Toronto_onehot[['Borough','Neighbourhood']] = Toronto_venues[['Borough','Neighbourhood']]

In [94]:
Toronto_onehot.head()

Unnamed: 0,Afghan Restaurant,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,Baby Store,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio,Borough,Neighbourhood
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,East Toronto,The Beaches
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,East Toronto,The Beaches
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,East Toronto,The Beaches
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,East Toronto,The Beaches
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,East Toronto,"The Danforth West, Riverdale"


In [104]:
columns = list(Toronto_onehot.columns[-2:])+list(Toronto_onehot.columns[:-2])

In [106]:
Toronto_onehot = Toronto_onehot[columns]
Toronto_onehot.head()

Unnamed: 0,Borough,Neighbourhood,Afghan Restaurant,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,East Toronto,The Beaches,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,East Toronto,The Beaches,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,East Toronto,The Beaches,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,East Toronto,The Beaches,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,East Toronto,"The Danforth West, Riverdale",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Groupby (Borough, Neighbourhood) and analyze the frequency of occurence of each category within each (Borough, Neighbourhood)

In [110]:
Toronto_grouped = Toronto_onehot.groupby(['Borough','Neighbourhood']).mean().reset_index()
Toronto_grouped.head()

Unnamed: 0,Borough,Neighbourhood,Afghan Restaurant,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,Central Toronto,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.038462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Central Toronto,Davisville North,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Central Toronto,"Forest Hill North, Forest Hill West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Toronto,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [111]:
Toronto_grouped.shape

(37, 209)

### print the neighbourhood along with the Top 5 categories

In [151]:
num_top_categories = 5

for bn, nn in zip(Toronto_grouped['Borough'],Toronto_grouped['Neighbourhood']):
    print("----{}----".format(bn))
    print("----{}----".format(nn))
    temp = Toronto_grouped[(Toronto_grouped['Borough']==bn) & (Toronto_grouped['Neighbourhood']==nn)].T.reset_index()
    temp.columns = ['categories','freq']
    temp = temp.iloc[2:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_categories))
    print("\n")

----Central Toronto----
----Davisville----
           categories  freq
0        Dessert Shop  0.12
1                Café  0.08
2      Sandwich Place  0.08
3         Pizza Place  0.08
4  Italian Restaurant  0.08


----Central Toronto----
----Davisville North----
       categories  freq
0           Hotel  0.12
1            Park  0.12
2  Breakfast Spot  0.12
3  Clothing Store  0.12
4             Gym  0.12


----Central Toronto----
----Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West----
           categories  freq
0  Light Rail Station  0.33
1         Coffee Shop  0.33
2        Liquor Store  0.17
3         Supermarket  0.17
4   Afghan Restaurant  0.00


----Central Toronto----
----Forest Hill North, Forest Hill West----
                  categories  freq
0               Home Service   1.0
1                Pizza Place   0.0
2                Men's Store   0.0
3         Mexican Restaurant   0.0
4  Middle Eastern Restaurant   0.0


----Central Toronto----
----Lawrence Park---

In [143]:
temp = Toronto_grouped[(Toronto_grouped['Borough']=='Central Toronto') & (Toronto_grouped['Neighbourhood']=='Davisville')].T.reset_index()

In [146]:
temp = temp.iloc[2:]

In [147]:
temp.columns=['categories',
            'freq']

In [148]:
temp.head()

Unnamed: 0,categories,freq
2,Afghan Restaurant,0
3,American Restaurant,0
4,Antique Shop,0
5,Art Gallery,0
6,Art Museum,0


In [None]:
temp.sort_values('freq',)