# Clustering Neighborhoods - Toronto

## Table of Contents

<div class="alert alert-block alert-info" style="margin-top: 20px">

<font size = 3>

1. <a href="#item1">Download and Explore Dataset</a>

2. <a href="#item2">Explore Neighborhoods in Toronto</a>

3. <a href="#item3">Analyze Each Neighborhood</a>

4. <a href="#item4">Cluster Neighborhoods</a>

5. <a href="#item5">Examine Clusters</a>    
</font>
</div>

In [348]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

from bs4 import BeautifulSoup ## to parse wikipage

import geocoder # import geocoder

print('Libraries imported.')

Libraries imported.


## 1. Download and Explore Dataset

In [349]:
response = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
content = response.content

In [350]:
# Initialize the parser
parser = BeautifulSoup(content, 'html.parser')

# test
body = parser.body

# Get the p tag from the body.
p = body.p

# Print the text inside the p tag.
print(p.text)
title_text = parser.title.text

This is a list of postal codes in Canada where the first letter is M. Postal codes beginning with M are located within the city of Toronto in the province of Ontario. Only the first three characters are listed, corresponding to the Forward Sortation Area.



In [351]:
tables = parser.find_all("table")[0]

In [352]:
table_columns = parser.find_all("tr")[0]
columns= []
for item in table_columns.findAll('th'):
    column = item.text
    columns.append(column)
columns

['Postcode', 'Borough', 'Neighbourhood\n']

In [353]:
# test for one row
row_one = tables.findAll("td")[0:3]
all_rows = []
resp_row = []
for item in row_one:
    curr_item = item.text
    resp_row.append(curr_item)
resp_row

['M1A', 'Not assigned', 'Not assigned\n']

In [354]:
all_rows = []
for i in range(0,10000,3):
    row_one = parser.findAll("td")[i:i+3]
    resp_row = []
    for item in row_one:
        curr_item = item.text
        resp_row.append(curr_item)
    if resp_row[0].startswith('M'):
        all_rows.append(resp_row)
    else:
        break
len(all_rows[0])

3

In [355]:
df = pd.DataFrame(all_rows, columns=['PostalCode', 'Borough', 'Neighborhood'])
df["Neighborhood"] = df["Neighborhood"].str.replace("\n","")
for i, row in df.iterrows():
    if row["Neighborhood"]=="Not assigned":
        df.iloc[i]["Borough"] == row["Borough"]
        
df = df[~(df["Borough"]== "Not assigned")]    

In [356]:
df[df["PostalCode"]=="M5V"]

Unnamed: 0,PostalCode,Borough,Neighborhood
216,M5V,Downtown Toronto,CN Tower
217,M5V,Downtown Toronto,Bathurst Quay
218,M5V,Downtown Toronto,Island airport
219,M5V,Downtown Toronto,Harbourfront West
220,M5V,Downtown Toronto,King and Spadina
221,M5V,Downtown Toronto,Railway Lands
222,M5V,Downtown Toronto,South Niagara


In [357]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [358]:
neighborhoods = df.groupby('PostalCode')["Neighborhood"].apply(lambda x: "{%s}" % ', '.join(x))

In [359]:
grouped_df = pd.concat([df.groupby('PostalCode').agg(sum), neighborhoods], axis=1)
len(grouped_df)

103

In [360]:
new_df = grouped_df.merge(df,how="inner", on="PostalCode").drop_duplicates(subset='PostalCode', keep='first', inplace=False).drop(["Borough_x","Neighborhood_y"], axis=1)

In [361]:
new_df.columns = ['PostalCode', 'Neighborhood_weg', 'Neighborhood', 'Borough']
new_df.columns

Index(['PostalCode', 'Neighborhood_weg', 'Neighborhood', 'Borough'], dtype='object')

In [362]:
new_df.drop("Neighborhood_weg",axis=1, inplace=True)

In [363]:
new_df["Neighborhood"] = new_df["Neighborhood"].str.replace("\{","")
new_df["Neighborhood"] = new_df["Neighborhood"].str.replace("\}","")

In [411]:
new_df.reset_index(inplace=True, drop=True)
new_df.head()

Unnamed: 0,PostalCode,Neighborhood,Borough
0,M1B,"Rouge, Malvern",Scarborough
1,M1C,"Highland Creek, Rouge Hill, Port Union",Scarborough
2,M1E,"Guildwood, Morningside, West Hill",Scarborough
3,M1G,Woburn,Scarborough
4,M1H,Cedarbrae,Scarborough


## for the sake of being consistent with the example df I am switching the column order

In [420]:
cols = new_df.columns.tolist()
new_cols = ['PostalCode', 'Borough', 'Neighborhood']
new_df1 = new_df[new_cols]
new_df1.head(15)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [365]:
postalcode_df = new_df.copy()

## Now adding latitudes and longitudes for each Postal Code area

In [366]:
toronto_geo = pd.read_csv("Geospatial_Coordinates.csv")
postalcode_df = pd.concat([postalcode_df, toronto_geo], axis=1)
toronto_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [367]:
postalcode_df.head()

Unnamed: 0,PostalCode,Neighborhood,Borough,Postal Code,Latitude,Longitude
0,M1B,"Rouge, Malvern",Scarborough,M1B,43.806686,-79.194353
1,M1C,"Highland Creek, Rouge Hill, Port Union",Scarborough,M1C,43.784535,-79.160497
2,M1E,"Guildwood, Morningside, West Hill",Scarborough,M1E,43.763573,-79.188711
3,M1G,Woburn,Scarborough,M1G,43.770992,-79.216917
4,M1H,Cedarbrae,Scarborough,M1H,43.773136,-79.239476


## Now exploring

In [368]:
address = 'Toronto, Ontario'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

  This is separate from the ipykernel package so we can avoid doing imports until


The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [369]:
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(postalcode_df['Latitude'], postalcode_df['Longitude'], postalcode_df['PostalCode']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,).add_to(map_toronto)  
    
map_toronto

### Start gathering cluster features

In [370]:
CLIENT_ID = 'BMMZILCYQCZJE1KR2SJYMHMFLDBQBMRZOXFRN3C3AARRX113'
CLIENT_SECRET= 'C1GY15IRDEV4W0KO2U1FZXMLSE2PH33O50FB1FEVMW0G4BPI'
VERSION = '20180604'
LIMIT = 100
radius = 500
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: BMMZILCYQCZJE1KR2SJYMHMFLDBQBMRZOXFRN3C3AARRX113
CLIENT_SECRET:C1GY15IRDEV4W0KO2U1FZXMLSE2PH33O50FB1FEVMW0G4BPI


In [388]:
postalcode_df.head(16)

Unnamed: 0,PostalCode,Neighborhood,Borough,Postal Code,Latitude,Longitude
0,M1B,"Rouge, Malvern",Scarborough,M1B,43.806686,-79.194353
1,M1C,"Highland Creek, Rouge Hill, Port Union",Scarborough,M1C,43.784535,-79.160497
2,M1E,"Guildwood, Morningside, West Hill",Scarborough,M1E,43.763573,-79.188711
3,M1G,Woburn,Scarborough,M1G,43.770992,-79.216917
4,M1H,Cedarbrae,Scarborough,M1H,43.773136,-79.239476
5,M1J,Scarborough Village,Scarborough,M1J,43.744734,-79.239476
6,M1K,"East Birchmount Park, Ionview, Kennedy Park",Scarborough,M1K,43.727929,-79.262029
7,M1L,"Clairlea, Golden Mile, Oakridge",Scarborough,M1L,43.711112,-79.284577
8,M1M,"Cliffcrest, Cliffside, Scarborough Village West",Scarborough,M1M,43.716316,-79.239476
9,M1N,"Birch Cliff, Cliffside West",Scarborough,M1N,43.692657,-79.264848


## test for one postalcode area

In [394]:
postalcode_latitude = postalcode_df.loc[14, 'Latitude'] # neighborhood latitude value
postalcode_longitude = postalcode_df.loc[14, 'Longitude'] # neighborhood longitude value

postalcode_name = postalcode_df.loc[14, 'PostalCode'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(postalcode_name, 
                                                               postalcode_latitude, 
                                                               postalcode_longitude))

Latitude and longitude values of M1V are 43.8152522, -79.2845772.


In [395]:
temp_lim = 100
radius=500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    postalcode_latitude, 
    postalcode_longitude, 
    radius, 
    temp_lim)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=BMMZILCYQCZJE1KR2SJYMHMFLDBQBMRZOXFRN3C3AARRX113&client_secret=C1GY15IRDEV4W0KO2U1FZXMLSE2PH33O50FB1FEVMW0G4BPI&v=20180604&ll=43.8152522,-79.2845772&radius=500&limit=100'

In [396]:
results = requests.get(url).json()

In [397]:
results

{'meta': {'code': 200, 'requestId': '5c1f9c19dd579735d528d6dc'},
  'headerLocation': 'Scarborough',
  'headerFullLocation': 'Scarborough',
  'headerLocationGranularity': 'city',
  'totalResults': 2,
  'suggestedBounds': {'ne': {'lat': 43.819752204500006,
    'lng': -79.2783524880255},
   'sw': {'lat': 43.8107521955, 'lng': -79.2908019119745}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4cff35ff574d60fce993466c',
       'name': 'Port Royal Park',
       'location': {'address': '130 Port Royal Trl',
        'crossStreet': 'Bramblebrook Ave',
        'lat': 43.815477,
        'lng': -79.289773,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.815477,
          'lng': -79.289773}],
        'distance': 418,
        'cc': 'CA',
        'city': 'Scarboro

In [393]:
results["response"]["groups"][0]["items"][0]["venue"]

{'id': '4df28ea0e4cda09e6da0129a',
 'name': 'Mr Congee Chinese Cuisine 龍粥記',
 'location': {'address': '2900 Warden Ave',
  'crossStreet': 'at Finch Ave. E',
  'lat': 43.798878792587615,
  'lng': -79.3183345011537,
  'labeledLatLngs': [{'label': 'display',
    'lat': 43.798878792587615,
    'lng': -79.3183345011537}],
  'distance': 72,
  'postalCode': 'M1W 2S8',
  'cc': 'CA',
  'neighborhood': "L'Amoreaux",
  'city': 'Toronto',
  'state': 'ON',
  'country': 'Canada',
  'formattedAddress': ['2900 Warden Ave (at Finch Ave. E)',
   'Toronto ON M1W 2S8',
   'Canada']},
 'categories': [{'id': '4bf58dd8d48988d145941735',
   'name': 'Chinese Restaurant',
   'pluralName': 'Chinese Restaurants',
   'shortName': 'Chinese',
   'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/asian_',
    'suffix': '.png'},
   'primary': True}],
 'photos': {'count': 0, 'groups': []}}

In [398]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        try:    
            # make the GET request
            results = requests.get(url).json()["response"]['groups'][0]['items']

            # return only relevant information for each nearby venue
            venues_list.append([(
                name, 
                lat, 
                lng, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in results])
      
        except:
            venues_list.append([(
            name, 
            lat, 
            lng, 
            0, 
            0, 
            0,  
            0) for v in results])
        

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['PostalCode', 
                  'PostalCode Latitude', 
                  'PostalCode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [399]:
toronto_venues = getNearbyVenues(names=postalcode_df['PostalCode'],
                                   latitudes=postalcode_df['Latitude'],
                                   longitudes=postalcode_df['Longitude']
                                  )

M1B
M1C
M1E
M1G
M1H
M1J
M1K
M1L
M1M
M1N
M1P
M1R
M1S
M1T
M1V
M1W
M1X
M2H
M2J
M2K
M2L
M2M
M2N
M2P
M2R
M3A
M3B
M3C
M3H
M3J
M3K
M3L
M3M
M3N
M4A
M4B
M4C
M4E
M4G
M4H
M4J
M4K
M4L
M4M
M4N
M4P
M4R
M4S
M4T
M4V
M4W
M4X
M4Y
M5A
M5B
M5C
M5E
M5G
M5H
M5J
M5K
M5L
M5M
M5N
M5P
M5R
M5S
M5T
M5V
M5W
M5X
M6A
M6B
M6C
M6E
M6G
M6H
M6J
M6K
M6L
M6M
M6N
M6P
M6R
M6S
M7A
M7R
M7Y
M8V
M8W
M8X
M8Y
M8Z
M9A
M9B
M9C
M9L
M9M
M9N
M9P
M9R
M9V
M9W


In [402]:
toronto_venues.head()

Unnamed: 0,PostalCode,PostalCode Latitude,PostalCode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M1B,43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,M1C,43.784535,-79.160497,RIGHT WAY TO GOLF,43.785177,-79.161108,Golf Course
2,M1C,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
3,M1E,43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,M1E,43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


In [404]:
len(toronto_venues["PostalCode"].unique())

99

In [405]:
len(postalcode_df["PostalCode"].unique())

103

In [408]:
set(postalcode_df["PostalCode"]) - set(toronto_venues["PostalCode"])

{'M1X', 'M2M', 'M9A', 'M9N'}

## For 4 postalcode areas the API didnt return any venues, I will drop them therefore ('M1X', 'M2M', 'M9A', 'M9N')