## Segmenting and Clustering Neighborhoods in Toronto | Part-1

Importing libraries

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

Using BeautifulSoup to Scrape list of Postal Codes on given Wikipeida page

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html_data = requests.get(url)
soup = BeautifulSoup(html_data.text, "html5lib")
soup

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en"><head>
<meta charset="utf-8"/>
<title>List of postal codes of Canada: M - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"12305fa9-d6fd-4721-a66e-a3fa730c7ffb","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":1032600019,"wgRevisionId":1032600019,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wikidata","Communications in Ontario","

Creating table with three columns and adding data to it from the BeautifulSoup object.
* Only Adding rows which has assigned Borough

In [3]:
column_names = ['Postal Code', 'Borough', 'Neighbourhood']
data = pd.DataFrame(columns = column_names)

for row in soup.find('tbody').find_all('td'):
    if(row.span.text!="Not assigned"):
        pcode = row.p.text[0:3]
        bor = row.span.text.split('(')[0]
        neigh = row.span.text.split("(")[1].strip(")").replace(" /",',').replace(")"," ").strip(" ")
        data = data.append({"Postal Code":pcode,"Borough":bor,"Neighbourhood":neigh}, ignore_index = True)
        

Cleaning the table:
* Assuming that any row which has empty Neighbourhood will only have ''(empty string) value
* Replacing all empty Neighbourhood with value of their Borough

In [4]:
data['Neighbourhood'] = data.apply(lambda x: x['Borough'] if(x['Neighbourhood']=='') else x['Neighbourhood'], axis = 1)

Printing the final table obtained from scraping the webpage

In [5]:
data

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


Shape of Obtained Table:

In [6]:
data.shape

(103, 3)

## Segmenting and Clustering Neighborhoods in Toronto | Part-2

Importing geocoder

In [7]:
!pip install geocoder
import geocoder



Defining function for geocoder

In [8]:
def get_latlong(postal_code):
    lat_long_coords = None
    while(lat_long_coords is None):
        g = geocoder.arcgis("{}, Toronto, Ontario".format(postal_code))
        lat_long_coords = g.latlng
    return lat_long_coords

get_latlong("M5A")

[43.65512000000007, -79.36263999999994]

Getting latitude longitude for each postal code

In [9]:
postal_codes = data['Postal Code']
coords = []
for postal_code in postal_codes.tolist():
    coords.append(get_latlong(postal_code))

Coverting into dataframe

In [10]:
data_coords = pd.DataFrame(coords,columns = ['Latitude', 'Longitude'])
data_coords.head()

Unnamed: 0,Latitude,Longitude
0,43.75245,-79.32991
1,43.73057,-79.31306
2,43.65512,-79.36264
3,43.72327,-79.45042
4,43.66253,-79.39188


Merging dataframes

In [11]:
data['Latitude'] = data_coords['Latitude']
data['Longitude'] = data_coords['Longitude']

In [12]:
data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
4,M7A,Queen's Park,Ontario Provincial Government,43.66253,-79.39188


## Segmenting and Clustering Neighborhoods in Toronto | Part-3

Installing libraries

In [13]:
import folium
import json
import matplotlib.cm as cm
import matplotlib.colors as colors

from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim


In [14]:
address = 'Toronto, Ontario Canada'
geolocator = Nominatim(user_agent="my-app")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

In [15]:
map_toronto = folium.Map(location = [latitude, longitude], zoom_start=11)
for pincode, lat, long, borough, neighbourhood in zip(data['Postal Code'],data['Latitude'],data['Longitude'],data['Borough'],data['Neighbourhood']):
    label = '{}, {}, {}'.format(neighbourhood,borough,pincode)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
    [lat,long],
    radius=4,
    popup=label,
    color = 'blue',
    fill=True,
    fill_color= "#87cefa",
    fill_opacity=0.5,
    parse_html=False).add_to(map_toronto)
map_toronto

In [16]:

toronto_data = data[data['Borough'].str.contains("Toronto")].reset_index(drop=True)
print(toronto_data.shape)
toronto_data.head()

(39, 5)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.65739,-79.37804
2,M5C,Downtown Toronto,St. James Town,43.65215,-79.37587
3,M4E,East Toronto,The Beaches,43.67709,-79.29547
4,M5E,Downtown Toronto,Berczy Park,43.64536,-79.37306


In [17]:
map_toronto = folium.Map(location = [latitude, longitude], zoom_start=11)
for pincode, lat, long, borough, neighbourhood in zip(toronto_data['Postal Code'],toronto_data['Latitude'],toronto_data['Longitude'],toronto_data['Borough'],toronto_data['Neighbourhood']):
    label = '{}, {}, {}'.format(neighbourhood,borough,pincode)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
    [lat,long],
    radius=4,
    popup=label,
    color = 'blue',
    fill=True,
    fill_color= "#3186cc",
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto

In [18]:
CLIENT_ID = 'DMMBKR5YEYMEYULXZHBF4SOTBIZKA3NGR5ZQ54AIS1OAZYH4' # Put Your Client Id
CLIENT_SECRET = 'IS0NIGDRWTYIKW5VCFTDIROYQAJJI4STFWWVGRUDCI3BBYN2' # Put You Client Secret 
VERSION = '20210719'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: Hidden')
print('CLIENT_SECRET: Hidden')

Your credentails:
CLIENT_ID: Hidden
CLIENT_SECRET: Hidden


#### 1. Exploring Neighbourhoods in Toronto

In [19]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)
        
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        venues_list.append([( name, lat, lng, v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'], v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
    
    return(nearby_venues)

In [20]:
toronto_venues = getNearbyVenues(names = toronto_data['Neighbourhood'], latitudes = toronto_data['Latitude'], longitudes = toronto_data['Longitude'])


Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Dufferin, Dovercourt Village
The Danforth  East
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
The Danforth West, Riverdale
Toronto Dominion Centre, Design Exchange
Brockton, Parkdale Village, Exhibition Place
India Bazaar, The Beaches West
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West
High Park, The Junction South
North Toronto West
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
University of Toronto, Harbord
Runnymede, Swansea
Moore Park, Summerhill East
Kensington Market, Chinatown, Grange Park
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Rosedale
Enclave of M5E
St. James Town, Cabbagetown
First Canadi

In [21]:
toronto_venues

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65512,-79.36264,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65512,-79.36264,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65512,-79.36264,Figs Breakfast & Lunch,43.655675,-79.364503,Breakfast Spot
3,"Regent Park, Harbourfront",43.65512,-79.36264,The Yoga Lounge,43.655515,-79.364955,Yoga Studio
4,"Regent Park, Harbourfront",43.65512,-79.36264,Body Blitz Spa East,43.654735,-79.359874,Spa
...,...,...,...,...,...,...,...
793,Enclave of M4L,43.64869,-79.38544,Canoe,43.647452,-79.381320,Restaurant
794,Enclave of M4L,43.64869,-79.38544,Civello Salon & Spa,43.650020,-79.389400,Salon / Barbershop
795,Enclave of M4L,43.64869,-79.38544,DAVIDsTEA,43.650547,-79.383385,Tea Room
796,Enclave of M4L,43.64869,-79.38544,Ninki Izakaya,43.650228,-79.384863,Japanese Restaurant


In [22]:
toronto_venues.groupby("Neighborhood").count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,30,30,30,30,30,30
"Brockton, Parkdale Village, Exhibition Place",30,30,30,30,30,30
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",30,30,30,30,30,30
Central Bay Street,30,30,30,30,30,30
Christie,10,10,10,10,10,10
Church and Wellesley,30,30,30,30,30,30
"Commerce Court, Victoria Hotel",30,30,30,30,30,30
Davisville,28,28,28,28,28,28
Davisville North,6,6,6,6,6,6
"Dufferin, Dovercourt Village",16,16,16,16,16,16


In [23]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 178 uniques categories.


#### 2. Analyse each Borough Neighbourhood

In [24]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Baby Store,...,Theater,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,BBQ Joint,...,Theater,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop
0,Berczy Park,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0
2,"CN Tower, King and Spadina, Railway Lands, Har...",0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Church and Wellesley,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,"Commerce Court, Victoria Hotel",0.0,0.066667,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.035714,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Dufferin, Dovercourt Village",0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625


In [26]:
toronto_grouped.shape


(39, 178)

In [27]:
num_top_venues = 5
for neigh in toronto_grouped['Neighborhood']:
    print("----"+neigh+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == neigh].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
                venue  freq
0        Cocktail Bar  0.07
1  Seafood Restaurant  0.07
2              Bakery  0.07
3            Beer Bar  0.07
4      Farmers Market  0.07


----Brockton, Parkdale Village, Exhibition Place----
                    venue  freq
0                  Bakery  0.07
1               Gift Shop  0.07
2              Restaurant  0.07
3  Furniture / Home Store  0.07
4             Coffee Shop  0.07


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
                  venue  freq
0    Italian Restaurant  0.10
1                  Café  0.07
2                  Park  0.07
3  Gym / Fitness Center  0.07
4            Restaurant  0.07


----Central Bay Street----
                venue  freq
0         Coffee Shop  0.10
1               Hotel  0.07
2      Clothing Store  0.07
3                 Spa  0.03
4  Miscellaneous Shop  0.03


----Christie----
           venue  freq
0           Café   0.3
1  Gro

In [28]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

In [29]:
import numpy as np
num_top_venues = 10
indicators = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.shape

(39, 11)

#### 3. Clustering Neighbourhoods

Importing Libraries

In [30]:
from sklearn.cluster import KMeans
import sklearn.cluster.k_means_
km = KMeans(n_clusters=3, init='k-means++', max_iter=100, n_init=1, 
  verbose=True)



Performing KMeans Clustering

In [31]:
kclusters = 10
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=2).fit(toronto_grouped_clustering)
print(kmeans.labels_[0:10])
print(len(kmeans.labels_))

[1 1 1 1 9 1 9 1 1 1]
39


In [32]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = toronto_data
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264,1,Coffee Shop,Breakfast Spot,Yoga Studio,Bakery,Electronics Store,Restaurant,Italian Restaurant,Spa,Pub,Distribution Center
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.65739,-79.37804,9,Café,Coffee Shop,Clothing Store,Ramen Restaurant,Theater,College Rec Center,Music Venue,Lounge,Pizza Place,Sporting Goods Shop
2,M5C,Downtown Toronto,St. James Town,43.65215,-79.37587,1,Café,Cosmetics Shop,Gastropub,Japanese Restaurant,Coffee Shop,Restaurant,Farmers Market,Italian Restaurant,Bookstore,Diner
3,M4E,East Toronto,The Beaches,43.67709,-79.29547,1,Trail,Health Food Store,Asian Restaurant,Pub,Wine Shop,Discount Store,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant
4,M5E,Downtown Toronto,Berczy Park,43.64536,-79.37306,1,Seafood Restaurant,Farmers Market,Bakery,Cocktail Bar,Beer Bar,Café,Bistro,Breakfast Spot,Jazz Club,Restaurant


Visualizing data

In [33]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'],kmeans.labels_):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker([lat, lon], radius=5, popup=label, color=rainbow[cluster-1], fill=True, fill_color=rainbow[cluster-1], fill_opacity=0.7).add_to(map_clusters)
map_clusters