# Segmenting and Clustering Neighborhoods in Toronto

### Scrape information from the wikipedia page on Toronto neighborhoods

- use BeautifulSoup for webscrapping the table

import libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

website for scrapping:

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

getting the entire website as lxml:

In [3]:
html_content = requests.get(url)

get text from the website

In [4]:
content = html_content.text
soup = BeautifulSoup(content, 'lxml')

find table:

In [5]:
# find table on the website
table = soup.find("table", attrs = {"class": "wikitable sortable"})

get table headers:

In [6]:
t_headers = []
for th in table.find_all("th"):
    t_headers.append(th.text.replace("\n",' ').strip())

t_headers

['Postal Code', 'Borough', 'Neighbourhood']

get data from table into a dictionary:

In [7]:
table_data = []
for tr in table.tbody.find_all("tr"): # each row in tbody of table is tr
    t_row = {}
    for td, th in zip(tr.find_all("td"), t_headers): # each cell in row is td
        t_row[th] = td.text.replace('\n','').strip()
    table_data.append(t_row)

### Wrangle the data

In [8]:
table_data = table_data[1:]  # remove first row (it is empty)

Convert dictionary into a dataframe:

In [9]:
df_table = pd.DataFrame(table_data)
df_table.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [10]:
df_table.shape

(180, 3)

Remove rows where Borough is Not assigned:

In [11]:
df_table = df_table[df_table['Borough']!='Not assigned']
df_table.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [12]:
df_table.shape

(103, 3)

Combine rows with same Postal Code/Borough:

In [13]:
df_table = pd.DataFrame(df_table.groupby(['Postal Code','Borough'], sort=False)['Neighbourhood'].apply(', '.join))
df_table.reset_index(inplace=True)
df_table.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [14]:
df_table.shape

(103, 3)

\* seems that there was no repeating Postal Code

For the rows with Not assigned Neighbourhood, apply Borough as the Neighbourhood name:

In [15]:
df_table.loc[df_table['Neighbourhood']=='Not assigned','Neighbourhood'] = df_table.loc[df_table['Neighbourhood']=='Not assigned','Borough']

Number of rows obtained after wrangling of the data:

In [16]:
print('There is {} rows in the dataframe'.format(df_table.shape[0]))

There is 103 rows in the dataframe


## Get spacial coordinates

- obtain spatial coordinates for each neighbourhood

In [17]:
import geocoder

loop through all the rows and get coordinates based on post code:

In [18]:
for idx, address in enumerate(df_table['Postal Code']):
    address_full = address + ', Toronto, Ontario, Canada'
    
    coord = None
    i = 0
    while coord == None or i==20:
        coord = geocoder.arcgis(address_full).latlng
        i+=1
    df_table.loc[idx, 'Latitude'] = coord[0]
    df_table.loc[idx, 'Longitude'] = coord[1]

In [19]:
df_table.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75188,-79.33036
1,M4A,North York,Victoria Village,43.73042,-79.31282
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65514,-79.36265
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72321,-79.45141
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66449,-79.39302


### Explore and cluster the neighborhoods in Toronto

Filter the neighbourhoods to get only those located in Toronto city

In [20]:
df_toronto = df_table[df_table['Borough'].str.contains('Toronto')]
df_toronto.reset_index(inplace=True)
df_toronto.drop('index', axis=1, inplace=True)
df_toronto.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65514,-79.36265
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66449,-79.39302
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.65736,-79.37818
3,M5C,Downtown Toronto,St. James Town,43.65143,-79.37557
4,M4E,East Toronto,The Beaches,43.67703,-79.29542


In [21]:
df_toronto.shape

(39, 5)

Foursquare creditentials

In [22]:
# @hidden_cell
CLIENT_ID = '1M5PQXOKTSFCFWJSNHRQI1X1GBPD1GM0O05QVPOSGMXGGC00' # your Foursquare ID
CLIENT_SECRET = '1GFHFAJFE05LEJ1K3DXRFNHXER3FXPUJ3U2CQOICNBAWPUPU' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

Get top 100 venues located within 500m using foursquare's API

In [81]:
LIMIT = 100
RADIUS = 500

toronto_venues = []

for i in range(df_toronto.shape[0]):
    # url request to the foursquare api
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit{}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        df_toronto.loc[i,'Latitude'], 
        df_toronto.loc[i,'Longitude'], 
        RADIUS, 
        LIMIT)
    
    results = requests.get(url).json()['response']['groups'][0]['items']
    
    toronto_venues.append([(
        df_toronto.loc[i,'Postal Code'],
        df_toronto.loc[i,'Borough'],
        df_toronto.loc[i,'Neighbourhood'],
        df_toronto.loc[i,'Latitude'],
        df_toronto.loc[i,'Longitude'], 
        result['venue']['categories'][0]['name']) for result in results])

Convert to dataframe

In [38]:
df_toronto_venues = pd.DataFrame(data=[item for venue_list in toronto_venues for item in venue_list],
                                columns=['Postal Code', 'Borough', 'Neighbourhood','Latitude','Longitude','Venue'])
df_toronto_venues.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65514,-79.36265,Bakery
1,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65514,-79.36265,Coffee Shop
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65514,-79.36265,Breakfast Spot
3,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65514,-79.36265,Yoga Studio
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65514,-79.36265,Spa


In [71]:
df_toronto_venues['Neighbourhood'].value_counts().shape

(38,)

perform one-hot-encoding

In [39]:
# one-hot-encoding for the venues
toronto_onehot = pd.get_dummies(df_toronto_venues[['Venue']], prefix="", prefix_sep="")

# move neighbourhood column to the front
toronto_onehot.insert(0, 'Neighbourhood', df_toronto_venues['Neighbourhood'])

# sum up venue types for each neighbourhood
toronto_group = toronto_onehot.groupby(['Neighbourhood']).mean().reset_index()

toronto_group.head()

Unnamed: 0,Neighbourhood,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Baby Store,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,Berczy Park,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,"Brockton, Parkdale Village, Exhibition Place",0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,"Business reply mail Processing Centre, South C...",1,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,Central Bay Street,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Cluster neighbourhoods based on number of venue types

we use k-means to group neighbourhoods in 4 groups

In [40]:
from sklearn.cluster import KMeans

In [41]:
# get features based on which groups will be made
features = toronto_group.drop(['Neighbourhood'], axis=1)

# create k-means model and fit our data
kclusters = 4
kmeans = KMeans(n_clusters=kclusters, init='k-means++', n_init=20, max_iter=500, random_state=0).fit(features)

create new data frame with assigned cluster and top 3 venue types for each neighbourhood

In [42]:
columns=['Cluster', 'Neighbourhood', 'top_1_venue', 'top_2_venue', 'top_3_venue']
toronto_clustered = pd.DataFrame(columns=columns)
for i in range(features.shape[0]):
    toronto_clustered.loc[i,columns[0]] = kmeans.labels_[i]
    toronto_clustered.loc[i,columns[1]] = toronto_group.loc[i, 'Neighbourhood']
    toronto_clustered.loc[i,columns[2:5]] = features.iloc[i,:].sort_values(ascending=False).index[0:3]

In [43]:
toronto_clustered.head()

Unnamed: 0,Cluster,Neighbourhood,top_1_venue,top_2_venue,top_3_venue
0,0,Berczy Park,Seafood Restaurant,Farmers Market,Cocktail Bar
1,0,"Brockton, Parkdale Village, Exhibition Place",Café,Furniture / Home Store,Gift Shop
2,2,"Business reply mail Processing Centre, South C...",Steakhouse,Hotel,Restaurant
3,0,"CN Tower, King and Spadina, Railway Lands, Har...",Italian Restaurant,Gym / Fitness Center,Restaurant
4,3,Central Bay Street,Coffee Shop,Pizza Place,Clothing Store


add borough and location data

In [83]:
toronto = df_toronto[['Borough','Neighbourhood','Latitude','Longitude']]
toronto = toronto.join(toronto_clustered.set_index('Neighbourhood'), on='Neighbourhood', how='right')
toronto.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,Cluster,top_1_venue,top_2_venue,top_3_venue
0,Downtown Toronto,"Regent Park, Harbourfront",43.65514,-79.36265,0,Coffee Shop,Breakfast Spot,Bakery
1,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66449,-79.39302,3,Coffee Shop,Sandwich Place,Park
2,Downtown Toronto,"Garden District, Ryerson",43.65736,-79.37818,3,Café,Coffee Shop,Theater
3,Downtown Toronto,St. James Town,43.65143,-79.37557,0,Café,Coffee Shop,Gastropub
4,East Toronto,The Beaches,43.67703,-79.29542,1,Neighborhood,Trail,Health Food Store


group by clusters

In [84]:
# move cluster column to front
fixed_columns = [toronto.columns[4]] + list(toronto.columns[0:4]) + list(toronto.columns[5:])
toronto_cluster = toronto[fixed_columns]

# group by cluster
toronto_cluster.sort_values(by='Cluster', inplace=True)
toronto_cluster.reset_index(drop=True, inplace=True)
toronto_cluster.head()

Unnamed: 0,Cluster,Borough,Neighbourhood,Latitude,Longitude,top_1_venue,top_2_venue,top_3_venue
0,0,Downtown Toronto,"Regent Park, Harbourfront",43.65514,-79.36265,Coffee Shop,Breakfast Spot,Bakery
1,0,West Toronto,"Brockton, Parkdale Village, Exhibition Place",43.63941,-79.42676,Café,Furniture / Home Store,Gift Shop
2,0,Central Toronto,"The Annex, North Midtown, Yorkville",43.67484,-79.40185,Italian Restaurant,Park,Sandwich Place
3,0,West Toronto,"Parkdale, Roncesvalles",43.64785,-79.4502,American Restaurant,Gift Shop,Eastern European Restaurant
4,0,Downtown Toronto,"University of Toronto, Harbord",43.66311,-79.4018,Bakery,Japanese Restaurant,Bookstore


#### Create map showing different clusters

In [85]:
import folium

In [91]:
toronto_map = folium.Map([toronto_cluster['Latitude'].median(), toronto_cluster['Longitude'].median()], zoom_start=12)

colors = ['red', 'blue', 'green', 'magenta']

for i in range(toronto_cluster.shape[0]):
        folium.CircleMarker(
        [toronto_cluster.loc[i, 'Latitude'], toronto_cluster.loc[i, 'Longitude']],
        color = colors[int(toronto_cluster.loc[i, 'Cluster'])],
        radius = 5,
        fill = True,
        fill_color = colors[int(toronto_cluster.loc[i, 'Cluster'])],
        fill_opacity = 0.7,
        popup = toronto_cluster.loc[i, 'Neighbourhood']
    ).add_to(toronto_map)

toronto_map