## Segmenting and Clustering Neighborhoods in Toronto

### Step 1: importing and cleaning the neighborhoods data in Toronto

In [None]:
!pip install lxml
!pip install bs4

import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

#### Scraping the web page by using the BeautifulSoup package

In [271]:
# tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
# canada = tables[0]

website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url,'lxml')
My_table = soup.find('table',{'class':'wikitable sortable'})

#### Importing the html data into the Python dataframe - canada

In [272]:
PostalCode=[]
for row in My_table.findAll('tr')[1:]:
    PostalCode_cell=row.findAll('td')[0]
    PostalCode.append(PostalCode_cell.text)    

Borough=[]
for row in My_table.findAll('tr')[1:] :
    Borough_cell=row.findAll('td')[1]
    Borough.append(Borough_cell.text)   

Neighbourhood=[]
for row in My_table.findAll('tr')[1:]:
    Neighbourhood_cell=row.findAll('td')[2]
    Neighbourhood_cell.text.rstrip('\n')
    Neighbourhood.append(Neighbourhood_cell.text)

canada=pd.DataFrame({'PostalCode':PostalCode,'Borough':Borough,'Neighborhood':Neighbourhood})
canada['Neighborhood'] = canada.Neighborhood.str.replace('(\n)','')

print(canada.head())
print('\n')
print(canada.describe())
print('\n')
print(canada['Borough'].value_counts())
print('\n')
print("Dataset Shape: ", canada.shape)

  PostalCode           Borough      Neighborhood
0        M1A      Not assigned      Not assigned
1        M2A      Not assigned      Not assigned
2        M3A        North York         Parkwoods
3        M4A        North York  Victoria Village
4        M5A  Downtown Toronto      Harbourfront


       PostalCode       Borough  Neighborhood
count         288           288           288
unique        180            12           209
top           M9V  Not assigned  Not assigned
freq            8            77            78


Not assigned        77
Etobicoke           45
North York          38
Downtown Toronto    37
Scarborough         37
Central Toronto     17
West Toronto        13
York                 9
East Toronto         7
East York            6
Mississauga          1
Queen's Park         1
Name: Borough, dtype: int64


Dataset Shape:  (288, 3)


#### Cleaning data:
#####    1) deleting data with the borough that is Not assigned.
#####    2) combining rows with same postal code into one row with neighborhoods separated with a comma.
#####    3) If a borough has a Not assigned neighborhood, the neighborhood will be the same as the borough.

In [273]:
canada = canada[canada['Borough'] != 'Not assigned']
canada.reset_index(drop=True, inplace = True)

dup_postal = canada['PostalCode'].value_counts().to_frame()
dup_postal = dup_postal[dup_postal['PostalCode'] > 1]

del_index=[]
for dup in dup_postal.index.values:
    index=[]
    for i in range(canada.shape[0]):
        if canada.loc[i,'PostalCode'] == dup :
            index = index +[i]
        if i == (canada.shape[0]-1) :
            canada.loc[index[0],'Neighborhood'] = ', '.join(canada.loc[index,'Neighborhood'].values)
            del_index = del_index + index[1:]

canada.drop(index = del_index,axis = 0, inplace=True)
canada.reset_index(drop=True, inplace = True)

for index, borough in enumerate(canada['Neighborhood']):
    if borough == 'Not assigned':
        canada.loc[index, 'Neighborhood'] = canada.loc[index, 'Borough']
        
print(canada.head())
print('\n')
print(canada.describe())
print('\n')
print(canada['Borough'].value_counts())
print('\n')
print("Dataset Shape: ", canada.shape)

  PostalCode           Borough                      Neighborhood
0        M3A        North York                         Parkwoods
1        M4A        North York                  Victoria Village
2        M5A  Downtown Toronto         Harbourfront, Regent Park
3        M6A        North York  Lawrence Heights, Lawrence Manor
4        M7A      Queen's Park                      Queen's Park


       PostalCode     Borough Neighborhood
count         103         103          103
unique        103          11          103
top           M4G  North York    Northwest
freq            1          24            1


North York          24
Downtown Toronto    18
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
York                 5
East Toronto         5
East York            5
Mississauga          1
Queen's Park         1
Name: Borough, dtype: int64


Dataset Shape:  (103, 3)


### Step 2: getting the latitude and the longitude coordinates of each neighborhood

In [None]:
!wget -O Geospatial_data.csv http://cocl.us/Geospatial_data

In [291]:
pd.set_option('display.width',1000)

neigh_loc = pd.read_csv('Geospatial_data.csv')

for i in range(canada.shape[0]):
    for j in range(neigh_loc.shape[0]):
        if neigh_loc.loc[j,'Postal Code'] == canada.loc[i, 'PostalCode']:
            canada.loc[i, 'Latitude'] = neigh_loc.loc[j,'Latitude']
            canada.loc[i, 'Longitude'] = neigh_loc.loc[j,'Longitude']
            
print("Dataset Shape: ", canada.shape)
print('\n')
canada.head(10)

Dataset Shape:  (103, 5)




Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


### Step 3: exploring and clustering the neighborhoods in Toronto

In [None]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import folium 
import json # library to handle JSON files
import requests # library to handle requests
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

#### Create a map of Toronto with neighborhoods superimposed on top.

In [275]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(canada['Latitude'], canada['Longitude'], canada['Borough'], canada['Neighborhood']):
    label = '{}: {}'.format(borough, neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, lng], radius=5, popup=label, color='blue', fill=True, fill_color='#3186cc',
                        fill_opacity=0.7, parse_html=False).add_to(map_toronto)  
    
map_toronto

The geograpical coordinate of Toronto are 43.653963, -79.387207.


#### Explore Neighborhoods in Toronto

In [276]:
CLIENT_ID = 'BMD5UMUXO0EIAIWFLU4QH2N5C3HQ1W3GXNEUHIWMGZ52RLZN' # your Foursquare ID
CLIENT_SECRET = 'WSAWH1VJ3YHFRYVJREYRIM3TIFHVNQYENEXYGCSPYZUV4WCE' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100

#borough_toronto =  canada[canada['Borough'] == 'York'].reset_index(drop=True)
borough_toronto =  canada 

In [None]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [277]:
def getNearbyVenues(names, latitudes, longitudes):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):      
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, LIMIT)          
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']      
        # return only relevant information for each nearby venue
        venues_list.append([(name, lat, lng, 
            v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])    
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)

In [None]:
toronto_venues = getNearbyVenues(names=borough_toronto['Neighborhood'],
                                   latitudes=borough_toronto['Latitude'],
                                   longitudes=borough_toronto['Longitude']
                                  )

In [292]:
print("Dataset Shape: ", toronto_venues.shape)
print('\n')
toronto_venues.head()

Dataset Shape:  (10165, 7)




Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Allwyn's Bakery,43.75984,-79.324719,Caribbean Restaurant
1,Parkwoods,43.753259,-79.329656,Donalda Golf & Country Club,43.752816,-79.342741,Golf Course
2,Parkwoods,43.753259,-79.329656,Graydon Hall Manor,43.763923,-79.342961,Event Space
3,Parkwoods,43.753259,-79.329656,Darband Restaurant,43.755194,-79.348498,Middle Eastern Restaurant
4,Parkwoods,43.753259,-79.329656,LCBO,43.757774,-79.314257,Liquor Store


#### Analyze Each Neighborhood

In [293]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

print("Dataset Shape: ", toronto_grouped.shape)
print('\n')
toronto_grouped.head()

Dataset Shape:  (103, 318)




Unnamed: 0,Neighborhood,Zoo Exhibit,ATM,Afghan Restaurant,African Restaurant,...,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
1,Agincourt,0.0,0.0,0.0,0.0,...,0.01,0.0,0.0,0.0,0.0
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0.012987,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
4,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,...,0.01,0.0,0.0,0.01,0.0


In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [294]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
toronto_venues_sorted = pd.DataFrame(columns=columns)
toronto_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    toronto_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

print("Dataset Shape: ", toronto_venues_sorted.shape)
print('\n')
toronto_venues_sorted.head()

Dataset Shape:  (103, 11)




Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,...,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Sushi Restaurant,Bar,...,Gym,Steakhouse,American Restaurant,Hotel,Asian Restaurant
1,Agincourt,Chinese Restaurant,Coffee Shop,Indian Restaurant,Supermarket,...,Caribbean Restaurant,Pharmacy,Clothing Store,Bubble Tea Shop,Bookstore
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Chinese Restaurant,Vietnamese Restaurant,Coffee Shop,Bubble Tea Shop,...,Korean Restaurant,Sandwich Place,Supermarket,Indian Restaurant,Dessert Shop
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Coffee Shop,Fast Food Restaurant,Caribbean Restaurant,Pharmacy,...,Grocery Store,Chinese Restaurant,Sandwich Place,Sushi Restaurant,Indian Restaurant
4,"Alderwood, Long Branch",Burger Joint,Coffee Shop,Furniture / Home Store,Breakfast Spot,...,Burrito Place,Grocery Store,Seafood Restaurant,Middle Eastern Restaurant,Bakery


In [295]:
# set number of clusters
kclusters = 4

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([0, 3, 3, 1, 3, 0, 3, 3, 0, 2, 3, 0, 2, 3, 2, 2, 2, 1, 1, 0, 2, 2,
       0, 3, 3, 3, 3, 0, 2, 2, 2, 2, 0, 3, 3, 2, 3, 1, 1, 3, 3, 2, 1, 3,
       0, 3, 2, 3, 1, 2, 0, 0, 2, 1, 3, 2, 2, 1, 2, 1, 1, 2, 3, 3, 2, 2,
       2, 3, 2, 3, 2, 0, 1, 2, 3, 0, 2, 2, 1, 2, 3, 1, 3, 0, 0, 2, 2, 2,
       2, 2, 2, 2, 2, 1, 3, 1, 1, 3, 3, 1, 3, 2, 2], dtype=int32)

In [296]:
# add clustering labels
toronto_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_venues_sorted
toronto_merged = borough_toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(toronto_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

print("Dataset Shape: ", toronto_merged.shape)
print('\n')
toronto_merged.head() # check the last columns!

Dataset Shape:  (103, 16)




Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,...,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,...,Gym / Fitness Center,Supermarket,Café,Chinese Restaurant,Ice Cream Shop
1,M4A,North York,Victoria Village,43.725882,-79.315572,...,Restaurant,Grocery Store,Movie Theater,Liquor Store,Burrito Place
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,...,Bakery,Breakfast Spot,Farmers Market,Italian Restaurant,Thai Restaurant
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763,...,Dessert Shop,Grocery Store,Liquor Store,Toy / Game Store,Cosmetics Shop
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494,...,Gastropub,Park,Gym,Burrito Place,Falafel Restaurant


#### Cluster Neighborhoods

In [283]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ': Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Examine Clusters （cluster 1)

In [285]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, 
                   toronto_merged.columns[[2] + 
                   list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,...,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Islington Avenue,1,Pharmacy,Coffee Shop,Bakery,...,Grocery Store,Liquor Store,Park,Golf Course,Pub
6,"Rouge, Malvern",1,Zoo Exhibit,Coffee Shop,Breakfast Spot,...,Pizza Place,Sandwich Place,Pharmacy,Burger Joint,Caribbean Restaurant
12,"Highland Creek, Rouge Hill, Port Union",1,Zoo Exhibit,Park,Coffee Shop,...,Pub,Mexican Restaurant,Liquor Store,Grocery Store,Gym
18,"Guildwood, Morningside, West Hill",1,Coffee Shop,Pharmacy,Park,...,Indian Restaurant,Grocery Store,Breakfast Spot,Fried Chicken Joint,Pub
22,Woburn,1,Coffee Shop,Indian Restaurant,Pizza Place,...,Park,Bank,Caribbean Restaurant,Fast Food Restaurant,Gym
26,Cedarbrae,1,Coffee Shop,Chinese Restaurant,Pharmacy,...,Caribbean Restaurant,Supermarket,Bookstore,Breakfast Spot,Clothing Store
32,Scarborough Village,1,Coffee Shop,Pharmacy,Fast Food Restaurant,...,Bank,Gym,Pizza Place,Clothing Store,Park
34,"Northwood Park, York University",1,Coffee Shop,Sandwich Place,Grocery Store,...,Middle Eastern Restaurant,Pharmacy,Restaurant,Flea Market,Tennis Stadium
46,Downsview West,1,Coffee Shop,Pizza Place,Vietnamese Restaurant,...,Grocery Store,Beer Store,Bank,Turkish Restaurant,Athletics & Sports
50,Humber Summit,1,Coffee Shop,Fast Food Restaurant,Sandwich Place,...,Burger Joint,Pharmacy,Indian Restaurant,Grocery Store,Steakhouse
