# Segmenting and clustering neighborhoods in Toronto (Coursera Capstone week 3 project)

## Section 1: scraping data from wikipedia

In [1]:
import pandas as pd
import numpy as np
import urllib.request 
from bs4 import BeautifulSoup as BS

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urllib.request.urlopen(url)
soup = BS(page,'lxml')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"0906c2a3-efbb-4789-961b-f63b6aed2219","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":960187814,"wgRevisionId":960187814,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Communications in Ontario","Postal codes in Canada","Toron

Use Beautiful soup to scrape table by identifying that the table rows include 'tr' and each cell in table has 'td' before the data 

After transforming the data into a dataframe, drop all rows where Borough is Not assigned and replace any Neighborhood with Not assigned to its repective Borough

from the table in wikipedia we see there that all postal codes are unique hence there is no need to further process the data

In [3]:
A = []
B = []
C = []

right_table = soup.find('table',class_='wikitable sortable')
for row in right_table.findAll('tr'):
    cells = row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True)[:-1])
        B.append(cells[1].find(text=True)[:-1])
        C.append(cells[2].find(text=True)[:-1])

df = pd.DataFrame(A, columns = ['PostalCode'])
df['Borough'] = B
df['Neighborhood'] = C
for index in df.index:
    if df.loc[index,'Borough'] == 'Not assigned':
        df.drop(index,0, inplace = True)
    elif df.loc[index,'Neighborhood'] == 'Not assigned':
        df.loc[index,'Neighborhood'] = df.loc[index,'Borough']
df.reset_index(drop = True, inplace = True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [4]:
df.shape

(103, 3)

## Section 2: adding geospatial data

In [5]:
pip install geocoder

Note: you may need to restart the kernel to use updated packages.


In [6]:
import geocoder

In [7]:

#lat_hold = []
#long_hold = []
#for index in df.index:
#    coord = None
#    while coord == None:
#        g = geocoder.google('{}, Toronto, Ontario'.format(df.loc[index,'PostalCode']))
#        coord = g.latlng
#    lat_hold.append(coord[0])
#    long_hold.append(coord[1])
#df['Latitude'] = lat_hold
#df['Longitude'] = long_hold
#df


Geocoder package takes too long and unable to get results, using csv file provided instead

In [8]:
df_coord = pd.read_csv('http://cocl.us/Geospatial_data')
df_coord.columns = ['PostalCode', 'Latitude', 'Longitude']

In [9]:
for index in df.index:
    for index_coord in df_coord.index:
        if df.loc[index,'PostalCode'] == df_coord.loc[index_coord,'PostalCode']:
            df.loc[index,'Latitude'] = df_coord.loc[index_coord, 'Latitude']
            df.loc[index,'Longitude'] = df_coord.loc[index_coord, 'Longitude']
            break
df


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


## Section 3: Exploring and clustering neighborhoods in Toronto

In [10]:
pip install folium

Note: you may need to restart the kernel to use updated packages.


In [11]:
import folium
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [12]:
client_id = 'PRHYB1HCPEUHVQ34OCTCWBY5Z5BXGEG5J1SUF0UV34TZSZIA'
client_secret = 'CI1BLK1HDHJ11PSQTBXCG1VRUBJLPN4SEZ5POWLEUACOSJ43'
version = '20200606'

### Drop any rows that are not in Toronto

In [13]:
for index in df.index:
    if 'Toronto' not in df.loc[index,'Borough']:
        df.drop(index, axis = 0, inplace = True)
df.reset_index(drop = True, inplace = True)   
df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [14]:
import requests
limit = 100
raius = 500

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            client_id, 
            client_secret, 
            version,
            lat, 
            lng, 
            radius, 
            limit)
            
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [15]:
Toronto_venues = getNearbyVenues(df['Neighborhood'], df['Latitude'], df['Longitude'])

Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
The Danforth West, Riverdale
Toronto Dominion Centre, Design Exchange
Brockton, Parkdale Village, Exhibition Place
India Bazaar, The Beaches West
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West, Forest Hill Road Park
High Park, The Junction South
North Toronto West,  Lawrence Park
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
University of Toronto, Harbord
Runnymede, Swansea
Moore Park, Summerhill East
Kensington Market, Chinatown, Grange Park
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport


### Perform one hot encoding for all different venue categories

In [16]:
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']])
Toronto_onehot['Neighborhood']= Toronto_venues['Neighborhood']
Toronto_onehot = Toronto_onehot[[Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])]
Toronto_onehot.head()

Unnamed: 0,Neighborhood,Venue Category_Afghan Restaurant,Venue Category_Airport,Venue Category_Airport Food Court,Venue Category_Airport Gate,Venue Category_Airport Lounge,Venue Category_Airport Service,Venue Category_Airport Terminal,Venue Category_American Restaurant,Venue Category_Antique Shop,...,Venue Category_Trail,Venue Category_Train Station,Venue Category_Vegetarian / Vegan Restaurant,Venue Category_Video Game Store,Venue Category_Vietnamese Restaurant,Venue Category_Wine Bar,Venue Category_Wine Shop,Venue Category_Wings Joint,Venue Category_Women's Store,Venue Category_Yoga Studio
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
Toronto_grouped = Toronto_onehot.groupby('Neighborhood').mean().reset_index()
Toronto_grouped

Unnamed: 0,Neighborhood,Venue Category_Afghan Restaurant,Venue Category_Airport,Venue Category_Airport Food Court,Venue Category_Airport Gate,Venue Category_Airport Lounge,Venue Category_Airport Service,Venue Category_Airport Terminal,Venue Category_American Restaurant,Venue Category_Antique Shop,...,Venue Category_Trail,Venue Category_Train Station,Venue Category_Vegetarian / Vegan Restaurant,Venue Category_Video Game Store,Venue Category_Vietnamese Restaurant,Venue Category_Wine Bar,Venue Category_Wine Shop,Venue Category_Wings Joint,Venue Category_Women's Store,Venue Category_Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.017857,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.055556,0.055556,0.055556,0.111111,0.166667,0.111111,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.015385,0.0,0.0,0.015385,0.0,0.0,0.0,0.015385
5,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.012821,0.0,0.0,0.0,0.0,0.0,0.0,0.012821,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.012821,0.012821,0.0,0.025641
7,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,...,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.0,0.0
8,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Use KMeans to cluster all different neighborhoods

In [18]:
Toronto_grouped_clustering = Toronto_grouped.drop('Neighborhood',axis = 1)

kmeans = KMeans(n_clusters = 5, random_state = 0).fit(Toronto_grouped_clustering)

kmeans.labels_

array([0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 3, 0, 2, 0,
       1, 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0])

In [19]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [20]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighborhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Venue Category_Coffee Shop,Venue Category_Cocktail Bar,Venue Category_Bakery,Venue Category_Seafood Restaurant,Venue Category_Restaurant,Venue Category_Café,Venue Category_Cheese Shop,Venue Category_Beer Bar,Venue Category_Lounge,Venue Category_Irish Pub
1,"Brockton, Parkdale Village, Exhibition Place",Venue Category_Café,Venue Category_Bakery,Venue Category_Breakfast Spot,Venue Category_Coffee Shop,Venue Category_Furniture / Home Store,Venue Category_Burrito Place,Venue Category_Convenience Store,Venue Category_Italian Restaurant,Venue Category_Stadium,Venue Category_Intersection
2,"Business reply mail Processing Centre, South C...",Venue Category_Yoga Studio,Venue Category_Skate Park,Venue Category_Auto Workshop,Venue Category_Brewery,Venue Category_Burrito Place,Venue Category_Butcher,Venue Category_Comic Shop,Venue Category_Farmers Market,Venue Category_Fast Food Restaurant,Venue Category_Garden
3,"CN Tower, King and Spadina, Railway Lands, Har...",Venue Category_Airport Service,Venue Category_Airport Lounge,Venue Category_Airport Terminal,Venue Category_Rental Car Location,Venue Category_Sculpture Garden,Venue Category_Plane,Venue Category_Boat or Ferry,Venue Category_Boutique,Venue Category_Bar,Venue Category_Harbor / Marina
4,Central Bay Street,Venue Category_Coffee Shop,Venue Category_Café,Venue Category_Italian Restaurant,Venue Category_Sandwich Place,Venue Category_Japanese Restaurant,Venue Category_Salad Place,Venue Category_Bubble Tea Shop,Venue Category_Burger Joint,Venue Category_Department Store,Venue Category_Modern European Restaurant


In [21]:
df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [22]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

Toronto_merged = df

Toronto_merged = Toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

Toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Venue Category_Coffee Shop,Venue Category_Pub,Venue Category_Bakery,Venue Category_Park,Venue Category_Restaurant,Venue Category_Breakfast Spot,Venue Category_Café,Venue Category_Theater,Venue Category_Yoga Studio,Venue Category_Farmers Market
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0,Venue Category_Coffee Shop,Venue Category_Sushi Restaurant,Venue Category_Yoga Studio,Venue Category_Music Venue,Venue Category_Bar,Venue Category_Beer Bar,Venue Category_Smoothie Shop,Venue Category_Sandwich Place,Venue Category_Burrito Place,Venue Category_Café
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Venue Category_Clothing Store,Venue Category_Coffee Shop,Venue Category_Japanese Restaurant,Venue Category_Café,Venue Category_Bubble Tea Shop,Venue Category_Cosmetics Shop,Venue Category_Italian Restaurant,Venue Category_Middle Eastern Restaurant,Venue Category_Theater,Venue Category_Fast Food Restaurant
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Venue Category_Coffee Shop,Venue Category_Café,Venue Category_Cocktail Bar,Venue Category_Gastropub,Venue Category_American Restaurant,Venue Category_Gym,Venue Category_Cosmetics Shop,Venue Category_Moroccan Restaurant,Venue Category_Department Store,Venue Category_Lingerie Store
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,1,Venue Category_Trail,Venue Category_Neighborhood,Venue Category_Health Food Store,Venue Category_Pub,Venue Category_Yoga Studio,Venue Category_Diner,Venue Category_Discount Store,Venue Category_Distribution Center,Venue Category_Dog Run,Venue Category_Doner Restaurant


In [25]:
map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start = 11)

# set color scheme for the clusters
x = np.arange(5)
ys = [i + x + (i*x)**2 for i in range(5)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Neighborhood'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### It can be noted that there is only Roselawn in cluster 4 meaning no other similar neighborhood in Toronto. Additionally there are the most amount of similar neighborhoods in cluster 0

### NOTE: If unable to view maps, [click this link](https://nbviewer.jupyter.org/) and paste link provided in coursera to view maps