# <center><u> Clustering Neighbourhoods in Toronto </u><center>

### <center>Coursera Capstone Project - Neeraj Tripathi<center>


## <center><u>Part 1</u></center>

# Part 1
### Scraping html table from Wikipedia into a DataFrame

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bsoup
from urllib.request import urlopen as uReq
import requests
import lxml

In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' 
req=requests.get(url)



### Parsing the web html file with BeautifulSoup package

In [23]:
# Parse the html with Soup
page=bsoup(req.text,"html.parser")
page.head()

[<meta charset="utf-8"/>,
 <title>List of postal codes of Canada: M - Wikipedia</title>,
 <script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"Xc6xewpAME8AABFFwk0AAAAN","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":926306543,"wgRevisionId":926306543,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario","Postal codes in Canada","Toronto","Ontario-related lists"],"wg

### Extracting table from html

In [4]:
table=page.table

results=table.find_all('tr')
nrows=len(results)
print(nrows)

print(results[-1])
results[0:5]

288
<tr>
<td>M9Z</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>


[<tr>
 <th>Postcode</th>
 <th>Borough</th>
 <th>Neighbourhood
 </th></tr>, <tr>
 <td>M1A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>, <tr>
 <td>M2A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>, <tr>
 <td>M3A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
 </td></tr>, <tr>
 <td>M4A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
 </td></tr>]

In [5]:
# columns for the dataframe
columns = results[0].text.split()
columns

['Postcode', 'Borough', 'Neighbourhood']

In [6]:
# Filling up the dataframe by extracting cells from html table

df=pd.DataFrame({}, columns=columns)
records =[]

for i in range(1, nrows):
    row = results[i].text.split('\n')
    record = pd.Series({columns[0]: row[1], columns[1]: row[2], columns[2]: row[3]}, name=i)
    
    df = df.append(record)
print(df.shape)
df.head(5)

(287, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [7]:
df.tail()

Unnamed: 0,Postcode,Borough,Neighbourhood
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West
286,M8Z,Etobicoke,South of Bloor
287,M9Z,Not assigned,Not assigned


In [8]:
# drops those rows where 'Not assigned' appears in column '[Borough]'
df = df[~df.Borough.str.contains("Not assigned")]
df = df.reset_index(drop=True)
df.shape

(210, 3)

### Combine rows with same PostCode but different neighborhood

In [9]:
df['Postcode'].nunique()

103

In [10]:
df2 = df.copy()                   # precaution to avoid corrupting the original dataframe

n=0
nrows2 = df.shape[0]-1

# fill using iteration
while n < nrows2 :
    post1=df2.iloc[n,0]
    #post1
    m=n+1
    post2=df2.iloc[m,0]
    #post2
    neigh1=df2.iloc[n,2]
    neigh2=df2.iloc[m,2]
    if post1==post2:
        df2.Neighbourhood[n,2] = neigh1=neigh1+','+neigh2
        #df2 = df2[df2.Neighbourhood != 'neigh2']
        df2=df2.drop(df2.index[m])
        nrows2=nrows2-1
        df2 = df2.reset_index(drop=True)
    else:
        n=n+1
print(df2.shape)
df2.head()

(103, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Kingsway Park South West,Mimico NW,The Queensw..."
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Not assigned


### Assign same neighbourhood as borough where not assigned

In [11]:
(df2['Neighbourhood']=='Not assigned').sum()

1

In [19]:
df2.loc[df2['Neighbourhood']=='Not assigned', 'Neighbourhood'] = df2['Borough']
df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Kingsway Park South West,Mimico NW,The Queensw..."
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [13]:
df2.shape

(103, 3)

# <hr>
## <center>End of Part 1</center>

### Part 1 Ends here

 <hr>
 <hr>

# <center>Part 2</center>

In [16]:
import json
from geopy.geocoders import Nominatim

### Retrieve the Latitude and Longitude coordinates for every Postal Code

In [17]:
url='http://cocl.us/Geospatial_data'
df_pcodes=pd.read_csv(url)
df_pcodes.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge the Latitude & Longitude data

In [22]:
# rename the column "Postal Code" to "PostalCode" in order to do a proper merger
df_pcodes.columns = ['Postcode', 'Latitude', 'Longitude']

# sort the original dataframe
df2.sort_values(by=['Postcode'], inplace=True)

# merge step
neighborhoods=pd.merge(df2,df_pcodes, how='right', on = 'Postcode')
neighborhoods.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


# <hr>
## <center>End of Part 2</center>

 <hr>
 <hr>

### Part 2 ends here

<hr>

## <center>Part 3</center>
## Part 3

### Analysing the neighbourhoods

In [25]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="tl-toronto-neigh")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


## Map using Folium

### Considering only those boroughs that have "Toronto" in their name

In [33]:
toronto_boroughs = ['East Toronto', 'Central Toronto', 'Downtown Toronto', 'West Toronto']
toronto_df = neighborhoods[neighborhoods['Borough'].isin(toronto_boroughs)].reset_index(drop=True)
print(toronto_df.shape)
toronto_df.head()

(38, 5)


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [34]:
import folium
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, long, post, borough, neigh in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Postcode'], toronto_df['Borough'], toronto_df['Neighbourhood']):
    label = "{} ({}): {}".format(borough, post, neigh)
    popup = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=popup,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

### Using FourSquare API to explore the Boroughs

In [37]:
CLIENT_ID = '1XFAKAXQ3VPCE0XGOYQUPLLMVNN4MAD22YAI2AX05TU22CFJ'

CLIENT_SECRET = 'BJY35WO41UMWVR5F00LQPRGCFSYJKHEW5WUDBPHIQPZLVB0M'

VERSION = '20150403'

In [40]:
radius = 400
LIMIT = 80

venues = []

for lat, long, post, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Postcode'], toronto_df['Borough'], toronto_df['Neighbourhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [41]:
places_df = pd.DataFrame(venues)
places_df.columns = ['PostalCode', 'Borough', 'Neighborhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']
print(places_df.shape)
places_df.head()

(1207, 9)


Unnamed: 0,PostalCode,Borough,Neighborhood,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,M4E,East Toronto,The Beaches,43.676357,-79.293031,Glen Stewart Park,43.675278,-79.294647,Park
2,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant
3,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,MenEssentials,43.67782,-79.351265,Cosmetics Shop
4,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,Dolce Gelato,43.677773,-79.351187,Ice Cream Shop


#### no of venues for each post code

In [43]:
(places_df['VenueCategory'].unique()).shape[0]

199

### Analyzing venues in each area

In [45]:
# one hot encoding
toronto_onehot = pd.get_dummies(places_df[['VenueCategory']], prefix="", prefix_sep="")

# add postal, borough and neighborhood column back to dataframe
toronto_onehot['PostalCode'] = places_df['PostalCode'] 
toronto_onehot['Borough'] = places_df['Borough'] 
toronto_onehot['Neighborhoods'] = places_df['Neighborhood'] 

# move postal, borough and neighborhood column to the first column
fixed_columns = list(toronto_onehot.columns[-3:]) + list(toronto_onehot.columns[:-3])
toronto_onehot = toronto_onehot[fixed_columns]

print(toronto_onehot.shape)
toronto_onehot.head()


(1207, 202)


Unnamed: 0,PostalCode,Borough,Neighborhoods,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Terminal,American Restaurant,Aquarium,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M4K,East Toronto,"The Danforth West,Riverdale",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4K,East Toronto,"The Danforth West,Riverdale",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4K,East Toronto,"The Danforth West,Riverdale",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
toronto_venues_freq = toronto_onehot.groupby(['PostalCode', 'Borough', 'Neighborhoods']).mean().reset_index()
print(toronto_venues_freq.shape)
toronto_venues_freq.head()

(35, 202)


Unnamed: 0,PostalCode,Borough,Neighborhoods,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Terminal,American Restaurant,Aquarium,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,M4E,East Toronto,The Beaches,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,East Toronto,"The Danforth West,Riverdale",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.030303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030303
2,M4L,East Toronto,"The Beaches West,India Bazaar",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,East Toronto,Studio District,0.0,0.0,0.0,0.0,0.0,0.035714,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M4P,Central Toronto,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Clustering

In [67]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
areaColumns = ['PostalCode', 'Borough', 'Neighborhoods']
freqColumns = []
for ind in np.arange(num_top_venues):
    try:
        freqColumns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        freqColumns.append('{}th Most Common Venue'.format(ind+1))
columns = areaColumns+freqColumns
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostalCode'] = toronto_venues_freq['PostalCode']
neighborhoods_venues_sorted['Borough'] = toronto_venues_freq['Borough']
neighborhoods_venues_sorted['Neighborhoods'] = toronto_venues_freq['Neighborhoods']

for ind in np.arange(toronto_venues_freq.shape[0]):
    row_categories = toronto_venues_freq.iloc[ind, :].iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    neighborhoods_venues_sorted.iloc[ind, 3:] = row_categories_sorted.index.values[0:num_top_venues]

neighborhoods_venues_sorted.sort_values(freqColumns, inplace=True)
neighborhoods_venues_sorted.head()

Unnamed: 0,PostalCode,Borough,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
25,M5V,Downtown Toronto,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Airport Lounge,Airport Terminal,Airport,Coffee Shop,Airport Food Court,Airport Gate,Boutique,Bar,Dumpling Restaurant,Eastern European Restaurant
29,M6H,West Toronto,"Dovercourt Village,Dufferin",Bakery,Pharmacy,Bar,Supermarket,Furniture / Home Store,Music Venue,Bank,Middle Eastern Restaurant,Park,Gym / Fitness Center
5,M4R,Central Toronto,North Toronto West,Boutique,Park,Yoga Studio,Discount Store,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant
23,M5S,Downtown Toronto,"Harbord,University of Toronto",Café,Bakery,Restaurant,Italian Restaurant,College Gym,Noodle House,Comfort Food Restaurant,College Arts Building,Dessert Shop,Chinese Restaurant
28,M6G,Downtown Toronto,Christie,Café,Grocery Store,Baby Store,Coffee Shop,Italian Restaurant,Nightclub,Candy Store,Diner,Discount Store,Event Space


In [68]:
neighborhoods_venues_sorted.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35 entries, 25 to 34
Data columns (total 13 columns):
PostalCode                35 non-null object
Borough                   35 non-null object
Neighborhoods             35 non-null object
1st Most Common Venue     35 non-null object
2nd Most Common Venue     35 non-null object
3rd Most Common Venue     35 non-null object
4th Most Common Venue     35 non-null object
5th Most Common Venue     35 non-null object
6th Most Common Venue     35 non-null object
7th Most Common Venue     35 non-null object
8th Most Common Venue     35 non-null object
9th Most Common Venue     35 non-null object
10th Most Common Venue    35 non-null object
dtypes: object(13)
memory usage: 3.8+ KB


In [65]:
toronto_clustered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35 entries, 0 to 34
Data columns (total 6 columns):
Borough          35 non-null object
Neighbourhood    35 non-null object
Latitude         35 non-null float64
Longitude        35 non-null float64
Cluster          35 non-null int32
PostalCode       35 non-null object
dtypes: float64(2), int32(1), object(3)
memory usage: 1.8+ KB


In [69]:
from sklearn.cluster import KMeans
kclusters = 3

toronto_venues_freq_clustering = toronto_venues_freq.drop(['PostalCode', 'Borough', 'Neighborhoods'], 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_venues_freq_clustering)

toronto_clustered_df = toronto_df.drop(index=[35,36,37])
toronto_clustered_df['Cluster'] = kmeans.labels_
toronto_clustered_df['PostalCode'] = toronto_clustered_df['Postcode']
toronto_clustered_df.drop(['Postcode'], 1, inplace=True)

toronto_clustered_df = toronto_clustered_df.merge(neighborhoods_venues_sorted.drop(['Borough', 'Neighborhoods'], 1), on='PostalCode')
toronto_clustered_df.sort_values(['Cluster'] + freqColumns, inplace=True)
toronto_clustered_df.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,Cluster,PostalCode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
30,West Toronto,"Little Portugal,Trinity",43.647927,-79.41975,0,M6J,Coffee Shop,Bar,Asian Restaurant,New American Restaurant,Bakery,Pizza Place,Men's Store,Vietnamese Restaurant,Cocktail Bar,Diner
20,Downtown Toronto,"Commerce Court,Victoria Hotel",43.648198,-79.379817,0,M5L,Coffee Shop,Café,Hotel,Restaurant,American Restaurant,Gastropub,Deli / Bodega,Seafood Restaurant,Bakery,Gym
4,Central Toronto,Davisville North,43.712751,-79.390197,0,M4P,Gym,Breakfast Spot,Food & Drink Shop,Clothing Store,Yoga Studio,Discount Store,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store
25,Downtown Toronto,"CN Tower,Bathurst Quay,Island airport,Harbourf...",43.628947,-79.39442,1,M5V,Airport Lounge,Airport Terminal,Airport,Coffee Shop,Airport Food Court,Airport Gate,Boutique,Bar,Dumpling Restaurant,Eastern European Restaurant
29,West Toronto,"Dovercourt Village,Dufferin",43.669005,-79.442259,1,M6H,Bakery,Pharmacy,Bar,Supermarket,Furniture / Home Store,Music Venue,Bank,Middle Eastern Restaurant,Park,Gym / Fitness Center


In [74]:
import matplotlib.cm as cm
import matplotlib.colors as colors
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, post, bor, poi, cluster in zip(toronto_clustered_df['Latitude'], toronto_clustered_df['Longitude'], toronto_clustered_df['PostalCode'], toronto_clustered_df['Borough'], toronto_clustered_df['Neighbourhood'], toronto_clustered_df['Cluster']):
    label = folium.Popup('{} ({}): {} - Cluster {}'.format(bor, post, poi, cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Part-3 ends here