<h2> This notebook will be used for the IBM Data Science Capstone Project

In [45]:
import pandas as pd
import numpy as np

<h2
> Segmenting and Clustering Neighborhoods in Toronto

<h3>Part 1: Scrape Wiki

In [46]:
from bs4 import BeautifulSoup
import requests
import csv

In [47]:
url ='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')

In [48]:
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"XfaLwApAADwAAAgv7BUAAAEK","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":930529633,"wgRevisionId":930529633,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communi

In [66]:
#write to csv file
toronto_boroughs = open('toronto_boroughs.csv', 'w')
csv_writer = csv.writer(toronto_boroughs)
for tag in soup.table.tbody:
    try:
        info = tag.text.strip().split('\n')
        csv_writer.writerow(info)
    except:
        pass
toronto_boroughs.close()

In [347]:
#read csv to pandas dataframe
df = pd.read_csv('toronto_boroughs.csv')
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Cleaning Data:

In [348]:
df = df[df['Borough'] != 'Not assigned']

In [349]:
#join duplicate postcodes
df = df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [350]:
#1 item in Neighbourhood is not assigned
for i, row in df.iterrows():
    if row['Neighbourhood'] == 'Not assigned':
        row['Neighbourhood'] = (row['Borough'])
print('\'Not assigned\' Neighbourhoods:', sum(df['Neighbourhood'] == 'Not assigned'))

'Not assigned' Neighbourhoods: 0


In [351]:
df.shape

(103, 3)

<h3> Part 2: add lat/long coordinates

In [352]:
#geocoder didn't work, importing csv file with coords

In [353]:
df_geo = pd.read_csv('Geospatial_Coordinates.csv')
df_geo.rename(columns={'Postal Code': 'Postcode'}, inplace=True)
df_geo.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [354]:
df = pd.merge(df, df_geo, on='Postcode')
df.rename(columns={'Neighbourhood': 'Neighborhood'}, inplace=True)
df

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437


<h3> Part 3: Cluster neighborhoods in Toronto


In [355]:
from sklearn.cluster import KMeans
import folium

In [356]:
print('There are {} boroughs and {} neighborhoods in Toronto'.format(
    len(df['Borough'].unique()), df.shape[0]))

There are 11 boroughs and 103 neighborhoods in Toronto


In [357]:
df['Borough'].unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       "Queen's Park", 'Mississauga', 'Etobicoke'], dtype=object)

Let's map all of the neighborhoods in Toronto

I looked up the coordinates for Toronto, they are 43.6532° N, 79.3832° W

In [358]:
toronto_map = folium.Map(location=[43.6532, -79.3832], zoom_start=10)

for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#ff8666',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)

toronto_map

Now, I'm going to do an analysis for each borough to cluster the neighborhoods within it

**First, Scarborough**

In [359]:
scarborough_df = df[df['Borough'] == 'Scarborough']
scarborough_df.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


Map the Scarborough:  43.7764° N, 79.2318° W

In [360]:
def map_Borough(borough, blat, blng):
    borough_map = folium.Map(location=[blat,blng], zoom_start=10)
    
    for lat,lng,borough,neighborhood in zip(borough['Latitude'], borough['Longitude'], borough['Borough'], borough['Neighborhood']):
        label = folium.Popup(neighborhood, parse_html=True)
        folium.CircleMarker(
            [lat,lng],
            radius=5,
            popup=label,
            color='purple',
            fill=True,
            fill_color='#ee33ff',
            fill_opacity=0.7,
            parse_html=False).add_to(borough_map)
    return borough_map

#map Scarborough
scarborough_map = map_Borough(scarborough_df, 43.6532, -79.3832)
scarborough_map

Now let's extract the top venues for each neighborhood in Scarborough within 500 meters of each


In [361]:
#Foursquare client data
CLIENT_ID = 'Z0CJRXADEV3O4UPSYV3ODCBGZOJFG0B0KKT4KNBVPUUTX5NT'
CLIENT_SECRET = 'WX1XPYKYOOBLPMUA54NY3RAJZJOVQ1P3H0WC1O3OC3WVP2SU'
VERSION = '20180605'
LIMIT = 100
radius = 500

#to get venues in all neighborhoods 
def get_borough_venues(df):
    venues = []
    for neighborhood, lat, lng in zip(df['Neighborhood'], df['Latitude'], df['Longitude']):
        print(neighborhood)
        
        #api request url
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
                    CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)
        #make get request
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        venues.append([(
            neighborhood, lat, lng, v['venue']['name'], v['venue']['location']['lat'],
            v['venue']['location']['lng'], v['venue']['categories'][0]['name']) for v in results])
        
    all_venues = pd.DataFrame([item for venue_list in venues for item in venue_list])
    all_venues.columns = ['Neighborhood', 'Neighborhood Lat', 'Neighborhood Lng',
                         'Venue', 'Venue Lat', 'Venue Lng', 'Venue Category']
    
    return all_venues

#get venues in scarborough
scarborough_venues = get_borough_venues(scarborough_df)
scarborough_venues

Rouge, Malvern
Highland Creek, Rouge Hill, Port Union
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park, Ionview, Kennedy Park
Clairlea, Golden Mile, Oakridge
Cliffcrest, Cliffside, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Scarborough Town Centre, Wexford Heights
Maryvale, Wexford
Agincourt
Clarks Corners, Sullivan, Tam O'Shanter
Agincourt North, L'Amoreaux East, Milliken, Steeles East
L'Amoreaux West
Upper Rouge


Unnamed: 0,Neighborhood,Neighborhood Lat,Neighborhood Lng,Venue,Venue Lat,Venue Lng,Venue Category
0,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Scarborough Historical Society,43.788755,-79.162438,History Museum
3,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
...,...,...,...,...,...,...,...
85,L'Amoreaux West,43.799525,-79.318389,Eggsmart,43.796375,-79.318681,Breakfast Spot
86,L'Amoreaux West,43.799525,-79.318389,Pizza Pizza,43.797909,-79.318113,Pizza Place
87,L'Amoreaux West,43.799525,-79.318389,Fit4Less,43.798394,-79.318453,Gym
88,L'Amoreaux West,43.799525,-79.318389,A Buck or Two,43.798286,-79.318485,Thrift / Vintage Store


In [362]:
scarborough_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Lat,Neighborhood Lng,Venue,Venue Lat,Venue Lng,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,4,4,4,4,4,4
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",3,3,3,3,3,3
"Birch Cliff, Cliffside West",4,4,4,4,4,4
Cedarbrae,8,8,8,8,8,8
"Clairlea, Golden Mile, Oakridge",10,10,10,10,10,10
"Clarks Corners, Sullivan, Tam O'Shanter",13,13,13,13,13,13
"Cliffcrest, Cliffside, Scarborough Village West",2,2,2,2,2,2
"Dorset Park, Scarborough Town Centre, Wexford Heights",5,5,5,5,5,5
"East Birchmount Park, Ionview, Kennedy Park",6,6,6,6,6,6
"Guildwood, Morningside, West Hill",8,8,8,8,8,8


In [363]:
print('There are {} unique categories'.format(len(scarborough_venues['Venue Category'].unique())))

There are 53 unique categories


Let's get the one hot encoded dataframe of means to use for clustering now

In [364]:
def get_cluster_df(df):
    onehot = pd.get_dummies(df[['Venue Category']], prefix='', prefix_sep='')
    onehot['Neighborhood'] = df['Neighborhood']
    #move to first column
    fixed_cols = [onehot.columns[-1]] + list(onehot.columns[:-1])
    onehot = onehot[fixed_cols]
    
    return onehot.groupby('Neighborhood').mean().reset_index()

scarborough_onehot = get_cluster_df(scarborough_venues)
scarborough_onehot

Unnamed: 0,Neighborhood,American Restaurant,Athletics & Sports,Auto Garage,Bakery,Bank,Bar,Breakfast Spot,Bubble Tea Shop,Bus Line,...,Playground,Rental Car Location,Sandwich Place,Shopping Mall,Skating Rink,Soccer Field,Spa,Thai Restaurant,Thrift / Vintage Store,Vietnamese Restaurant
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,...,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0
1,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Birch Cliff, Cliffside West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0
3,Cedarbrae,0.0,0.125,0.0,0.125,0.125,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0
4,"Clairlea, Golden Mile, Oakridge",0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.2,...,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0
5,"Clarks Corners, Sullivan, Tam O'Shanter",0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,...,0.0,0.076923,0.0,0.076923,0.0,0.0,0.0,0.076923,0.0,0.0
6,"Cliffcrest, Cliffside, Scarborough Village West",0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Dorset Park, Scarborough Town Centre, Wexford ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2
8,"East Birchmount Park, Ionview, Kennedy Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Guildwood, Morningside, West Hill",0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,...,0.0,0.125,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0


Now let's get the top 5 categories for each neighborhood, since most don't have that many

In [375]:
def get_top_five(df, num):
    def return_most_common(row):
        row_cats = row.iloc[1:]
        row_cats = row_cats.sort_values(ascending=False)
        return row_cats.index.values[0:num]
    
    indicator = ['st', 'nd', 'rd']
    cols = ['Neighborhood']
    for ind in range(num):
        try:
            cols.append('{}{} Most Common Venue'.format(ind+1, indicator[ind]))
        except:
            cols.append('{}th Most Common Venue'.format(ind+1))
    
    #create new df with those columns
    sorted_df = pd.DataFrame(columns=cols)
    sorted_df['Neighborhood'] = df['Neighborhood']
    
    for i in range(df.shape[0]):
        sorted_df.iloc[i, 1:] = return_most_common(df.iloc[i, :])
    return sorted_df

#get top five for each neighborhood in scarborough
scarborough_top_five = get_top_five(scarborough_onehot, 5)
scarborough_top_five

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Agincourt,Skating Rink,Breakfast Spot,Latin American Restaurant,Lounge,Vietnamese Restaurant
1,"Agincourt North, L'Amoreaux East, Milliken, St...",Park,Playground,Coffee Shop,Caribbean Restaurant,Grocery Store
2,"Birch Cliff, Cliffside West",General Entertainment,Skating Rink,Café,College Stadium,Vietnamese Restaurant
3,Cedarbrae,Caribbean Restaurant,Bakery,Gas Station,Fried Chicken Joint,Bank
4,"Clairlea, Golden Mile, Oakridge",Bakery,Bus Line,Metro Station,Soccer Field,Intersection
5,"Clarks Corners, Sullivan, Tam O'Shanter",Pizza Place,Noodle House,Thai Restaurant,Fried Chicken Joint,Fast Food Restaurant
6,"Cliffcrest, Cliffside, Scarborough Village West",American Restaurant,Motel,Coffee Shop,Gym,Grocery Store
7,"Dorset Park, Scarborough Town Centre, Wexford ...",Indian Restaurant,Vietnamese Restaurant,Chinese Restaurant,Pet Store,Grocery Store
8,"East Birchmount Park, Ionview, Kennedy Park",Hobby Shop,Bus Station,Department Store,Convenience Store,Coffee Shop
9,"Guildwood, Morningside, West Hill",Rental Car Location,Electronics Store,Medical Center,Pizza Place,Breakfast Spot


Now let's create (3) clusters and add them to our top_five dataframe and visualize!

In [376]:
#create clusters and update top5
def create_clusters(onehot, top_five):
    kclusters=3
    cluster_df = onehot.drop('Neighborhood', axis=1)
    kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(cluster_df)
    
    #put clusters into top_five dataframe
    top_five.insert(0, 'Cluster Labels', kmeans.labels_)    
    
create_clusters(scarborough_onehot, scarborough_top_five)
    


In [377]:
scarborough_top_five = scarborough_top_five.merge(scarborough_df, on='Neighborhood')
scarborough_top_five.set_index('Neighborhood', inplace=True)
scarborough_top_five

Unnamed: 0_level_0,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Postcode,Borough,Latitude,Longitude
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Agincourt,0,Skating Rink,Breakfast Spot,Latin American Restaurant,Lounge,Vietnamese Restaurant,M1S,Scarborough,43.7942,-79.262029
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",1,Park,Playground,Coffee Shop,Caribbean Restaurant,Grocery Store,M1V,Scarborough,43.815252,-79.284577
"Birch Cliff, Cliffside West",0,General Entertainment,Skating Rink,Café,College Stadium,Vietnamese Restaurant,M1N,Scarborough,43.692657,-79.264848
Cedarbrae,0,Caribbean Restaurant,Bakery,Gas Station,Fried Chicken Joint,Bank,M1H,Scarborough,43.773136,-79.239476
"Clairlea, Golden Mile, Oakridge",0,Bakery,Bus Line,Metro Station,Soccer Field,Intersection,M1L,Scarborough,43.711112,-79.284577
"Clarks Corners, Sullivan, Tam O'Shanter",0,Pizza Place,Noodle House,Thai Restaurant,Fried Chicken Joint,Fast Food Restaurant,M1T,Scarborough,43.781638,-79.304302
"Cliffcrest, Cliffside, Scarborough Village West",0,American Restaurant,Motel,Coffee Shop,Gym,Grocery Store,M1M,Scarborough,43.716316,-79.239476
"Dorset Park, Scarborough Town Centre, Wexford Heights",0,Indian Restaurant,Vietnamese Restaurant,Chinese Restaurant,Pet Store,Grocery Store,M1P,Scarborough,43.75741,-79.273304
"East Birchmount Park, Ionview, Kennedy Park",0,Hobby Shop,Bus Station,Department Store,Convenience Store,Coffee Shop,M1K,Scarborough,43.727929,-79.262029
"Guildwood, Morningside, West Hill",0,Rental Car Location,Electronics Store,Medical Center,Pizza Place,Breakfast Spot,M1E,Scarborough,43.763573,-79.188711


In [379]:
#map the clusters
def create_cluster_map(df, blat, blng, color):
    cluster_map = folium.Map(location=[blat,blng], zoom_start=10)
    colors = ['#ff0000', '#00ff00', '#0000ff']
    
    for lat, lng, neighborhood, cluster in zip(df['Latitude'], df['Longitude'], df.index, df['Cluster Label']):
        label = folium.Popup(neighborhood + ' Cluster ' + cluster, parse_html=True)
        folium.CircleMarker(
            [lat,lng], 
            radius=5,
            popup=label,
            color=color,
            fill=True,
            fill_color=colors[cluster],
            fill_opacity=0.7).add_to(cluster_map)
        
    return cluster_map

scarborough_clusters = create_cluster_map(scarborough_top_five, 43.7764, -79.2318, 'red')
scarborough_clusters
        


KeyError: 'Cluster Label'