# Segmenting and Clustering
Toronto neighborhoods data will be parsed and transformed into a pandas dataframe. 
We will cluster and segment the data using folium

In [312]:
import pandas as pd
import numpy as np

In [313]:
# read wiki table using pandas read_html
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
# if boroughs are not assigned drop them
df2 = df[df['Borough']!= 'Not assigned']
df2.reset_index(inplace=True)
df2.drop(columns='index', inplace=True)

#group by postal code and borough, list out neighbourhoods sharing same postal codes
df2 = df2.groupby(['Postcode', 'Borough']).agg(lambda x : tuple(x)).applymap(list).reset_index()

#if a neighbourhood is not assigned but borough is then assign borough name to neighbourhood
for i in range(0, df2.shape[0]):
    if df2.iloc[i, 2][0] == 'Not assigned':
        b_name = df2.iloc[i, 1]
        df2.iloc[i, 2][0] = b_name
        
print(df2.shape)
df2.head()

(103, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"[Rouge, Malvern]"
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]"
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]"
3,M1G,Scarborough,[Woburn]
4,M1H,Scarborough,[Cedarbrae]


We have concatenated neighbourhoods that share the same postal code in a borough.

<h2> Downloading coordinates for post codes. </h2>

In [314]:
# read coordinates for each postal code
postal_df = pd.read_csv('http://cocl.us/Geospatial_data')
postal_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


This postal coordinates table will be joined with the borough table.

In [317]:
# rename column
postal_df.rename(columns={'Postal Code': 'Postcode'}, inplace=True)
#join tables on post codes
df = df2.set_index('Postcode').join(postal_df.set_index('Postcode'))
df.reset_index(inplace=True)
df.rename(columns={'Neighbourhood': 'Neighborhood'}, inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"[Rouge, Malvern]",43.806686,-79.194353
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]",43.784535,-79.160497
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]",43.763573,-79.188711
3,M1G,Scarborough,[Woburn],43.770992,-79.216917
4,M1H,Scarborough,[Cedarbrae],43.773136,-79.239476


We now can visualize the neighbourhoods in Toronto. 

In [318]:
!pip install geopy
!pip install folium
import folium # library for map visualizations
from geopy.geocoders import Nominatim # converts address to coordinates
print('Done.')

Done.


In [319]:
address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The coordinates of Toronto, Ontario are {}, {}.'.format(latitude, longitude))

The coordinates of Toronto, Ontario are 43.653963, -79.387207.


<h3>Create a map of Toronto with neighborhoods superimposed on top.</h3>

In [320]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neigh in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neigh, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
map_toronto

Segment by downtown toronto neighbourhoods.

In [321]:
downtown_df = df[df['Borough']=='Downtown Toronto'].reset_index(drop=True)
downtown_df.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M4W,Downtown Toronto,[Rosedale],43.679563,-79.377529
1,M4X,Downtown Toronto,"[Cabbagetown, St. James Town]",43.667967,-79.367675
2,M4Y,Downtown Toronto,[Church and Wellesley],43.66586,-79.38316
3,M5A,Downtown Toronto,[Harbourfront],43.65426,-79.360636
4,M5B,Downtown Toronto,"[Ryerson, Garden District]",43.657162,-79.378937


In [322]:
address = 'Downtown Toronto, Ontario'
geolocator = Nominatim(user_agent="ontario_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Coordinates of downtown toronto are {}, {}.'.format(latitude, longitude))

Coordinates of downtown toronto are 43.6563221, -79.3809161.


In [323]:
downtown_map = folium.Map(location=[latitude, longitude], zoom_start=11)
#add neighborhood markers
for lat, lng, label in zip(downtown_df['Latitude'], downtown_df['Longitude'], downtown_df['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3187cc',
        fill_opacity=0.7,
        parse_html=False).add_to(downtown_map)
downtown_map

In [324]:
CLIENT_ID = 'EQVBHNSZ3GDTBMKSXPFZSXAHABDFPEQOHSOI4O2Z1RATEJYK'
CLIENT_S = 'BTOAOHOEG122QTUZMZS24H4EOOGOYSWH1YWWDHMAEO3LNA4P'
VERSION = '20180605' # Foursquare API version
LIMIT=100

In [335]:
def getNearbyVenues(pcs, names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for pc, name, lat, lng in zip(pcs, names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_S, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            pc,
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postcode',
                  'Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [336]:
downtown_venues = getNearbyVenues(pcs = downtown_df['Postcode'],
                                  names=downtown_df['Neighborhood'],
                                   latitudes=downtown_df['Latitude'],
                                   longitudes=downtown_df['Longitude']
                                  )

['Rosedale']
['Cabbagetown', 'St. James Town']
['Church and Wellesley']
['Harbourfront']
['Ryerson', 'Garden District']
['St. James Town']
['Berczy Park']
['Central Bay Street']
['Adelaide', 'King', 'Richmond']
['Harbourfront East', 'Toronto Islands', 'Union Station']
['Design Exchange', 'Toronto Dominion Centre']
['Commerce Court', 'Victoria Hotel']
['Harbord', 'University of Toronto']
['Chinatown', 'Grange Park', 'Kensington Market']
['CN Tower', 'Bathurst Quay', 'Island airport', 'Harbourfront West', 'King and Spadina', 'Railway Lands', 'South Niagara']
['Stn A PO Boxes 25 The Esplanade']
['First Canadian Place', 'Underground city']
['Christie']
["Queen's Park"]


In [337]:
downtown_venues.head()

Unnamed: 0,Postcode,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M4W,[Rosedale],43.679563,-79.377529,Rosedale Park,43.682328,-79.378934,Playground
1,M4W,[Rosedale],43.679563,-79.377529,Whitney Park,43.682036,-79.373788,Park
2,M4W,[Rosedale],43.679563,-79.377529,Alex Murray Parkette,43.6783,-79.382773,Park
3,M4W,[Rosedale],43.679563,-79.377529,Milkman's Lane,43.676352,-79.373842,Trail
4,M4X,"[Cabbagetown, St. James Town]",43.667967,-79.367675,Cranberries,43.667843,-79.369407,Diner


In [338]:
downtown_venues.groupby('Postcode').count()

Unnamed: 0_level_0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
M4W,4,4,4,4,4,4,4
M4X,45,45,45,45,45,45,45
M4Y,82,82,82,82,82,82,82
M5A,46,46,46,46,46,46,46
M5B,100,100,100,100,100,100,100
M5C,100,100,100,100,100,100,100
M5E,55,55,55,55,55,55,55
M5G,86,86,86,86,86,86,86
M5H,100,100,100,100,100,100,100
M5J,100,100,100,100,100,100,100


In [339]:
print('There are {} unique categories.'.format(len(downtown_venues['Venue Category'].unique())))

There are 211 unique categories.


Analyze each neighborhood

In [363]:
downtown_onehot = pd.get_dummies(downtown_venues[['Venue Category']], prefix="", prefix_sep="")
downtown_onehot['Neighborhood'] = downtown_venues['Neighborhood']
downtown_onehot['PC'] = downtown_venues['Postcode']
downtown_onehot = downtown_onehot[['PC', 'Neighborhood'] + [x for x in downtown_onehot.columns if x not in ['Neighborhood', 'PC']]]

In [364]:
downtown_onehot.head()

Unnamed: 0,PC,Neighborhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,M4W,[Rosedale],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M4W,[Rosedale],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M4W,[Rosedale],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4W,[Rosedale],0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,M4X,"[Cabbagetown, St. James Town]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [362]:
downtown_onehot.shape

(1312, 211)

In [372]:
downtown_grouped = downtown_onehot.groupby('PC').mean().reset_index()

In [366]:
downtown_grouped.shape

(19, 211)

In [374]:
num_top_venues = 5

for hood in downtown_grouped['PC']:
    print("---- Postcode: "+hood+" ----")
    temp = downtown_grouped[downtown_grouped['PC'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

---- Postcode: M4W ----
               venue  freq
0               Park  0.50
1         Playground  0.25
2              Trail  0.25
3  Afghan Restaurant  0.00
4      Movie Theater  0.00


---- Postcode: M4X ----
         venue  freq
0  Coffee Shop  0.07
1         Park  0.04
2         Café  0.04
3       Market  0.04
4  Pizza Place  0.04


---- Postcode: M4Y ----
                 venue  freq
0          Coffee Shop  0.09
1  Japanese Restaurant  0.05
2     Sushi Restaurant  0.05
3              Gay Bar  0.04
4           Restaurant  0.04


---- Postcode: M5A ----
         venue  freq
0  Coffee Shop  0.17
1       Bakery  0.07
2         Park  0.07
3          Pub  0.07
4   Restaurant  0.04


---- Postcode: M5B ----
                       venue  freq
0                Coffee Shop  0.10
1             Clothing Store  0.05
2             Cosmetics Shop  0.04
3                       Café  0.04
4  Middle Eastern Restaurant  0.03


---- Postcode: M5C ----
          venue  freq
0          Café  0.06
1   

In [375]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [377]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postcode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Postcode'] = downtown_grouped['PC']

for ind in np.arange(downtown_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(downtown_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4W,Park,Playground,Trail,Deli / Bodega,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Discount Store
1,M4X,Coffee Shop,Park,Market,Pub,Restaurant,Bakery,Italian Restaurant,Café,Pizza Place,Caribbean Restaurant
2,M4Y,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Restaurant,Gay Bar,Gym,Men's Store,Mediterranean Restaurant,Hotel,Gastropub
3,M5A,Coffee Shop,Park,Pub,Bakery,Mexican Restaurant,Café,Breakfast Spot,Restaurant,Performing Arts Venue,Chocolate Shop
4,M5B,Coffee Shop,Clothing Store,Café,Cosmetics Shop,Middle Eastern Restaurant,Ramen Restaurant,Pizza Place,Restaurant,Diner,Japanese Restaurant


Cluster the neighborhoods

In [385]:
from sklearn.cluster import KMeans
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

kclusters = 5
downtown_grouped_cluster = downtown_grouped.drop('PC', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(downtown_grouped_cluster)
kmeans.labels_[0:10]

array([2, 1, 1, 4, 1, 1, 1, 4, 1, 4])

New dataframe includes cluster and top 10 venues for each neighborhood

In [383]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
downtown_merged = downtown_df
downtown_merged = downtown_merged.join(neighborhoods_venues_sorted.set_index('Postcode'), on='Postcode')
downtown_merged.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4W,Downtown Toronto,[Rosedale],43.679563,-79.377529,2,Park,Playground,Trail,Deli / Bodega,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Discount Store
1,M4X,Downtown Toronto,"[Cabbagetown, St. James Town]",43.667967,-79.367675,1,Coffee Shop,Park,Market,Pub,Restaurant,Bakery,Italian Restaurant,Café,Pizza Place,Caribbean Restaurant
2,M4Y,Downtown Toronto,[Church and Wellesley],43.66586,-79.38316,1,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Restaurant,Gay Bar,Gym,Men's Store,Mediterranean Restaurant,Hotel,Gastropub
3,M5A,Downtown Toronto,[Harbourfront],43.65426,-79.360636,4,Coffee Shop,Park,Pub,Bakery,Mexican Restaurant,Café,Breakfast Spot,Restaurant,Performing Arts Venue,Chocolate Shop
4,M5B,Downtown Toronto,"[Ryerson, Garden District]",43.657162,-79.378937,1,Coffee Shop,Clothing Store,Café,Cosmetics Shop,Middle Eastern Restaurant,Ramen Restaurant,Pizza Place,Restaurant,Diner,Japanese Restaurant


In [391]:
m_clusters = folium.Map(location=[latitude, longitude], zoom_start=13)
x=np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_arr = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_arr]

marker_colors= []
for lat, lng, poi, cluster in zip(downtown_merged['Latitude'], downtown_merged['Longitude'], downtown_merged['Neighborhood'], downtown_merged['Cluster Labels']):
    label = folium.Popup(str(poi)+ ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
    [lat, lng],
    radius=5,
    popup=label,
    color=rainbow[cluster-1],
    fill_opacity=0.7).add_to(m_clusters)
m_clusters