# Data load and manipulation

In [None]:
import pandas as pd

Load the table

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
table = pd.read_html(url, header=0)[0]

Ignore cells with a borough that is Not assigned.

In [3]:
df = table.loc[table['Borough']!='Not assigned']

If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [4]:
df.loc[(df['Neighbourhood']=='Not assigned'), 'Neighbourhood'] = df['Borough']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [5]:
df.reset_index(inplace=True, drop=True)

Print the number of rows of the dataframe.

In [6]:
df.shape[0]

103

In [7]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [8]:
url_csv = 'http://cocl.us/Geospatial_data'
csv = pd.read_csv(url_csv)

In [12]:
df = df.merge(csv, how='left', on='Postal Code')

In [25]:
# Choosing neighbourhoods with "York" in the name
dfy = df[df['Borough'].str.contains('York', regex=False)].reset_index(drop=True)
dfy.shape[0]

34

# Getting location of neighbourhoods and showing them on a map

In [17]:
import numpy as np
import pandas as pd
import json
from geopy.geocoders import Nominatim
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

In [19]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [47]:
map_to = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighbourhood in zip(dfy['Latitude'], dfy['Longitude'], dfy['Borough'], dfy['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_to)  
    
map_to

# Exploration and clustering the neighborhoods in Toronto

In [48]:
CLIENT_ID = 'A3RTG2OICPJV1JDY0UO24PM0DSO3VZXSSHM33BNV5NCGWP1X'
CLIENT_SECRET = '3UJK5I5X00K4ZUSJ41WIRYHNMIVBZ0EECLS1RHULT1DWLMIZ'
VERSION = '20180605'

In [49]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [50]:
dfy_venues = getNearbyVenues(names=dfy['Neighbourhood'],
                                   latitudes=dfy['Latitude'],
                                   longitudes=dfy['Longitude']
                                  )

Parkwoods
Victoria Village
Lawrence Manor, Lawrence Heights
Don Mills
Parkview Hill, Woodbine Gardens
Glencairn
Don Mills
Woodbine Heights
Humewood-Cedarvale
Caledonia-Fairbanks
Leaside
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto, Broadview North (Old East York)
Bayview Village
Downsview
York Mills, Silver Hills
Downsview
North Park, Maple Leaf Park, Upwood Park
Humber Summit
Willowdale, Newtonbrook
Downsview
Bedford Park, Lawrence Manor East
Del Ray, Mount Dennis, Keelsdale and Silverthorn
Humberlea, Emery
Willowdale, Willowdale East
Downsview
Runnymede, The Junction North
Weston
York Mills West
Willowdale, Willowdale West


In [66]:
dfy_venues.groupby('Neighbourhood').count()

(27, 6)

In [52]:
dfy_onehot = pd.get_dummies(dfy_venues[['Venue Category']], prefix="", prefix_sep="")

dfy_onehot['Neighbourhood'] = dfy_venues['Neighbourhood'] 

fixed_columns = [dfy_onehot.columns[-1]] + list(dfy_onehot.columns[:-1])
dfy_onehot = dfy_onehot[fixed_columns]

dfy_onehot

Unnamed: 0,Neighbourhood,Accessories Store,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bagel Shop,Bakery,...,Thai Restaurant,Theater,Toy / Game Store,Trail,Turkish Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
dfy_group = dfy_onehot.groupby('Neighbourhood').mean().reset_index()

Unnamed: 0,Neighbourhood,Accessories Store,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bagel Shop,Bakery,...,Thai Restaurant,Theater,Toy / Game Store,Trail,Turkish Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Women's Store,Yoga Studio
0,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bedford Park, Lawrence Manor East",0.0,0.0,0.038462,0.0,0.0,0.0,0.0,0.0,0.0,...,0.038462,0.0,0.038462,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Caledonia-Fairbanks,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0
4,"Del Ray, Mount Dennis, Keelsdale and Silverthorn",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0
5,Don Mills,0.0,0.0,0.0,0.038462,0.0,0.038462,0.038462,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Downsview,0.0,0.0625,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"East Toronto, Broadview North (Old East York)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,"Fairview, Henry Farm, Oriole",0.0,0.0,0.014286,0.0,0.0,0.014286,0.0,0.0,0.014286,...,0.0,0.014286,0.014286,0.0,0.0,0.014286,0.0,0.0,0.028571,0.0
9,Glencairn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
num_top_venues = 5

for hood in dfy_group['Neighbourhood']:
    print("----"+hood+"----")
    temp = dfy_group[dfy_group['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Bathurst Manor, Wilson Heights, Downsview North----
           venue  freq
0    Coffee Shop  0.10
1           Bank  0.10
2  Shopping Mall  0.05
3    Gas Station  0.05
4    Bridal Shop  0.05


----Bayview Village----
                 venue  freq
0                 Café  0.25
1                 Bank  0.25
2   Chinese Restaurant  0.25
3  Japanese Restaurant  0.25
4        Metro Station  0.00


----Bedford Park, Lawrence Manor East----
                venue  freq
0      Sandwich Place  0.08
1          Restaurant  0.08
2         Coffee Shop  0.08
3  Italian Restaurant  0.08
4                 Pub  0.04


----Caledonia-Fairbanks----
               venue  freq
0               Park  0.50
1      Women's Store  0.25
2               Pool  0.25
3  Accessories Store  0.00
4             Lounge  0.00


----Del Ray, Mount Dennis, Keelsdale and Silverthorn----
                venue  freq
0        Skating Rink  0.25
1  Turkish Restaurant  0.25
2      Sandwich Place  0.25
3      Discount Store  0.25
4  

In [55]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [56]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
dfy_venues_sorted = pd.DataFrame(columns=columns)
dfy_venues_sorted['Neighbourhood'] = dfy_group['Neighbourhood']

for ind in np.arange(dfy_group.shape[0]):
    dfy_venues_sorted.iloc[ind, 1:] = return_most_common_venues(dfy_group.iloc[ind, :], num_top_venues)

dfy_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Park,Shopping Mall,Grocery Store,Ice Cream Shop,Diner,Deli / Bodega,Middle Eastern Restaurant,Mobile Phone Shop
1,Bayview Village,Bank,Japanese Restaurant,Chinese Restaurant,Café,Yoga Studio,Dog Run,Discount Store,Diner,Dim Sum Restaurant,Department Store
2,"Bedford Park, Lawrence Manor East",Italian Restaurant,Coffee Shop,Restaurant,Sandwich Place,Fast Food Restaurant,Liquor Store,Pub,Pizza Place,Indian Restaurant,Boutique
3,Caledonia-Fairbanks,Park,Pool,Women's Store,Gastropub,Dog Run,Construction & Landscaping,Convenience Store,Golf Course,Cosmetics Shop,Curling Ice
4,"Del Ray, Mount Dennis, Keelsdale and Silverthorn",Skating Rink,Discount Store,Turkish Restaurant,Sandwich Place,Yoga Studio,Department Store,Diner,Dim Sum Restaurant,Dessert Shop,Deli / Bodega


In [65]:
# set number of clusters
kclusters = 5

dfy_grouped_clustering = dfy_group.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(dfy_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 3, 2, 0, 0, 0, 0, 0, 0, 1, 4, 0,
       3, 0, 0, 0, 1])

In [58]:
# add clustering labels
dfy_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [69]:
dfy_merged = dfy

# merge data to add latitude/longitude for each neighborhood
dfy_merged = dfy_merged.join(dfy_venues_sorted.set_index('Neighbourhood'), how='inner', on='Neighbourhood')

dfy_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,1,Park,Food & Drink Shop,Yoga Studio,Event Space,Convenience Store,Cosmetics Shop,Curling Ice,Dance Studio,Deli / Bodega,Department Store
1,M4A,North York,Victoria Village,43.725882,-79.315572,3,Pizza Place,Coffee Shop,Hockey Arena,French Restaurant,Portuguese Restaurant,Yoga Studio,Deli / Bodega,Diner,Dim Sum Restaurant,Dessert Shop
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,0,Clothing Store,Accessories Store,Boutique,Furniture / Home Store,Event Space,Miscellaneous Shop,Coffee Shop,Vietnamese Restaurant,Asian Restaurant,Cosmetics Shop
3,M3B,North York,Don Mills,43.745906,-79.352188,0,Gym,Coffee Shop,Japanese Restaurant,Beer Store,Clothing Store,Restaurant,Asian Restaurant,Café,Sandwich Place,Italian Restaurant
6,M3C,North York,Don Mills,43.7259,-79.340923,0,Gym,Coffee Shop,Japanese Restaurant,Beer Store,Clothing Store,Restaurant,Asian Restaurant,Café,Sandwich Place,Italian Restaurant


In [70]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dfy_merged['Latitude'], dfy_merged['Longitude'], dfy_merged['Neighbourhood'], dfy_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### One of clusters 

In [71]:
dfy_merged.loc[dfy_merged['Cluster Labels'] == 0, dfy_merged.columns[[1] + list(range(5, dfy_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,North York,0,Clothing Store,Accessories Store,Boutique,Furniture / Home Store,Event Space,Miscellaneous Shop,Coffee Shop,Vietnamese Restaurant,Asian Restaurant,Cosmetics Shop
3,North York,0,Gym,Coffee Shop,Japanese Restaurant,Beer Store,Clothing Store,Restaurant,Asian Restaurant,Café,Sandwich Place,Italian Restaurant
6,North York,0,Gym,Coffee Shop,Japanese Restaurant,Beer Store,Clothing Store,Restaurant,Asian Restaurant,Café,Sandwich Place,Italian Restaurant
4,East York,0,Pizza Place,Pharmacy,Athletics & Sports,Café,Fast Food Restaurant,Intersection,Bank,Breakfast Spot,Gastropub,Gym / Fitness Center
5,North York,0,Bakery,Park,Playground,Japanese Restaurant,Italian Restaurant,Pub,Department Store,Diner,Dim Sum Restaurant,Dessert Shop
7,East York,0,Pharmacy,Athletics & Sports,Curling Ice,Dance Studio,Bus Stop,Skating Rink,Beer Store,Spa,Park,Discount Store
8,York,0,Field,Dog Run,Hockey Arena,Trail,Yoga Studio,Dessert Shop,Discount Store,Diner,Dim Sum Restaurant,Deli / Bodega
10,East York,0,Coffee Shop,Sporting Goods Shop,Furniture / Home Store,Burger Joint,Bank,Dessert Shop,Department Store,Liquor Store,Electronics Store,Mexican Restaurant
11,North York,0,Golf Course,Pool,Mediterranean Restaurant,Dog Run,Yoga Studio,Electronics Store,Convenience Store,Cosmetics Shop,Curling Ice,Dance Studio
12,North York,0,Coffee Shop,Bank,Park,Shopping Mall,Grocery Store,Ice Cream Shop,Diner,Deli / Bodega,Middle Eastern Restaurant,Mobile Phone Shop
