# Segmenting and Clustering Toronto

#### Foursquare API Parameters

In [1]:
# @hidden_cell
CLIENT_ID = 'O5Q2HWGTOTCNZ1ADUIRS3E5SRF2YPJQOJUNSWYRLQF0RCJ2V' # your Foursquare ID
CLIENT_SECRET = 'LL5IF02ROWBRLAOGIJOF2GH5S3F4C5OXM1T3NQO2PRLTLGBA' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 #Get only the top 100 venues 

#### Google API Key

In [2]:
# @hidden_cell
## Set Google API KEY
gmaps_api_key='AIzaSyCekj-dvzN-KFIE5hKYUMuoLo4z_f3s5_M'

#### Install and import libraries

In [3]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests

#!pip install -U googlemaps
import googlemaps

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#### Initialize postcode DataFrame

In [4]:
postcode_df = pd.DataFrame(columns=range(0,3), index = range(0,500)) # 3 columns, 500 rows should be enough
# Name columns
postcode_df.columns=['PostalCode', 'Borough', 'Neighborhood']

#### Optain data from Wikipedia

In [5]:
# Read Data from URL
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
# Parse the response
soup = BeautifulSoup(website_url,'lxml')
# Find the postcode table
poscode_html_table = soup.find('table',{'class':'wikitable sortable'})

#### Extract data from the html table and fill it to the dataframe

In [6]:
# Extract Rows/Columns and fill them to the postcode_df
row_i = 0
for poscode_html_table_row in poscode_html_table.find_all('tr'):
    column_i = 0
    columns = poscode_html_table_row.find_all('td')
    for column in columns:
        postcode_df.iat[row_i,column_i] = column.get_text()
        column_i += 1
    row_i += 1

### Cleanup and Prepare Data

#### Remove empty Rows

In [7]:
postcode_df.dropna(inplace=True)

#### Remove \n at the end of the neighborhood

In [8]:
postcode_df['Neighborhood'] = postcode_df['Neighborhood'].replace(r'\s$','', regex=True) 

#### Remove rows whith "Not assigned" Boroughs

In [9]:
postcode_df = postcode_df[postcode_df['Borough'] != 'Not assigned']

#### Set the Neighborhood to the value of Borough when Neighborhood is not assigned

In [10]:
postcode_df.loc[postcode_df['Neighborhood'] == 'Not assigned', ['Neighborhood']] = postcode_df['Borough'] 

### Prepare Output Data

#### Transform / Group DataFrame to concatinate neighborhoods with comma  

In [11]:
postcode_grp_df=postcode_df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(lambda x: "%s" % ', '.join(x)).reset_index()

#### Display shape

In [12]:
postcode_grp_df.shape

(103, 3)

#### Display DataFrame

In [13]:
postcode_grp_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [14]:
postcode_grp_df.shape

(103, 3)

## Geocoding Postcodes

#### Function for geolocation returning lat, lng

In [15]:
def get_latlng(post_code, borough):

    # Initialize API
    gmaps = googlemaps.Client(key=gmaps_api_key)
    
    #Build adress
    address = '{}, {}, Canada'.format(post_code, borough)

    # Geocoding an address
    geocode_result = gmaps.geocode(address)

    # get latitude, longitude
    lat = geocode_result[0]['geometry']['location']['lat']
    lng = geocode_result[0]['geometry']['location']['lng']
    #print(post_code, borough, lat, lng)
    
    # return coordinate
    return lat, lng

#### Copy data to new dataframe

In [16]:
postcode_loc_df = postcode_grp_df[['PostalCode','Borough','Neighborhood']]

#### Add coordinates to the postcode dataframe

In [17]:
postcode_loc_df['latlng'] = postcode_loc_df.apply(lambda x: get_latlng(x.PostalCode, x.Borough), axis=1)

In [18]:
postcode_loc_df['Latitude'] = postcode_loc_df['latlng'].apply(lambda x: x[0])
postcode_loc_df['Longitude'] = postcode_loc_df['latlng'].apply(lambda x: x[1])

#Reorder colunms and drop latlng
postcode_loc_df = postcode_loc_df[['PostalCode','Borough','Neighborhood','Latitude','Longitude']]


In [19]:
postcode_loc_df['Display'] = postcode_loc_df['Neighborhood'] + ' (' + postcode_loc_df['PostalCode'] + ' ' + postcode_loc_df['Borough'] + ')'

In [20]:
postcode_loc_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Display
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,"Rouge, Malvern (M1B Scarborough)"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,"Highland Creek, Rouge Hill, Port Union (M1C Sc..."
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,"Guildwood, Morningside, West Hill (M1E Scarbor..."
3,M1G,Scarborough,Woburn,43.770992,-79.216917,Woburn (M1G Scarborough)
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,Cedarbrae (M1H Scarborough)


### Use Map and Clustering from our last Session

#### Prepare Data for clustering

#### Get Coordinates of Toronto

In [21]:
toronto_locator = Nominatim(user_agent="toronto_explorer")
toronto_location = toronto_locator.geocode('Toronto, Canada')
toronto_latitude = toronto_location.latitude
toronto_longitude = toronto_location.longitude

#### Show Toronto Neighborhoods in a map

In [22]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[toronto_latitude, toronto_longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(postcode_loc_df['Latitude'], postcode_loc_df['Longitude'], postcode_loc_df['Display']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Explore Neighborhoods in Toronto

##### This function gets the top 100 venues for each PostalCode from Foursquare

In [23]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Display', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Get the Toronto venues from Foursquare

In [24]:
toronto_venues = getNearbyVenues(names=postcode_loc_df['Display'],
                                   latitudes=postcode_loc_df['Latitude'],
                                   longitudes=postcode_loc_df['Longitude']
                                  )

### Analyzing Toronto Venues

In [25]:
toronto_venues.head()

Unnamed: 0,Display,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Malvern (M1B Scarborough)",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Highland Creek, Rouge Hill, Port Union (M1C Sc...",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Guildwood, Morningside, West Hill (M1E Scarbor...",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
3,"Guildwood, Morningside, West Hill (M1E Scarbor...",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
4,"Guildwood, Morningside, West Hill (M1E Scarbor...",43.763573,-79.188711,Big Bite Burrito,43.766299,-79.19072,Mexican Restaurant


#### Convert categories to matrix

In [26]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
columns_without_key = toronto_onehot.columns

# add neighborhood column back to dataframe
toronto_onehot['Display'] = toronto_venues['Display'] 

# move neighborhood column to the first column
fixed_columns = ['Display'] + list(columns_without_key)
toronto_onehot = toronto_onehot[fixed_columns]

#### Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [27]:
toronto_grouped = toronto_onehot.groupby('Display').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Display,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond (M5H Downtown Toronto)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,...,0.02,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0
1,Agincourt (M1S Scarborough),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Alderwood, Long Branch (M8W Etobicoke)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Show the top 10 categories for each Neighborhood

In [34]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Display']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Display'] = toronto_grouped['Display']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)


In [35]:
neighborhoods_venues_sorted.head()

Unnamed: 0,Display,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond (M5H Downtown Toronto)",Coffee Shop,Café,Bar,Steakhouse,Cosmetics Shop,Thai Restaurant,Hotel,Restaurant,Burger Joint,American Restaurant
1,Agincourt (M1S Scarborough),Lounge,Sandwich Place,Breakfast Spot,Skating Rink,Chinese Restaurant,Drugstore,Discount Store,Dog Run,Doner Restaurant,Donut Shop
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Playground,Park,Yoga Studio,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Pharmacy,Fried Chicken Joint,Pizza Place,Coffee Shop,Sandwich Place,Beer Store,Fast Food Restaurant,Gluten-free Restaurant,Dance Studio
4,"Alderwood, Long Branch (M8W Etobicoke)",Pizza Place,Gym,Coffee Shop,Sandwich Place,Pub,Skating Rink,Pool,Pharmacy,Drugstore,Donut Shop


## Clustering using K-Means

#### Run k-means to cluster the neighborhood into 5 clusters.

In [36]:
# set number of clusters
kclusters = 3

toronto_grouped_clustering = toronto_grouped.drop('Display', axis=1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [37]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = postcode_loc_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Display'), on='Display')

# remove NaN Rows
toronto_merged.dropna(inplace=True)

In [38]:
toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Display,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,"Rouge, Malvern (M1B Scarborough)",2.0,Fast Food Restaurant,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Eastern European Restaurant,College Theater
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,"Highland Creek, Rouge Hill, Port Union (M1C Sc...",0.0,Bar,Yoga Studio,Dumpling Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Eastern European Restaurant,Dim Sum Restaurant
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,"Guildwood, Morningside, West Hill (M1E Scarbor...",0.0,Mexican Restaurant,Intersection,Pizza Place,Breakfast Spot,Electronics Store,Medical Center,Rental Car Location,Yoga Studio,Discount Store,Dog Run
3,M1G,Scarborough,Woburn,43.770992,-79.216917,Woburn (M1G Scarborough),0.0,Coffee Shop,Korean Restaurant,Eastern European Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Yoga Studio
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,Cedarbrae (M1H Scarborough),0.0,Hakka Restaurant,Bakery,Caribbean Restaurant,Athletics & Sports,Thai Restaurant,Bank,Fried Chicken Joint,Yoga Studio,Donut Shop,Dog Run


### Visualize the Clusters on map

In [33]:
# create map
map_clusters = folium.Map(location=[toronto_latitude, toronto_longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Display'], toronto_merged['Cluster Labels'].astype(int)):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters