# Clustering
#### By: Lucas Noto

## Importing Libraries

In [5]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium
from geopy.geocoders import Nominatim
import requests

### Import Files Required

In [6]:
filepath = ('Geospatial_Coordinates.csv')
coord = pd.read_csv('Geospatial_Coordinates.csv')
coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [7]:
link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
#reading the link
tables = pd.read_html(link)

### 1. Creating the First DataFrame

In [8]:
my_table = tables[0].iloc[0:]
df = pd.DataFrame(my_table)
df.columns = ['Postal Code', 'Borough', 'Neighborhood']

In [9]:
#removing rows with "not assigned" in the borough column
df1 = df[df.Borough != 'Not assigned']
#grouping the rows by postal code and removing duplicates
df2 = df1.groupby(['Postal Code', 'Borough'], sort = False).agg(lambda x:', '.join(x))
df2.reset_index(level=['Postal Code','Borough'], inplace=True)
#assigning neighborhoods their borough pairings if "not assigned"
na_neigh = df2.Neighborhood == "Not assigned"
df2.loc[na_neigh, 'Neighborhood'] = df2.loc[na_neigh, 'Borough']
df2.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [10]:
df_merge = pd.merge(df2, coord, on='Postal Code')
df_merge.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


### 2. Using Folium, Map of Toronto

In [11]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent = 'Lucas-Toronto')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

43.653963 -79.387207


In [13]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood, postal_code in zip(df_merge['Latitude'], df_merge['Longitude'], df_merge['Borough'], df_merge['Neighborhood'], df_merge['Postal Code']):
    label = '{}, {}'.format(postal_code, neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### "Toronto" Named Boroughs

In [50]:
boroughs1 = ['East Toronto' , 'Central Toronto' , 'Downtown Toronto' , 'West Toronto']
df_fin = df_merge[df_merge['Borough'].isin(boroughs1)].reset_index(drop = True)
print(df_fin.shape)
df_fin.head()

(38, 5)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
1,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


### FourSquare API

In [15]:
CLIENT_ID = 'C4PCOYEM3A2BGCRL2VLN3Q0FC5D5BMJON3QQTHC3N1VMJSMD' # your Foursquare ID
CLIENT_SECRET = 'LXWRZADBRGIEVUY2PFRYK5DRUGBP0CV2KIGECR3EU1ZMNIJK' # your Foursquare Secret
VERSION = '20180605'

In [16]:
LIMIT = 100
radius = 500
venues = []
for lat,long,post,borough,neighbourhood in zip(df_merge['Latitude'] , df_merge['Longitude'] , df_merge['Postal Code'] , df_merge['Borough'] , df_merge['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(CLIENT_ID,CLIENT_SECRET,VERSION,lat,long,radius,LIMIT)
    
    results = requests.get(url).json()['response']['groups'][0]['items']

    for venue in results:
        venues.append((post,borough,neighbourhood,lat,long,venue['venue']['name'],venue['venue']['location']['lat'],venue['venue']['location']['lng'],venue['venue']['categories'][0]['name']))
        

In [47]:
venues_df = pd.DataFrame(venues)
venues_df.columns = ['Postal Code' , 'Borough' , 'Neighborhood' , 'B_Latitude' , 'B_Longitude' , 'V_Name' , 'V_Latitude' , 'V_Longitude' , 'V_Category']
print(venues_df.shape)
venues_df.head()

(2244, 9)


Unnamed: 0,Postal Code,Borough,Neighborhood,B_Latitude,B_Longitude,V_Name,V_Latitude,V_Longitude,V_Category
0,M3A,North York,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,M3A,North York,Parkwoods,43.753259,-79.329656,KFC,43.754387,-79.333021,Fast Food Restaurant
2,M3A,North York,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,M3A,North York,Parkwoods,43.753259,-79.329656,GreenWin pool,43.756232,-79.333842,Pool
4,M4A,North York,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena


In [48]:
venues_df.groupby(['Postal Code' , 'Borough' , 'Neighborhood'])['V_Name'].count()

Postal Code  Borough           Neighborhood                                                                                                                          
M1B          Scarborough       Rouge, Malvern                                                                                                                             1
M1C          Scarborough       Highland Creek, Rouge Hill, Port Union                                                                                                     1
M1E          Scarborough       Guildwood, Morningside, West Hill                                                                                                          8
M1G          Scarborough       Woburn                                                                                                                                     4
M1H          Scarborough       Cedarbrae                                                                                                          

### Onehot Encoding of Different Categories

In [49]:
venues_onehot = pd.get_dummies(venues_df[['V_Category']] , prefix = "" , prefix_sep = "")

venues_onehot['Postal Code'] = venues_df['Postal Code']
venues_onehot['Borough'] = venues_df['Borough']
venues_onehot['Neighborhoods'] = venues_df['Neighborhood']

column1 = list(venues_onehot.columns[-3:])+ list(venues_onehot.columns[:-3])
venues_onehot = venues_onehot[column1]

print(venues_onehot.shape)
venues_onehot.head()

(2244, 277)


Unnamed: 0,Postal Code,Borough,Neighborhoods,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M3A,North York,Parkwoods,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M3A,North York,Parkwoods,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M3A,North York,Parkwoods,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M3A,North York,Parkwoods,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4A,North York,Victoria Village,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Frequency of Categories

In [19]:
df_freq = venues_onehot.groupby(['Postal Code' , 'Borough' , 'Neighborhoods']).mean().reset_index()
print(df_freq.shape)
df_freq.head()

(100, 277)


Unnamed: 0,Postal Code,Borough,Neighborhoods,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,Scarborough,"Rouge, Malvern",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,Scarborough,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,Scarborough,Cedarbrae,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Top 5 Frequency of Venues in Each Neighborhood

In [65]:
num = 5
indicators = ['st' , 'nd' , 'rd']

area_col = ['Postal Code' , 'Borough' , 'Neighborhoods']
freq_col = []
for gh in np.arange(num):
    try:
        freq_col.append('{}{} Most Common Venue'.format(gh+1 , indicators[ind]))
    except:
        freq_col.append('{}th Most Common Venue'.format(gh+1))
final_col = area_col + freq_col

df_sorted = pd.DataFrame(columns = final_col)
df_sorted['Postal Code'] = df_freq['Postal Code']
df_sorted['Borough'] = df_freq['Borough']
df_sorted['Neighborhoods'] = df_freq['Neighborhoods']

for gh in np.arange(df_freq.shape[0]):
    row_categories = df_freq.iloc[gh,:].iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending = False)
    df_sorted.iloc[gh , 3:] = row_categories_sorted.index.values[0:num]
    
df_sorted.sort_values(freq_col , inplace = True)
df_sorted

Unnamed: 0,Postal Code,Borough,Neighborhoods,1th Most Common Venue,2th Most Common Venue,3th Most Common Venue,4th Most Common Venue,5th Most Common Venue
66,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Service,Airport Lounge,Airport Terminal,Harbor / Marina,Boat or Ferry
25,M3C,North York,"Flemingdon Park, Don Mills South",Asian Restaurant,Gym,Coffee Shop,Beer Store,Grocery Store
4,M1H,Scarborough,Cedarbrae,Athletics & Sports,Fried Chicken Joint,Hakka Restaurant,Thai Restaurant,Bakery
11,M1R,Scarborough,"Maryvale, Wexford",Auto Garage,Middle Eastern Restaurant,Smoke Shop,Breakfast Spot,Shopping Mall
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",Bakery,Bus Line,Park,Intersection,Fast Food Restaurant
18,M2K,North York,Bayview Village,Bank,Café,Japanese Restaurant,Chinese Restaurant,Yoga Studio
91,M9B,Etobicoke,"Cloverdale, Islington, Martin Grove, Princess ...",Bank,Golf Course,Yoga Studio,Dim Sum Restaurant,Diner
21,M2P,North York,York Mills West,Bank,Park,Electronics Store,Yoga Studio,Dim Sum Restaurant
75,M6J,West Toronto,"Little Portugal, Trinity",Bar,Asian Restaurant,Coffee Shop,Restaurant,New American Restaurant
80,M6P,West Toronto,"High Park, The Junction South",Bar,Café,Mexican Restaurant,Grocery Store,Italian Restaurant


### K-Means Clustering

In [90]:
k = 5
df_clustering = df_freq.drop(['Neighborhoods' , 'Postal Code' , 'Borough'], 1)

kmeans = KMeans(n_clusters = k, random_state = 0).fit(df_clustering)
kmeans.labels_[0:10]
#df_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_fin
toronto_merged = toronto_merged.join(df_sorted.drop(['Borough', 'Neighborhoods'], 1).set_index('Postal Code'), on = 'Postal Code')
toronto_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1th Most Common Venue,2th Most Common Venue,3th Most Common Venue,4th Most Common Venue,5th Most Common Venue,Cluster
0,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,0,Coffee Shop,Park,Café,Pub,Bakery,0
1,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,2,Coffee Shop,Clothing Store,Café,Cosmetics Shop,Middle Eastern Restaurant,2
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,2,Coffee Shop,Hotel,Restaurant,Café,Cosmetics Shop,2
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,2,Health Food Store,Music Venue,Neighborhood,Pub,Yoga Studio,2
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,2,Coffee Shop,Cocktail Bar,Restaurant,Steakhouse,Bakery,2


### Map of Clusters

In [91]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start = 11)
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range (k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters