This is a notebook for the 3rd week's assignment of capstone project.
It is about wrangling,segmenting, and clustering locations in the city
of Toronto in Canada's Ontario province. I start with importing (and
installing if necessary) required libraries throghout the notebook.
Because I use Ubuntu, I included pip commands for installations. Also
please note that because these libraries were installed in earlier versions
of this notebook, installation commands are commented out at the moment.

In [1]:
#Required libraries FOR:

#data processing
import numpy as np
import pandas as pd

#web scraping
import urllib.request
from bs4 import BeautifulSoup

#plotting maps
#!conda install -c conda-forge folium --yes
#I have Ubuntu on my laptop so 
#!pip3 install folium
import folium

#retrieving latitude-longitude of Neighbors
from geopy.geocoders import Nominatim

#clustering
#!pip3 install --upgrade scipy
#!pip3 install --upgrade scikit-learn
from sklearn.cluster import KMeans

#getting data and conversion from JSON
import requests
from pandas.io.json import json_normalize


## Part 1) Download and Explore Dataset

In [2]:
#Create a soup object from given url 
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, "lxml")

In [3]:
#Find all tables in the url, then pick the desired one
all_tables=soup.find_all("table")
#all_tables

right_table=soup.find('table', class_='wikitable sortable')
#right_table

In [4]:
#Initiate arrays for storing contents of table
po_codes=[]
boroghs=[]
n_hoods=[]

In [5]:
#Populate arrays with data extracted from wiki table.
#To trim new line marks, use slicing [:-1] 
for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        po_codes.append(cells[0].find(text=True)[:-1])
        boroghs.append(cells[1].find(text=True)[:-1])
        n_hoods.append(cells[2].find(text=True)[:-1])

In [6]:
#Initiate an empty dataframe with desired column names
column_names = ['PostalCode', 'Borough', 'Neighborhood']
ns_df = pd.DataFrame(columns=column_names)

In [7]:
#Populate the dataframe with the extracted data
ns_df['PostalCode']=po_codes
ns_df['Borough']=boroghs
ns_df['Neighborhood']=n_hoods

### Below I will remove data without a Borough name as instructed

In [8]:
#Remove rows without a Borough name
ns_df = ns_df.drop(ns_df[ns_df['Borough'] == 'Not assigned'].index).reset_index(drop=True)
ns_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### Below I will check if dataframe has a borough but a Not assigned neighborhood.

In [9]:
#Check if dataframe has a borough but a Not assigned neighborhood.
#This is done by first locating NULL elements in dataframe 'Neighborhood'
#column. Following .isnull test, we create a list from indices of elements
#in test with 'False' value and call it check list. Finally we compare 
#lengths of test list and check list. In case comparison is 'True', we have
#no NULL elements. If comparison gives 'False', we have Not assigned 
#neighborhoods.
test = pd.isnull(ns_df.loc[:, 'Neighborhood'])
check = np.where(test==False)[0]
len(test)==len(check)

True

### Below I will check the shape of dataframe which concludes Part 1 of assignment.

In [10]:
#Check the shape of dataframe.
ns_df.shape

(103, 3)

## Part 2) Data wrangling plots and prep for clustering.
#### Please note that because of some rendering issues, map plots may not be visible. To see them, you need to download the notebook and run the contents!!!

In [11]:
#Get latitude longitude data from csv file.
all_locs=pd.read_csv('Geospatial_Coordinates.csv')

In [12]:
#Create two empty lists for storing latitude/longitude data
latitude=[]
longitude=[]

In [13]:
#Populate lists with the data from csv file.
for i in range(len(ns_df)): 
    mark = ns_df['PostalCode'][i]
    mark_val=all_locs[all_locs['Postal Code']==mark]
    latitude += [mark_val.iloc[0]['Latitude'] ]
    longitude += [mark_val.iloc[0]['Longitude']]

In [14]:
#Add 'Latitude' and 'Longitude' columns/values to dataframe.
#Please note that these are the 'Latitude' and 'Longitude'
#values associated with Postal Codes.
ns_df['Latitude']=latitude
ns_df['Longitude']=longitude
ns_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [15]:
#Here we plot all neighborhoods of Toronto
map_toronto_allNeighs = folium.Map(location=[latitude[0], longitude[0]], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(ns_df['Latitude'], ns_df['Longitude'], ns_df['Borough'], ns_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_allNeighs)  
    
map_toronto_allNeighs

In [16]:
#Here we pick only 'Toronto' including boroghs. There is a regex way of doing it
#but I am lazy.
downtown_toronto = ns_df[ns_df['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
central_toronto = ns_df[ns_df['Borough'] == 'Central Toronto'].reset_index(drop=True)
east_toronto = ns_df[ns_df['Borough'] == 'East Toronto'].reset_index(drop=True)
west_toronto = ns_df[ns_df['Borough'] == 'West Toronto'].reset_index(drop=True)
ton_df=pd.concat([downtown_toronto,central_toronto,east_toronto,west_toronto])
ton_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
5,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
6,M6G,Downtown Toronto,Christie,43.669542,-79.422564
7,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
8,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752
9,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576


In [17]:
#ton_df has multiple Neighborhoods per Borogh. This will complicate 
#later clustering work. Therefore, we will explode Neighborhood column
#to individual elements while keeping associated Borogh and Latitude/
#Longitude information. 
ton_df = \
(ton_df.set_index(ton_df.columns.drop('Neighborhood',0).tolist())
   .Neighborhood.str.split(', ', expand=True)
   .stack()
   .reset_index()
   .rename(columns={0:'Neighborhood'})
   .loc[:, ns_df.columns]
)
ton_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
1,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
2,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
3,M7A,Downtown Toronto,Ontario Provincial Government,43.662301,-79.389494
4,M5B,Downtown Toronto,Garden District,43.657162,-79.378937
5,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937
6,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
7,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
8,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
9,M6G,Downtown Toronto,Christie,43.669542,-79.422564


In [18]:
#I will center the map to Island airport for better visualization.
map_toronto_Toronto = folium.Map(location=[ton_df.loc[31,'Latitude'], ton_df.loc[31,'Longitude']], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(ton_df['Latitude'], ton_df['Longitude'], ton_df['Borough'], ton_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_Toronto)  
    
map_toronto_Toronto

In [19]:
#Foursquare base info
CLIENT_ID = 'RYCX141XUBTJKRVJMLA51KZVARP4KANFZDUVGY4SNPBT0JV0' # your Foursquare ID
CLIENT_SECRET = 'BRS3PNN0IUEIUAJ4LYDWJ0EWFVVA22FKZMCUHC4331QBADIH' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: RYCX141XUBTJKRVJMLA51KZVARP4KANFZDUVGY4SNPBT0JV0
CLIENT_SECRET:BRS3PNN0IUEIUAJ4LYDWJ0EWFVVA22FKZMCUHC4331QBADIH


In [20]:
ton_df.loc[:10, 'Neighborhood']

0                       Regent Park
1                      Harbourfront
2                      Queen's Park
3     Ontario Provincial Government
4                   Garden District
5                           Ryerson
6                    St. James Town
7                       Berczy Park
8                Central Bay Street
9                          Christie
10                         Richmond
Name: Neighborhood, dtype: object

In [21]:
ton_df.shape

(78, 5)

In [22]:
#Change column names Latitude and Longitude to Latitude_B and
#Longitude_B to indicate they are for PO locations.
ton_df=ton_df.rename(columns={"Latitude":"Latitude_PO", "Longitude":"Longitude_PO"})

In [23]:
#Geocode is not able to find locations of some Neighborhoods, so I will drop them from
#the list before analysis.
black_list=['Ontario Provincial Government','Railway Lands','Island airport','Stn A PO Boxes', 
           'Business reply mail Processing Centre','South Central Letter Processing Plant Toronto']
for i in black_list:
    ton_df = ton_df.drop(ton_df[ton_df['Neighborhood'] == i].index).reset_index(drop=True)

In [24]:
#ton_df.loc[59,'Neighborhood']

In [25]:
#Cleaned up list is ready for the use in geocode. Now we extract location data for
#Neighboors and store them in lat and long arrays.
n_latitude=[]
n_longitude=[]
city = 'Toronto, Canada'
boroughs  = ton_df['Neighborhood']
geolocator = Nominatim(user_agent="toron_explorer")
#i=0
for borough in boroughs:
    address = borough + ', ' + city
    location = geolocator.geocode(address)
    n_latitude += [location.latitude]
    n_longitude += [location.longitude]
    #print(i,borough,location.latitude,location.longitude)
    #i+=1

In [26]:
ton_df["Latitude_N"]=n_latitude
ton_df["Longitude_N"]=n_longitude

In [27]:
ton_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude_PO,Longitude_PO,Latitude_N,Longitude_N
0,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636,43.660706,-79.360457
1,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,43.64008,-79.38015
2,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494,43.659659,-79.39034
3,M5B,Downtown Toronto,Garden District,43.657162,-79.378937,43.6565,-79.377114
4,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937,43.658469,-79.378993


In [28]:
#Function to request from Foursquare for top 100 venues and within
#500m range.
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    LIMIT=100
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [29]:
ton_venues = getNearbyVenues(names=ton_df['Neighborhood'],
                                   latitudes=ton_df["Latitude_N"],
                                   longitudes=ton_df['Longitude_N']
                                  )

Regent Park
Harbourfront
Queen's Park
Garden District
Ryerson
St. James Town
Berczy Park
Central Bay Street
Christie
Richmond
Adelaide
King
Harbourfront East
Union Station
Toronto Islands
Toronto Dominion Centre
Design Exchange
Commerce Court
Victoria Hotel
University of Toronto
Harbord
Kensington Market
Chinatown
Grange Park
CN Tower
King and Spadina
Harbourfront West
Bathurst Quay
South Niagara
Rosedale
St. James Town
Cabbagetown
First Canadian Place
Underground city
Church and Wellesley
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West
Forest Hill Road Park
North Toronto West
 Lawrence Park
The Annex
North Midtown
Yorkville
Davisville
Moore Park
Summerhill East
Summerhill West
Rathnelly
South Hill
Forest Hill SE
Deer Park
The Beaches
The Danforth West
Riverdale
India Bazaar
The Beaches West
Studio District
Dufferin
Dovercourt Village
Little Portugal
Trinity
Brockton
Parkdale Village
Exhibition Place
High Park
The Junction South
Parkdale
Roncesvalles
Runnymede
Swansea


In [30]:
print(ton_venues.shape)
ton_venues.head()

(3640, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Regent Park,43.660706,-79.360457,Regent Park Aquatic Centre,43.6606,-79.361392,Pool
1,Regent Park,43.660706,-79.360457,Daniels Spectrum,43.660137,-79.361808,Performing Arts Venue
2,Regent Park,43.660706,-79.360457,Thai To Go,43.663418,-79.36071,Thai Restaurant
3,Regent Park,43.660706,-79.360457,Sumach Espresso,43.658135,-79.359515,Coffee Shop
4,Regent Park,43.660706,-79.360457,Paintbox Bistro,43.66005,-79.362855,Restaurant


In [31]:
ton_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Lawrence Park,50,50,50,50,50,50
Adelaide,100,100,100,100,100,100
Bathurst Quay,25,25,25,25,25,25
Berczy Park,100,100,100,100,100,100
Brockton,19,19,19,19,19,19
...,...,...,...,...,...,...
Underground city,15,15,15,15,15,15
Union Station,55,55,55,55,55,55
University of Toronto,30,30,30,30,30,30
Victoria Hotel,38,38,38,38,38,38


In [32]:
print('There are {} uniques categories.'.format(len(ton_venues['Venue Category'].unique())))

There are 290 uniques categories.


## Part 3) Analysis of neighborhoods through clustering 
#### Please note that most of the following code is from the given notebook in lectures and exactly follows the clustering procedure given before!!!

In [33]:
# one hot encoding
ton_onehot = pd.get_dummies(ton_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
ton_onehot['Neighborhood'] = ton_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [ton_onehot.columns[-1]] + list(ton_onehot.columns[:-1])
ton_onehot = ton_onehot[fixed_columns]

ton_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Service,American Restaurant,Animal Shelter,Antique Shop,Aquarium,...,Tunnel,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
ton_onehot.shape

(3640, 290)

In [35]:
ton_grouped = ton_onehot.groupby('Neighborhood').mean().reset_index()
ton_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Service,American Restaurant,Animal Shelter,Antique Shop,...,Tunnel,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint,Women's Store
0,Lawrence Park,0.020000,0.0,0.00,0.0,0.00,0.00,0.000000,0.0,0.00,...,0.00,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.00,0.0,0.00
1,Adelaide,0.000000,0.0,0.00,0.0,0.00,0.00,0.040000,0.0,0.00,...,0.00,0.0,0.010000,0.000000,0.0,0.000000,0.0,0.01,0.0,0.00
2,Bathurst Quay,0.000000,0.0,0.00,0.0,0.04,0.04,0.000000,0.0,0.00,...,0.04,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.00,0.0,0.00
3,Berczy Park,0.010000,0.0,0.00,0.0,0.00,0.00,0.010000,0.0,0.01,...,0.00,0.0,0.010000,0.000000,0.0,0.000000,0.0,0.00,0.0,0.00
4,Brockton,0.000000,0.0,0.00,0.0,0.00,0.00,0.000000,0.0,0.00,...,0.00,0.0,0.000000,0.000000,0.0,0.105263,0.0,0.00,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,Underground city,0.000000,0.0,0.00,0.0,0.00,0.00,0.000000,0.0,0.00,...,0.00,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.00,0.0,0.00
67,Union Station,0.000000,0.0,0.00,0.0,0.00,0.00,0.018182,0.0,0.00,...,0.00,0.0,0.018182,0.000000,0.0,0.000000,0.0,0.00,0.0,0.00
68,University of Toronto,0.033333,0.0,0.00,0.0,0.00,0.00,0.000000,0.0,0.00,...,0.00,0.0,0.000000,0.033333,0.0,0.000000,0.0,0.00,0.0,0.00
69,Victoria Hotel,0.000000,0.0,0.00,0.0,0.00,0.00,0.000000,0.0,0.00,...,0.00,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.00,0.0,0.00


In [36]:
num_top_venues = 5

for hood in ton_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = ton_grouped[ton_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

---- Lawrence Park----
                venue  freq
0    Sushi Restaurant  0.08
1  Italian Restaurant  0.08
2              Bakery  0.06
3         Coffee Shop  0.06
4                 Pub  0.04


----Adelaide----
                 venue  freq
0          Coffee Shop  0.06
1                 Café  0.06
2            Gastropub  0.04
3  Japanese Restaurant  0.04
4  American Restaurant  0.04


----Bathurst Quay----
             venue  freq
0      Coffee Shop  0.16
1             Café  0.12
2             Park  0.08
3           Tunnel  0.04
4  Harbor / Marina  0.04


----Berczy Park----
                 venue  freq
0          Coffee Shop  0.08
1                 Café  0.06
2           Restaurant  0.06
3  Japanese Restaurant  0.04
4   Italian Restaurant  0.04


----Brockton----
                   venue  freq
0                    Bar  0.16
1                   Park  0.11
2  Vietnamese Restaurant  0.11
3            Coffee Shop  0.05
4               Dive Bar  0.05


----CN Tower----
         venue  freq
0

            venue  freq
0            Café  0.09
1  Clothing Store  0.07
2     Coffee Shop  0.07
3        Boutique  0.04
4          Museum  0.04


----Underground city----
                  venue  freq
0                  Bank  0.07
1  Outdoor Supply Store  0.07
2       Bubble Tea Shop  0.07
3          Burger Joint  0.07
4   Sporting Goods Shop  0.07


----Union Station----
                 venue  freq
0          Coffee Shop  0.15
1                 Café  0.07
2  Japanese Restaurant  0.05
3           Restaurant  0.04
4        Deli / Bodega  0.04


----University of Toronto----
                 venue  freq
0                 Café  0.17
1  Japanese Restaurant  0.07
2            Bookstore  0.07
3                 Park  0.07
4   Italian Restaurant  0.07


----Victoria Hotel----
           venue  freq
0    Coffee Shop  0.18
1    Pizza Place  0.08
2           Café  0.08
3  Grocery Store  0.08
4       Pharmacy  0.03


----Yorkville----
                venue  freq
0                Café  0.05
1     

In [37]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [38]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = ton_grouped['Neighborhood']

for ind in np.arange(ton_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(ton_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Lawrence Park,Italian Restaurant,Sushi Restaurant,Coffee Shop,Bakery,Pub,Bank,Ice Cream Shop,Asian Restaurant,Cosmetics Shop,Pool
1,Adelaide,Café,Coffee Shop,Gym,Gastropub,American Restaurant,Restaurant,Japanese Restaurant,Cosmetics Shop,Seafood Restaurant,Breakfast Spot
2,Bathurst Quay,Coffee Shop,Café,Park,Boat or Ferry,Gym,Sculpture Garden,Caribbean Restaurant,Ramen Restaurant,Garden,Sushi Restaurant
3,Berczy Park,Coffee Shop,Restaurant,Café,Italian Restaurant,Japanese Restaurant,Gym,Breakfast Spot,Cocktail Bar,Beer Bar,Bakery
4,Brockton,Bar,Park,Vietnamese Restaurant,Grocery Store,Café,French Restaurant,Dive Bar,Korean Restaurant,Gastropub,Bakery


In [39]:
###CLUSTERIUNG AND ANALAYSIS

In [40]:
# set number of clusters
kclusters = 5

ton_grouped_clustering = ton_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(ton_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int32)

In [41]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

ton_merged = ton_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
ton_merged = ton_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

ton_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude_PO,Longitude_PO,Latitude_N,Longitude_N,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636,43.660706,-79.360457,2,Coffee Shop,Thai Restaurant,Pet Store,Indian Restaurant,Performing Arts Venue,Pharmacy,Pool,Pub,Restaurant,Electronics Store
1,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,43.64008,-79.38015,2,Coffee Shop,Café,Restaurant,Hotel,Italian Restaurant,Music Venue,Sports Bar,Sushi Restaurant,Brewery,Steakhouse
2,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494,43.659659,-79.39034,2,Coffee Shop,Café,Sandwich Place,Italian Restaurant,Japanese Restaurant,Thai Restaurant,Ice Cream Shop,French Restaurant,Chinese Restaurant,Bubble Tea Shop
3,M5B,Downtown Toronto,Garden District,43.657162,-79.378937,43.6565,-79.377114,2,Clothing Store,Restaurant,Coffee Shop,Hotel,Bookstore,Japanese Restaurant,Tea Room,Theater,Cosmetics Shop,Fast Food Restaurant
4,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937,43.658469,-79.378993,2,Coffee Shop,Clothing Store,Italian Restaurant,Japanese Restaurant,Restaurant,Ramen Restaurant,Burger Joint,Diner,Café,Middle Eastern Restaurant


In [42]:
ton_df.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude_PO,Longitude_PO,Latitude_N,Longitude_N
0,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636,43.660706,-79.360457
1,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,43.64008,-79.38015
2,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494,43.659659,-79.39034
3,M5B,Downtown Toronto,Garden District,43.657162,-79.378937,43.6565,-79.377114
4,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937,43.658469,-79.378993


In [43]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[ton_df.loc[31,'Latitude_PO'], ton_df.loc[31,'Longitude_PO']], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ton_merged['Latitude_N'], ton_merged['Longitude_N'], ton_merged['Neighborhood'], ton_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters