# part 1
creating a pandas dataframe from the Wikipedia table

In [1]:
#using pandas and numpy
import pandas as pd
import numpy as np

In [2]:
#using pandas read_html we get table of postal codes of Toronto from Wikipedia 
link = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
tables = pd.read_html(link,header=0)
df = pd.DataFrame(tables[0])  #read_html returns many tables, by looking at them I see that 0 is the correct one
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [3]:
df = df.loc[df.Borough!="Not assigned"] #dropping addresses with "Not assigned" Borough
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [4]:
#using groupby to group Neighborhoods with similar Postal code and Borough
gb = df.groupby(['Postcode', 'Borough'])
postcodes = []
boroughs = []
Neighborhoods = []
for name, group in gb:
    postcodes.append(name[0])
    boroughs.append(name[1])
    a = group.Neighborhood.values[0] 
    #making the grouped string for all the Neighborhoods that correspond too the same postalcode
    for i in range(1,len(group.Neighborhood.values)):
        a = a + ", " + group.Neighborhood.values[i]
    Neighborhoods.append(a)


In [5]:
#new dataframe from the grouped lists 
daf = pd.DataFrame({'Postalcode':postcodes,
                   'Borough':boroughs,
                   'Neighborhood': Neighborhoods})

In [6]:
#changing "Not assigned" neighborhood names to the borough name
for i in range(len(daf["Neighborhood"])):
    if daf["Neighborhood"][i] == "'Not assigned'":
        print(daf["Borough"][i])
        daf["Neighborhood"][i] = daf["Borough"][i]

# dataframe for part 1

In [7]:
daf

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [8]:
daf.shape

(103, 3)

# part 2
## Add Latitude and  Longitude to the dataframe

In [9]:
# import geocoder - a library for geolocation data
!conda install -c conda-forge geocoder --yes 
import geocoder 
import time

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [10]:
#using geocoder to get coordinates for all the neighborhoods by postal code
#this cell takes a few minutes to run due to requests from web services

OK = False 
k = 0 
latitude = []
longitude = []

for i in range(len(daf.Postalcode)):#
    OK = False
    while((not OK) and k < 10): # loop until you get the coordinates (ten attempts each)

        # using komoot insted of google because it returned better results 
        #if malfunctioning <komoot> can be replaced by <osm> or <google> 
        g = geocoder.komoot(str(daf.Postalcode[i]) + " , "+str(daf.Borough[i]) + ', Toronto, Ontario')
        OK = g.ok
        k += 1
        time.sleep(1) # 1 second between calls to not get locked out
        if(OK):
            #print(g.latlng) #uncomment to see in real time that we get coordinates for all
            k = 0
            latitude.append(g.latlng[0])
            longitude.append(g.latlng[1])
        elif k == 10: #try 10 times for each, if 10 times fail print to notify user
            print(i)
            latitude.append(None)
            longitude.append(None)
            print("problem with " + str(daf.Postalcode[i]) + " , "+str(daf.Borough[i]))
            OK = True
            k = 0

## dataframe for part 2:

In [11]:
toronto_df = pd.DataFrame({'Postalcode':postcodes,
                           'Borough':boroughs,
                           'Neighborhood': Neighborhoods,
                           'Latitude':latitude,
                           'Longitude':longitude})
toronto_df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.819623,-79.184498
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.787521,-79.188785
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.742517,-79.207875
3,M1G,Scarborough,Woburn,43.778504,-79.222183
4,M1H,Scarborough,Cedarbrae,43.785792,-79.22781


In [12]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(toronto_df['Borough'].unique()),
        toronto_df.shape[0]
    )
)

The dataframe has 11 boroughs and 103 neighborhoods.


* for part 3 - (exploration and analysis) for aesthetic reasons I want each row to have just one neighborhood name, so I am keeping only the first one for each, since they have the same coordinates.

In [13]:
Neighborhoods = [i.split(",")[0].strip("'") for i in Neighborhoods]

In [14]:
toronto_df = pd.DataFrame({'Postalcode':postcodes,
                           'Borough':boroughs,
                           'Neighborhood': Neighborhoods,
                           'Latitude':latitude,
                           'Longitude':longitude})
toronto_df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Rouge,43.819623,-79.184498
1,M1C,Scarborough,Highland Creek,43.787521,-79.188785
2,M1E,Scarborough,Guildwood,43.742517,-79.207875
3,M1G,Scarborough,Woburn,43.778504,-79.222183
4,M1H,Scarborough,Cedarbrae,43.785792,-79.22781


# part 3 - exploration
I'll replicate the same analysis we did to the New York City data.

first thing I want to do is look at the data on a map, for that I'll import folium

In [15]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [16]:
# create map of Toronto using latitude and longitude values
Location = [43.7, -79.4]
map_toronto = folium.Map(location=Location, zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df['Latitude'],
                                           toronto_df['Longitude'],
                                           toronto_df['Borough'], 
                                           toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

First of all we see that the data is OK and we got what we expected, a map of toronto with neighborhoods marked by postal codes. now I'll look for data from Foursquare API about the neighborhoods.

In [17]:
#hide this cell
CLIENT_ID = 'GZOLSMPGDEVONWG3PG3WPHGXZLVTIMTYVP3IJRZ0L4GJC031' # your Foursquare ID
CLIENT_SECRET = 'DPZ5TDI3OVTGJTQUKPHTUXXL5EZGYTHBPUWCIQL2S1PTPSZ1' # your Foursquare Secret
VERSION = '20191201' # Foursquare API version


lets check that Foursquare API gets us the data we want first on one neighborhood

In [18]:

radius = 500 # define radius
LIMIT = 100 # limit of number of venues returned by Foursquare API
# create URL

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    toronto_df['Latitude'][0], # just the first neighborhood in the dataframe
    toronto_df['Longitude'][0], 
    radius, 
    LIMIT)

In [19]:
import requests # library to handle web requests


In [20]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e0895ac0de0d9001cb67004'},
 'response': {'headerLocation': 'Rouge',
  'headerFullLocation': 'Rouge, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 41,
  'suggestedBounds': {'ne': {'lat': 43.824123304500006,
    'lng': -79.17827251325099},
   'sw': {'lat': 43.8151232955, 'lng': -79.1907228485828}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4cd854fd3ec4b1f71900be3f',
       'name': 'African Rainforest Pavilion',
       'location': {'lat': 43.81772505914066,
        'lng': -79.18343284457424,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.81772505914066,
          'lng': -79.18343284457424}],
        'distance': 227,
        'cc': 'CA',
        'neighborhood': 'Rouge',
        'city

Looks good we get a lot of data back in JSON format 


we need to interpret it and structure it into a pandas dataframe.

In [21]:
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe


In [22]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [23]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # normalize JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,African Rainforest Pavilion,Zoo Exhibit,43.817725,-79.183433
1,Toronto Zoo,Zoo,43.820582,-79.181551
2,Penguin Exhibit,Zoo Exhibit,43.819435,-79.185959
3,Orangutan Exhibit,Zoo Exhibit,43.818413,-79.182548
4,Polar Bear Exhibit,Zoo,43.823372,-79.185145


lets see for the first neighborhood how many venues were returned by Foursquare

In [24]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

41 venues were returned by Foursquare.


Now we can repet the process for all 103 neighborhoods of Toronto

In [25]:
# function that gets a neighborhood's location and returns venues nearby

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [26]:
TorontoVenues = getNearbyVenues(names=toronto_df['Neighborhood'],
                                latitudes=toronto_df['Latitude'],
                                longitudes=toronto_df['Longitude'])



In [27]:
print("{} Venues in Toronto returned by Foursquare API".format(TorontoVenues.shape))
TorontoVenues.head()

(2940, 7) Venues in Toronto returned by Foursquare API


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Rouge,43.819623,-79.184498,African Rainforest Pavilion,43.817725,-79.183433,Zoo Exhibit
1,Rouge,43.819623,-79.184498,Toronto Zoo,43.820582,-79.181551,Zoo
2,Rouge,43.819623,-79.184498,Penguin Exhibit,43.819435,-79.185959,Zoo Exhibit
3,Rouge,43.819623,-79.184498,Orangutan Exhibit,43.818413,-79.182548,Zoo Exhibit
4,Rouge,43.819623,-79.184498,Polar Bear Exhibit,43.823372,-79.185145,Zoo


Let's check how many venues were returned for each neighborhood

In [28]:
TorontoVenues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Adelaide,100,100,100,100,100,100
Agincourt,14,14,14,14,14,14
Agincourt North,5,5,5,5,5,5
Albion Gardens,3,3,3,3,3,3
Alderwood,5,5,5,5,5,5
Bathurst Manor,4,4,4,4,4,4
Bayview Village,13,13,13,13,13,13
Bedford Park,1,1,1,1,1,1
Berczy Park,100,100,100,100,100,100
Birch Cliff,4,4,4,4,4,4


#### Let's find out how many unique categories can be curated from all the returned venues

In [29]:
print('There are {} unique categories.'.format(len(TorontoVenues['Venue Category'].unique())))

There are 290 unique categories.


## Analyzing Neighborhoods

In [30]:
# one hot encoding
OnehotToronto = pd.get_dummies(TorontoVenues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
OnehotToronto['Neighborhood'] = TorontoVenues['Neighborhood'] 

# move neighborhood column to the first column
ind_neig = list(OnehotToronto.columns).index('Neighborhood')
fixed_columns = [OnehotToronto.columns[ind_neig]] + list(OnehotToronto.columns[:ind_neig]) + list(OnehotToronto.columns[ind_neig+1:])
OnehotToronto = OnehotToronto[fixed_columns]

OnehotToronto.head()

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport Lounge,American Restaurant,Animal Shelter,Antique Shop,Aquarium,Arcade,Art Gallery,...,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,Rouge,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,Rouge,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,Rouge,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Rouge,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,Rouge,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [31]:
OnehotToronto.shape

(2940, 290)

#### Next, let's group rows by neighborhood and by taking the mean of the frequency (MF) of occurrence of each category

In [32]:
torontoMF = OnehotToronto.groupby('Neighborhood').mean().reset_index()
torontoMF.head()

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport Lounge,American Restaurant,Animal Shelter,Antique Shop,Aquarium,Arcade,Art Gallery,...,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,Adelaide,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Agincourt North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Albion Gardens,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Alderwood,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
torontoMF.shape

(103, 290)

lets look at the top top 5 venues for each neighborhood

In [34]:
num_top_venues = 5

for hood in torontoMF['Neighborhood']:
    print("----"+hood+"----")
    temp = torontoMF[torontoMF['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide----
                 venue  freq
0          Coffee Shop  0.07
1       Clothing Store  0.05
2           Restaurant  0.04
3       Cosmetics Shop  0.04
4  American Restaurant  0.04


----Agincourt----
                    venue  freq
0      Chinese Restaurant  0.21
1     Rental Car Location  0.14
2           Shopping Mall  0.07
3  Peking Duck Restaurant  0.07
4       Korean Restaurant  0.07


----Agincourt North----
                venue  freq
0  Chinese Restaurant   0.4
1  Athletics & Sports   0.2
2         Coffee Shop   0.2
3            Bus Stop   0.2
4   Accessories Store   0.0


----Albion Gardens----
                  venue  freq
0          Burger Joint  0.33
1    Chinese Restaurant  0.33
2  Fast Food Restaurant  0.33
3             Nightclub  0.00
4          Noodle House  0.00


----Alderwood----
                    venue  freq
0                   Trail   0.4
1                   Beach   0.2
2                    Park   0.2
3       College Cafeteria   0.2
4  Peking Duck Res

#### Let's put that into a pandas dataframe

In [35]:
#a function to sort the venues in descending order:

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [36]:
#creating a new dataframe and display the top 10 venues for each neighborhood.
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = torontoMF['Neighborhood']

for ind in np.arange(torontoMF.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(torontoMF.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Clothing Store,Restaurant,American Restaurant,Cosmetics Shop,Italian Restaurant,Bakery,Gym,Tea Room,Plaza
1,Agincourt,Chinese Restaurant,Rental Car Location,Coffee Shop,Hong Kong Restaurant,Shopping Mall,Peking Duck Restaurant,Train Station,Korean Restaurant,Cantonese Restaurant,Asian Restaurant
2,Agincourt North,Chinese Restaurant,Coffee Shop,Bus Stop,Athletics & Sports,Zoo Exhibit,Fish & Chips Shop,Farmers Market,Fast Food Restaurant,Festival,Filipino Restaurant
3,Albion Gardens,Burger Joint,Chinese Restaurant,Fast Food Restaurant,French Restaurant,Falafel Restaurant,Farmers Market,Festival,Filipino Restaurant,Fish & Chips Shop,Fish Market
4,Alderwood,Trail,Park,Beach,College Cafeteria,Empanada Restaurant,Factory,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Festival


## Clustering Neighborhoods

Now based on the top 10 venues in each neighborhood we can cluster them and see which neighborhoods are similar in this way.

We will use k-means with k = 5 to sort them into 5 clusters

In [37]:
# import K-Means algorithm
from sklearn.cluster import KMeans


In [38]:
# set number of clusters
kclusters = 5

torontoMFClust = torontoMF.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(torontoMFClust)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0])

In [39]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

TorontoMerged = toronto_df

# merge toronto_grouped with toronto_df to add latitude/longitude for each neighborhood
TorontoMerged = TorontoMerged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

TorontoMerged.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,Rouge,43.819623,-79.184498,0,Zoo Exhibit,Other Great Outdoors,Zoo,Gift Shop,Restaurant,Fast Food Restaurant,Tram Station,Café,Dessert Shop,Food & Drink Shop
1,M1C,Scarborough,Highland Creek,43.787521,-79.188785,0,Coffee Shop,Burger Joint,Restaurant,Fish Market,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Festival,Filipino Restaurant,Fish & Chips Shop
2,M1E,Scarborough,Guildwood,43.742517,-79.207875,0,Pub,Shopping Mall,Chinese Restaurant,Coffee Shop,Festival,Factory,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Filipino Restaurant
3,M1G,Scarborough,Woburn,43.778504,-79.222183,0,Bus Stop,Zoo Exhibit,Fish Market,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Festival,Filipino Restaurant,Fish & Chips Shop,Flea Market
4,M1H,Scarborough,Cedarbrae,43.785792,-79.22781,0,Coffee Shop,Business Service,Spa,Zoo Exhibit,Fish & Chips Shop,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Festival,Filipino Restaurant


In [40]:
TorontoMerged.shape

(103, 16)

In [41]:
TorontoMerged = TorontoMerged.dropna()
TorontoMerged.shape

(103, 16)

In [42]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [43]:
# create map
Location = [43.7, -79.4]

map_clusters = folium.Map(location=Location, zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(TorontoMerged['Latitude'], TorontoMerged['Longitude'],
                                  TorontoMerged['Neighborhood'], TorontoMerged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [44]:
TorontoMerged.loc[TorontoMerged['Cluster Labels'] == 0, TorontoMerged.columns[[1] + list(range(5, TorontoMerged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Scarborough,0,Zoo Exhibit,Other Great Outdoors,Zoo,Gift Shop,Restaurant,Fast Food Restaurant,Tram Station,Café,Dessert Shop,Food & Drink Shop
1,Scarborough,0,Coffee Shop,Burger Joint,Restaurant,Fish Market,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Festival,Filipino Restaurant,Fish & Chips Shop
2,Scarborough,0,Pub,Shopping Mall,Chinese Restaurant,Coffee Shop,Festival,Factory,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Filipino Restaurant
3,Scarborough,0,Bus Stop,Zoo Exhibit,Fish Market,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Festival,Filipino Restaurant,Fish & Chips Shop,Flea Market
4,Scarborough,0,Coffee Shop,Business Service,Spa,Zoo Exhibit,Fish & Chips Shop,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Festival,Filipino Restaurant
5,Scarborough,0,Coffee Shop,Pub,Pharmacy,Gym,Chinese Restaurant,Fast Food Restaurant,Discount Store,Fish & Chips Shop,Filipino Restaurant,Festival
6,Scarborough,0,Factory,Train Station,Coffee Shop,Fish & Chips Shop,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Festival,Filipino Restaurant,Zoo Exhibit
7,Scarborough,0,Burrito Place,Chinese Restaurant,Fried Chicken Joint,Laser Tag,Burger Joint,Sporting Goods Shop,Pizza Place,Breakfast Spot,Supermarket,Fast Food Restaurant
8,Scarborough,0,Park,Construction & Landscaping,Zoo Exhibit,Fish & Chips Shop,Factory,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Festival,Filipino Restaurant
9,Scarborough,0,Park,Gym Pool,Hotel,Zoo Exhibit,Filipino Restaurant,Factory,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Festival


In [45]:
TorontoMerged.loc[TorontoMerged['Cluster Labels'] == 1, TorontoMerged.columns[[1] + list(range(5, TorontoMerged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
62,North York,1,Wine Shop,Zoo Exhibit,Fish Market,Factory,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Festival,Filipino Restaurant,Fish & Chips Shop


In [46]:
TorontoMerged.loc[TorontoMerged['Cluster Labels'] == 2, TorontoMerged.columns[[1] + list(range(5, TorontoMerged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
15,Scarborough,2,Playground,Zoo Exhibit,Fish Market,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Festival,Filipino Restaurant,Fish & Chips Shop,Flea Market


In [47]:
TorontoMerged.loc[TorontoMerged['Cluster Labels'] == 3, TorontoMerged.columns[[1] + list(range(5, TorontoMerged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
26,North York,3,Pool,Zoo Exhibit,Fish Market,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Festival,Filipino Restaurant,Fish & Chips Shop,Flea Market


In [48]:
TorontoMerged.loc[TorontoMerged['Cluster Labels'] == 4, TorontoMerged.columns[[1] + list(range(5, TorontoMerged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
32,North York,4,Insurance Office,Zoo Exhibit,Fish Market,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Festival,Filipino Restaurant,Fish & Chips Shop,Flea Market


only one categorie contains more than two venues, maybe k = 2 would make more sense:

In [49]:
kclusters = 2

torontoMFClust = torontoMF.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=5).fit(torontoMFClust)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 


#creating a new dataframe and display the top 10 venues for each neighborhood.
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = torontoMF['Neighborhood']

for ind in np.arange(torontoMF.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(torontoMF.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

TorontoMerged = toronto_df

# merge toronto_grouped with toronto_df to add latitude/longitude for each neighborhood
TorontoMerged = TorontoMerged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')


TorontoMerged = TorontoMerged.dropna()





In [50]:
Location = [43.7, -79.4]

map_clusters = folium.Map(location=Location, zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(TorontoMerged['Latitude'], TorontoMerged['Longitude'],
                                  TorontoMerged['Neighborhood'], TorontoMerged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [51]:
TorontoMerged.loc[TorontoMerged['Cluster Labels'] == 0, TorontoMerged.columns[[1] + list(range(5, TorontoMerged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Scarborough,0,Zoo Exhibit,Other Great Outdoors,Zoo,Gift Shop,Restaurant,Fast Food Restaurant,Tram Station,Café,Dessert Shop,Food & Drink Shop
1,Scarborough,0,Coffee Shop,Burger Joint,Restaurant,Fish Market,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Festival,Filipino Restaurant,Fish & Chips Shop
2,Scarborough,0,Pub,Shopping Mall,Chinese Restaurant,Coffee Shop,Festival,Factory,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Filipino Restaurant
3,Scarborough,0,Bus Stop,Zoo Exhibit,Fish Market,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Festival,Filipino Restaurant,Fish & Chips Shop,Flea Market
4,Scarborough,0,Coffee Shop,Business Service,Spa,Zoo Exhibit,Fish & Chips Shop,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Festival,Filipino Restaurant
5,Scarborough,0,Coffee Shop,Pub,Pharmacy,Gym,Chinese Restaurant,Fast Food Restaurant,Discount Store,Fish & Chips Shop,Filipino Restaurant,Festival
6,Scarborough,0,Factory,Train Station,Coffee Shop,Fish & Chips Shop,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Festival,Filipino Restaurant,Zoo Exhibit
7,Scarborough,0,Burrito Place,Chinese Restaurant,Fried Chicken Joint,Laser Tag,Burger Joint,Sporting Goods Shop,Pizza Place,Breakfast Spot,Supermarket,Fast Food Restaurant
10,Scarborough,0,Clothing Store,Coffee Shop,Cosmetics Shop,Tea Room,Sporting Goods Shop,Sandwich Place,Pharmacy,Mexican Restaurant,Food Court,Movie Theater
11,Scarborough,0,Burger Joint,Pizza Place,Korean Restaurant,Café,Smoke Shop,Fish Market,Breakfast Spot,Gas Station,Middle Eastern Restaurant,Seafood Restaurant


In [52]:
TorontoMerged.loc[TorontoMerged['Cluster Labels'] == 1, TorontoMerged.columns[[1] + list(range(5, TorontoMerged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,Scarborough,1,Park,Construction & Landscaping,Zoo Exhibit,Fish & Chips Shop,Factory,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Festival,Filipino Restaurant
9,Scarborough,1,Park,Gym Pool,Hotel,Zoo Exhibit,Filipino Restaurant,Factory,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Festival
62,North York,1,Wine Shop,Zoo Exhibit,Fish Market,Factory,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Festival,Filipino Restaurant,Fish & Chips Shop
80,York,1,Park,Discount Store,Grocery Store,Wine Shop,Sandwich Place,Factory,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Festival
91,Etobicoke,1,Moving Target,Park,Cycle Studio,Fish Market,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Festival,Filipino Restaurant,Fish & Chips Shop


this is too much data too understand, by looking at the top 5 most frequent venues in the top 3 places we can characterise the clusters 

In [53]:
clust1 = TorontoMerged.loc[TorontoMerged['Cluster Labels'] == 0,
                           TorontoMerged.columns[[1] + list(range(5, TorontoMerged.shape[1]))]]


In [54]:
df = pd.DataFrame({"1st Most Common Venue": clust1["1st Most Common Venue"].value_counts().sort_values(ascending = False)[:5],
                   "2nd Most Common Venue": clust1["2nd Most Common Venue"].value_counts().sort_values(ascending = False)[:5],
                   "3rd Most Common Venue": clust1["3rd Most Common Venue"].value_counts().sort_values(ascending = False)[:5]
                  })

df['sum'] = df.sum(axis=1)
df.sort_values("sum",ascending = False)

Unnamed: 0,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,sum
Coffee Shop,25.0,13.0,8.0,46.0
Café,11.0,6.0,,17.0
Italian Restaurant,,,11.0,11.0
Bookstore,,7.0,,7.0
Clothing Store,,6.0,,6.0
Bakery,,,5.0,5.0
Pizza Place,5.0,,,5.0
Fish Market,,,4.0,4.0
Park,,,4.0,4.0
Pub,4.0,,,4.0


cluster - 1 has a lot of coffee shops and cafe's and some restaurants of other types

In [55]:
clust2 = TorontoMerged.loc[TorontoMerged['Cluster Labels'] == 1,
                           TorontoMerged.columns[[1] + list(range(5, TorontoMerged.shape[1]))]]


In [56]:
df = pd.DataFrame({"1st Most Common Venue": clust2["1st Most Common Venue"].value_counts().sort_values(ascending = False)[:5],
                   "2nd Most Common Venue": clust2["2nd Most Common Venue"].value_counts().sort_values(ascending = False)[:5],
                   "3rd Most Common Venue": clust2["3rd Most Common Venue"].value_counts().sort_values(ascending = False)[:5]
                  })

df['sum'] = df.sum(axis=1)
df.sort_values("sum",ascending = False)

Unnamed: 0,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,sum
Park,3.0,1.0,,4.0
Zoo Exhibit,,1.0,1.0,2.0
Construction & Landscaping,,1.0,,1.0
Cycle Studio,,,1.0,1.0
Discount Store,,1.0,,1.0
Fish Market,,,1.0,1.0
Grocery Store,,,1.0,1.0
Gym Pool,,1.0,,1.0
Hotel,,,1.0,1.0
Moving Target,1.0,,,1.0


cluster- 2 has a lot of parks and some other typs of outdoor venues, it is much smaller then cluster 1, it seams toronto is somewhat homogeneous