# Part I - Segmenting and Clustering Neighborhoods in Toronto

## In this assignment we have to create pandas dataframe from the table of the Torronto post codes, borroughs and neighborhoods located on wikipedia web page

In [1]:
#first we have to import needed libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import json
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

### First we have to scrape table from the web page with requests and BeautifulSoup python libraries

In [2]:
#using requests library in order to get web page that I need and Beautiful Soup library to parse from that web page
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml') #parser is lxml

In [3]:
#print(soup.prettify()) #prettify method is used to get identations of the code like on the real HTML page

In [4]:
#parse table from the whole web page
table = soup.find('table', class_='wikitable sortable')
#print(table.prettify())

In [5]:
#in HTML 'tr' is the tag for table rows
table_rows = table.find_all('tr')
len(table_rows)

289

In [6]:
#create a list with 3 items in every list. Strip is used to remove '\n' from the end of the line.
toronto_list = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [i.text.strip() for i in td]
    #print(row)
    toronto_list.append(row)
toronto_list

[[],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M5A', 'Downtown Toronto', 'Regent Park'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', "Queen's Park", 'Not assigned'],
 ['M8A', 'Not assigned', 'Not assigned'],
 ['M9A', 'Etobicoke', 'Islington Avenue'],
 ['M1B', 'Scarborough', 'Rouge'],
 ['M1B', 'Scarborough', 'Malvern'],
 ['M2B', 'Not assigned', 'Not assigned'],
 ['M3B', 'North York', 'Don Mills North'],
 ['M4B', 'East York', 'Woodbine Gardens'],
 ['M4B', 'East York', 'Parkview Hill'],
 ['M5B', 'Downtown Toronto', 'Ryerson'],
 ['M5B', 'Downtown Toronto', 'Garden District'],
 ['M6B', 'North York', 'Glencairn'],
 ['M7B', 'Not assigned', 'Not assigned'],
 ['M8B', 'Not assigned', 'Not assigned'],
 ['M9B', 'Etobicoke', 'Cloverdale'],
 ['M9B', 'Etobicoke', 'Islington'],
 ['M

In [7]:
len(toronto_list)

289

In [8]:
#remove first item from the list
toronto_list = toronto_list[1:]
toronto_list[:5]

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront']]

### Now we have to do the following: 
* create pandas DataFrame from the list of list that was scraped from the web page
* drop all rows with the value of the Borough 'Not Assigned'
* change name of the Neighborhood in the row that has neighborhood value 'Not Assigned' to the corresponding Borough name
* combine all rows that have same PostalCode and Borough, like M5A and Downtown Toronto into one row with 2 or more neighborhoods. For this we have to use groupby method.

In [9]:
#creating DF from the list and renaming columns
df = pd.DataFrame(toronto_list)
df.rename(columns={0 : 'PostalCode', 1 : 'Borough', 2 : 'Neighborhood'}, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [10]:
len(df)

288

In [11]:
#We have to drop all rows with the value of the Borough'Not Assigned'. There are 77 rows.
len(df.loc[df['Borough'] == 'Not assigned'])

77

In [12]:
#we have dropped 77 rows, final number of rows is 211
df = df.loc[df['Borough'] != 'Not assigned']
len(df)

211

In [13]:
# Code to drop all rows with the value of the Borough and Neighborhod 'Not Assigned'. There are 78 rows.
#len(df.loc[(df['Borough'] == 'Not assigned') | (df['Neighborhood'] == 'Not assigned')])
#df = df.loc[(df['Borough'] != 'Not assigned') & (df['Neighborhood'] != 'Not assigned')]

In [14]:
#there is one more 'Not assigned' value in the Neighborhood, we will make Nighborhood same as the Borough 
df.loc[df['Neighborhood'] == 'Not assigned', 'Neighborhood'] = "Queen's Park"

In [15]:
#just to check if the name was changed successfuly
df.loc[df['Borough'] == "Queen's Park"]

Unnamed: 0,PostalCode,Borough,Neighborhood
8,M7A,Queen's Park,Queen's Park


In [16]:
#we have to combine all rows that have same PostalCode and Borough, like M5A and Downtown Toronto 
#into one row with 2 or more neighborhoods
df.loc[df.PostalCode == 'M5A']

Unnamed: 0,PostalCode,Borough,Neighborhood
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park


In [17]:
#use group by method to group items by PostalCode and Borough, result is Series with two indexes and string join by ','.
df_grouped = df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(','.join)

In [18]:
df_grouped.head(8)

PostalCode  Borough    
M1B         Scarborough                                Rouge,Malvern
M1C         Scarborough         Highland Creek,Rouge Hill,Port Union
M1E         Scarborough              Guildwood,Morningside,West Hill
M1G         Scarborough                                       Woburn
M1H         Scarborough                                    Cedarbrae
M1J         Scarborough                          Scarborough Village
M1K         Scarborough    East Birchmount Park,Ionview,Kennedy Park
M1L         Scarborough                Clairlea,Golden Mile,Oakridge
Name: Neighborhood, dtype: object

In [19]:
#group by return Pandas Series here multiindex serie.
type(df_grouped)

pandas.core.series.Series

In [20]:
df_toronto = pd.DataFrame(df_grouped)
df_toronto.head(8)

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighborhood
PostalCode,Borough,Unnamed: 2_level_1
M1B,Scarborough,"Rouge,Malvern"
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
M1E,Scarborough,"Guildwood,Morningside,West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
M1J,Scarborough,Scarborough Village
M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"


In [21]:
df_toronto = df_toronto.reset_index()
df_toronto.head(8)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"


In [22]:
df_toronto.shape

(103, 3)

# Part 2 - Segmenting and Clustering Neighborhoods in Toronto

Now that we have built a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name, in order to utilize the Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood.

### Geocoder with geocoder.google does not work, it works with geocoder.arcgis but the coordinates are not  the same as the ones in the csv file.
Here is the code used with geocoder.arcgis

In [23]:
import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None
latitude_list = []
longitude_list = []

# loop until you get the coordinates
while(lat_lng_coords is None):
    for i in df_toronto['PostalCode']: 
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(i))
        lat_lng_coords = g.latlng
        latitude_list.append(lat_lng_coords[0])
        longitude_list.append(lat_lng_coords[1])


In [24]:
len(latitude_list)

103

In [25]:
len(longitude_list)

103

In [26]:
df_latitude = pd.DataFrame(latitude_list)
df_latitude.head(6)

Unnamed: 0,0
0,43.811525
1,43.785665
2,43.765815
3,43.768369
4,43.769688
5,43.743125


In [27]:
df_longitude = pd.DataFrame(longitude_list)
df_longitude.head(6)

Unnamed: 0,0
0,-79.195517
1,-79.158725
2,-79.175193
3,-79.21759
4,-79.23944
5,-79.23175


In [28]:
df_toronto.head(6)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village


### I have tested coordinates also with pgeocode and they are not the same as the ones in csv file and also as the ones that I get with the goecoder.arcgis

In [29]:
import pgeocode

nomi = pgeocode.Nominatim('ca')
data = nomi.query_postal_code(["M1B", "M1C", "M1E", "M1G", "M1H"])
data

Unnamed: 0,postal_code,country code,place_name,state_name,state_code,county_name,county_code,community_name,community_code,latitude,longitude,accuracy
0,M1B,CA,Scarborough (Malvern / Rouge River),Ontario,ON,Scarborough,,,,43.8113,-79.193,6.0
1,M1C,CA,Scarborough (Rouge Hill / Port Union / Highlan...,Ontario,ON,Scarborough,,,,43.7878,-79.1564,6.0
2,M1E,CA,Scarborough (Guildwood / Morningside / Ellesmere),Ontario,ON,Scarborough,,,,43.7678,-79.1866,6.0
3,M1G,CA,Scarborough (Woburn),Ontario,ON,Scarborough,,,,43.7712,-79.2144,6.0
4,M1H,CA,Scarborough (Cedarbrae),Ontario,ON,Scarborough,,,,43.7686,-79.2389,6.0


### Due to the differences between values of the coordinates in the csv file and the values with geocoder and pgeocode I will make DF with the values from the csv file.

In [30]:
#import csv file
long_lat = pd.read_csv('Geospatial_Coordinates_Toronto_Neighbours.csv')
long_lat.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [31]:
long_lat.drop('Postal Code', axis= 1, inplace=True)
long_lat.head()

Unnamed: 0,Latitude,Longitude
0,43.806686,-79.194353
1,43.784535,-79.160497
2,43.763573,-79.188711
3,43.770992,-79.216917
4,43.773136,-79.239476


In [32]:
long_lat.shape

(103, 2)

In [33]:
df_toronto.shape

(103, 3)

In [34]:
df = pd.concat([df_toronto, long_lat], axis=1)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [35]:
df.shape

(103, 5)

## Part 3 - Segmenting and Clustering Neighborhoods in Toronto

### First I will create map of Toronto with the neighborhoods and boroughs we have in the dataframe


In [36]:
df.Borough.value_counts()

North York          24
Downtown Toronto    18
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East York            5
York                 5
East Toronto         5
Mississauga          1
Queen's Park         1
Name: Borough, dtype: int64

In [37]:
df.loc[df.Borough == 'Central Toronto'].head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
47,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316


In [38]:
df.loc[28]

PostalCode                                                M3H
Borough                                            North York
Neighborhood    Bathurst Manor,Downsview North,Wilson Heights
Latitude                                              43.7543
Longitude                                            -79.4423
Name: 28, dtype: object

In [39]:
#I will use longitude and latitude values of Central Toronto Davisville as the value to open map.
longitude = -79.388790
latitude = 43.704324

In [40]:
# create map of Toronto using latitude and longitude values
import folium

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

However, for illustration purposes, let's simplify the above map and segment and cluster only 2 neighborhoods in Central Toronto. So let's slice the original dataframe and create a new dataframe.

In [109]:
df_central = df.loc[(df['Borough'] == 'Central Toronto') | (df['Borough'] == 'East York')].reset_index(drop=True)
df_central

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4B,East York,"Woodbine Gardens,Parkview Hill",43.706397,-79.309937
1,M4C,East York,Woodbine Heights,43.695344,-79.318389
2,M4G,East York,Leaside,43.70906,-79.363452
3,M4H,East York,Thorncliffe Park,43.705369,-79.349372
4,M4J,East York,East Toronto,43.685347,-79.338106
5,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
6,M4P,Central Toronto,Davisville North,43.712751,-79.390197
7,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
8,M4S,Central Toronto,Davisville,43.704324,-79.38879
9,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316


In [110]:
#I will use same latitude and longitude of Davisville as in the map above for open the map
map_central_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_central['Latitude'], df_central['Longitude'], 
                                           df_central['Borough'], df_central['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_central_toronto)  
    
map_central_toronto

## Now we are going to use the Foursquare API to explore and segment the neighborhoods.

### Define Foursquare Credentials and Version

In [111]:
CLIENT_ID = 'G5LVEOE3FLIASFNRITJLWM1Q0GELIKATD3R3ZEQ0YYYTE3DF' # your Foursquare ID
CLIENT_SECRET = 'YHIQ2ZT4CBJZMOMXAHADXPNRKWCWRGOBC2D34U4JHLOUAFHR' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: G5LVEOE3FLIASFNRITJLWM1Q0GELIKATD3R3ZEQ0YYYTE3DF
CLIENT_SECRET:YHIQ2ZT4CBJZMOMXAHADXPNRKWCWRGOBC2D34U4JHLOUAFHR


### Let's explore the first neighborhood in our dataframe.

In [112]:
df_central.loc[0]

PostalCode                                 M4B
Borough                              East York
Neighborhood    Woodbine Gardens,Parkview Hill
Latitude                               43.7064
Longitude                             -79.3099
Name: 0, dtype: object

In [145]:
#I will use location of the Leaside since it is centrally located
neighborhood_latitude = df_central.loc[2, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df_central.loc[2, 'Longitude'] # neighborhood longitude value

neighborhood_name = df_central.loc[2, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Leaside are 43.7090604, -79.3634517.


### Now, let's get the top 100 venues that are in Central Toronto within a radius of 500 meters.

First, let's create the GET request URL. Name your URL url.


In [146]:
radius = 500
LIMIT = 100
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, neighborhood_latitude, neighborhood_longitude, VERSION, radius, LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?client_id=G5LVEOE3FLIASFNRITJLWM1Q0GELIKATD3R3ZEQ0YYYTE3DF&client_secret=YHIQ2ZT4CBJZMOMXAHADXPNRKWCWRGOBC2D34U4JHLOUAFHR&ll=43.7090604,-79.3634517&v=20180605&radius=500&limit=100'

In [147]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5dbc9182cc7d41002b133667'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Leaside',
  'headerFullLocation': 'Leaside, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 35,
  'suggestedBounds': {'ne': {'lat': 43.7135604045, 'lng': -79.3572380270639},
   'sw': {'lat': 43.704560395499996, 'lng': -79.3696653729361}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '5531956d498e24c6e9994f2e',
       'name': 'Local Leaside',
       'location': {'address': '180 Laird Drive',
        'lat': 43.71001166793114,
        'lng': -79.36351433524794,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.71001166793114,
 

### Now we have to create DF with the columns, name, categorie of the venue, lat and long. Here bellow we have expressions to extract these values from the json file.
I will try to do this without json normalize function just for the practice

In [148]:
name = results['response']['groups'][0]['items'][0]['venue']['name']
name

'Local Leaside'

In [149]:
category = results['response']['groups'][0]['items'][0]['venue']['categories'][0]['name']
category

'Sports Bar'

In [150]:
lat = results['response']['groups'][0]['items'][0]['venue']['location']['lat']
lat

43.71001166793114

In [151]:
lng = results['response']['groups'][0]['items'][0]['venue']['location']['lng']
lng

-79.36351433524794

In [152]:
#this is expression to loop on the items of the json file, I will use len function for the loop
results['response']['groups'][0]['items']

[{'reasons': {'count': 0,
   'items': [{'summary': 'This spot is popular',
     'type': 'general',
     'reasonName': 'globalInteractionReason'}]},
  'venue': {'id': '5531956d498e24c6e9994f2e',
   'name': 'Local Leaside',
   'location': {'address': '180 Laird Drive',
    'lat': 43.71001166793114,
    'lng': -79.36351433524794,
    'labeledLatLngs': [{'label': 'display',
      'lat': 43.71001166793114,
      'lng': -79.36351433524794}],
    'distance': 106,
    'postalCode': 'M4G 3V7',
    'cc': 'CA',
    'city': 'Toronto',
    'state': 'ON',
    'country': 'Canada',
    'formattedAddress': ['180 Laird Drive', 'Toronto ON M4G 3V7', 'Canada']},
   'categories': [{'id': '4bf58dd8d48988d11d941735',
     'name': 'Sports Bar',
     'pluralName': 'Sports Bars',
     'shortName': 'Sports Bar',
     'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/nightlife/sportsbar_',
      'suffix': '.png'},
     'primary': True}],
   'photos': {'count': 0, 'groups': []}},
  'referralId': 'e-0-5531

In [153]:
name_list = []
category_list = []
lat_list = []
lng_list = []
for i in range(len(results['response']['groups'][0]['items'])):
    name = results['response']['groups'][0]['items'][i]['venue']['name']
    category = results['response']['groups'][0]['items'][i]['venue']['categories'][0]['name']
    lat = results['response']['groups'][0]['items'][i]['venue']['location']['lat']
    lng = results['response']['groups'][0]['items'][i]['venue']['location']['lng']
    name_list.append(name)
    category_list.append(category)
    lat_list.append(lat)
    lng_list.append(lng)



In [154]:
#just to check if we get wanted result
category_list

['Sports Bar',
 'Liquor Store',
 'Fish & Chips Shop',
 'Sporting Goods Shop',
 'Gym',
 'Restaurant',
 'Bike Shop',
 'Coffee Shop',
 'Grocery Store',
 'Supermarket',
 'Sushi Restaurant',
 'Pet Store',
 'Smoothie Shop',
 'Burger Joint',
 'Clothing Store',
 'Coffee Shop',
 'Burger Joint',
 'Sporting Goods Shop',
 'Shopping Mall',
 'Sandwich Place',
 'Dessert Shop',
 'Bank',
 'Coffee Shop',
 'Brewery',
 'Breakfast Spot',
 'Sporting Goods Shop',
 'Record Shop',
 'Furniture / Home Store',
 'Beer Store',
 'Furniture / Home Store',
 'Mexican Restaurant',
 'Electronics Store',
 'Coffee Shop',
 'Bagel Shop',
 'Sandwich Place']

In [155]:
a_list = []
for i in zip(name_list, category_list, lat_list, lng_list):
    a_list.append(i)

a_list

[('Local Leaside', 'Sports Bar', 43.71001166793114, -79.36351433524794),
 ('LCBO', 'Liquor Store', 43.710571304463954, -79.36028703241085),
 ('Olde Yorke Fish & Chips',
  'Fish & Chips Shop',
  43.706141306111306,
  -79.3618288170822),
 ('Rack Attack', 'Sporting Goods Shop', 43.70693351467496, -79.36226108591485),
 ('CrossFit Toronto', 'Gym', 43.7080995557005, -79.35905992984772),
 ('The Leaside Pub', 'Restaurant', 43.710428911854564, -79.36354706165021),
 ('Enduro Sport', 'Bike Shop', 43.70605947915959, -79.36183454315712),
 ('Aroma Espresso Bar', 'Coffee Shop', 43.70561065613986, -79.36077468518098),
 ('Bulk Barn', 'Grocery Store', 43.70611573147322, -79.36054145695127),
 ("Longo's", 'Supermarket', 43.70643262604979, -79.35975310254634),
 ('Kintako Japanese Restaurant',
  'Sushi Restaurant',
  43.71159659251094,
  -79.36396207124376),
 ('PetSmart', 'Pet Store', 43.7126819, -79.3626365),
 ('Booster Juice', 'Smoothie Shop', 43.70617277, -79.36065194),
 ('South St. Burger', 'Burger Join

In [156]:
venues_df = pd.DataFrame(a_list, columns=['name', 'category', 'lat', 'lng'])
venues_df

Unnamed: 0,name,category,lat,lng
0,Local Leaside,Sports Bar,43.710012,-79.363514
1,LCBO,Liquor Store,43.710571,-79.360287
2,Olde Yorke Fish & Chips,Fish & Chips Shop,43.706141,-79.361829
3,Rack Attack,Sporting Goods Shop,43.706934,-79.362261
4,CrossFit Toronto,Gym,43.7081,-79.35906
5,The Leaside Pub,Restaurant,43.710429,-79.363547
6,Enduro Sport,Bike Shop,43.706059,-79.361835
7,Aroma Espresso Bar,Coffee Shop,43.705611,-79.360775
8,Bulk Barn,Grocery Store,43.706116,-79.360541
9,Longo's,Supermarket,43.706433,-79.359753


## Explore neighborhoods in Toronto

In [157]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [158]:
toronto_venues = getNearbyVenues(names=df_central['Neighborhood'],
                                   latitudes=df_central['Latitude'],
                                   longitudes=df_central['Longitude']
                                  )

Woodbine Gardens,Parkview Hill
Woodbine Heights
Leaside
Thorncliffe Park
East Toronto
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park,Summerhill East
Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West
Roselawn
Forest Hill North,Forest Hill West
The Annex,North Midtown,Yorkville


In [159]:
print(toronto_venues.shape)
toronto_venues.head()

(196, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Woodbine Gardens,Parkview Hill",43.706397,-79.309937,Jawny Bakers,43.705783,-79.312913,Gastropub
1,"Woodbine Gardens,Parkview Hill",43.706397,-79.309937,East York Gymnastics,43.710654,-79.309279,Gym / Fitness Center
2,"Woodbine Gardens,Parkview Hill",43.706397,-79.309937,TD Canada Trust,43.70574,-79.31227,Bank
3,"Woodbine Gardens,Parkview Hill",43.706397,-79.309937,Shoppers Drug Mart,43.705933,-79.312825,Pharmacy
4,"Woodbine Gardens,Parkview Hill",43.706397,-79.309937,Pizza Pizza,43.705159,-79.31313,Pizza Place


Let's check how many venues were returned for each neighborhood

In [160]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Davisville,35,35,35,35,35,35
Davisville North,9,9,9,9,9,9
"Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West",15,15,15,15,15,15
East Toronto,4,4,4,4,4,4
"Forest Hill North,Forest Hill West",4,4,4,4,4,4
Lawrence Park,3,3,3,3,3,3
Leaside,35,35,35,35,35,35
"Moore Park,Summerhill East",2,2,2,2,2,2
North Toronto West,24,24,24,24,24,24
Roselawn,2,2,2,2,2,2


Let's find out how many unique categories can be curated from all the returned venues

In [161]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 84 uniques categories.


### Analyze each neighborhood

In [162]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,Athletics & Sports,BBQ Joint,Bagel Shop,Bank,Beer Store,Bike Shop,Breakfast Spot,...,Supermarket,Sushi Restaurant,Swim School,Thai Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Yoga Studio
0,"Woodbine Gardens,Parkview Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Woodbine Gardens,Parkview Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Woodbine Gardens,Parkview Hill",0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Woodbine Gardens,Parkview Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Woodbine Gardens,Parkview Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [163]:
toronto_onehot.shape

(196, 85)

Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [164]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,Athletics & Sports,BBQ Joint,Bagel Shop,Bank,Beer Store,Bike Shop,Breakfast Spot,...,Supermarket,Sushi Restaurant,Swim School,Thai Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Yoga Studio
0,Davisville,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.057143,0.0,0.028571,0.028571,0.0,0.0,0.0,0.0,0.0
1,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",0.066667,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,...,0.066667,0.066667,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.0
3,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Forest Hill North,Forest Hill West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0
5,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Leaside,0.0,0.0,0.0,0.0,0.028571,0.028571,0.028571,0.028571,0.028571,...,0.028571,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Moore Park,Summerhill East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,North Toronto West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667
9,Roselawn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Let's print each neighborhood along with the top 5 most common venues

In [165]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Davisville----
                venue  freq
0         Pizza Place  0.09
1        Dessert Shop  0.09
2      Sandwich Place  0.09
3  Italian Restaurant  0.06
4         Coffee Shop  0.06


----Davisville North----
            venue  freq
0  Clothing Store  0.11
1           Hotel  0.11
2  Sandwich Place  0.11
3             Gym  0.11
4  Breakfast Spot  0.11


----Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West----
                 venue  freq
0          Coffee Shop  0.13
1                  Pub  0.13
2  American Restaurant  0.07
3     Sushi Restaurant  0.07
4          Pizza Place  0.07


----East Toronto----
                venue  freq
0                Park  0.50
1       Metro Station  0.25
2   Convenience Store  0.25
3  Mexican Restaurant  0.00
4                 Pub  0.00


----Forest Hill North,Forest Hill West----
                 venue  freq
0                Trail  0.25
1                 Park  0.25
2        Jewelry Store  0.25
3     Sushi Restaurant  0.25
4  American Res

Let's write a function to sort the venues in descending order.

In [166]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [167]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Davisville,Pizza Place,Dessert Shop,Sandwich Place,Italian Restaurant,Coffee Shop,Sushi Restaurant,Gym,Café,Deli / Bodega,Diner
1,Davisville North,Hotel,Gym,Park,Convenience Store,Dog Run,Sandwich Place,Breakfast Spot,Clothing Store,Food & Drink Shop,Yoga Studio
2,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",Coffee Shop,Pub,Supermarket,Fried Chicken Joint,Light Rail Station,Liquor Store,Pizza Place,Restaurant,Sports Bar,American Restaurant
3,East Toronto,Park,Convenience Store,Metro Station,Yoga Studio,Deli / Bodega,Diner,Discount Store,Dog Run,Electronics Store,Farmers Market
4,"Forest Hill North,Forest Hill West",Trail,Park,Sushi Restaurant,Jewelry Store,Yoga Studio,Fish & Chips Shop,Dessert Shop,Diner,Discount Store,Dog Run


## Cluster Neighborhoods

In [168]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 1, 4, 1, 0, 3, 0, 2])

In [169]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_central

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood

toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4B,East York,"Woodbine Gardens,Parkview Hill",43.706397,-79.309937,0,Fast Food Restaurant,Pizza Place,Gym / Fitness Center,Gastropub,Pet Store,Intersection,Breakfast Spot,Bus Line,Bank,Athletics & Sports
1,M4C,East York,Woodbine Heights,43.695344,-79.318389,0,Skating Rink,Curling Ice,Asian Restaurant,Dance Studio,Park,Cosmetics Shop,Beer Store,Pharmacy,Bus Stop,Discount Store
2,M4G,East York,Leaside,43.70906,-79.363452,0,Coffee Shop,Sporting Goods Shop,Sandwich Place,Furniture / Home Store,Burger Joint,Shopping Mall,Pet Store,Clothing Store,Liquor Store,Fish & Chips Shop
3,M4H,East York,Thorncliffe Park,43.705369,-79.349372,0,Indian Restaurant,Burger Joint,Yoga Studio,Sandwich Place,Grocery Store,Gym,Gym / Fitness Center,Warehouse Store,Housing Development,Liquor Store
4,M4J,East York,East Toronto,43.685347,-79.338106,1,Park,Convenience Store,Metro Station,Yoga Studio,Deli / Bodega,Diner,Discount Store,Dog Run,Electronics Store,Farmers Market


In [170]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine Clusters

In [171]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East York,0,Fast Food Restaurant,Pizza Place,Gym / Fitness Center,Gastropub,Pet Store,Intersection,Breakfast Spot,Bus Line,Bank,Athletics & Sports
1,East York,0,Skating Rink,Curling Ice,Asian Restaurant,Dance Studio,Park,Cosmetics Shop,Beer Store,Pharmacy,Bus Stop,Discount Store
2,East York,0,Coffee Shop,Sporting Goods Shop,Sandwich Place,Furniture / Home Store,Burger Joint,Shopping Mall,Pet Store,Clothing Store,Liquor Store,Fish & Chips Shop
3,East York,0,Indian Restaurant,Burger Joint,Yoga Studio,Sandwich Place,Grocery Store,Gym,Gym / Fitness Center,Warehouse Store,Housing Development,Liquor Store
6,Central Toronto,0,Hotel,Gym,Park,Convenience Store,Dog Run,Sandwich Place,Breakfast Spot,Clothing Store,Food & Drink Shop,Yoga Studio
7,Central Toronto,0,Sporting Goods Shop,Clothing Store,Coffee Shop,Yoga Studio,Gym / Fitness Center,Rental Car Location,Park,Miscellaneous Shop,Mexican Restaurant,Metro Station
8,Central Toronto,0,Pizza Place,Dessert Shop,Sandwich Place,Italian Restaurant,Coffee Shop,Sushi Restaurant,Gym,Café,Deli / Bodega,Diner
10,Central Toronto,0,Coffee Shop,Pub,Supermarket,Fried Chicken Joint,Light Rail Station,Liquor Store,Pizza Place,Restaurant,Sports Bar,American Restaurant
13,Central Toronto,0,Sandwich Place,Café,Coffee Shop,Pub,Jewish Restaurant,Indian Restaurant,Burger Joint,Cosmetics Shop,American Restaurant,History Museum


In [172]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,East York,1,Park,Convenience Store,Metro Station,Yoga Studio,Deli / Bodega,Diner,Discount Store,Dog Run,Electronics Store,Farmers Market
5,Central Toronto,1,Park,Swim School,Bus Line,Yoga Studio,Fish & Chips Shop,Diner,Discount Store,Dog Run,Electronics Store,Farmers Market


In [173]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
11,Central Toronto,2,Ice Cream Shop,Garden,Yoga Studio,Fish & Chips Shop,Dessert Shop,Diner,Discount Store,Dog Run,Electronics Store,Farmers Market


In [174]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
9,Central Toronto,3,Playground,Restaurant,Dance Studio,Deli / Bodega,Dessert Shop,Diner,Discount Store,Dog Run,Electronics Store,Farmers Market


In [175]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
12,Central Toronto,4,Trail,Park,Sushi Restaurant,Jewelry Store,Yoga Studio,Fish & Chips Shop,Dessert Shop,Diner,Discount Store,Dog Run


## The end