## 1. Getting the dataframe with all data needed
##### As in previous notebook

In [1]:
import numpy as np
import pandas as pd
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', attrs={"class": "wikitable"}, skiprows=1)[0]
df = df.rename({0:'PostalCode',1:'Borough',2:'Neighborhood'},axis=1)
df = df.drop(df[(df.Borough == 'Not assigned')].index)
df = df.groupby(['PostalCode','Borough'])[['Neighborhood']].agg(', '.join).reset_index()
df.loc[df['Neighborhood'] == 'Not assigned', 'Neighborhood'] = "Queen's Park"
cords = pd.read_csv('https://cocl.us/Geospatial_data')
cords=cords.rename(columns = {'Postal Code':'PostalCode'})
df1 = pd.merge(df, cords)
df1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## 2. Importing tools for analysis

In [2]:
import json
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim
import requests 
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
!conda install -c conda-forge folium=0.5.0 --yes
import folium
print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/DSX-Python35

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    ca-certificates-2019.3.9   |       hecc5488_0         146 KB  conda-forge
    openssl-1.0.2r             |       h14c3975_0         3.1 MB  conda-forge
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    certifi-2018.8.24          |        py35_1001         139 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.49-py_0         conda-forge
    geopy:           1.20.0-py_0       conda-forge

The following packages will be UPDATED:

   

## 3. Generating map with boroughs
###### Not sure why the map with markers doesn't show up... That's why I'm showing map without markers (hashed markers generator, better than nothing...)

In [38]:
address = 'Toronto'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

#for lat, lng, borough, neighborhood in zip(df1['Latitude'], df1['Longitude'], df1['Borough'], df1['Neighborhood']):
#    label = '{}, {}'.format(neighborhood, borough)
#    label = folium.Popup(label, parse_html=False)
#    folium.CircleMarker(
#        [lat, lng],
#        radius=5,
#        popup=label,
#        color='blue',
#        fill=True,
#        fill_color='#3186cc',
#        fill_opacity=0.7,
#        parse_html=False).add_to(map_toronto)  
    
map_toronto

## 4. Defining Foursquare credentials

In [39]:
CLIENT_ID = '4YU5PR2ETLLQMX5FQ2KZUFEHYDOJV1RRECSCVAFLS1GKFQOT' # your Foursquare ID
CLIENT_SECRET = 'YOG1B3VYW0VXOQDZSDC3OCTW41ZN0OXNFNHPPPH0WBP0EA10' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 4YU5PR2ETLLQMX5FQ2KZUFEHYDOJV1RRECSCVAFLS1GKFQOT
CLIENT_SECRET:YOG1B3VYW0VXOQDZSDC3OCTW41ZN0OXNFNHPPPH0WBP0EA10


## 5. Let's explore East York
#### Analogous to Manhattan analysis lab

In [40]:
east_york = df1[df1['Borough'] == 'East York'].reset_index(drop=True)
east_york.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
1,M4C,East York,Woodbine Heights,43.695344,-79.318389
2,M4G,East York,Leaside,43.70906,-79.363452
3,M4H,East York,Thorncliffe Park,43.705369,-79.349372
4,M4J,East York,East Toronto,43.685347,-79.338106


In [41]:
address = 'East York, Toronto'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

map_eastyork = folium.Map(location=[latitude, longitude], zoom_start=14)

#for lat, lng, label in zip(west_toronto['Latitude'], west_toronto['Longitude'], west_toronto['Neighborhood']):
#    label = folium.Popup(label, parse_html=True)
#    folium.CircleMarker(
#        [lat, lng],
#        radius=5,
#        popup=label,
#        color='blue',
#        fill=True,
#        fill_color='#3186cc',
#        fill_opacity=0.7,
#        parse_html=False).add_to(map_westtoronto)  
    
map_eastyork

In [42]:
neighborhood_latitude = east_york.loc[0, 'Latitude']
neighborhood_longitude = east_york.loc[0, 'Longitude']
neighborhood_name = east_york.loc[0, 'Neighborhood']
limit = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    limit)
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5cf77df69fb6b775bddff9cb'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-4b5a3842f964a52023b528e3-0',
      'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/gastropub_',
          'suffix': '.png'},
         'id': '4bf58dd8d48988d155941735',
         'name': 'Gastropub',
         'pluralName': 'Gastropubs',
         'primary': True,
         'shortName': 'Gastropub'}],
       'id': '4b5a3842f964a52023b528e3',
       'location': {'address': "804 O'Connor Dr",
        'cc': 'CA',
        'city': 'Toronto',
        'country': 'Canada',
        'crossStreet': 'St Clair E',
        'distance': 249,
        'formattedAddress': ["804 O'Connor Dr (St Clair E)",
         'Toronto ON M4B 2S9',
         'Canada'],
        'labeledLatLngs': [{'label

In [43]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
    
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

13 venues were returned by Foursquare.


In [44]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [45]:
eastyork_venues = getNearbyVenues(names=east_york['Neighborhood'],
                                   latitudes=east_york['Latitude'],
                                   longitudes=east_york['Longitude']
                                  )

Woodbine Gardens, Parkview Hill
Woodbine Heights
Leaside
Thorncliffe Park
East Toronto


In [46]:
print('There are {} uniques categories.'.format(len(eastyork_venues['Venue Category'].unique())))

There are 46 uniques categories.


In [47]:
eastyork_onehot = pd.get_dummies(eastyork_venues[['Venue Category']], prefix="", prefix_sep="")
eastyork_onehot['Neighborhood'] = eastyork_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [eastyork_onehot.columns[-1]] + list(eastyork_onehot.columns[:-1])
eastyork_onehot = eastyork_onehot[fixed_columns]

eastyork_onehot.head()

Unnamed: 0,Neighborhood,Athletics & Sports,Bagel Shop,Bank,Beer Store,Bike Shop,Breakfast Spot,Brewery,Burger Joint,Bus Stop,...,Sandwich Place,Shopping Mall,Skating Rink,Smoothie Shop,Sporting Goods Shop,Sports Bar,Supermarket,Sushi Restaurant,Warehouse Store,Yoga Studio
0,"Woodbine Gardens, Parkview Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Woodbine Gardens, Parkview Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Woodbine Gardens, Parkview Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Woodbine Gardens, Parkview Hill",0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Woodbine Gardens, Parkview Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
eastyork_grouped = eastyork_onehot.groupby('Neighborhood').mean().reset_index()
eastyork_grouped

Unnamed: 0,Neighborhood,Athletics & Sports,Bagel Shop,Bank,Beer Store,Bike Shop,Breakfast Spot,Brewery,Burger Joint,Bus Stop,...,Sandwich Place,Shopping Mall,Skating Rink,Smoothie Shop,Sporting Goods Shop,Sports Bar,Supermarket,Sushi Restaurant,Warehouse Store,Yoga Studio
0,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Leaside,0.0,0.028571,0.028571,0.028571,0.028571,0.028571,0.028571,0.057143,0.0,...,0.028571,0.028571,0.0,0.028571,0.085714,0.028571,0.028571,0.057143,0.0,0.0
2,Thorncliffe Park,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.055556,0.0,...,0.055556,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.055556,0.055556
3,"Woodbine Gardens, Parkview Hill",0.076923,0.0,0.076923,0.0,0.0,0.076923,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Woodbine Heights,0.125,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.125,...,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
num_top_venues = 5

for hood in eastyork_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = eastyork_grouped[eastyork_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----East Toronto----
                 venue  freq
0    Convenience Store  0.33
1                 Park  0.33
2          Coffee Shop  0.33
3           Restaurant  0.00
4  Housing Development  0.00


----Leaside----
                 venue  freq
0          Coffee Shop  0.09
1  Sporting Goods Shop  0.09
2        Grocery Store  0.06
3         Burger Joint  0.06
4     Sushi Restaurant  0.06


----Thorncliffe Park----
               venue  freq
0  Indian Restaurant  0.11
1      Grocery Store  0.06
2     Discount Store  0.06
3    Warehouse Store  0.06
4        Supermarket  0.06


----Woodbine Gardens, Parkview Hill----
                  venue  freq
0  Fast Food Restaurant  0.15
1           Pizza Place  0.15
2    Athletics & Sports  0.08
3                  Café  0.08
4          Intersection  0.08


----Woodbine Heights----
                venue  freq
0  Athletics & Sports  0.12
1          Beer Store  0.12
2                Park  0.12
3            Bus Stop  0.12
4        Skating Rink  0.12




In [50]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = eastyork_grouped['Neighborhood']

for ind in np.arange(eastyork_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(eastyork_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,Coffee Shop,Park,Convenience Store,Yoga Studio,Fish & Chips Shop,Fast Food Restaurant,Electronics Store,Discount Store,Dessert Shop,Curling Ice
1,Leaside,Sporting Goods Shop,Coffee Shop,Sushi Restaurant,Furniture / Home Store,Grocery Store,Burger Joint,Liquor Store,Mexican Restaurant,Food & Drink Shop,Electronics Store
2,Thorncliffe Park,Indian Restaurant,Yoga Studio,Pharmacy,Coffee Shop,Warehouse Store,Grocery Store,Gym,Housing Development,Intersection,Liquor Store
3,"Woodbine Gardens, Parkview Hill",Pizza Place,Fast Food Restaurant,Gastropub,Bank,Breakfast Spot,Café,Gym / Fitness Center,Intersection,Pet Store,Pharmacy
4,Woodbine Heights,Athletics & Sports,Skating Rink,Curling Ice,Cosmetics Shop,Park,Bus Stop,Pharmacy,Beer Store,Breakfast Spot,Convenience Store


In [51]:
# set number of clusters
kclusters = 5

eastyork_grouped_clustering = eastyork_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(eastyork_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 1, 3, 4, 0], dtype=int32)

In [54]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

eastyork_merged = east_york

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
eastyork_merged = eastyork_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
eastyork_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937,4,Pizza Place,Fast Food Restaurant,Gastropub,Bank,Breakfast Spot,Café,Gym / Fitness Center,Intersection,Pet Store,Pharmacy
1,M4C,East York,Woodbine Heights,43.695344,-79.318389,0,Athletics & Sports,Skating Rink,Curling Ice,Cosmetics Shop,Park,Bus Stop,Pharmacy,Beer Store,Breakfast Spot,Convenience Store
2,M4G,East York,Leaside,43.70906,-79.363452,1,Sporting Goods Shop,Coffee Shop,Sushi Restaurant,Furniture / Home Store,Grocery Store,Burger Joint,Liquor Store,Mexican Restaurant,Food & Drink Shop,Electronics Store
3,M4H,East York,Thorncliffe Park,43.705369,-79.349372,3,Indian Restaurant,Yoga Studio,Pharmacy,Coffee Shop,Warehouse Store,Grocery Store,Gym,Housing Development,Intersection,Liquor Store
4,M4J,East York,East Toronto,43.685347,-79.338106,2,Coffee Shop,Park,Convenience Store,Yoga Studio,Fish & Chips Shop,Fast Food Restaurant,Electronics Store,Discount Store,Dessert Shop,Curling Ice


In [62]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
#for lat, lon, poi, cluster in zip(eastyork_merged['Latitude'], eastyork_merged['Longitude'], eastyork_merged['Neighborhood'], eastyork_merged['Cluster Labels']):
#    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
#    folium.CircleMarker(
#        [lat, lon],
#        radius=5,
#        popup=label,
#        color=rainbow[cluster-1],
#        fill=True,
#        fill_color=rainbow[cluster-1],
#        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [56]:
eastyork_merged.loc[eastyork_merged['Cluster Labels'] == 0, eastyork_merged.columns[[1] + list(range(5, eastyork_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,East York,0,Athletics & Sports,Skating Rink,Curling Ice,Cosmetics Shop,Park,Bus Stop,Pharmacy,Beer Store,Breakfast Spot,Convenience Store


In [57]:
eastyork_merged.loc[eastyork_merged['Cluster Labels'] == 1, eastyork_merged.columns[[1] + list(range(5, eastyork_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,East York,1,Sporting Goods Shop,Coffee Shop,Sushi Restaurant,Furniture / Home Store,Grocery Store,Burger Joint,Liquor Store,Mexican Restaurant,Food & Drink Shop,Electronics Store


In [59]:
eastyork_merged.loc[eastyork_merged['Cluster Labels'] == 2, eastyork_merged.columns[[1] + list(range(5, eastyork_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,East York,2,Coffee Shop,Park,Convenience Store,Yoga Studio,Fish & Chips Shop,Fast Food Restaurant,Electronics Store,Discount Store,Dessert Shop,Curling Ice


In [60]:
eastyork_merged.loc[eastyork_merged['Cluster Labels'] == 3, eastyork_merged.columns[[1] + list(range(5, eastyork_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,East York,3,Indian Restaurant,Yoga Studio,Pharmacy,Coffee Shop,Warehouse Store,Grocery Store,Gym,Housing Development,Intersection,Liquor Store


In [61]:
eastyork_merged.loc[eastyork_merged['Cluster Labels'] == 4, eastyork_merged.columns[[1] + list(range(5, eastyork_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East York,4,Pizza Place,Fast Food Restaurant,Gastropub,Bank,Breakfast Spot,Café,Gym / Fitness Center,Intersection,Pet Store,Pharmacy


## Got 5 clusters from 5 boroughs. Now, that's epic!