PART I: Import from Wikipedia List of postal codes of Canada and create a DF

In [1]:
import pandas as pd
import requests
from pandas.io.json import json_normalize

import folium

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
table = pd.read_html(url)[0]

In [3]:
table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288 entries, 0 to 287
Data columns (total 3 columns):
Postcode         288 non-null object
Borough          288 non-null object
Neighbourhood    288 non-null object
dtypes: object(3)
memory usage: 6.8+ KB


In [4]:
table = table[table['Borough']!='Not assigned'] #Drop the rows that contatins the "Not assigned" in the column "Borough"

In [5]:
table.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [6]:
table = table.groupby(['Postcode', 'Borough'],as_index=False).agg(', '.join) #Aggregate rows with same Postcode

In [7]:
table[table['Borough'].str.contains('Queen')]

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Not assigned


In [8]:
#create a function for change "Not assigned" in the "Neighbourhood" for the Borough´s name

def __assigned__(x, y):
    if y == "Not assigned":
        return x
    else:
        return y    

In [9]:
x = table

table["Neighbourhood"] = table.apply(lambda x: __assigned__(x["Borough"], x["Neighbourhood"]), axis=1)

In [10]:
table[table['Borough'].str.contains('Queen')] #Verify that all be ok.

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Queen's Park


In [11]:
table.shape

(103, 3)

Part II: Getting the Geolocations

In [12]:
geo_coordinates = pd.read_csv('Geospatial_Coordinates.csv')

In [13]:
geo_coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
df = table.merge(geo_coordinates, left_on='Postcode', right_on='Postal Code').drop(['Postal Code'], axis=1)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [31]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Part III: Working with the data

In [32]:
df_toronto = df[df['Borough'].str.contains('Toronto')]
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [33]:
latitude = 43.6532
longitude = -79.3832
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6532, -79.3832.


In [34]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [35]:
CLIENT_ID = 'WNU4QNPM0NESFXS5PNS4FVNI2LXWXV3F5AVCX2UTZDEZYS5H' # your Foursquare ID
CLIENT_SECRET = 'NY50BSQ1QWU3RAXCKQNFBWIEISZOG2KSYPOKXN40JJFPJ0G2' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 50
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: WNU4QNPM0NESFXS5PNS4FVNI2LXWXV3F5AVCX2UTZDEZYS5H
CLIENT_SECRET:NY50BSQ1QWU3RAXCKQNFBWIEISZOG2KSYPOKXN40JJFPJ0G2


In [36]:
## By search query ##

In [37]:
search_query = 'Gym'
radius = 5000
print(search_query + ' .... OK!')

Gym .... OK!


In [38]:
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, search_query, radius, LIMIT)
url

'https://api.foursquare.com/v2/venues/search?client_id=WNU4QNPM0NESFXS5PNS4FVNI2LXWXV3F5AVCX2UTZDEZYS5H&client_secret=NY50BSQ1QWU3RAXCKQNFBWIEISZOG2KSYPOKXN40JJFPJ0G2&ll=43.6532,-79.3832&v=20180604&query=Gym&radius=5000&limit=50'

In [39]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5d9e553d0d2be70039177f91'},
 'response': {'venues': [{'id': '5133f680e4b02e871367c60c',
    'name': 'The Gym at the Shangri-La',
    'location': {'lat': 43.648773826131794,
     'lng': -79.38651748319191,
     'labeledLatLngs': [{'label': 'display',
       'lat': 43.648773826131794,
       'lng': -79.38651748319191}],
     'distance': 560,
     'cc': 'CA',
     'country': 'Canada',
     'formattedAddress': ['Canada']},
    'categories': [{'id': '4bf58dd8d48988d176941735',
      'name': 'Gym',
      'pluralName': 'Gyms',
      'shortName': 'Gym',
      'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/building/gym_',
       'suffix': '.png'},
      'primary': True}],
    'referralId': 'v-1570657597',
    'hasPerk': False},
   {'id': '4cd044c29d87224bf129543b',
    'name': 'University Centre Gym',
    'location': {'lat': 43.653571,
     'lng': -79.386979,
     'labeledLatLngs': [{'label': 'display',
       'lat': 43.653571,
       'lng': -79.

In [40]:
venues = results['response']['venues']

# tranform venues into a dataframe
dataframe = json_normalize(venues)
dataframe.head()

Unnamed: 0,categories,hasPerk,id,location.address,location.cc,location.city,location.country,location.crossStreet,location.distance,location.formattedAddress,location.labeledLatLngs,location.lat,location.lng,location.postalCode,location.state,name,referralId
0,"[{'id': '4bf58dd8d48988d176941735', 'name': 'G...",False,5133f680e4b02e871367c60c,,CA,,Canada,,560,[Canada],"[{'label': 'display', 'lat': 43.64877382613179...",43.648774,-79.386517,,,The Gym at the Shangri-La,v-1570657597
1,"[{'id': '4bf58dd8d48988d175941735', 'name': 'G...",False,4cd044c29d87224bf129543b,,CA,,Canada,,307,[Canada],"[{'label': 'display', 'lat': 43.653571, 'lng':...",43.653571,-79.386979,,,University Centre Gym,v-1570657597
2,"[{'id': '4bf58dd8d48988d176941735', 'name': 'G...",False,4f61e092e4b0d7325fb1f8a3,145 Richmond Street W,CA,Toronto,Canada,at University Ave.,410,"[145 Richmond Street W (at University Ave.), T...","[{'label': 'display', 'lat': 43.64999445230568...",43.649994,-79.38573,,ON,Hilton Gym,v-1570657597
3,"[{'id': '4bf58dd8d48988d175941735', 'name': 'G...",False,58fb6a1e029a555d117e0cb2,181 Wellington St W,CA,Toronto,Canada,John St,920,"[181 Wellington St W (John St), Toronto ON M5V...","[{'label': 'display', 'lat': 43.645418, 'lng':...",43.645418,-79.387059,M5V 3G7,ON,The Gym,v-1570657597
4,"[{'id': '4bf58dd8d48988d176941735', 'name': 'G...",False,4c2630ca5c5ca59364a545fe,373 Front St. W.,CA,Toronto,Canada,Blue Jays Way,1366,"[373 Front St. W. (Blue Jays Way), Toronto ON,...","[{'label': 'display', 'lat': 43.64291316865826...",43.642913,-79.392462,,ON,The Matrix Gym (City Place),v-1570657597


In [41]:
# keep only columns that include venue name, and anything that is associated with location
filtered_columns = ['name', 'categories'] + [col for col in dataframe.columns if col.startswith('location.')] + ['id']
dataframe_filtered = dataframe.loc[:, filtered_columns]

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

# filter the category for each row
dataframe_filtered['categories'] = dataframe_filtered.apply(get_category_type, axis=1)

# clean column names by keeping only last term
dataframe_filtered.columns = [column.split('.')[-1] for column in dataframe_filtered.columns]

dataframe_filtered.head()

Unnamed: 0,name,categories,address,cc,city,country,crossStreet,distance,formattedAddress,labeledLatLngs,lat,lng,postalCode,state,id
0,The Gym at the Shangri-La,Gym,,CA,,Canada,,560,[Canada],"[{'label': 'display', 'lat': 43.64877382613179...",43.648774,-79.386517,,,5133f680e4b02e871367c60c
1,University Centre Gym,Gym / Fitness Center,,CA,,Canada,,307,[Canada],"[{'label': 'display', 'lat': 43.653571, 'lng':...",43.653571,-79.386979,,,4cd044c29d87224bf129543b
2,Hilton Gym,Gym,145 Richmond Street W,CA,Toronto,Canada,at University Ave.,410,"[145 Richmond Street W (at University Ave.), T...","[{'label': 'display', 'lat': 43.64999445230568...",43.649994,-79.38573,,ON,4f61e092e4b0d7325fb1f8a3
3,The Gym,Gym / Fitness Center,181 Wellington St W,CA,Toronto,Canada,John St,920,"[181 Wellington St W (John St), Toronto ON M5V...","[{'label': 'display', 'lat': 43.645418, 'lng':...",43.645418,-79.387059,M5V 3G7,ON,58fb6a1e029a555d117e0cb2
4,The Matrix Gym (City Place),Gym,373 Front St. W.,CA,Toronto,Canada,Blue Jays Way,1366,"[373 Front St. W. (Blue Jays Way), Toronto ON,...","[{'label': 'display', 'lat': 43.64291316865826...",43.642913,-79.392462,,ON,4c2630ca5c5ca59364a545fe


In [42]:
dataframe_filtered.name.head()

0      The Gym at the Shangri-La
1          University Centre Gym
2                     Hilton Gym
3                        The Gym
4    The Matrix Gym (City Place)
Name: name, dtype: object

In [43]:
venues_map = folium.Map(location=[latitude, longitude], zoom_start=13) # generate map centred

# add a red circle marker to represent the center
folium.features.CircleMarker(
    [latitude, longitude],
    radius=10,
    color='red',
    popup='Toronto',
    fill = True,
    fill_color = 'red',
    fill_opacity = 0.6
).add_to(venues_map)

# add the venues as blue circle markers
for lat, lng, label in zip(dataframe_filtered.lat, dataframe_filtered.lng, dataframe_filtered.categories):
    folium.features.CircleMarker(
        [lat, lng],
        radius=5,
        color='blue',
        popup=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(venues_map)

# display map
venues_map

## Explore & Clustering##

Part 1

In [27]:
CLIENT_ID = 'WNU4QNPM0NESFXS5PNS4FVNI2LXWXV3F5AVCX2UTZDEZYS5H' # your Foursquare ID
CLIENT_SECRET = 'NY50BSQ1QWU3RAXCKQNFBWIEISZOG2KSYPOKXN40JJFPJ0G2' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: WNU4QNPM0NESFXS5PNS4FVNI2LXWXV3F5AVCX2UTZDEZYS5H
CLIENT_SECRET:NY50BSQ1QWU3RAXCKQNFBWIEISZOG2KSYPOKXN40JJFPJ0G2


In [47]:
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [48]:
latitude = 43.6532
longitude = -79.3832
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6532, -79.3832.


In [49]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [50]:
CLIENT_ID = 'WNU4QNPM0NESFXS5PNS4FVNI2LXWXV3F5AVCX2UTZDEZYS5H' # your Foursquare ID
CLIENT_SECRET = 'NY50BSQ1QWU3RAXCKQNFBWIEISZOG2KSYPOKXN40JJFPJ0G2' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: WNU4QNPM0NESFXS5PNS4FVNI2LXWXV3F5AVCX2UTZDEZYS5H
CLIENT_SECRET:NY50BSQ1QWU3RAXCKQNFBWIEISZOG2KSYPOKXN40JJFPJ0G2


Part 2 Explore Neighborhoods

In [99]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [100]:
toronto_venues = getNearbyVenues(names=df_toronto['Neighbourhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )

The Beaches
The Danforth West, Riverdale
The Beaches West, India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park, Summerhill East
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront, Regent Park
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide, King, Richmond
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North, Forest Hill West
The Annex, North Midtown, Yorkville
Harbord, University of Toronto
Chinatown, Grange Park, Kensington Market
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie
Dovercourt Village, Dufferin
Little Portugal, Trinity
Brockton, Exhibition Place, Parkdale Village
High Park, The 

In [101]:
print(toronto_venues.shape)
toronto_venues.head()

(1165, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,"The Danforth West, Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


In [103]:
toronto_venues1 = toronto_venues

In [104]:
toronto_venues1.groupby('Neighborhood').count()


Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",50,50,50,50,50,50
Berczy Park,50,50,50,50,50,50
"Brockton, Exhibition Place, Parkdale Village",22,22,22,22,22,22
Business Reply Mail Processing Centre 969 Eastern,20,20,20,20,20,20
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",15,15,15,15,15,15
"Cabbagetown, St. James Town",45,45,45,45,45,45
Central Bay Street,50,50,50,50,50,50
"Chinatown, Grange Park, Kensington Market",50,50,50,50,50,50
Christie,16,16,16,16,16,16
Church and Wellesley,50,50,50,50,50,50


In [105]:
print('There are {} uniques categories.'.format(len(toronto_venues1['Venue Category'].unique())))

There are 215 uniques categories.


Part 3

In [106]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues1[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues1['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theme Restaurant,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
