# Question 1: Importing the neighbourhood data and creating the dataframe

In [1]:
# installing geocoder
%pip install geocoder

Note: you may need to restart the kernel to use updated packages.


In [2]:
# importing the necessary packages for data importing and cleanup
from bs4 import BeautifulSoup
import requests
import pandas as pd
import geocoder

In [3]:
# importing url
file = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
# extracting table from imported html code
soup = BeautifulSoup(file,'lxml')
table = soup.find_all('table')[0]
# converting table to pandas dataframe
df = pd.read_html(str(table))[0]
# dropping entries without assigned borough name and assigning neighbourhood names where none are assigned
df = df[df.Borough != 'Not assigned'].sort_values(by='Postcode')
df.reset_index(inplace=True, drop=True)
for i,neigh in enumerate(df.Neighbourhood):
    if neigh == 'Not assigned':
        df.Neighbourhood[i] = df.Borough[i]

# adding suffix ', ' to each value in the Neighbourhood column
df['Neighbourhood'] = df['Neighbourhood'] + ', '
# creating a new dataframe and populating it with unique postcodes and combined neighbourhood values
df1 = pd.DataFrame()
for code in df.Postcode.unique():
    var = df.loc[df.Postcode == code]
    hood = var.Neighbourhood
    row = var.iloc[0,:]
    row['Neighbourhood'] = "".join(hood)
    df1 = df1.append(row)
# reordering the columns in the new dataframe
columns = ['Postcode','Borough','Neighbourhood']
df1 = df1[columns]
# resetting the indices and removing the old ones
df1.reset_index(inplace=True, drop=True)
# removing the ', ' suffix from the neighbourhood values that was added for ease of addition
df1.Neighbourhood = df1.Neighbourhood.str.rstrip(', ')
# printing the resulting dataframe
df1.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Golden Mile, Oakridge, Clairlea"
8,M1M,Scarborough,"Cliffcrest, Scarborough Village West, Cliffside"
9,M1N,Scarborough,"Cliffside West, Birch Cliff"


# Question 2: Adding latitude and longitude data

In [4]:
# creating empty lists to contain latitude and longitude data
latitudes = []
longitudes = []
# looping through all postcodes in df1 and finding their respective lat&long coordinates
for code in df1.Postcode:
    lat_lng_coords = None
    while (lat_lng_coords is None):
        # geocoder.google did not work, hence using another search agent, arcgis
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(code))
        lat_lng_coords = g.latlng
        lat = lat_lng_coords[0]
        lng = lat_lng_coords[1]
    # populating the latitudes and longitudes lists
    latitudes.append(lat)
    longitudes.append(lng)

# creating columns in the dataframe containint latitude and longitude coordinates
df1['Latitude'] = latitudes
df1['Longitude'] = longitudes

# viewing the final dataframe
df1.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.811525,-79.195517
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",43.78573,-79.15875
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.76569,-79.175256
3,M1G,Scarborough,Woburn,43.768359,-79.21759
4,M1H,Scarborough,Cedarbrae,43.769688,-79.23944
5,M1J,Scarborough,Scarborough Village,43.743125,-79.23175
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.726245,-79.26367
7,M1L,Scarborough,"Golden Mile, Oakridge, Clairlea",43.713133,-79.285055
8,M1M,Scarborough,"Cliffcrest, Scarborough Village West, Cliffside",43.723575,-79.234976
9,M1N,Scarborough,"Cliffside West, Birch Cliff",43.696665,-79.260163


# Question 3: Foursquare and neighbourhood clustering

In [5]:
# inputting Foursquare credentials
CLIENT_ID = 'CXOGRE4HVICLXNJXYR1QCKWAI2ES2JT2304FUHUXXATCVBYP'
CLIENT_SECRET = 'IJ2YHBJ31BV4IOT0W0DBDQUOVN5FOTKXDHVPVZRDPTKBP423'
VERSION = '20180605'

In [15]:
# defining function for extracting venue details in the vicinity of a given location
def getNearbyVenues(names, latitudes, longitudes, radius=500, limit=10):

    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):                  
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Borough', 
                  'Borough Latitude', 
                  'Borough Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [16]:
# filtering out the bouroughs that do not contain the word 'Toronto'
df2 = df1[df1['Borough'].str.contains('Toronto')==True]
df2 = df2.drop(['Postcode'],axis=1)
df2.reset_index(inplace=True, drop=True)
# acquiring coordinates for each of the central boroughs by averaging the coordinates of the postcodes within the borough
df3 = df2.groupby(['Borough']).mean()
df3.reset_index(inplace=True)
# display the resulting reduced dataframe
df3

Unnamed: 0,Borough,Latitude,Longitude
0,Central Toronto,43.701911,-79.399073
1,Downtown Toronto,43.654065,-79.384545
2,East Toronto,43.667905,-79.337058
3,West Toronto,43.651785,-79.444729


In [17]:
# requesting venues for boroughs containing the name "Toronto" from Foursquare
toronto_venues = getNearbyVenues(names=df3['Borough'],
                                   latitudes=df3['Latitude'],
                                   longitudes=df3['Longitude']
                                  )

In [18]:
# one hot encoding of the dataframe containing venue information
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
# adding 'Borough' column to the one hot encoded dataframe
toronto_onehot['Borough'] = toronto_venues['Borough']
# reordering the columns so that 'Borough' comes first from the left
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
# groupping venues by borough and computing mean occurrence frequency
toronto_grouped = toronto_onehot.groupby(['Borough']).mean().reset_index()
toronto_grouped

Unnamed: 0,Borough,American Restaurant,Art Museum,Athletics & Sports,Breakfast Spot,Brewery,Bubble Tea Shop,Burger Joint,Caribbean Restaurant,Clothing Store,...,Plaza,Poke Place,Polish Restaurant,Ramen Restaurant,Restaurant,Sandwich Place,Sushi Restaurant,Tea Room,Vietnamese Restaurant,Wings Joint
0,Central Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.2,0.0,0.1,0.0,0.0,0.1
1,Downtown Toronto,0.0,0.1,0.0,0.1,0.0,0.1,0.0,0.0,0.0,...,0.1,0.1,0.0,0.1,0.0,0.0,0.1,0.1,0.0,0.0
2,East Toronto,0.1,0.0,0.0,0.0,0.0,0.0,0.1,0.2,0.0,...,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.1,0.0
3,West Toronto,0.0,0.0,0.1,0.0,0.2,0.0,0.0,0.0,0.1,...,0.0,0.0,0.0,0.0,0.2,0.1,0.0,0.0,0.0,0.0


In [20]:
# borough clustering
from sklearn.cluster import KMeans
# set number of clusters, it has to be smaller than the number of boroughs which in this case is 4
kclusters = 2
toronto_grouped_clustering = toronto_grouped.drop('Borough', 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

In [23]:
#importing numpy as the function below uses it
import numpy as np
# define a function that sorts venues in descending order of occurrence
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Borough']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
boroughs_venues_sorted = pd.DataFrame(columns=columns)
boroughs_venues_sorted['Borough'] = toronto_grouped['Borough']

for ind in np.arange(toronto_grouped.shape[0]):
    boroughs_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

# add clustering labels
boroughs_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = df3
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(boroughs_venues_sorted.set_index('Borough'), on='Borough')
toronto_merged.head() # check the last columns!

Unnamed: 0,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,43.701911,-79.399073,0,Italian Restaurant,Restaurant,Middle Eastern Restaurant,Deli / Bodega,General Entertainment,Indonesian Restaurant,Wings Joint,Sushi Restaurant,Tea Room,Art Museum
1,Downtown Toronto,43.654065,-79.384545,1,Plaza,Tea Room,Sushi Restaurant,Art Museum,Ramen Restaurant,Breakfast Spot,Poke Place,Bubble Tea Shop,Neighborhood,Miscellaneous Shop
2,East Toronto,43.667905,-79.337058,1,Caribbean Restaurant,Burger Joint,Vietnamese Restaurant,Mexican Restaurant,Coffee Shop,Cocktail Bar,Pide Place,American Restaurant,Polish Restaurant,Grocery Store
3,West Toronto,43.651785,-79.444729,0,Restaurant,Brewery,Park,Athletics & Sports,Sandwich Place,Coffee Shop,Grocery Store,Clothing Store,Wings Joint,Deli / Bodega


In [26]:
# installing geopy
%pip install geopy

Collecting geopy
  Downloading https://files.pythonhosted.org/packages/80/93/d384479da0ead712bdaf697a8399c13a9a89bd856ada5a27d462fb45e47b/geopy-1.20.0-py2.py3-none-any.whl (100kB)
Collecting geographiclib<2,>=1.49 (from geopy)
  Downloading https://files.pythonhosted.org/packages/5b/ac/4f348828091490d77899bc74e92238e2b55c59392f21948f296e94e50e2b/geographiclib-1.49.tar.gz
Building wheels for collected packages: geographiclib
  Building wheel for geographiclib (setup.py): started
  Building wheel for geographiclib (setup.py): finished with status 'done'
  Stored in directory: C:\Users\Lyubomir\AppData\Local\pip\Cache\wheels\99\45\d1\14954797e2a976083182c2e7da9b4e924509e59b6e5c661061
Successfully built geographiclib
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.49 geopy-1.20.0
Note: you may need to restart the kernel to use updated packages.


In [27]:
# create map
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
from geopy.geocoders import Nominatim

# get coordinates for Toronto
address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Borough'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters