In [1]:
from bs4 import BeautifulSoup
from tabulate import tabulate
import pandas as pd
import numpy as np

In [2]:
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

In [3]:
import requests

In [4]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

In [5]:
tb = soup.find('table', class_ = 'wikitable')

In [6]:
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))


In [7]:
postcodes = df[0]

In [15]:
#Remove 'Not assigned' boroughs
postcodes_clean = postcodes[postcodes['Borough']!='Not assigned']
postcodes_clean = postcodes_clean.reset_index(drop=True)

In [16]:
for i in range(len(postcodes_clean)):
    if postcodes_clean['Neighbourhood'][i] == 'Not assigned':
        postcodes_clean['Neighbourhood'][i] = postcodes_clean['Borough'][i]

In [17]:
cordinates = pd.read_csv("Geospatial_Coordinates.csv")

In [18]:
cordinates.rename(columns={'Postal Code':'Postcode'}, 
                 inplace=True)

In [19]:
df = pd.merge(postcodes_clean, cordinates, on="Postcode")
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
4,M6A,North York,Lawrence Heights,43.718518,-79.464763


## Question 3

Sign in to foursquare api

In [20]:
CLIENT_ID = 'PICYXB14DW25045HX5DL4ZF2GSM1YD3EKATZR5IIJZ1TBAJ3' # your Foursquare ID
CLIENT_SECRET = 'G3SZG1QTMSNCXXIB3ZLMHB2JFIEJNEJP2DN1WIKZA4TUEZOY' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: PICYXB14DW25045HX5DL4ZF2GSM1YD3EKATZR5IIJZ1TBAJ3
CLIENT_SECRET:G3SZG1QTMSNCXXIB3ZLMHB2JFIEJNEJP2DN1WIKZA4TUEZOY


Work out which neighbourhoods are in Toronto

In [27]:
df['Borough'].value_counts()

Etobicoke           45
North York          38
Downtown Toronto    37
Scarborough         37
Central Toronto     17
West Toronto        13
York                 9
East Toronto         7
East York            6
Queen's Park         1
Mississauga          1
Name: Borough, dtype: int64

Create a dataframe containing only toronto boroughs

In [28]:

toronto = df[df['Borough'].isin(['Downtown Toronto', 'Central Toronto', 'West Toronto', 'East Toronto'])]
toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
13,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937
14,M5B,Downtown Toronto,Garden District,43.657162,-79.378937
27,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418


In [29]:

print('Toronto has', len(toronto['Postcode'].unique()), 'different postcodes')

Toronto has 38 different postcodes


In [30]:
neighbourhoods = list(toronto['Neighbourhood'])
print('Toronto has', len(set(neighbourhoods)), 'different neighbourhoods')

Toronto has 73 different neighbourhoods


In [31]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [32]:
LIMIT = 100

toronto_venues = getNearbyVenues(names=toronto['Neighbourhood'],
                                   latitudes=toronto['Latitude'],
                                   longitudes=toronto['Longitude']
                                  )

Harbourfront
Regent Park
Ryerson
Garden District
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Adelaide
King
Richmond
Dovercourt Village
Dufferin
Harbourfront East
Toronto Islands
Union Station
Little Portugal
Trinity
The Danforth West
Riverdale
Design Exchange
Toronto Dominion Centre
Brockton
Exhibition Place
Parkdale Village
The Beaches West
India Bazaar
Commerce Court
Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North
Forest Hill West
High Park
The Junction South
North Toronto West
The Annex
North Midtown
Yorkville
Parkdale
Roncesvalles
Davisville
Harbord
University of Toronto
Runnymede
Swansea
Moore Park
Summerhill East
Chinatown
Grange Park
Kensington Market
Deer Park
Forest Hill SE
Rathnelly
South Hill
Summerhill West
CN Tower
Bathurst Quay
Island airport
Harbourfront West
King and Spadina
Railway Lands
South Niagara
Rosedale
Stn A PO Boxes 25 The Esplanade
Cabbagetown
St. James Town
First Canadian Place
Underground city


Lets look at the venues

In [33]:
print(toronto_venues.shape)
toronto_venues.head()

(3314, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Harbourfront,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,Harbourfront,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,Harbourfront,43.65426,-79.360636,Toronto Cooper Koo Family Cherry St YMCA Centre,43.653191,-79.357947,Gym / Fitness Center
3,Harbourfront,43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
4,Harbourfront,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa


In [34]:
print('There are {} uniques categories of venue locations.'.format(len(toronto_venues['Venue Category'].unique())))

There are 238 uniques categories of venue locations.


Let's consider how many of each type of venue category is in each neighbourhood on average

In [35]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighbourhood,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,Adelaide,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0
1,Bathurst Quay,0.0,0.0,0.066667,0.066667,0.066667,0.133333,0.2,0.133333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017241,0.0,0.0,0.0,0.0,0.0,0.0
3,Brockton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:

num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide----
                 venue  freq
0          Coffee Shop  0.06
1                 Café  0.05
2           Steakhouse  0.04
3  American Restaurant  0.04
4      Thai Restaurant  0.04


----Bathurst Quay----
              venue  freq
0   Airport Service  0.20
1    Airport Lounge  0.13
2  Airport Terminal  0.13
3   Harbor / Marina  0.07
4           Airport  0.07


----Berczy Park----
            venue  freq
0     Coffee Shop  0.09
1    Cocktail Bar  0.05
2      Steakhouse  0.03
3  Farmers Market  0.03
4          Bakery  0.03


----Brockton----
            venue  freq
0  Breakfast Spot  0.11
1     Coffee Shop  0.11
2            Café  0.11
3             Bar  0.05
4      Restaurant  0.05


----Business Reply Mail Processing Centre 969 Eastern----
                venue  freq
0  Light Rail Station  0.11
1         Pizza Place  0.06
2          Restaurant  0.06
3             Brewery  0.06
4                 Spa  0.06


----CN Tower----
              venue  freq
0   Airport Service  0.20
1

                    venue  freq
0        Greek Restaurant  0.18
1             Coffee Shop  0.09
2          Ice Cream Shop  0.07
3  Furniture / Home Store  0.05
4      Italian Restaurant  0.05


----Roncesvalles----
            venue  freq
0  Breakfast Spot  0.12
1       Gift Shop  0.12
2     Coffee Shop  0.06
3            Bank  0.06
4             Bar  0.06


----Rosedale----
                     venue  freq
0                     Park  0.50
1               Playground  0.25
2                    Trail  0.25
3  New American Restaurant  0.00
4              Men's Store  0.00


----Roselawn----
                       venue  freq
0                     Garden   0.5
1             Ice Cream Shop   0.5
2                  Nightclub   0.0
3         Mexican Restaurant   0.0
4  Middle Eastern Restaurant   0.0


----Runnymede----
                venue  freq
0         Pizza Place  0.10
1         Coffee Shop  0.10
2                Café  0.08
3  Italian Restaurant  0.05
4    Sushi Restaurant  0.05


----R

We now organise this in order

In [37]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [38]:

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

We now add the venue information to the original toronto dataset with clusters to see if they have any effect as features

In [39]:
from sklearn.cluster import KMeans
clustering = toronto.drop(['Postcode', 'Borough', 'Neighbourhood'], 1)
kmeans = KMeans(n_clusters=5, random_state=0).fit(clustering)
labels = kmeans.predict(clustering)


In [40]:
toronto = toronto.assign(Cluster = labels) 
centroids = kmeans.cluster_centers_

Now we consider whether there is any relationship between the types of venues and the assigned clusters



In [41]:
toronto_df = pd.merge(toronto, neighbourhoods_venues_sorted, on='Neighbourhood')

In [43]:
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [44]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

#set color scheme for the clusters
x = np.arange(5)
ys = [i + x + (i*x)**2 for i in range(5)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

#add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Neighbourhood'], 
                                  toronto_df['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [45]:
for i in range(5):
    print('Cluster', i, 'consists of:', list(toronto_df.loc[toronto_df['Cluster'] == i]['Borough'].unique()))

Cluster 0 consists of: ['Downtown Toronto', 'Central Toronto']
Cluster 1 consists of: ['West Toronto', 'Downtown Toronto']
Cluster 2 consists of: ['West Toronto']
Cluster 3 consists of: ['Downtown Toronto']
Cluster 4 consists of: ['East Toronto']


We can see that East Toronto as a borough doesn't really mesh with the other boroughs in terms of area. The other boroughs however seem to merge a bit here and there.