In [114]:
'''
Import dependencies
'''

import random 
import re
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.cluster import KMeans 

In [115]:
'''
Get the latest data from the Wikipedia page
'''

!wget -O raw_data.html https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M 

--2020-08-21 15:02:51--  https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M
Resolving en.wikipedia.org (en.wikipedia.org)... 208.80.154.224, 2620:0:861:ed1a::1
Connecting to en.wikipedia.org (en.wikipedia.org)|208.80.154.224|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 56520 (55K) [text/html]
Saving to: ‘raw_data.html’


2020-08-21 15:02:52 (500 KB/s) - ‘raw_data.html’ saved [56520/56520]



In [116]:
'''
Process the raw source code and grab the <table> string
'''

process = False
table_data_string = ''

with open("raw_data.html") as raw_data:
    for line in raw_data:
        if line.strip().startswith("<tbody") and table_data_string == '':
            process= True
            continue
        if process:
            table_data_string = table_data_string + line.strip()
        if line.strip().endswith("</table>"):
            process = False
            break

table_rows = table_data_string.split("<tr>")

In [117]:
'''
Split and clean the table string and write out the rows to .csv
'''

with open("clean_data.csv", "w") as clean_data:
    for row in table_rows:
        comma_row = re.sub('</t[dh]>', ';', row)
        clean_row = re.sub('<[/]*t[rdh]>', '', comma_row)
        clean_row = re.sub('&amp;', '&', clean_row)
        
        # Only proceed if the Borough is assigned
        row_items = clean_row.split(';') 
        if row_items[1] != "Not assigned":

            # Check if the neighbourhood is "Not assigned" and use the "Borough"
            if row_items[2] == "Not assigned":
                row_items[2] = row_items[1]
            clean_data.write(clean_row[:-1] + '\n')

In [118]:
'''
Read in the data
'''

df = pd.read_csv("clean_data.csv", sep=';')

df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [119]:
'''
Print the data frame dimensions
'''

df.shape

(103, 3)

In [120]:
'''
Install and import geocoder
'''

import sys
!{sys.executable} -m pip install geocoder

import geocoder



In [121]:
'''
Get coordinates - Option 1 (Note: I never got geocoder to work)
'''
# def get_coordinates(postal_code):
#     '''
#     Ping the geocoder.google until it graces us with a response
#     '''
#     lat_lng_coords = None

#     while(lat_lng_coords is None):
#       g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#       lat_lng_coords = g.latlng

#     latitude = lat_lng_coords[0]
#     longitude = lat_lng_coords[1]
    
#     return lat_lng_coords

# with open("clean_data.csv") as data:
#     with open("data_with_coordinates", "w") as completed_data:
#         for line in data:
#             if line.startswith("Postal"):
#                 completed_data.write("%s;Latitude;Longitude\n" % line.strip())
#             else:
#                 postal_code = line.split(";")[0]
#                 coordinates = get_coordinates(postal_code)
#                 completed_line = "%s;%s;%s\n" % (line.strip(), coordinates[0], coordinates[1])
#                 completed_data.write(completed_line)
        


'\nGet coordinates - Option 1 (Note: I never got geocoder to work)\n'

In [122]:
'''
Get coordinates - Option 2
'''

!wget -O coordinates.csv https://cocl.us/Geospatial_data

coordinates_mapping = {}
with open("coordinates.csv") as coordinates:
    for line in coordinates:
        postal_code, latitude, longitude = line.split(',')
        coordinates_mapping[postal_code] = (latitude, longitude)

--2020-08-21 15:03:13--  https://cocl.us/Geospatial_data
Resolving cocl.us (cocl.us)... 169.55.161.7
Connecting to cocl.us (cocl.us)|169.55.161.7|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2020-08-21 15:03:14--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com (ibm.box.com)... 107.152.29.197
Connecting to ibm.box.com (ibm.box.com)|107.152.29.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2020-08-21 15:03:15--  https://ibm.box.com/public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Reusing existing connection to ibm.box.com:443.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.ent.box.com/public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2020-08-21 15:0

In [123]:
'''
Create updated .csv file
'''

with open("clean_data.csv") as data:
    with open("data_with_coordinates.csv", "w") as completed_data:
        for line in data:
            if line.startswith("Postal"):
                completed_data.write("%s;Latitude;Longitude\n" % line.strip())
            else:
                postal_code = line.split(";")[0]
                completed_line = "%s;%s;%s\n" % (line.strip(), 
                                                 coordinates_mapping[postal_code][0], 
                                                 coordinates_mapping[postal_code][1])
                completed_data.write(completed_line)

In [124]:
'''
Inspect updates .csv file
'''

df = pd.read_csv("data_with_coordinates.csv", sep=';')
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [125]:
'''
Get and import folium
'''

!{sys.executable} -m pip install folium

import folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/a4/f0/44e69d50519880287cc41e7c8a6acc58daa9a9acf5f6afc52bcc70f69a6d/folium-0.11.0-py2.py3-none-any.whl (93kB)
[K     |████████████████████████████████| 102kB 7.2MB/s ta 0:00:011
[?25hCollecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/13/fb/9eacc24ba3216510c6b59a4ea1cd53d87f25ba76237d7f4393abeaf4c94e/branca-0.4.1-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.11.0


In [130]:
'''
Render map of Toronto with the Neighbourhoods
'''

# Toronto central coordinates
longitude = -79.403590 
latitude = 43.704689

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# Mark-up the Neighbourhoods
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='white',
        fill_opacity=1,
        parse_html=False).add_to(map_toronto)  
map_toronto

In [142]:
CLIENT_ID = 'TG40BRCQYER2OYURQDLLHRAZXRKCNEUN0OJ3GDNBQRN1UKKU' 
CLIENT_SECRET = 'NNGU0OFXAAUOMZMKRF0WLYMMNHOEE4QEVE0RSXAZ43Y4C1IK' 
VERSION = '20200201' # Foursquare API version
LIMIT = 100

base_url = "https://api.foursquare.com/v2"
venues_url = base_url + "/venues"

In [143]:
def get_nearby_venues(areas, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for area, lat, lng in zip(areas, latitudes, longitudes):
        # create the API request URL
        url = venues_url+'/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            area, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['location']['distance'], 
            v['venue']['categories'][0]['name'],
            v['venue']['categories'][0]['id'],
        ) for v in results])
        
        print(f"{len(results)} venues within {radius} m of {area} center")

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = [
        'Neighborhood', 
        'Neighborhood Latitude', 
        'Neighborhood Longitude', 
        'Venue Name', 
        'Venue Latitude', 
        'Venue Longitude', 
        'Venue Distance to Neighbourhood Centre', 
        'Venue Category',
        'Venue Category ID',
    ]
    
    return(nearby_venues)

In [149]:
toronto_venues_by_postal_code = get_nearby_venues(
    df["Postal Code"],
    df["Latitude"], 
    df["Longitude"]
)
toronto_venues_by_postal_code.head()

28 venues within 1000 m of M3A center
13 venues within 1000 m of M4A center
100 venues within 1000 m of M5A center
46 venues within 1000 m of M6A center
100 venues within 1000 m of M7A center
12 venues within 1000 m of M9A center
17 venues within 1000 m of M1B center
29 venues within 1000 m of M3B center
18 venues within 1000 m of M4B center
100 venues within 1000 m of M5B center
31 venues within 1000 m of M6B center
15 venues within 1000 m of M9B center
5 venues within 1000 m of M1C center
45 venues within 1000 m of M3C center
27 venues within 1000 m of M4C center
100 venues within 1000 m of M5C center
30 venues within 1000 m of M6C center
18 venues within 1000 m of M9C center
23 venues within 1000 m of M1E center
78 venues within 1000 m of M4E center
100 venues within 1000 m of M5E center
20 venues within 1000 m of M6E center
9 venues within 1000 m of M1G center
58 venues within 1000 m of M4G center
100 venues within 1000 m of M5G center
100 venues within 1000 m of M6G center
30 venu

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Name,Venue Latitude,Venue Longitude,Venue Distance to Neighbourhood Centre,Venue Category,Venue Category ID
0,M3A,43.753259,-79.329656,Allwyn's Bakery,43.75984,-79.324719,833,Caribbean Restaurant,4bf58dd8d48988d144941735
1,M3A,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,245,Park,4bf58dd8d48988d163941735
2,M3A,43.753259,-79.329656,Tim Hortons,43.760668,-79.326368,866,Café,4bf58dd8d48988d16d941735
3,M3A,43.753259,-79.329656,A&W,43.760643,-79.326865,852,Fast Food Restaurant,4bf58dd8d48988d16e941735
4,M3A,43.753259,-79.329656,Bruno's valu-mart,43.746143,-79.32463,889,Grocery Store,4bf58dd8d48988d118951735


In [171]:
toronto_coded_venues_by_postal_code = pd.get_dummies(toronto_venues_by_postal_code[['Venue Category']], prefix="", prefix_sep="")
toronto_coded_venues_by_postal_code.head()

Unnamed: 0,ATM,Accessories Store,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,...,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [198]:
'''
Perform the k-means clustering and add the cluster ids to the data frame
'''

X = toronto_coded_venues_by_postal_code.values
X

k_means = KMeans(init = "k-means++", n_clusters = 5, n_init = 12)
k_means.fit(X)

clustered_toronto = toronto_venues_by_postal_code.copy()
clustered_toronto['Cluster ID'] = k_means.labels_
clustered_toronto.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Name,Venue Latitude,Venue Longitude,Venue Distance to Neighbourhood Centre,Venue Category,Venue Category ID,Cluster ID
0,M3A,43.753259,-79.329656,Allwyn's Bakery,43.75984,-79.324719,833,Caribbean Restaurant,4bf58dd8d48988d144941735,0
1,M3A,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,245,Park,4bf58dd8d48988d163941735,0
2,M3A,43.753259,-79.329656,Tim Hortons,43.760668,-79.326368,866,Café,4bf58dd8d48988d16d941735,2
3,M3A,43.753259,-79.329656,A&W,43.760643,-79.326865,852,Fast Food Restaurant,4bf58dd8d48988d16e941735,3
4,M3A,43.753259,-79.329656,Bruno's valu-mart,43.746143,-79.32463,889,Grocery Store,4bf58dd8d48988d118951735,0


In [199]:
'''
Generate a map with the neighborhoods coloured by 
'''

cluster_map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11, tiles='cartodbpositron')

colours = [
    '#594F4F',
    '#547980',
    '#45ADA8',
    '#9DE0AD',
    '#E5FCC2'
]

# Mark-up the Neighbourhoods coloured by clusters
for lat, lng, neighborhood, cluster_id in zip(clustered_toronto['Neighborhood Latitude'], 
                                           clustered_toronto['Neighborhood Longitude'],
                                           clustered_toronto['Neighborhood'], 
                                           clustered_toronto['Cluster ID']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=colours[cluster_id],
        fill=True,
        fill_color=colours[cluster_id],
        fill_opacity=1,
        parse_html=False).add_to(cluster_map_toronto)
    
cluster_map_toronto