In [2]:
# import all necessary libraries including BeautifulSoup and requests for 
# reading in html text from Wikipedia webpage and parsing the text for the 
# table of Toronto area postcodes, boroughs and neighborhoods

#!conda install -c conda-forge beautifulsoup4 --yes
from bs4 import BeautifulSoup

#!conda install -c conda-forge lxml --yes
import lxml

!conda install -c conda-forge geocoder --yes
import geocoder 
from geopy.geocoders import Nominatim

import pandas as pd
import numpy as np

import requests

from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cm
import folium

Collecting package metadata: done
Solving environment: / 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/linux-64::anaconda==5.3.1=py37_0
  - defaults/linux-64::astropy==3.0.4=py37h14c3975_0
  - defaults/linux-64::bkcharts==0.2=py37_0
  - defaults/linux-64::blaze==0.11.3=py37_0
  - defaults/linux-64::bokeh==0.13.0=py37_0
  - defaults/linux-64::bottleneck==1.2.1=py37h035aef0_1
  - defaults/linux-64::dask==0.19.1=py37_0
  - defaults/linux-64::datashape==0.5.4=py37_1
  - defaults/linux-64::mkl-service==1.1.2=py37h90e4bf4_5
  - defaults/linux-64::numba==0.39.0=py37h04863e7_0
  - defaults/linux-64::numexpr==2.6.8=py37hd89afb7_0
  - defaults/linux-64::odo==0.5.1=py37_0
  - defaults/linux-64::pytables==3.4.4=py37ha205bf6_0
  - defaults/linux-64::pytest-arraydiff==0.2=py37h39e3cac_0
  - defaults/linux-64::pytest-astropy==0.4.0=py37_0
  - defaults/linux-64::pytest-doctestplus==0.1.3=py37_0
  - defaults

In [3]:
# get html object via request
wikipage = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

# parse html object using BeautifulSoup and lxml
soup = BeautifulSoup(wikipage,'lxml')

# identify portion of the html text containing the table of Toronto postcodes
table = soup.find('table')

# loop through the list of all table entries (identified via tag 'tr')
# and extract and format each row of the table, making sure to split
# each table row by carriage return '\n' and removing any whitespace
tablebody = []
for xx in table.find_all('tr'):
    tablebody.append(xx.text.split('\n')[1:-1])

# create pandas DataFrame to store table    
df = pd.DataFrame(tablebody[1:],columns=tablebody[0])

# only keep those rows that have a Borough identified by name
df = df.loc[df['Borough'] != 'Not assigned',:]

# replace the neighbourhood name with the borough name for those 
# neighborhoods with unassigned names
df.loc[df['Neighbourhood'] == 'Not assigned','Neighbourhood'] = df.loc[df['Neighbourhood'] == 'Not assigned','Borough']

# define function to concatenate a list of names into a list
def f2(x):
    return(list(x.unique()))

# group Dataframe by borough and apply function to Postcode and Neighborhood columns
df = df.groupby(['Postcode']).agg({'Borough':f2, 'Neighbourhood': f2}).reset_index()

print('Number of rows in dataframe (i.e. number of postcodes) ', df.shape[0])

Number of rows in dataframe (i.e. number of postcodes)  103


In [5]:
# import geocoder
lat_long_coords = None

# set up loop to find lat/long for every postcode in the dataframe
# then, add columns for "Latitude" and "Longitude" to the dataframe
for ind in df.index:
    while( lat_long_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(df.loc[ind,'Postcode']))
        lat_long_coords = g.latlng
    
    df.loc[ind,"Latitude"] = lat_long_coords[0]
    df.loc[ind,"Longitude"] = lat_long_coords[1]
    print(df.loc[ind,"Postcode"] + ' Toronto, Ontario has lat/long ' + str(lat_long_coords))

# NOTE: GETTING THE LAT LONG COORDINATES VIA GEOCODER WAS TAKING TOO LONG
# SO I IMPORTED THE LAT/LONG COORDINATES FOR THE VARIOUS POSTCODES FROM
# THE CSV FILE PROVIDED

KeyboardInterrupt: 

In [4]:
# import CSV file of lat/long coordinates into a second dataframe

df_latlong = pd.read_csv('Geospatial_Coordinates.csv')
df_latlong.rename(columns={'Postal Code':'Postcode'},inplace= True)

# merge two dataframes on postcode
df_toronto=df.join(df_latlong.set_index('Postcode'),on='Postcode')

In [5]:
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,[Scarborough],"[Rouge, Malvern]",43.806686,-79.194353
1,M1C,[Scarborough],"[Highland Creek, Rouge Hill, Port Union]",43.784535,-79.160497
2,M1E,[Scarborough],"[Guildwood, Morningside, West Hill]",43.763573,-79.188711
3,M1G,[Scarborough],[Woburn],43.770992,-79.216917
4,M1H,[Scarborough],[Cedarbrae],43.773136,-79.239476


In [6]:
#define FourSquare credentials
CLIENT_ID = 'IIU1GYSNI4WI5H4R5CATIZFVN5KGOOZ241N5GICT2NMJKGGB' # your Foursquare ID
CLIENT_SECRET = 'AY1H3BLKCZ3W1W15FLUXWW0Z4ITU5QRYETGIUZX5W2Y3A21X' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: IIU1GYSNI4WI5H4R5CATIZFVN5KGOOZ241N5GICT2NMJKGGB
CLIENT_SECRET:AY1H3BLKCZ3W1W15FLUXWW0Z4ITU5QRYETGIUZX5W2Y3A21X


In [7]:
# create the API request URL
radius=500
LIMIT = 100

counter = 0
col=['Postcode',
                                          'Borough',
                                          'Neighbourhood',
                                          'Latitude',
                                          'Longitude',
                                          'Venue Name',
                                          'Venue Latitude',
                                          'Venue Longitude',
                                          'Venue Category']

df_toronto_venues = pd.DataFrame(columns=col)
for lat,lng, neigh,postcode,borough in zip(
    df_toronto['Latitude'],
    df_toronto['Longitude'],
    df_toronto['Neighbourhood'],
    df_toronto['Postcode'],
    df_toronto['Borough']):
    
        # form FourSquare URL request
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        # get URL content, formatted as a JSON object
        results = requests.get(url).json()

        # format results object to produce a dictionary of venues (and associated details)
        results_formatted = results['response']['groups'][0]['items']
        
        # extract venues including their name, category, lat, long
        #print('number of venues at {:.2f},{:.2f}'.format(lat,lng) + " in  " + str(neigh) + " is " + str(len(results_formatted)))
      
        venue_list = []
        for r in results_formatted:
            venue_list.append(
                [postcode,
                borough,
                neigh,
                lat,
                lng,
                r['venue']['name'],
                r['venue']['location']['lat'],
                r['venue']['location']['lng'],
                r['venue']['categories'][0]['name']])
        
        x = pd.DataFrame(venue_list,columns=col)
        df_toronto_venues = df_toronto_venues.append(x,ignore_index=True)



In [9]:
# one-hot encoding of toronto DataFrame
toronto_onehot = pd.get_dummies(df_toronto_venues['Venue Category'],prefix="",prefix_sep="")

# add back Postcode column to one-hot encoded Dataframe
toronto_onehot.insert(0,'Postcode',df_toronto_venues['Postcode'])

# now, group one-hot encoded toronto dataframe by neighborhood to compute frequency 
# of occurence of each venue category by neighborhood
df_toronto_venues_freq = toronto_onehot.groupby('Postcode').mean().reset_index()


In [10]:
# Now sort each row of dataframe with frequencies of occurrences of Venue Categories
# to establish top num_top venues in each Postcode

num_top = 5
#create list of column names for num_top most popular venue categories
col = []
for k in np.arange(num_top):
    if k == 0:
        col.append(str(k+1) + "st")
    elif k == 1:
        col.append(str(k+1) + "nd")
    elif k == 2:
        col.append(str(k+1) + "rd")
    else:
        col.append(str(k+1) + "th")

col.insert(0,'Postcode')

# create new dataframe with top num_top most popular venue categories
df_toronto_venues_sorted = pd.DataFrame(columns=col)
for ind in np.arange(df_toronto_venues_freq.shape[0]):
    t = df_toronto_venues_freq.iloc[ind,1:].astype(float)
    t = t.sort_values(ascending=False) # sort based on frequency of occurence
    t = list(t.index.values)[0:num_top] # get venue categories of sorted list
    t.insert(0,df_toronto_venues_freq.loc[ind,'Postcode'])

    df_toronto_venues_sorted.loc[ind] = t

df_toronto_venues_sorted.head()

Unnamed: 0,Postcode,1st,2nd,3rd,4th,5th
0,M1B,Fast Food Restaurant,Print Shop,Dessert Shop,Diner,Discount Store
1,M1C,Construction & Landscaping,Bar,Yoga Studio,Eastern European Restaurant,Dog Run
2,M1E,Intersection,Spa,Electronics Store,Pizza Place,Breakfast Spot
3,M1G,Coffee Shop,Convenience Store,Korean Restaurant,Yoga Studio,Eastern European Restaurant
4,M1H,Hakka Restaurant,Bakery,Lounge,Caribbean Restaurant,Athletics & Sports


In [12]:
# set up kmeans algorithm with 5 clusters
# Go ahead and cluster the original, unsorted dataframe with frequencies of occurences
# of different venue categories
kclusters = 5

# drop all non-numeric columns from input dataframe to kmeans clustering
df_toronto_clustering = df_toronto_venues_freq.drop('Postcode',axis=1)

# run kmeans
k_means = KMeans(n_clusters=kclusters,random_state=1).fit(df_toronto_clustering)

# extract result of kmeans clustering
cluster_labels = k_means.labels_
cluster_centroids = k_means.cluster_centers_

# update toronto data with cluster labels
df_toronto_venues_freq.insert(0,'Cluster labels',cluster_labels)

In [11]:
#append neighbourhood, borough and lat/long information to sorted dataframe
df_toronto_venues_sorted.insert(1,'Borough',df_toronto['Borough'])
df_toronto_venues_sorted.insert(2,'Neighbourhood',df_toronto['Neighbourhood'])
df_toronto_venues_sorted.insert(3,'Latitude',df_toronto['Latitude'])
df_toronto_venues_sorted.insert(4,'Longitude',df_toronto['Longitude'])

df_toronto_venues_sorted.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,1st,2nd,3rd,4th,5th
0,M1B,[Scarborough],"[Rouge, Malvern]",43.806686,-79.194353,Fast Food Restaurant,Print Shop,Dessert Shop,Diner,Discount Store
1,M1C,[Scarborough],"[Highland Creek, Rouge Hill, Port Union]",43.784535,-79.160497,Construction & Landscaping,Bar,Yoga Studio,Eastern European Restaurant,Dog Run
2,M1E,[Scarborough],"[Guildwood, Morningside, West Hill]",43.763573,-79.188711,Intersection,Spa,Electronics Store,Pizza Place,Breakfast Spot
3,M1G,[Scarborough],[Woburn],43.770992,-79.216917,Coffee Shop,Convenience Store,Korean Restaurant,Yoga Studio,Eastern European Restaurant
4,M1H,[Scarborough],[Cedarbrae],43.773136,-79.239476,Hakka Restaurant,Bakery,Lounge,Caribbean Restaurant,Athletics & Sports


In [13]:
# now, insert cluster labels into sorted venue categories
df_toronto_venues_sorted.insert(5,"Cluster labels",cluster_labels)
df_toronto_venues_sorted.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster labels,1st,2nd,3rd,4th,5th
0,M1B,[Scarborough],"[Rouge, Malvern]",43.806686,-79.194353,0,Fast Food Restaurant,Print Shop,Dessert Shop,Diner,Discount Store
1,M1C,[Scarborough],"[Highland Creek, Rouge Hill, Port Union]",43.784535,-79.160497,0,Construction & Landscaping,Bar,Yoga Studio,Eastern European Restaurant,Dog Run
2,M1E,[Scarborough],"[Guildwood, Morningside, West Hill]",43.763573,-79.188711,4,Intersection,Spa,Electronics Store,Pizza Place,Breakfast Spot
3,M1G,[Scarborough],[Woburn],43.770992,-79.216917,0,Coffee Shop,Convenience Store,Korean Restaurant,Yoga Studio,Eastern European Restaurant
4,M1H,[Scarborough],[Cedarbrae],43.773136,-79.239476,0,Hakka Restaurant,Bakery,Lounge,Caribbean Restaurant,Athletics & Sports


In [22]:
# understand cluster_centroids. First sort the cluster centroids in descending order of frequency
# to understand the top num_top popular venue categories in the cluster centroids


df_cluster_centroids = pd.DataFrame(cluster_centroids,columns=df_toronto_clustering.columns)
df_cluster_centroids_sorted = pd.DataFrame(columns=col[1:])

for ind in np.arange(kclusters):
    c = df_cluster_centroids.loc[ind,:].sort_values(ascending=False)
    df_cluster_centroids_sorted.loc[ind,:] = c.index.values[0:num_top]

df_cluster_centroids_sorted.head()

Unnamed: 0,1st,2nd,3rd,4th,5th
0,Coffee Shop,Park,Café,Bank,Bar
1,Cafeteria,Eastern European Restaurant,Discount Store,Dog Run,Doner Restaurant
2,Playground,Park,Tennis Court,Asian Restaurant,Trail
3,Baseball Field,Yoga Studio,Dog Run,Doner Restaurant,Donut Shop
4,Pizza Place,Pharmacy,Sandwich Place,Coffee Shop,Grocery Store


In [23]:
# get latitude and longitude of Toronto for plotting
address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent='Toronto_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

print('the latitude/longitude of Toronto, Ontario is {}'.format([latitude,longitude]))

the latitude/longitude of Toronto, Ontario is [43.653963, -79.387207]


In [28]:
# create folium map of Toronto
toronto_map = folium.Map(location=[latitude,longitude],zoom_start=10)

#set color scheme for clusters
x = np.arange(kclusters)
colors_array = cm.rainbow(np.linspace(0,1,kclusters))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# plot neighborhoods color coded by clusters
for ind,(lat,lng,cluster,neigh,borough,postcode) in enumerate(zip(
    df_toronto_venues_sorted['Latitude'],
    df_toronto_venues_sorted['Longitude'],
    df_toronto_venues_sorted['Cluster labels'],
    df_toronto_venues_sorted['Neighbourhood'],
    df_toronto_venues_sorted['Borough'],
    df_toronto_venues_sorted['Postcode'])):
        
        top_venues = str(df_toronto_venues_sorted.iloc[ind,6:].to_list())
        top_cluster_venues = str(df_cluster_centroids_sorted.loc[cluster,:].to_list())
        label = folium.Popup('Postcode: ' + str(postcode) + 
                             ' contains neighborhoods: ' + str(neigh) + 
                             "with top 5 venue categories " + top_venues + 
                             "in cluster " + str(cluster) + 
                             "with top 5 cluster venues " + top_cluster_venues,parse_html=True)
        folium.CircleMarker(
            [lat,lng],
            radius=5,
            popup=label,
            color=rainbow[cluster-1],
            fill=True,
            fill_color = rainbow[cluster-1],
            fill_opacity=0.7).add_to(toronto_map)

    
toronto_map