In [1]:
import pandas as pd
import numpy as np 

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#Using pandas read_html, I pulled out the table, which is found as the first of three tables on the Wiki page (designated [0]).  

canada_postal = pd.read_html(url)
df = canada_postal[0]

#Next, I removed all rows containing Boroughs that were entitled "Not assigned."
df2 = df[df.Borough != 'Not assigned']
df2

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [3]:
#Using .shape confirms that the table now has 103 rows & 3 columns
df2.shape

(103, 3)

In [4]:
#Creating two new columns to represent the appropriate latitude and longitude of each postal code by joining df2 with the .csv file data
df_geo = pd.read_csv('http://cocl.us/Geospatial_data', index_col = 'Postal Code')

df_toronto_final = df2.join(df_geo, on ='Postal Code')
df_toronto_final

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
165,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [5]:
#The following code is used to import necessary mdoules for plotting/creating a folium map of Toronto:
import matplotlib.cm as cm
import matplotlib.colors as colors

!conda install -c conda-forge folium=0.5.0 --yes
import folium

#Next, restricting the boroughs in the table down to only those containing the name "Toronto" 
df_toronto_abbrev = df_toronto_final[df_toronto_final['Borough'].str.contains("Toronto")]
df_toronto_abbrev

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python-3.7-main

  added / updated specs:
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    _libgcc_mutex-0.1          |      conda_forge           3 KB  conda-forge
    _openmp_mutex-4.5          |           1_llvm           5 KB  conda-forge
    _py-xgboost-mutex-2.0      |            cpu_0           8 KB  conda-forge
    _pytorch_select-0.2        |            gpu_0           2 KB
    absl-py-0.11.0             |   py37h89c1867_0         168 KB  conda-forge
    aiohttp-3.7.3              |   py37h5e8e339_2  

libuuid-2.32.1       | 28 KB     | ##################################### | 100% 
pyasn1-0.4.8         | 53 KB     | ##################################### | 100% 
jsonschema-3.2.0     | 45 KB     | ##################################### | 100% 
pcre-8.44            | 261 KB    | ##################################### | 100% 
cached-property-1.5. | 10 KB     | ##################################### | 100% 
snowballstemmer-2.1. | 57 KB     | ##################################### | 100% 
lxml-4.6.2           | 1.5 MB    | ##################################### | 100% 
ibm-wsrt-py37main-ma | 2 KB      | ##################################### | 100% 
ca-certificates-2020 | 137 KB    | ##################################### | 100% 
cffi-1.14.5          | 225 KB    | ##################################### | 100% 
six-1.15.0           | 14 KB     | ##################################### | 100% 
dill-0.3.3           | 60 KB     | ##################################### | 100% 
brotli-1.0.9         | 389 K

libevent-2.1.10      | 1.1 MB    | ##################################### | 100% 
libstdcxx-ng-9.3.0   | 4.0 MB    | ##################################### | 100% 
pyjwt-2.0.1          | 17 KB     | ##################################### | 100% 
wheel-0.36.2         | 31 KB     | ##################################### | 100% 
jedi-0.18.0          | 923 KB    | ##################################### | 100% 
typing-extensions-3. | 8 KB      | ##################################### | 100% 
markdown-3.3.3       | 66 KB     | ##################################### | 100% 
plotly-4.14.3        | 5.9 MB    | ##################################### | 100% 
json5-0.9.5          | 20 KB     | ##################################### | 100% 
folium-0.5.0         | 45 KB     | ##################################### | 100% 
ipywidgets-7.6.3     | 101 KB    | ##################################### | 100% 
typing_extensions-3. | 25 KB     | ##################################### | 100% 
libprotobuf-3.15.2   | 2.5 M

seaborn-base-0.11.1  | 217 KB    | ##################################### | 100% 
pyzmq-22.0.3         | 526 KB    | ##################################### | 100% 
toml-0.10.2          | 18 KB     | ##################################### | 100% 
iniconfig-1.1.1      | 8 KB      | ##################################### | 100% 
h5py-3.1.0           | 1.2 MB    | ##################################### | 100% 
ninja-1.10.2         | 2.4 MB    | ##################################### | 100% 
ipython_genutils-0.2 | 21 KB     | ##################################### | 100% 
pthread-stubs-0.4    | 5 KB      | ##################################### | 100% 
llvm-openmp-11.0.1   | 4.7 MB    | ##################################### | 100% 
openjpeg-2.4.0       | 525 KB    | ##################################### | 100% 
secretstorage-3.3.1  | 24 KB     | ##################################### | 100% 
jupyterlab_widgets-1 | 130 KB    | ##################################### | 100% 
keras-preprocessing- | 34 KB

done


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
13,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
22,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
30,M4E,East Toronto,The Beaches,43.676357,-79.293031
31,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
40,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
41,M6G,Downtown Toronto,Christie,43.669542,-79.422564
49,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
50,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [6]:
#Next, creating a simple folium Map centered on Toronto, which uses the abbreviated Toronoto neighborhoods' latitude/longitude and displays blue markers
map_toronto = folium.Map(location=[43.6532, -79.3832], zoom_start = 10)

for lat, lng, label in zip(df_toronto_abbrev['Latitude'], df_toronto_abbrev['Longitude'], df_toronto_abbrev['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [7]:
#Next, defining my Foursquare Credentials and using Foursquare to provide useful neighborhood data that will help in later clustering via getNearbyVenues function

import json
import requests
from pandas.io.json import json_normalize

CLIENT_ID = 'Q5O51ZIB0JS3BSS2UYLH3UVZ4OTE125BZR20RBOWFLXCYMKR'
CLIENT_SECRET = 'UIXNLOGLM0X3UJBH4SKY2TUPKZQDMCQILVZ0S5BU505XEA4O'
VERSION = '20180604'
LIMIT = 100 

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):            
        #API Request to Foursquare
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)

        #GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        #Will return all relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#Checking the size of the Toronto venues that were found -- 1606 -- as well as displaying the first 5 rows:
toronto_venues = getNearbyVenues(names = df_toronto_abbrev['Neighbourhood'], latitudes = df_toronto_abbrev['Latitude'], longitudes = df_toronto_abbrev['Longitude'])

print(toronto_venues.shape)
toronto_venues.head()

(1607, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park, Harbourfront",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


In [8]:
#Using one-hot coding to effectively demonstrate the distribution of venues per neighborhood (and displayed below):

toronto_venues.groupby('Neighborhood')

toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']

fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Adult Boutique,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theater,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.017857,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.066667,0.066667,0.133333,0.2,0.133333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.015873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.015873,0.0,0.0,0.015873
5,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.025316,0.012658,0.0,0.0,0.0,0.0,0.0,0.012658,0.0,...,0.012658,0.012658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01
8,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,0.0
9,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
#Next, displaying the top 5 venues for each neighborhood in order for final preparations for clustering:

num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
                venue  freq
0         Coffee Shop  0.09
1        Cocktail Bar  0.05
2              Bakery  0.05
3  Seafood Restaurant  0.04
4         Cheese Shop  0.04


----Brockton, Parkdale Village, Exhibition Place----
            venue  freq
0            Café  0.12
1  Breakfast Spot  0.08
2       Nightclub  0.08
3     Coffee Shop  0.08
4          Bakery  0.08


----Business reply mail Processing Centre, South Central Letter Processing Plant Toronto----
                  venue  freq
0    Light Rail Station  0.12
1           Yoga Studio  0.06
2         Auto Workshop  0.06
3  Fast Food Restaurant  0.06
4            Skate Park  0.06


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
                 venue  freq
0      Airport Service  0.20
1       Airport Lounge  0.13
2     Airport Terminal  0.13
3                Plane  0.07
4  Rental Car Location  0.07


----Central Bay Street----
                ve

In [10]:
#Defining a function in order to allow the venues to be sorted
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#Subsequently, setting up a separate df to display the top 10 venues/neighborhood
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

#Columns according to n-th top venues:
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

#New df arranging by most common venues per neighborhood:
toronto_neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
toronto_neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
   toronto_neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

toronto_neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Bakery,Farmers Market,Cheese Shop,Pharmacy,Beer Bar,Restaurant,Seafood Restaurant,Café
1,"Brockton, Parkdale Village, Exhibition Place",Café,Bakery,Breakfast Spot,Nightclub,Coffee Shop,Grocery Store,Gym,Pet Store,Performing Arts Venue,Italian Restaurant
2,"Business reply mail Processing Centre, South C...",Light Rail Station,Auto Workshop,Park,Comic Shop,Pizza Place,Restaurant,Burrito Place,Brewery,Skate Park,Smoke Shop
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Airport Terminal,Boat or Ferry,Sculpture Garden,Rental Car Location,Plane,Coffee Shop,Harbor / Marina,Airport Food Court
4,Central Bay Street,Coffee Shop,Sandwich Place,Café,Italian Restaurant,Burger Joint,Thai Restaurant,Bubble Tea Shop,Salad Place,Portuguese Restaurant,Ramen Restaurant


In [11]:
#Setting up to divide the neighborhoods into 5 clusters:
kclusters = 5
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

#Implementing K-means clustering fit:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

#Adding Clustering Labels
toronto_neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = df_toronto_abbrev

#Merging both df's to add latitude/longitude for each neighborhood -- showing the header for successful confirmation
toronto_merged = toronto_merged.join(toronto_neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')
toronto_merged.head()

AttributeError: module 'numpy.linalg.lapack_lite' has no attribute '_ilp64'

In [None]:
#Initiating a folium map in order to add the final clustering results, based on top venues
map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start=10)

#Cluster color scheme:
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

#Successfully added colored markers to the map alongside cluster numbering, based on respective shared, top venues
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [None]:
#Looking at the map, the following most common venues define the clustering #'s:

#Label 0 = Coffee Shop/Café
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

In [None]:
#Label 1 = Trails
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

In [None]:
#Label 2 = Parks
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

In [None]:
#Label 3 = Playgrounds
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

In [None]:
#Label 4 = Fast Food Restaurants
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]