In [2]:
import pandas as pd
import numpy as np

Part 1

In [2]:
# Webpage url                                                                                                               
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# Extract tables
dfs = pd.read_html(url)

df = dfs[0]
#Drop rows without a Borough
df = df[df['Borough'] != 'Not assigned'].reset_index(drop=True)

df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [3]:
df.shape

(103, 3)

Part 2

In [4]:
df2 = pd.read_csv('Geospatial_Coordinates.csv')
latitude = []
longitude = []
for i in df['Postal Code'].values.tolist():
    latitude.append(df2[df2['Postal Code'] == i]['Latitude'].values[0])
    longitude.append(df2[df2['Postal Code'] == i]['Longitude'].values[0])


In [5]:
#Add to DataFrame
df['Latitude'] = latitude
df['Longitude'] = longitude

In [6]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


Part 3

In [7]:
from geopy.geocoders import Nominatim
import folium
import requests

In [8]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

In [9]:
#Just to visualize where each of the Postal Zones were located
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, postal, neighborhood in zip(df['Latitude'], df['Longitude'], df['Postal Code'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, postal)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [10]:
#Make dataframe with only postal codes in the borough of downtown toronto
df_toronto = df[df['Borough'] == 'Downtown Toronto']
df_north = df[df['Borough'] == 'North York']

In [11]:

CLIENT_ID = 'BNJ5AYFFOMQSMI3BNK2CIDFLEI225NQL5FZRUSSQOK0OZWQM' # your Foursquare ID
CLIENT_SECRET = 'N3PUBTON3W0WWDGST5W3OYYGYNRXCGLDYSWLGA4BWKQ32VA5' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
radius = 1000
limit = 100

In [12]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        try:
            results = requests.get(url).json()["response"]["groups"][0]['items']
        except:
            print(requests.get(url).json()["response"])
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal Code', 
                  'Postal Code Latitude', 
                  'Postal Code Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [13]:
#Get all venues near said postal codes
def return_most_common_venues(row,num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]
   
def prep_cluster(df,num_top_venues):
    df_venues = getNearbyVenues(names=df['Postal Code'],latitudes=df['Latitude'],longitudes=df['Longitude'])
    df_onehot = pd.get_dummies(df_venues[['Venue Category']], prefix="", prefix_sep="")

    # add neighborhood column back to dataframe
    df_onehot['Postal Code'] = df_venues['Postal Code'] 

    # move neighborhood column to the first column
    fixed_columns = [df_onehot.columns[-1]] + list(df_onehot.columns[:-1])
    df_onehot = df_onehot[fixed_columns]
    df_grouped = df_onehot.groupby('Postal Code').mean().reset_index()
    indicators = ['st', 'nd', 'rd']

    # create columns according to number of top venues
    columns = ['Postal Code']
    for ind in np.arange(num_top_venues):
        try:
            columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
        except:
            columns.append('{}th Most Common Venue'.format(ind+1))

    # create a new dataframe to sort most common venues
    df_venues_sorted = pd.DataFrame(columns=columns)
    df_venues_sorted['Postal Code'] = df_grouped['Postal Code']

    for ind in np.arange(df_grouped.shape[0]):
        df_venues_sorted.iloc[ind, 1:] = return_most_common_venues(df_grouped.iloc[ind, :], num_top_venues)
    df_venues_sorted['Cluster Labels'] = ""
    df_clustering = df_grouped.drop('Postal Code', 1)
    return df_clustering,df_venues_sorted

def cluster(df,df_clustering,kclusters,df_venues_sorted):
    # run k-means clustering
    kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_clustering)
    df_venues_sorted['Cluster Labels'] = kmeans.labels_

    df_merged = df

    # merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
    df_merged = df_merged.join(df_venues_sorted.set_index('Postal Code'), on='Postal Code')

    return df_merged

In [14]:
north_clustering,north_sorted = prep_cluster(df_north,10)
downtown_toronto_clustering,downtown_toronto_sorted = prep_cluster(df_toronto,10)

In [15]:
north_clustering.head()

Unnamed: 0,Accessories Store,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Baby Store,Bagel Shop,Bakery,...,Toy / Game Store,Trail,Train Station,Turkish Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wings Joint,Women's Store,Yoga Studio
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.022727,0.0,0.0,0.022727,0.0,0.0,0.0,0.045455,...,0.022727,0.0,0.0,0.0,0.022727,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
downtown_toronto_clustering.head()

Unnamed: 0,Airport,Airport Lounge,American Restaurant,Animal Shelter,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Trail,Train Station,University,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.01,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.01,0.02
3,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.0,0.01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01
4,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.01,0.0,0.0,...,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.01


In [17]:
final_clustering = pd.concat([downtown_toronto_clustering,north_clustering], ignore_index=True)
final_clustering.fillna(0,inplace = True)
final_clustering.head()

Unnamed: 0,Airport,Airport Lounge,American Restaurant,Animal Shelter,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Ski Chalet,Snack Place,Soccer Field,Sports Club,Storage Facility,Tennis Court,Toy / Game Store,Turkish Restaurant,Video Game Store,Women's Store
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.01,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.0,0.01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.01,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
final_sorted = pd.concat([downtown_toronto_sorted,north_sorted], ignore_index=True)
final_sorted

Unnamed: 0,Postal Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels
0,M4W,Coffee Shop,Grocery Store,Park,Filipino Restaurant,BBQ Joint,Sandwich Place,Office,Japanese Restaurant,Bank,Breakfast Spot,
1,M4X,Gastropub,Park,Café,Diner,Japanese Restaurant,Restaurant,Jewelry Store,Sushi Restaurant,Italian Restaurant,Steakhouse,
2,M4Y,Coffee Shop,Japanese Restaurant,Restaurant,Burger Joint,Sushi Restaurant,Park,Diner,Men's Store,Dance Studio,Italian Restaurant,
3,M5A,Coffee Shop,Café,Pub,Park,Diner,Theater,Breakfast Spot,Restaurant,Bakery,Italian Restaurant,
4,M5B,Coffee Shop,Gastropub,Japanese Restaurant,Theater,Café,Hotel,Ramen Restaurant,Cosmetics Shop,Diner,Seafood Restaurant,
5,M5C,Coffee Shop,Café,Restaurant,Gastropub,Italian Restaurant,Theater,Creperie,Plaza,Bookstore,Concert Hall,
6,M5E,Coffee Shop,Café,Hotel,Restaurant,Japanese Restaurant,Park,Grocery Store,Gastropub,Gym,Creperie,
7,M5G,Coffee Shop,Café,Park,Clothing Store,Japanese Restaurant,Art Gallery,Sushi Restaurant,Bubble Tea Shop,Burger Joint,Burrito Place,
8,M5H,Coffee Shop,Café,Hotel,Theater,Tea Room,Japanese Restaurant,Furniture / Home Store,Plaza,Pizza Place,Beer Bar,
9,M5J,Coffee Shop,Café,Hotel,Park,Japanese Restaurant,Brewery,Gym,Theater,Scenic Lookout,Plaza,


In [19]:
df_full = pd.concat([df_toronto,df_north], ignore_index=True)
df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [21]:
from sklearn.cluster import KMeans
downtown_toronto_merged = cluster(df_full,final_clustering,5,final_sorted)

In [23]:
import matplotlib.cm as cm
import matplotlib.colors as colors
kclusters = 5
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(downtown_toronto_merged['Latitude'], downtown_toronto_merged['Longitude'], downtown_toronto_merged['Postal Code'], downtown_toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In Part 3 I grouped the postal codes in the two boroughs, DownTown Toronoto and North York and looked for postal codes with similar venues between the two boroughs