In [1]:
import pandas as pd
import numpy as np
import requests

# Read the table from the Wikipedia site

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
data = pd.read_html(url, flavor = 'bs4')

# Define a function to arrange the dataset

In [3]:
def str_extract(x):
    if 'Not assigned' in x:
        postal_code = str(x)[:3]
        borough = neighbor = np.nan
    else:
        postal_code = str(x)[:3]
        bor_neig = str(x)[3:].split('(')
        borough = bor_neig[0]
        neighbor = bor_neig[1].replace(')', '').replace(' /', ',')
        if neighbor == '': neighbor = borough
        
    return [postal_code, borough, neighbor]

# Filter and create a structured dataset

In [4]:
data_stacked = data[0].applymap(str_extract).stack()
new_list = []
for x in data_stacked:
    new_list.append(x)
dataset = pd.DataFrame(new_list, columns = ['PostalCode', 'Borough', 'Neighborhood'])

In [5]:
dataset.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Queen's Park,Ontario Provincial Government
7,M8A,,
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,"Malvern, Rouge"


In [6]:
dataset.shape

(180, 3)

# Read the GeoSpatial Dataset

In [7]:
loc_url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv'
location = pd.read_csv(loc_url, names = ['PostalCode', 'Latitude', 'Longitude'])

# Execute a inner join in both datasets

In [8]:
dataset = pd.merge(dataset, location, on = 'PostalCode')
dataset['Latitude'] = pd.to_numeric(dataset['Latitude'])
dataset['Longitude'] = pd.to_numeric(dataset['Longitude'])
dataset.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don MillsNorth,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


# Install libraries for next steps

In [9]:
!pip install geopy
from geopy.geocoders import Nominatim

!pip install folium
import folium

from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting folium
  Downloading folium-0.12.1-py2.py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 5.6 MB/s  eta 0:00:01
Collecting branca>=0.3.0
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.12.1


# Find Latitude and Longitude for Toronto

In [10]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="toronto")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# Show a Toronto map with the Neighborhoods

In [11]:
map_toronto = folium.Map(location = [latitude, longitude], zoom_start = 12)

for lat, lng, borough, neighborhood in zip(dataset['Latitude'],
                                           dataset['Longitude'],
                                           dataset['Borough'],
                                           dataset['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

# Function to get venues from Foursquare

In [12]:
CLIENT_ID = 'M1J4KQTEEGUPLUYXWYLEEBGLSRQLPKTT0M4WIBPIAOLFMCFT'
CLIENT_SECRET = 'OVNVPHUPXPM3FO0Q51UABIHGIE12RHAH1C52YYXLTPEGAZ4B'
VERSION = '20190425'
LIMIT = 100

In [13]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

# Get venues and present the information
The boroughs were filtered to only those containing the word Toronto 

In [14]:
dataset_filtered = dataset[dataset['Borough'].str.contains("Toronto")]

toronto_venues = getNearbyVenues(dataset_filtered['Neighborhood'],
                                 dataset_filtered['Latitude'],
                                 dataset_filtered['Longitude'], radius = 500)

In [15]:
toronto_venues.groupby('Neighborhood')['Venue'].count()

Neighborhood
Berczy Park                                                                                                    58
Brockton, Parkdale Village, Exhibition Place                                                                   24
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport     18
Central Bay Street                                                                                             62
Christie                                                                                                       16
Church and Wellesley                                                                                           81
Commerce Court, Victoria Hotel                                                                                100
Davisville                                                                                                     35
Davisville North                                                           

In [16]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park, Harbourfront",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


# Encode the venues

In [17]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

In [18]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

# Define the most common venues for each neighborhood

In [19]:
num_top_venues = 10
indicators = ['st', 'nd', 'rd']
columns = ['Neighborhood']

for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Cocktail Bar,Coffee Shop,Bakery,Cheese Shop,Restaurant,Pharmacy,Pub,Seafood Restaurant,Farmers Market,Beer Bar
1,"Brockton, Parkdale Village, Exhibition Place",Café,Coffee Shop,Breakfast Spot,Pet Store,Bakery,Office,Performing Arts Venue,Nightclub,Climbing Gym,Restaurant
2,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Airport Terminal,Coffee Shop,Bar,Plane,Rental Car Location,Sculpture Garden,Boutique,Boat or Ferry
3,Central Bay Street,Coffee Shop,Sandwich Place,Café,Italian Restaurant,Bubble Tea Shop,Burger Joint,Restaurant,Japanese Restaurant,Salad Place,Portuguese Restaurant
4,Christie,Grocery Store,Café,Park,Nightclub,Italian Restaurant,Restaurant,Baby Store,Candy Store,Athletics & Sports,Coffee Shop


# Cluster the neighborhoods by venues
KMeans did not work well and the SpectralClustring was applied 

In [20]:
from sklearn.cluster import SpectralClustering

kclusters = 5
kmeans = SpectralClustering(n_clusters = kclusters).fit(toronto_grouped.drop('Neighborhood', 1))

neighborhoods_venues_sorted['Cluster Labels'] = kmeans.labels_ 
fixed_columns = [neighborhoods_venues_sorted.columns[-1]] + list(neighborhoods_venues_sorted.columns[:-1])
neighborhoods_venues_sorted = neighborhoods_venues_sorted[fixed_columns]


# Merge and present the results

In [21]:
toronto_merged = dataset_filtered
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
toronto_merged

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Coffee Shop,Bakery,Park,Pub,Theater,Café,Breakfast Spot,Bank,Dessert Shop,Mexican Restaurant
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Coffee Shop,Clothing Store,Cosmetics Shop,Japanese Restaurant,Middle Eastern Restaurant,Bubble Tea Shop,Café,Diner,Ramen Restaurant,Bookstore
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Café,Coffee Shop,Cosmetics Shop,Restaurant,Cocktail Bar,Creperie,Art Gallery,Farmers Market,Beer Bar,Bakery
19,M4E,East Toronto,The Beaches,43.676357,-79.293031,4,Pub,Health Food Store,Trail,Wine Shop,Donut Shop,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Eastern European Restaurant
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,0,Cocktail Bar,Coffee Shop,Bakery,Cheese Shop,Restaurant,Pharmacy,Pub,Seafood Restaurant,Farmers Market,Beer Bar
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,0,Coffee Shop,Sandwich Place,Café,Italian Restaurant,Bubble Tea Shop,Burger Joint,Restaurant,Japanese Restaurant,Salad Place,Portuguese Restaurant
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564,1,Grocery Store,Café,Park,Nightclub,Italian Restaurant,Restaurant,Baby Store,Candy Store,Athletics & Sports,Coffee Shop
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568,0,Coffee Shop,Café,Restaurant,Thai Restaurant,Clothing Store,Gym,Deli / Bodega,Steakhouse,Pizza Place,Concert Hall
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259,1,Pharmacy,Bakery,Supermarket,Middle Eastern Restaurant,Music Venue,Park,Pet Store,Café,Brewery,Bar
35,M4J,East YorkEast Toronto,The Danforth East,43.685347,-79.338106,2,Park,Convenience Store,Wine Shop,Diner,Falafel Restaurant,Event Space,Ethiopian Restaurant,Escape Room,Electronics Store,Eastern European Restaurant


# Show a map of Toronto with the clustered neighborhoods

In [22]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start = 12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'],
                                  toronto_merged['Longitude'],
                                  toronto_merged['Neighborhood'],
                                  toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters