# Scraping Toronto Postal Codes, Borough, and Neighborhood

In [1]:
# import libraries for scraping
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup

In [2]:
# request html contained in Wiki page
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')

In [3]:
# find all tables in scraped data
all_tables = soup.find_all('table')
table = soup.find('table', class_='wikitable sortable')
table

<table class="wikitable sortable">
<tbody><tr>
<th>Postal Code
</th>
<th>Borough
</th>
<th>Neighborhood
</th></tr>
<tr>
<td>M1A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M2A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M3A
</td>
<td>North York
</td>
<td>Parkwoods
</td></tr>
<tr>
<td>M4A
</td>
<td>North York
</td>
<td>Victoria Village
</td></tr>
<tr>
<td>M5A
</td>
<td>Downtown Toronto
</td>
<td>Regent Park, Harbourfront
</td></tr>
<tr>
<td>M6A
</td>
<td>North York
</td>
<td>Lawrence Manor, Lawrence Heights
</td></tr>
<tr>
<td>M7A
</td>
<td>Downtown Toronto
</td>
<td>Queen's Park, Ontario Provincial Government
</td></tr>
<tr>
<td>M8A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M9A
</td>
<td>Etobicoke
</td>
<td>Islington Avenue
</td></tr>
<tr>
<td>M1B
</td>
<td>Scarborough
</td>
<td>Malvern, Rouge
</td></tr>
<tr>
<td>M2B
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M3B
</td>
<td>North York
</td>
<td>Don Mills
</td></tr>
<tr>
<td>M4B
</td>
<td>East Y

In [6]:
# extract useful info
post_code = []
bor = []
neigh = []

for row in table.findAll('tr'):
    cells = row.findAll('td')
    if len(cells)==3:
        post_code.append(cells[0].find(text=True))
        bor.append(cells[1].find(text=True))
        neigh.append(cells[2].find(text=True))

In [45]:
import pandas as pd

# generate dataframe 
d = {'PostalCode': post_code, 'Borough': bor, 'Neighborhood': neigh}
df = pd.DataFrame(data = d)
df = df.apply(lambda x: x.str.strip())
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
PostalCode      180 non-null object
Borough         180 non-null object
Neighborhood    180 non-null object
dtypes: object(3)
memory usage: 4.3+ KB


In [47]:
# check number of 'not assigned' and if there's empty Neighborhood
bor_NA = len(df[df['Borough'] == 'Not assigned'])
neig_NA = len(df[df['Neighborhood'] == ''])
print('Number of Not assigned Borough: {}, \nNumber of empty entries for Neighborhood: {}.'.format(bor_NA, neig_NA))

Number of Not assigned Borough: 77, 
Number of empty entries for Neighborhood: 77.


In [None]:
# remove 'Not assigned' Borough

In [50]:
df = df[df['Borough'] != 'Not assigned']
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
PostalCode      103 non-null object
Borough         103 non-null object
Neighborhood    103 non-null object
dtypes: object(3)
memory usage: 2.5+ KB


In [52]:
# conferm there are no empty Neighborhoods
len(df[df['Neighborhood'] == ''])

0

In [53]:
# chheck shape of final dataframe
df.shape

(103, 3)

In [54]:
# save file in csv format
df.to_csv('toronto.csv', index=False)

# Adding Latitude and Logitude to each Postal Code

In [55]:
#!pip install geocoder

In [56]:
# retrieve latitude and logitude of each postal code with geocode

import geocoder 

df['Latitude'] = None
df['Longitude'] = None

# initialize your variable to None

for postal_code in df['PostalCode']:
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    
    df.loc[df['PostalCode'] == postal_code, ['Latitude']] = lat_lng_coords[0]
    df.loc[df['PostalCode'] == postal_code, ['Longitude']] = lat_lng_coords[1]

In [57]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7529,-79.3356
1,M4A,North York,Victoria Village,43.7281,-79.3119
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.651,-79.353
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7233,-79.4512
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6618,-79.3894


In [58]:
# check for possible empty entries
null_lat = len(df[df['Latitude'].isnull()])
null_lon = len(df[df['Longitude'].isnull()])
print('Number of empty entries for Latitude: {}, \nNumber of empty entries for Longitude: {}.'.format(null_lat, null_lon))

Number of empty entries for Latitude: 0, 
Number of empty entries for Longitude: 0.


In [59]:
# save file in csv format
df.to_csv('toronto.csv', index=False)

# Segmenting and Clustering Neighborhoods in Toronto

In [24]:
#!pip install folium==0.5.0
#!conda install -c conda-forge geopy --yes

In [60]:
# import useful libraries
import numpy as np 

# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

# library to handle JSON files
import json  

# convert an address into latitude and longitude values
from geopy.geocoders import Nominatim 

# library to handle requests
import requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

In [183]:
# import my Foursquare credentials (which I'm not gonna share here!)
CLIENT_ID = '************' # your Foursquare ID
CLIENT_SECRET = '***************' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [86]:
# Select Borough with word Toronto in it to reduce dimensions od dataset for this assignment
df_1 = df[df['Borough'].str.contains('Toronto', regex=False)]
df_1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.651,-79.353
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6618,-79.3894
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.6575,-79.3775
15,M5C,Downtown Toronto,St. James Town,43.6517,-79.3756
19,M4E,East Toronto,The Beaches,43.6781,-79.2953


In [88]:
df_1.shape

(39, 5)

In [89]:
# generate function to obtain venues nearby a certain location

def getNearbyVenues(names, latitudes, longitudes, radius, limit):
    
    radius = radius
    limit = limit
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
# call the function and organize data in dataframe
toronto_venues = getNearbyVenues(names=df_1['Neighborhood'],
                                 latitudes=df_1['Latitude'],
                                 longitudes=df_1['Longitude'],
                                 radius = 500,
                                 limit = 100)

print('\nFoursquare API returned {} venues'.format(toronto_venues.shape[0]))
toronto_venues.head()

Venues returned for each neighborhood

In [94]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,63,63,63,63,63,63
"Brockton, Parkdale Village, Exhibition Place",44,44,44,44,44,44
Business reply mail Processing Centre,100,100,100,100,100,100
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",64,64,64,64,64,64
Central Bay Street,54,54,54,54,54,54
Christie,12,12,12,12,12,12
Church and Wellesley,86,86,86,86,86,86
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
Davisville,29,29,29,29,29,29
Davisville North,5,5,5,5,5,5


Unique venues categories 

In [152]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 223 uniques categories.


In [185]:
# I noticed some venues were returned with the venue category called 'Neighborhood'
# so I remove them since they do not mean anything to the analysis

In [155]:
toronto_venues = toronto_venues[toronto_venues['Venue Category'] != 'Neighborhood']
print('After removing venues that were not specified we have {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

After removing venues that were not specified we have 222 uniques categories.


Neighborhood analysis 

In [156]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix='', prefix_sep='')

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
cols = ['Neighborhood']  + [col for col in toronto_onehot if col != 'Neighborhood']
toronto_onehot = toronto_onehot[cols]

toronto_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Workshop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [157]:
# chack that shape is consistent
toronto_onehot.shape

(1593, 223)

In [158]:
# get mean values for neighborhood
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Workshop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.015873,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.015873,0.0,0.0,0.0,0.0,0.0,0.015873
1,"Brockton, Parkdale Village, Exhibition Place",0.023256,0.0,0.0,0.023256,0.0,0.023256,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Business reply mail Processing Centre,0.0,0.0,0.02,0.01,0.0,0.0,0.03,0.0,0.0,...,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.0,0.0,0.0,0.0,0.015625,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.018868,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.018868,0.018868,0.0,0.0,0.0,0.0
5,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.0,0.011628,0.011628,0.0,0.0,0.011628,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011628,0.0,0.011628
7,"Commerce Court, Victoria Hotel",0.0,0.0,0.04,0.01,0.0,0.0,0.01,0.0,0.0,...,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.01
8,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.034483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [165]:
# print top 3 venues per Neighborhood

num_top_venues = 3

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
          venue  freq
0   Coffee Shop  0.10
1  Cocktail Bar  0.05
2    Restaurant  0.03


----Brockton, Parkdale Village, Exhibition Place----
         venue  freq
0         Café  0.07
1  Coffee Shop  0.07
2        Diner  0.05


----Business reply mail Processing Centre----
         venue  freq
0  Coffee Shop  0.07
1        Hotel  0.05
2   Restaurant  0.04


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
               venue  freq
0        Coffee Shop  0.06
1               Café  0.06
2  French Restaurant  0.05


----Central Bay Street----
             venue  freq
0      Coffee Shop  0.15
1       Restaurant  0.04
2  Bubble Tea Shop  0.04


----Christie----
           venue  freq
0           Café  0.25
1  Grocery Store  0.25
2    Coffee Shop  0.08


----Church and Wellesley----
                 venue  freq
0          Coffee Shop  0.12
1  Japanese Restaurant  0.07
2     Sushi Restaurant  0.05


----Com

In [166]:
# generate function that returns top 10 most common venues

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [167]:
# apply function to every neighborhood

num_top_venues = 10
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Breakfast Spot,Beer Bar,Bakery,Seafood Restaurant,Restaurant,Cheese Shop,Hotel,Café
1,"Brockton, Parkdale Village, Exhibition Place",Coffee Shop,Café,Diner,Pizza Place,Gift Shop,Thrift / Vintage Store,Accessories Store,Caribbean Restaurant,Boutique,Italian Restaurant
2,Business reply mail Processing Centre,Coffee Shop,Hotel,Restaurant,Japanese Restaurant,Café,Asian Restaurant,Taco Place,Seafood Restaurant,Gym,Steakhouse
3,"CN Tower, King and Spadina, Railway Lands, Har...",Café,Coffee Shop,Restaurant,French Restaurant,Park,Lounge,Speakeasy,Spa,Italian Restaurant,Gym / Fitness Center
4,Central Bay Street,Coffee Shop,Clothing Store,Bubble Tea Shop,Middle Eastern Restaurant,Plaza,Restaurant,Sandwich Place,Japanese Restaurant,Gastropub,Spa


Neighborhood clustering

In [174]:
# set number of clusters
kclusters = 10

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:100] 

array([4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 6, 1, 4, 4, 3, 4, 1, 8,
       4, 4, 4, 4, 0, 2, 4, 4, 4, 4, 9, 4, 4, 7, 0, 4, 4])

In [179]:
neighborhoods_venues_sorted.drop(['Cluster Labels'], axis=1, inplace=True)

In [180]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_1

# merge toronto_grouped with toronto_data to add latitude and longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.651,-79.353,4,Pub,Café,Athletics & Sports,French Restaurant,Bank,Thai Restaurant,Theater,Bakery,Distribution Center,Chocolate Shop
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6618,-79.3894,4,Coffee Shop,Café,Sushi Restaurant,College Cafeteria,Yoga Studio,Bookstore,Fried Chicken Joint,Burrito Place,Smoothie Shop,Middle Eastern Restaurant
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.6575,-79.3775,4,Coffee Shop,Clothing Store,Sandwich Place,Middle Eastern Restaurant,Italian Restaurant,Restaurant,Café,Cosmetics Shop,Bar,Hotel
15,M5C,Downtown Toronto,St. James Town,43.6517,-79.3756,4,Coffee Shop,Café,Cocktail Bar,Cosmetics Shop,American Restaurant,Gastropub,Italian Restaurant,Theater,Creperie,Clothing Store
19,M4E,East Toronto,The Beaches,43.6781,-79.2953,7,Health Food Store,Pub,Trail,Church,Cuban Restaurant,Donut Shop,Flower Shop,Fish Market,Fish & Chips Shop,Fast Food Restaurant


Generate maps with labeles neighborhoods  (10 clusters)

In [181]:
# retrieve Toronto coordinates 
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [186]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.9).add_to(map_clusters)
       
map_clusters