# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import pandas as pd
import numpy as np
#!conda install -c conda-forge geopy
#!pip install geocoder
import geocoder # import geocoder
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
print('Libraries Imported')

Libraries Imported


# HTML Parsing & Data Pre-Processing
### - Using pandas for parsing HTML & Data Wrangling

In [2]:
# parse HTML and get the first occurence of <table> tag
canada_neighborhood_df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M',header=0)[0]

# remove Borough with 'Not Assigned' value & reset index
canada_neighborhood_df = canada_neighborhood_df[canada_neighborhood_df.Borough != 'Not assigned'].reset_index(drop=True)

# rename column : Postal code->PostalCode
canada_neighborhood_df.rename(columns = {'Postal code':'PostalCode'}, inplace = True) 

# replace "/" with ","
temp_df= canada_neighborhood_df['Neighborhood']
temp_df = temp_df.replace(r' /', ',', regex=True)
temp_df = temp_df.replace(r'\'', '', regex=True)
canada_neighborhood_df['Neighborhood']=temp_df

# sort dataframe for readability
canada_neighborhood_df = canada_neighborhood_df.sort_values('PostalCode').reset_index(drop=True)
canada_neighborhood_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


## Canada Neighborhood Data Frame Shape

In [3]:
canada_neighborhood_df.shape

(103, 3)

# Method # 1. Use Postal Codes For Determining Lat/Long [use google geocoder]

### Assumptions
<ol>
<li>Use google geocoder API</li>
<li>Use postal code to determine Lat/Long</li>
<li>Attempt multiple retries</li>
</ol>

In [4]:
import geocoder # import geocoder

# define the dataframe columns
column_names_geocoder = ['Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods_geocoder = pd.DataFrame(columns=column_names_geocoder)

for idx_geo in canada_neighborhood_df.index:
    postal_code = canada_neighborhood_df['PostalCode'][idx_geo]
    g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        
    lat_lng_coords = g.latlng
    
    if(lat_lng_coords != None):
        latitude = lat_lng_coords[0]
        longitude = lat_lng_coords[1]
        print('Postal Code :', postal_code, ' Lat/Long : ', latitude, longitude)
    else:
        print("No Lat/Long for PostalCode: ", postal_code)

No Lat/Long for PostalCode:  M1B
No Lat/Long for PostalCode:  M1C
No Lat/Long for PostalCode:  M1E
No Lat/Long for PostalCode:  M1G
No Lat/Long for PostalCode:  M1H
No Lat/Long for PostalCode:  M1J
No Lat/Long for PostalCode:  M1K
No Lat/Long for PostalCode:  M1L
No Lat/Long for PostalCode:  M1M
No Lat/Long for PostalCode:  M1N
No Lat/Long for PostalCode:  M1P
No Lat/Long for PostalCode:  M1R
No Lat/Long for PostalCode:  M1S
No Lat/Long for PostalCode:  M1T
No Lat/Long for PostalCode:  M1V
No Lat/Long for PostalCode:  M1W
No Lat/Long for PostalCode:  M1X
No Lat/Long for PostalCode:  M2H
No Lat/Long for PostalCode:  M2J
No Lat/Long for PostalCode:  M2K
No Lat/Long for PostalCode:  M2L
No Lat/Long for PostalCode:  M2M
No Lat/Long for PostalCode:  M2N
No Lat/Long for PostalCode:  M2P
No Lat/Long for PostalCode:  M2R
No Lat/Long for PostalCode:  M3A
No Lat/Long for PostalCode:  M3B
No Lat/Long for PostalCode:  M3C
No Lat/Long for PostalCode:  M3H
No Lat/Long for PostalCode:  M3J
No Lat/Lon

### Result :
<ol>
<li>Despite multiple retries, including using while loops, could not retrieve Lat/Long with Postal Codes</li>
<li>Attempt to derive Lat/Long using Nominatim geolocator [with Neighborhood name], as shown below in Method # 2</li>
</ol>

# Method # 2. Use Neighborhood  For Determining Lat/Long [use geolocator]

### Assumptions
<ol>
<li>Use Nominatim geolocator API</li>
<li>Use Neighborhood name to determine Lat/Long</li>
<li>Use delay, to avoid timeouts</li>
<li>Use Downtown Ontario Lat/Long as default, if the Lat/Long cannot be resoloved</li>
</ol>

In [7]:
# define the dataframe columns
column_names = ['Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

# Initialize
from geopy.extra.rate_limiter import RateLimiter
geolocator = Nominatim(user_agent="toronto_geo_explorer")

# Use default Ontario Downton Lat/Long, if the address cannot be found using Neighborhood address
lat_default = 43.6547567
lon_default = -79.3966769

for idx in canada_neighborhood_df.index:
    address = canada_neighborhood_df['Neighborhood'][idx]
    # for postal codes with multiple neighborhoods, take the first one
    address = address.split(',')
    address = address[0]
    address = address + ',ON'

    # add delay to avoid timeouts ..
    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=3)
    
    location = geolocator.geocode(address)
    if(location != None):
        latitude = location.latitude
        longitude = location.longitude
        neighborhoods = neighborhoods.append({'Latitude': latitude,'Longitude': longitude}, ignore_index=True)
    else:
        # Address cannot be located by the API. Assign default address
        neighborhoods = neighborhoods.append({'Latitude': lat_default,'Longitude': lon_default}, ignore_index=True)

canada_neighborhood_df['Latitude'] = neighborhoods['Latitude']
canada_neighborhood_df['Longitude'] = neighborhoods['Longitude']

### Result :
<ol>
<li>with adding delays,  able to resolve timeout issue</li>
<li>Able to retrieve Lat/Long. Here is the output</li>
</ol>

In [8]:
canada_neighborhood_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.809196,-79.221701
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.780271,-79.130499
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.755225,-79.198229
3,M1G,Scarborough,Woburn,43.759824,-79.225291
4,M1H,Scarborough,Cedarbrae,43.756467,-79.226692
5,M1J,Scarborough,Scarborough Village,43.743742,-79.211632
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.724878,-79.253969
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.727841,-79.287622
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.711170,-79.248177
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.691639,-79.266110
