# Segmenting and Clustering Neighborhoods in Toronto

## Part 1. Web Scraping with Beautiful Soup

In [21]:
import pandas as pd
from bs4 import BeautifulSoup
import requests 

Define the dataframe:

In [22]:
column_names = ['PostalCode', 'Borough', 'Neighborhood'] 
neighborhoods = pd.DataFrame(columns=column_names)
neighborhoods.set_index('PostalCode')
neighborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood


Get the table of interest using BeautifulSoup:

In [23]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(url).text
table = BeautifulSoup(source, 'lxml').find('table', class_ = "wikitable")

The first row contains heading, but since it is surrounded by 'th' we can simply omit it by finding content inside 'td's.

Iterate through the table line by line:

In [24]:
na = 'Not assigned'

for line in table.tbody.find_all('tr'):
    items = line.find_all('td') 
    if len(items) > 0 : # for heading this list is empty
        if items[1].text != na : # ignore cells with a borough that is Not assigned
            items[0] = items[0].text # get only text from the fields
            items[1] = items[1].text
            if items[2].text.startswith(na) : # case with 'Queen's Park' 
                items[2] = items[1] 
            else : # just remove '\n' at the end
                items[2] = items[2].text.strip('\n') 
            if items[0] in neighborhoods.index : # if already in dataframe 
                previous = neighborhoods.loc[items[0], 'Neighborhood']
                neighborhoods.at[items[0], 'Neighborhood'] =  previous + ', ' + items[2]
            else: # append new row
                neighborhood = pd.DataFrame([items], columns = column_names, index = [items[0]])
                neighborhoods = neighborhoods.append(neighborhood)

In [25]:
neighborhoods.reset_index(drop = True, inplace = True)

In [26]:
neighborhoods.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [27]:
neighborhoods.shape

(103, 3)

----
## Part 2. Setting coordinates

### Using geocoder 
__<[REQUEST_DENIED] Google - Geocode [empty]>__

In [8]:
import geocoder

def get_lat_lng(postal_code):
    lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [9]:
# check whether geocoder is working
g = geocoder.google('M3A, Toronto, Ontario')
g

<[REQUEST_DENIED] Google - Geocode [empty]>

### Using provided csv file

In [None]:
#!pip install --upgrade wget

Downloading provided data:

In [30]:
import wget

coords_file = wget.download('http://cocl.us/Geospatial_data')
coords = pd.read_csv(coords_file).rename(columns={"Postal Code": "PostalCode"})
coords.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merging two dataframes on column 'PostalCode':

In [51]:
neighborhoods = pd.merge(neighborhoods, coords, on = 'PostalCode', how = 'inner', validate = 'one_to_one')
neighborhoods.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


Check shape:

In [52]:
neighborhoods.shape

(103, 5)

----
## Part 3. Exploring and clustering the neighborhoods

In [49]:
#import numpy as np # library to handle data in a vectorized manner

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
#import matplotlib.cm as cm
#import matplotlib.colors as colors

# import k-means from clustering stage
#from sklearn.cluster import KMeans

import folium # map rendering library


Use geopy library to get the latitude and longitude values of Toronto.

In [50]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


Create a map of Toronto

In [66]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Select only boroughs that contain the word Toronto 

In [71]:
toronto = neighborhoods[neighborhoods['Borough'].str.contains('Toronto')].reset_index(drop = True)
toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
1,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [72]:
toronto.shape

(38, 5)

In [75]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(toronto['Latitude'], toronto['Longitude'], toronto['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto