This notebook will explore, segment, and cluster the neighborhoods in the city of Toronto

<h4>Import Library and Parser</h4>

In [1]:
# Installed beautifulsoup4
# Installed lml and html5lib parser and request library
# Note: To simplify, outputs for 'pip install' were cleared out

In [None]:
pip install beautifulsoup4

In [None]:
pip install lxml

In [None]:
pip install html5lib

In [None]:
pip install requests

In [6]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

<h4>Reading the data set from the URL</h4>

In [7]:
# Getting source code from Wikipedia page using 'requests libary' 
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

# Pass source code to Beautifulsoup4
soup = BeautifulSoup(source, 'lxml')

# print(soup.prettify()) - This will format our output
# Scroll through and copy tag '<table class="wikitable sortable">'
# Note: to simplify, the long output was cleared out

In [8]:
# We then parsed out 'table' from tag to create dataframe
table = soup.find_all('table')[0]

In [9]:
# Read table to the dataframe
df = pd.read_html(str(table))[0]

In [10]:
# Now, let's see what our table looks like
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West
285,M8Z,Etobicoke,South of Bloor


<h4>Evaluating and Cleaning Data</h4>

In [11]:
# From our table, we can select rows with 'Not assigned' value
df_Na = df[df.Borough.isin(['Not assigned'])]

In [12]:
# Let's see all rows with "Not assigned' value
df_Na

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
8,M8A,Not assigned,Not assigned
12,M2B,Not assigned,Not assigned
19,M7B,Not assigned,Not assigned
...,...,...,...
277,M4Z,Not assigned,Not assigned
278,M5Z,Not assigned,Not assigned
279,M6Z,Not assigned,Not assigned
280,M7Z,Not assigned,Not assigned


In [13]:
# After finding out rows with 'Not assigned', we can now ignore them
df_Na = df[~df.Borough.isin(['Not assigned'])]

In [14]:
# Here, we have 287 rows less 77 Not assigned Borough values
# Should return 210 rows
df_Na

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
...,...,...,...
281,M8Z,Etobicoke,Kingsway Park South West
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West


In [15]:
# From our generated table, we can group values with same Postcode and Borough
# in the same Neighbourhood row, separated by a comma
df_same_postcode = df_Na.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()
df_same_postcode.tail(18)

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Not assigned
86,M7R,Mississauga,Canada Post Gateway Processing Centre
87,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
88,M8V,Etobicoke,"Humber Bay Shores,Mimico South,New Toronto"
89,M8W,Etobicoke,"Alderwood,Long Branch"
90,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North"
91,M8Y,Etobicoke,"Humber Bay,King's Mill Park,Kingsway Park Sout..."
92,M8Z,Etobicoke,"Kingsway Park South West,Mimico NW,The Queensw..."
93,M9A,Queen's Park,Queen's Park
94,M9B,Etobicoke,"Cloverdale,Islington,Martin Grove,Princess Gar..."


In [16]:
# Notice first row, Postcode M7A, Neighbourhood has 'Not assigned' value? 

<h4>Identify and Handle values</h4>


In [17]:
# For Neighborhood with 'Not assigned' value, will be assign a value, same as their Borough 
df_same_postcode.loc[df_same_postcode['Neighbourhood']
                     =="Not assigned",'Neighbourhood'] = df_same_postcode.loc[df_same_postcode['Neighbourhood']
                                        =="Not assigned",'Borough']

In [18]:
# Here, we can see Borough value is same as Neighbourhood for Postcode M7A
df_same_postcode.tail(18)

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Queen's Park
86,M7R,Mississauga,Canada Post Gateway Processing Centre
87,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
88,M8V,Etobicoke,"Humber Bay Shores,Mimico South,New Toronto"
89,M8W,Etobicoke,"Alderwood,Long Branch"
90,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North"
91,M8Y,Etobicoke,"Humber Bay,King's Mill Park,Kingsway Park Sout..."
92,M8Z,Etobicoke,"Kingsway Park South West,Mimico NW,The Queensw..."
93,M9A,Queen's Park,Queen's Park
94,M9B,Etobicoke,"Cloverdale,Islington,Martin Grove,Princess Gar..."


<h4>Checking the Data</h4>

In [19]:
# Finally, for our dataframe we can use '.shape' method to check number of rows
df_same_postcode.shape

(103, 3)

<h4>Latitude and Longitude coordinates of each neighborhood</h4>

In [20]:
# Retrieving Geospatial_data
# Let's examine the table
df_gdata = pd.read_csv('Geo_data.csv')
df_gdata

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [21]:
# Here,we perfomed join operation by columns using merge function
# Let's add the Geospatial_data to our dataframe
# Then use 'PostalCode' as our header for first column

df_gsd = pd.merge(df_same_postcode,df_gdata,how='left',left_on='Postcode',right_on='Postal Code')
df_gsd.drop('Postal Code',axis=1,inplace=True)
df_gsd.rename(columns={'Postcode':'PostalCode'}, inplace=True)
df_gsd

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",43.739416,-79.588437


<h4>Replicating same analysis on our lab for New York City data</h4>

In [None]:
# First let's install all the dependencies that we need
# Note: To simplify, outputs for all dependencies were cleared out

import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim 

import requests

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

print('Libraries imported.')

In [23]:
# Let's check number for boroughs and neighborhoods
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df_gsd['Borough'].unique()),
        df_gsd.shape[0]
     )
)
   

The dataframe has 11 boroughs and 103 neighborhoods.


In [24]:
# Then, let's cluster Boroughs that contain the word 'Toronto' 

df_word=df_gsd[df_gsd['Borough'].str.contains('Toronto')].reset_index(drop=True)
df_word   

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.686412,-79.400049


In [25]:
# Let's check our dataframe

print('The dataframe has {} Boroughs and {} Toronto.'.format(
            len(df_word['Borough'].unique()),
df_word.shape[0]
 )
)

The dataframe has 4 Boroughs and 38 Toronto.


In [26]:
# Use geopy library to get the latitude and longitude values of Toronto

address = 'Toronto, ON'
geolocator = Nominatim(user_agent="toronto_city_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.653963, -79.387207.


In [27]:
# Here, we create a map of Toronto with neighborhoods superimposed on top
# The map will be created using latitude and longitude values
# The map should show clusters of colored blue markers
# Note: Generated map is similar to our lab on New York City
# Note: Image may not display correctly on Github

map_toronto_city = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_gsd['Latitude'], df_gsd['Longitude'], df_gsd['Borough'], df_gsd['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_city)  
    
map_toronto_city

In [28]:
# Let's segment and cluster only the neighborhood Downtown Toronto 
# From original dataframe, let's create a new dataframe of Downtown Toronto

Downtown_data = df_word[df_word['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
Downtown_data

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,M4X,Downtown Toronto,"Cabbagetown,St. James Town",43.667967,-79.367675
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
4,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937
5,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
6,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
7,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
8,M5H,Downtown Toronto,"Adelaide,King,Richmond",43.650571,-79.384568
9,M5J,Downtown Toronto,"Harbourfront East,Toronto Islands,Union Station",43.640816,-79.381752


In [29]:
# Use geopy library to get the latitude and longitude values of Downtown Toronto

address = 'Downtown Toronto, ON'
geolocator = Nominatim(user_agent="Downtown_Toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Downtown Toronto are 43.655115, -79.380219.


In [30]:
# Here, let’s visualize Downtown Toronto neighborhoods
# Markers changed to colored purple from blue
# The map should show clusters of 18 colored purple markers
# The map will be created using latitude and longitude values
# Note: Generated map is similar to our lab on New York City
# Note: Image may not display correctly on Github

map_downtown = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, lng, label in zip(Downtown_data['Latitude'], Downtown_data['Longitude'], Downtown_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='purple',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtown)  
    
map_downtown

<h4>Observation<h4>

In [31]:
# Retrieving the word ‘Toronto’ from our dataframe returns four groups of boroughs
# The values are East Toronto, Central Toronto, Downtown Toronto and West Toronto
# First map created is a visualization of the four groups and their neighbourhood
# We then focused more on the Downtown Toronto values
# In doing so, we can see the distance of neighbourhoods from each other
# Now we zoom in to examine;
# We can see that the Financial District has the most number of Downtown Toronto boroughs in close proximity
# Note: As I stated before, Github may not display images correctly