<h1>Segmenting and Clustering Neighborhoods in Toronto</h1>

<h4>Importing packages and linking to Wiki page

In [1]:
import requests
from bs4 import BeautifulSoup

URL = 'https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=945633050.'
page = requests.get(URL)

soup = BeautifulSoup(page.content, 'html.parser')

<h4>Extracting table from page

In [2]:
to_table = soup.find("table", attrs={"class": "wikitable sortable"})
to_table_data = to_table.tbody.find_all("tr")

In [3]:
to_table_data

[<tr>
 <th>Postcode</th>
 <th>Borough</th>
 <th>Neighbourhood
 </th></tr>,
 <tr>
 <td>M1A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>,
 <tr>
 <td>M2A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>,
 <tr>
 <td>M3A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
 </td></tr>,
 <tr>
 <td>M4A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
 </td></tr>,
 <tr>
 <td>M5A</td>
 <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
 <td><a href="/wiki/Regent_Park" title="Regent Park">Harbourfront</a>
 </td></tr>,
 <tr>
 <td>M6A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Lawrence_Heights" title="Lawrence Heights">Lawrence Heights</a>
 </td></tr>,
 <tr>
 <td>M6A</td>
 <td><a href="/wiki/No

In [4]:
to_table_data[0].find_all("th")

[<th>Postcode</th>,
 <th>Borough</th>,
 <th>Neighbourhood
 </th>]

<h4>Creating headings list

In [5]:
headings = []
for th in to_table_data[0].find_all("th"):
    headings.append(th.text.replace('\n', ''))
print(headings)

['Postcode', 'Borough', 'Neighbourhood']


<h4>Inserting all rows into a list

In [6]:
arrup = []
arrdown = []
i = 1
while i < len(to_table_data):
    for td in to_table_data[i].find_all("td"):
        arrup.append(td.text.replace('\n', ''))
    arrdown.append(arrup)
    arrup = []
    i = i + 1

In [7]:
arrdown

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', 'Downtown Toronto', "Queen's Park"],
 ['M8A', 'Not assigned', 'Not assigned'],
 ['M9A', 'Etobicoke', 'Islington Avenue'],
 ['M1B', 'Scarborough', 'Rouge'],
 ['M1B', 'Scarborough', 'Malvern'],
 ['M2B', 'Not assigned', 'Not assigned'],
 ['M3B', 'North York', 'Don Mills North'],
 ['M4B', 'East York', 'Woodbine Gardens'],
 ['M4B', 'East York', 'Parkview Hill'],
 ['M5B', 'Downtown Toronto', 'Ryerson'],
 ['M5B', 'Downtown Toronto', 'Garden District'],
 ['M6B', 'North York', 'Glencairn'],
 ['M7B', 'Not assigned', 'Not assigned'],
 ['M8B', 'Not assigned', 'Not assigned'],
 ['M9B', 'Etobicoke', 'Cloverdale'],
 ['M9B', 'Etobicoke', 'Islington'],
 ['M9B', 'Etobicoke', 'Martin Grove'],
 ['M9B', 'E

In [8]:
import pandas as pd

In [9]:
df = pd.DataFrame.from_records(arrdown, columns = headings)

In [10]:
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West
285,M8Z,Etobicoke,South of Bloor


<h4>Remove 'Not assigned' boroughs

In [11]:
ndf = df[df['Borough'] != 'Not assigned']

In [20]:
ndf

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
...,...,...,...
281,M8Z,Etobicoke,Kingsway Park South West
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West


<h4>Check for 'Not assigned' neighbourhoods

In [13]:
ndf[ndf['Neighbourhood'] == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood


In [14]:
gdf = ndf.groupby(["Postcode","Borough"])

In [18]:
gdf

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000022DFEA3BB20>

In [21]:
gdf.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
...,...,...,...
281,M8Z,Etobicoke,Kingsway Park South West
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West


In [22]:
gldf = gdf["Neighbourhood"].apply(list)

In [23]:
gldf = gldf.reset_index()

In [24]:
gldf

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"[Rouge, Malvern]"
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]"
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]"
3,M1G,Scarborough,[Woburn]
4,M1H,Scarborough,[Cedarbrae]
...,...,...,...
98,M9N,York,[Weston]
99,M9P,Etobicoke,[Westmount]
100,M9R,Etobicoke,"[Kingsview Village, Martin Grove Gardens, Rich..."
101,M9V,Etobicoke,"[Albion Gardens, Beaumond Heights, Humbergate,..."


In [27]:
gldf[gldf['Postcode'] == 'M4B']

Unnamed: 0,Postcode,Borough,Neighbourhood
35,M4B,East York,"[Woodbine Gardens, Parkview Hill]"


In [25]:
gldf.shape

(103, 3)

<h3>Q4 Location Data</h3>

In [35]:
longlat_df = pd.read_csv("Geospatial_Coordinates.csv")

In [36]:
longlat_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


<h4>Rename postcal code column for join</h4>

In [37]:
longlat_df.rename(columns = {'Postal Code':'Postcode'}, inplace = True)

In [38]:
longlat_df.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


<h4>Join dataframes into new dataframe on Postcode

In [40]:
finaldf = pd.merge(gldf, longlat_df, on='Postcode')

In [58]:
finaldf

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"[Rouge, Malvern]",43.806686,-79.194353
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]",43.784535,-79.160497
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]",43.763573,-79.188711
3,M1G,Scarborough,[Woburn],43.770992,-79.216917
4,M1H,Scarborough,[Cedarbrae],43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,[Weston],43.706876,-79.518188
99,M9P,Etobicoke,[Westmount],43.696319,-79.532242
100,M9R,Etobicoke,"[Kingsview Village, Martin Grove Gardens, Rich...",43.688905,-79.554724
101,M9V,Etobicoke,"[Albion Gardens, Beaumond Heights, Humbergate,...",43.739416,-79.588437


<h3>Q5 Clustering

In [48]:
todf = finaldf[finaldf['Borough'].str.contains('Toronto')]

In [49]:
todf.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,[The Beaches],43.676357,-79.293031
41,M4K,East Toronto,"[The Danforth West, Riverdale]",43.679557,-79.352188
42,M4L,East Toronto,"[The Beaches West, India Bazaar]",43.668999,-79.315572
43,M4M,East Toronto,[Studio District],43.659526,-79.340923
44,M4N,Central Toronto,[Lawrence Park],43.72802,-79.38879


<h4>Import libraries

In [52]:
from geopy.geocoders import Nominatim

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

<h4>Toronto location

In [53]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


<h4>Map all neighbouhoods

In [61]:
map_to_all = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(finaldf['Latitude'], finaldf['Longitude'], finaldf['Borough'], finaldf['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_to_all)  
    
map_to_all

<h4>Map only boroughs with Toronto in name

In [63]:
map_to = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(todf['Latitude'], todf['Longitude'], todf['Borough'], todf['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_to)  
    
map_to