# Segmenting and Clustering Neighborhoods in Toronto

In [22]:
#import necessary libraries
from bs4 import BeautifulSoup
from pathlib import Path

import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans


import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [23]:
latlong = pd.read_csv(Path('data/Geospatial_Coordinates.csv'))

In [24]:
latlong.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [25]:
latlong.shape

(103, 3)

#### Scraped data from wikipedia and convert into dataframe.

In [26]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
table = soup.find('table', {'class':'wikitable sortable'})
df = pd.read_html(str(table))[0]
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


#### Drop "Not assigned" value in Borough column.

In [27]:
df = df[df['Borough'] != 'Not assigned']
df.reset_index(drop=True, inplace=True)
df[df['Borough'] == 'Not assigned'].count()

Postal code     0
Borough         0
Neighborhood    0
dtype: int64

#### Replace / with ,

In [28]:
df['Neighborhood'] = df['Neighborhood'].str.replace(' /', ',')
df.head(12)

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


#### Drop NaN value.

In [29]:
df['Neighborhood'].fillna(df['Borough'], inplace=True)
df.isna().sum()

Postal code     0
Borough         0
Neighborhood    0
dtype: int64

#### Rename column to match with latlong dataframe.

In [30]:
df.rename(columns={"Postal code": "Postal Code"}, inplace=True)

#### Show the data.

In [31]:
df.head(12)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


#### Show shape of dataframe.

In [32]:
df.shape

(103, 3)

#### Merge df and latlong dataframe together to shows the result.

In [33]:
df_map = pd.merge(df, latlong, on='Postal Code')
df_map.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


#### Drop Postal Code column

In [34]:
df_map.drop(['Postal Code'], axis=1, inplace=True)
df_map

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,North York,Parkwoods,43.753259,-79.329656
1,North York,Victoria Village,43.725882,-79.315572
2,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...
98,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,East Toronto,Business reply mail Processing CentrE,43.662744,-79.321558
101,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


#### Select only Toronto borough

In [35]:
df_map = df_map[df_map['Borough'].str.contains('Toronto')]
df_map.reset_index(drop=True, inplace=True)
df_map.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,Downtown Toronto,St. James Town,43.651494,-79.375418
4,East Toronto,The Beaches,43.676357,-79.293031


#### Count uniqued neighborhood in Toronto

In [36]:
df_map['Neighborhood'].value_counts()

The Beaches                                                                                                   1
Central Bay Street                                                                                            1
Stn A PO Boxes                                                                                                1
Commerce Court, Victoria Hotel                                                                                1
Roselawn                                                                                                      1
Runnymede, Swansea                                                                                            1
University of Toronto, Harbord                                                                                1
St. James Town                                                                                                1
Dufferin, Dovercourt Village                                                                            

#### Find geolocation of Toronto

In [37]:
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


#### Visualized Toronto map and neighborhood

In [38]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_map['Latitude'], df_map['Longitude'], df_map['Borough'], df_map['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto