# Segmenting and Clustering Neighborhoods in Toronto

## Part I - Structuring Data

In [67]:
# Importing libraries

#!pip install beautifulsoup4
#!pip install lxml
import requests
import pandas as pd 
import numpy as np 
import random
#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim 
from IPython.display import Image 
from IPython.core.display import HTML 
from IPython.display import display_html
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize


Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Libraries imported.


### Scraping data from a Wikipedia page

In [68]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup=BeautifulSoup(source,'lxml')
print(soup.title)
from IPython.display import display_html
tab = str(soup.table)
display_html(tab,raw=True)

<title>List of postal codes of Canada: M - Wikipedia</title>


Postal Code,Borough,Neighbourhood
M1A,Not assigned,Not assigned
M2A,Not assigned,Not assigned
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
M8A,Not assigned,Not assigned
M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
M1B,Scarborough,"Malvern, Rouge"


In [46]:
dfs = pd.read_html(tab)
df=dfs[0]

#renaming the column "Postal Code" to "Postcode" because the column in the CSV file (which will be loaded later) has the same
df = df.rename(columns = {"Postal Code": "Postcode"})
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [69]:
# Making sure that if a cell has a borough but a Not assigned  neighborhood, then the neighborhood is the same as the borough.
df1 = df[df.Borough != 'Not assigned']
df2 = df1.groupby(['Postcode','Borough'], sort=False).agg(', '.join)
df2.reset_index(inplace=True)
df2['Neighbourhood'] = np.where(df2['Neighbourhood'] == 'Not assigned',df2['Borough'], df2['Neighbourhood'])

In [71]:
# Using the .shape method to print the number of rows of the dataframe.
df2.shape

(103, 3)

## Part II - Importing geographical coordinates from a csv file and merging dataframes

In [48]:
# Loading data from the csv file
lat_lon = pd.read_csv('https://cocl.us/Geospatial_data')
lat_lon.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [49]:
# Merging two datasets on the Postal Code column since this column is the same in both dataframes
lat_lon.rename(columns={'Postal Code':'Postcode'},inplace=True)
df3 = pd.merge(df2,lat_lon,on='Postcode')
df3.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [50]:
#Getting rows that have "Toronto" in the "Borough" column
df4 = df3[df3['Borough'].str.contains('Toronto',regex=False)]
df4

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


## Part III - Visualizing clusters
### Note: actual maps might not appear in GitHub

In [None]:
# Installing Folium

#!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

print('Folium installed')

In [82]:
map_toronto = folium.Map(location=[43.651070,-79.347015],zoom_start=11)

for lat,lng,borough,neighbourhood in zip(df4['Latitude'],df4['Longitude'],df4['Borough'],df4['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=8,
    popup=label,
    color='red',
    fill=True,
    fill_color='#CC8631',
    fill_opacity=0.2,
    parse_html=False).add_to(map_toronto)
map_toronto

In [76]:
# Runing k-means to cluster the neighborhood into 4 clusters.
k=4
toronto_clustering = df4.drop(['Postcode','Borough','Neighbourhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
kmeans.labels_

array([2, 2, 2, 2, 1, 2, 2, 3, 2, 3, 2, 3, 1, 2, 3, 1, 2, 1, 0, 0, 0, 0,
       3, 0, 3, 3, 0, 3, 3, 0, 3, 0, 2, 2, 2, 2, 2, 2, 1], dtype=int32)

In [84]:
# create map

map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=11)

x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df4['Latitude'], df4['Longitude'], df4['Neighbourhood'], df4['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=8,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.2).add_to(map_clusters)
       
map_clusters