# Segmenting and Clustering Neighborhoods in Toronto

**This Notebook will explore and cluster the neighborhoods in Toronto**

**Scraping data from Wikipedia Page**


In [2]:
import requests 
website_url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

from bs4 import BeautifulSoup
soup = BeautifulSoup(website_url,'lxml')

In [5]:
My_table = soup.find('table',{'class':'wikitable sortable'})


In [6]:
Postcode = []
Borough = []
Neighbourhood = []

for row in My_table.findAll('tr'):
    cells = row.findAll('td')
    if len(cells)==3: #Only extract table body not heading
        Postcode.append(cells[0].find(text=True))
        Borough.append(cells[1].find(text=True))
        Neighbourhood.append(cells[2].find(text=True))

In [8]:
import pandas as pd
df=pd.DataFrame()
df['PostalCode']=Postcode
df['Borough']=Borough
df['Neighbourhood']=Neighbourhood
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [10]:
df1 = df[df.Borough != 'Not assigned']
df1.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [11]:
df2 = df1.groupby("PostalCode").agg(lambda x:','.join(set(x)))
df2.head()

Unnamed: 0_level_0,Borough,Neighbourhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern,Rouge"
M1C,Scarborough,"Rouge Hill,Highland Creek,Port Union"
M1E,Scarborough,"Morningside,West Hill,Guildwood\n"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae\n


In [12]:
df2.shape

(103, 2)

**Importing Geospatial data into dataframe (data)**

In [13]:
data = pd.read_csv('http://cocl.us/Geospatial_data', delimiter = ',')
data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
data_merge = pd.merge(left=df2,right=data, left_on='PostalCode', right_on='Postal Code')
data_merge.head()

Unnamed: 0,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,Scarborough,"Malvern,Rouge",M1B,43.806686,-79.194353
1,Scarborough,"Rouge Hill,Highland Creek,Port Union",M1C,43.784535,-79.160497
2,Scarborough,"Morningside,West Hill,Guildwood\n",M1E,43.763573,-79.188711
3,Scarborough,Woburn,M1G,43.770992,-79.216917
4,Scarborough,Cedarbrae\n,M1H,43.773136,-79.239476


In [15]:
data_merge = data_merge[['Postal Code', 'Borough', 'Neighbourhood', 'Latitude', 'Longitude']]
data_merge.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill,Highland Creek,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Morningside,West Hill,Guildwood\n",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae\n,43.773136,-79.239476


**Segmenting Borough that contains the word Toronto**

In [19]:
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans


In [20]:
conda install -c conda-forge folium 

Collecting package metadata: done
Solving environment: done


  current version: 4.6.11
  latest version: 4.6.14

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /Users/Mohideen/anaconda3

  added / updated specs:
    - folium


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-3.0.1               |           py37_0         726 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    certifi-2019.3.9           |           py37_0         149 KB  conda-forge
    conda-4.6.14               |           py37_0         2.1 MB  conda-forge
    folium-0.9.0               |             py_0          59 KB  conda-forge
    openssl-1.1.1b             |       h1de35cc_1         3.5 MB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    -

In [21]:
import folium

In [22]:
data_toronto = data_merge[data_merge['Borough'].str.contains('Toronto')].reset_index(drop=True)
data_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West\n,Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West\n,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District\n,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [25]:
import numpy as np
toronto_map = folium.Map(location=[43.65, -79.4], zoom_start=12)

X = data_toronto['Latitude']
Y = data_toronto['Longitude']
Z = np.stack((X, Y), axis=1)

kmeans = KMeans(n_clusters=4, random_state=0).fit(Z)

clusters = kmeans.labels_
colors = ['red', 'green', 'blue', 'yellow']
data_toronto['Cluster'] = clusters

for latitude, longitude, borough, cluster in zip(data_toronto['Latitude'], data_toronto['Longitude'], data_toronto['Borough'], data_toronto['Cluster']):
    label = folium.Popup(borough, parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color=colors[cluster],
        fill_opacity=0.7).add_to(toronto_map)  

toronto_map