# Segmenting and Clustering Neighborhoods in Toronto

### Importing python libraries and dependencies

In [1]:
!pip install bs4
!pip install folium
!pip install html5lib
!pip install geopy

Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
Collecting beautifulsoup4
  Downloading beautifulsoup4-4.9.3-py3-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 12.0 MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2
  Downloading soupsieve-2.2.1-py3-none-any.whl (33 kB)
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25ldone
[?25h  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1273 sha256=ff1c9d27133a4b113aa1c36599096c73a603dbb2fa87776f31951e5d7816e471
  Stored in directory: /home/jovyan/.cache/pip/wheels/19/f5/6d/a97dd4f22376d4472d5f4c76c7646876052ff3166b3cf71050
Successfully built bs4
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.9.3 bs4-0.0.1 soupsieve-2.2.1
Collecting folium
  Downloading folium-0.12.1-py2.py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 5.1 MB/s eta 0:00:011
Collecting branca>=0.3.0
  Downloading br

In [2]:
import pandas as pd
import requests
import numpy as np
import folium
import requests 
import matplotlib.cm as cm
import matplotlib.colors as colors
import json
import xml
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

from pandas.io.json import json_normalize 
from sklearn.cluster import KMeans
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim

print("All Required Libraries Imported!")

All Required Libraries Imported!


### Webscrapping and data extraction

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
data  = requests.get(url).text
soup = BeautifulSoup(data,'html.parser')

In [4]:
#find all html tables in the web page
tables = soup.find('table')

In [5]:
postal_codes = []
boros = []
neiros = []
titles = []

In [6]:
for tr in tables.find_all('tr'):
    
    for td in tr.find_all('td'):
        i=0      
        titles = []
        
        for b in td.find_all('b'):
            postal_code = td.b.text
            postal_codes.append(postal_code)
            
        for a in td.find_all('a'):
            title = a['title']
            titles.append(title)
            i=i+1
                
        if i == 0:
            boro = np.nan
            boros.append(boro)
            neiro = boro
            neiros.append(neiro)
        
        elif i == 1:
            boro = td.a.text
            boros.append(boro)
            neiro = boro
            neiros.append(neiro)
        
        elif i >= 2:
            boro = td.a.text
            boros.append(boro)
            titles.pop(0)
            neiro = titles
            neiros.append(neiro)

### Creating dataframe

In [7]:
#Converting content of PostalCode HTML table as dataframe
column_names = ['Postalcode','Borough','Neighborhood']
df = pd.DataFrame(columns = column_names)

In [8]:
df['Postalcode'] = postal_codes
df['Borough'] = boros
df['Neighborhood'] = neiros

### Cleaning dataframe

In [9]:
df = df.dropna().reset_index(drop=True)

In [10]:
df.head(50)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,[Parkwoods]
1,M4A,North York,[Victoria Village]
2,M5A,Downtown Toronto,"[Regent Park, Harbourfront, Toronto]"
3,M6A,North York,"[Lawrence Manor, Lawrence Heights]"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,[Islington Avenue]
6,M1B,Scarborough,"[Malvern, Toronto, Rouge, Toronto]"
7,M3B,North York,[Don Mills]
8,M4B,East York,"[Parkview Hill, Woodbine Gardens]"
9,M5B,Downtown Toronto,"[Garden District, Toronto, Ryerson University]"


### Dataframe shape

In [11]:
df.shape

(101, 3)

### Upload CSV

In [12]:
geo_loc = pd.read_csv('Geospatial_Coordinates.csv')

### Merging the dataframes

In [13]:
final_df = pd.merge(df, geo_loc, how='inner', on='Postalcode' )

In [26]:
final_df.head(50)

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,[Parkwoods],43.753259,-79.329656
1,M4A,North York,[Victoria Village],43.725882,-79.315572
2,M5A,Downtown Toronto,"[Regent Park, Harbourfront, Toronto]",43.65426,-79.360636
3,M6A,North York,"[Lawrence Manor, Lawrence Heights]",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,[Islington Avenue],43.667856,-79.532242
6,M1B,Scarborough,"[Malvern, Toronto, Rouge, Toronto]",43.806686,-79.194353
7,M3B,North York,[Don Mills],43.745906,-79.352188
8,M4B,East York,"[Parkview Hill, Woodbine Gardens]",43.706397,-79.309937
9,M5B,Downtown Toronto,"[Garden District, Toronto, Ryerson University]",43.657162,-79.378937


.

## Clusters Analysis

### Use geopy library to get the latitude and longitude values of Toronto

In [15]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


### Create a map of Toronto with neighborhoods superimposed on top

In [16]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(final_df['Latitude'], final_df['Longitude'], final_df['Borough'], final_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Slicing dataframe

For illustration purposes, let's simplify the above map and segment and cluster only the boroughs with 'Toronto ' in the name. So let's slice the original dataframe and create a new dataframe.

In [30]:
toronto = final_df[final_df['Borough'].str.contains("Toronto")].reset_index(drop=True)

In [31]:
toronto.head(50)

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"[Regent Park, Harbourfront, Toronto]",43.65426,-79.360636
1,M5B,Downtown Toronto,"[Garden District, Toronto, Ryerson University]",43.657162,-79.378937
2,M5C,Downtown Toronto,[St. James Town],43.651494,-79.375418
3,M5E,Downtown Toronto,Downtown Toronto,43.644771,-79.373306
4,M5G,Downtown Toronto,[Bay Street],43.657952,-79.387383
5,M6G,Downtown Toronto,Downtown Toronto,43.669542,-79.422564
6,M5H,Downtown Toronto,"[Richmond Street, King Street (Toronto)]",43.650571,-79.384568
7,M5J,Downtown Toronto,"[Harbourfront (Toronto), Union Station (Toront...",43.640816,-79.381752
8,M5K,Downtown Toronto,"[Toronto Dominion Centre, Design Exchange]",43.647177,-79.381576
9,M5L,Downtown Toronto,"[Commerce Court, Hotel Victoria (Toronto)]",43.648198,-79.379817


In [32]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


In [33]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto['Latitude'], toronto['Longitude'], toronto['Borough'], toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto