# Segmenting and Clustering Neighborhoods in Toronto

## Import statements

In [31]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

## Getting the table from wikipedia and load it in a DataFrame

In [32]:
df = pd.read_html(io = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' \
                  ,flavor = 'bs4',header = 0, attrs = {'class':'wikitable sortable'})[0]

#### A look at the data

In [33]:
df.head(8)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor


#### Ignoring cells with a borough that is Not assigned

In [34]:
df = df[df['Borough'] != 'Not assigned']
df.head(8)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue


#### Not assigned neighborhood will be the same as the borough (Check the row with index 8 )

In [35]:
df = df.apply(lambda x: pd.Series([x[0],x[1],x[1]]) if x[2] == 'Not assigned' else pd.Series([x[0],x[1],x[2]]),\
              axis=1 , raw = True)
df.head(8)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue


#### Shape

In [36]:
df.shape


(212, 3)

#### Loading the latitude and longtitude data to $locations

In [37]:
locations = pd.read_csv('labs/DP0701EN/Geospatial_Coordinates.csv')
locations.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Joining the dataframe with location dataframe to neighborhoods

In [38]:
neighborhoods = df.join(locations.set_index('Postal Code'), on='Postcode')
neighborhoods.head(8)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
5,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
6,M6A,North York,Lawrence Heights,43.718518,-79.464763
7,M6A,North York,Lawrence Manor,43.718518,-79.464763
8,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
10,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
