## Segmenting and Clustering Neighborhoods in Toronto

### Date: 23 March 2019
### Author: Min Jung Kang

In [1]:
import pandas as pd
import numpy as np
import requests
import lxml

In [3]:
from bs4 import BeautifulSoup

In [21]:
# Obtaining data from Wikipedia
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

In [22]:
PostalCode = []
Borough = []
Neighborhood = []

for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if len(cells) > 0:
        PostalCode.append(cells[0].text)
        Borough.append(cells[1].text)
        Neighborhood.append(cells[2].text.rstrip('\n'))

In [23]:
dict = { 'PostalCode' : PostalCode,
       'Borough' : Borough,
       'Neighborhood': Neighborhood}
df = pd.DataFrame.from_dict(dict)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [24]:
# Remove 'Not assigned' Boroughs
df = df[df.Borough != 'Not assigned']
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [25]:
# Copy Borough values to 'Not assigned' Neighborhoods
df.replace("Not assigned", np.nan, inplace=True)
df['Neighborhood'].fillna(df['Borough'], inplace=True)
df.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [28]:
# Group by PostalCode
df = df.groupby(['PostalCode','Borough'], as_index=False).agg(lambda x: ','.join(x))
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [29]:
df.shape

(103, 3)