## Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [23]:
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [204]:
soup = BeautifulSoup(res.content,'lxml')

In [205]:
table = soup.find_all('table')[0]

In [206]:
df = pd.read_html(str(table))[0]

In [207]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [208]:
df = df[df['Borough'] != 'Not assigned']

In [209]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [210]:
for i, row in df.iterrows():
    if (row['Borough'] != 'Not assigned') & (row['Neighbourhood'] == 'Not assigned'):
        df.loc[i,'Neighbourhood'] = df.loc[i,'Borough']

#### More than one neighborhood can exist in one postal code area. These two rows will be combined into one row with the neighborhoods separated with a comma.

In [211]:
df = df.groupby(['Postcode', 'Borough']).agg(lambda x: tuple(x)).applymap(list)
df = df.reset_index()
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"[Rouge, Malvern]"
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]"
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]"
3,M1G,Scarborough,[Woburn]
4,M1H,Scarborough,[Cedarbrae]


In [212]:
for index, row in df.iterrows():
    elements = len(row['Neighbourhood'])
    df.at[index,'Neighbourhood2'] = ''
    for i in range(elements):
        df.at[index,'Neighbourhood2'] = df.at[index,'Neighbourhood2'] + row['Neighbourhood'][i] + ', '
    df.loc[index,'Neighbourhood2'] = df.loc[index,'Neighbourhood2'][:-2]

In [213]:
df = df.drop('Neighbourhood',axis=1)
df.columns = ['Postcode', 'Borough', 'Neighbourhood']
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [214]:
df.shape

(103, 3)