# Segmenting and Clustering
Toronto neighborhoods data will be parsed with beautiful soup and transformed into a pandas dataframe. 
We will cluster and segment the data to develop insights. 

In [202]:
import pandas as pd
import numpy as np

In [242]:
# read wiki table using pandas read_html
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
# if boroughs are not assigned drop them
df2 = df[df['Borough']!= 'Not assigned']
df2.reset_index(inplace=True)
df2.drop(columns='index', inplace=True)

#group by postal code and borough, list out neighbourhoods sharing same postal codes
df2 = df2.groupby(['Postcode', 'Borough']).agg(lambda x : tuple(x)).applymap(list).reset_index()

#if a neighbourhood is not assigned but borough is then assign borough name to neighbourhood
for i in range(0, df2.shape[0]):
    if df2.iloc[i, 2][0] == 'Not assigned':
        b_name = df2.iloc[i, 1]
        df2.iloc[i, 2][0] = b_name
        
print(df2.shape)
df2.head()

(103, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"[Rouge, Malvern]"
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]"
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]"
3,M1G,Scarborough,[Woburn]
4,M1H,Scarborough,[Cedarbrae]


<h2> Downloading long/lat coordinates for post codes </h2>

In [268]:
# read coordinates for each postal code
postal_df = pd.read_csv('http://cocl.us/Geospatial_data')
# rename column
postal_df.rename(columns={'Postal Code': 'Postcode'}, inplace=True)
#join tables on post codes
df = df2.set_index('Postcode').join(postal_df.set_index('Postcode'))
df.reset_index(inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"[Rouge, Malvern]",43.806686,-79.194353
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]",43.784535,-79.160497
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]",43.763573,-79.188711
3,M1G,Scarborough,[Woburn],43.770992,-79.216917
4,M1H,Scarborough,[Cedarbrae],43.773136,-79.239476
