# Segmenting and Clustering Neighbourhoods in Toronto

## Import required libraries

In [2]:
# general imports
import pandas as pd
import numpy as np

# required for pulling down web data
import urllib

# required for parsing an HTML document
from bs4 import BeautifulSoup

## Pull down a copy of the data

In [3]:
# obtain a copy of the web page
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = urllib.request.urlopen(url)

## Parse the downloaded data

In [4]:
# parse the downloaded HTML page
soup = BeautifulSoup(page, 'html.parser')

#get a reference to the correct table
table=soup.find('table', class_='wikitable sortable')

## Extract the column names

In [5]:
#obtain the column names, stripping off whitespace
columnNames = []
for name in table.findAll( 'th' ):
    columnNames.append( name.find(text=True ).strip() )

#lets see what we've got    
print( columnNames )

['Postcode', 'Borough', 'Neighbourhood']


## Extract the data into a DataFrame

In [6]:
#location to store the data during the extraction loop
cellData = []
rowData = []

#pull out each row of data
for row in table.findAll( 'tr' ):

    #pull out each cell from the row
    for cell in row.findAll( 'td' ):
        cellData.append( cell.find(text=True).strip() )

    #only append if we hold complete and correct data (exclude if borough is 'Not assigned')
    if( len( cellData ) == len( columnNames ) ):
        if( cellData[1] != "Not assigned" ):
            rowData.append( cellData )
    cellData = []


#once we have all the cells, create a data frame with this data
df = pd.DataFrame( np.array( rowData ), columns=columnNames )

## Replace 'Not Assigned' in Neighbourhoods

In [None]:
#validate we have a 'Not assigned' neighbourhood
print( grouped.loc[grouped.Neighbourhood == 'Not assigned'] )
print()

#when the neighbourhood is 'Not assigned', replace with the borough
grouped.Neighbourhood = pd.np.where(grouped.Neighbourhood == 'Not assigned', grouped.Borough, grouped.Neighbourhood)

#validate we do NOT have a 'Not assigned' neighbourhood
print( grouped.loc[grouped.Neighbourhood == 'Not assigned'])
print()

## Combine Neighbourhoods

In [17]:
#group the data by post code, combining Neighbourhood 
grouped = df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(','.join ).reset_index()

# Ensure we print out the same order as the given screenshot - not required, just for fun
grouped.loc[ (grouped.Postcode == 'M5G') | (grouped.Postcode == 'M2H') | (grouped.Postcode == 'M4B')| (grouped.Postcode == 'M1J')| (grouped.Postcode == 'M4G')| (grouped.Postcode == 'M4M')| (grouped.Postcode == 'M1R')| (grouped.Postcode == 'M9V')| (grouped.Postcode == 'M9L')| (grouped.Postcode == 'M5V')| (grouped.Postcode == 'M1B')| (grouped.Postcode == 'M5A')]

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
5,M1J,Scarborough,Scarborough Village
11,M1R,Scarborough,"Maryvale,Wexford"
17,M2H,North York,Hillcrest Village
35,M4B,East York,"Woodbine Gardens,Parkview Hill"
38,M4G,East York,Leaside
43,M4M,East Toronto,Studio District
53,M5A,Downtown Toronto,"Harbourfront,Regent Park"
57,M5G,Downtown Toronto,Central Bay Street
68,M5V,Downtown Toronto,"CN Tower,Bathurst Quay,Island airport,Harbourf..."


In [12]:
grouped.shape

(103, 3)