# Clustering Neighbourhoods

The aim of this notebook is to extract data with Toronto neighbourhoods from Wikipedia and clean them up

In [1]:
import pandas as pd
import numpy as np

In [2]:
# scrap data from the website as list of multiple dataframes
tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

# extract dataframe we are interested in
df = tables[0]

In [3]:
# Get names of indexes for which column Borough has value 'Not assigned'
indexNames = df[ df['Borough'] == 'Not assigned' ].index
 
# Delete these row indexes from dataframe
df.drop(indexNames , inplace=True)

In [4]:
# check whether there are any boroughs that have no neighborhood
df[df["Neighborhood"].isna()]

# since there are no such instances, no need to set any neighborhood names same as borough nameb

Unnamed: 0,Postal code,Borough,Neighborhood


Since list has only unique postcodes with all neighborhoods relating to particular postcode already provided in one row, the only thing that is needed to match requested format of data is to change '/' to ',', which is done below.

In [5]:
# replace '/' with ','
df['Neighborhood'] = df['Neighborhood'].str.replace(' /', ',')

In [6]:
df = df.reset_index(drop = True)
df

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing CentrE
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [8]:
df.shape

(103, 3)