## Exploring and clustering the neighborhoods in Toronto - Part 1

In [1]:
import pandas as pd
import numpy as np

In [2]:
pip install lxml

Note: you may need to restart the kernel to use updated packages.


1. Importing the table from Wikipedia

In [3]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
df = pd.DataFrame(df)[0][0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


2. Removing rows with unassigned Borough

In [4]:
df = df[df.Borough != 'Not assigned']
df.reset_index(drop=True,inplace=True)
df.head(12)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


3. Separate the Neighbours with the same Postal Code into new rows to ease Foursquare Venue Search

In [5]:
for x in range(len(df)):
    last_index = len(df)-1
    count = 0
    nhood = df.Neighborhood[x]
    if len(nhood.split(',')) > 1:
        num_nhood = len(nhood.split(','))
        
        for i in range(num_nhood-1):
            df = df.append(df.loc[x])
            df.reset_index(drop=True, inplace=True)
            
        row_index = df.index[df.Neighborhood == nhood][0]
        strip = [x.strip() for x in nhood.split(',')]
        df.Neighborhood[row_index] = strip[count]
        
        while num_nhood != 1:
            last_index += 1
            df.Neighborhood[last_index] = strip[count+1]
            num_nhood -= 1
            count += 1

4. Finding the number of boroughs and neighborhoods in the dataframe

In [6]:
unique = []

for Neighborhood in df.Neighborhood:
    new_neighborhood = Neighborhood.split(",")
    unique = unique + new_neighborhood

print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df['Borough'].unique()),
        len(unique)))

The dataframe has 10 boroughs and 209 neighborhoods.


In [7]:
df.shape

(209, 3)

In [8]:
df.to_csv('Postal_Codes_of_Canada.csv')