# Segmenting and Clustering Neighborhoods in Toronto

## Import Libraries

In [1]:
#high-level mathematical functions
import numpy as np 
#data manipulation and analysis
import pandas as pd

In [3]:
# To build the code to scrape the following Wikipedia page,
wiki_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [4]:
#read the data from url
toronto_df, = pd.read_html(wiki_url, match="Postal code", skiprows=1)
#The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood (given)
toronto_df.columns = ["PostalCode", "Borough", "Neighborhood"]
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M2A,Not assigned,
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,Regent Park / Harbourfront
4,M6A,North York,Lawrence Manor / Lawrence Heights


In [5]:
# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.(given)
toronto_df = toronto_df[toronto_df["Borough"] != "Not assigned"]
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,Regent Park / Harbourfront
4,M6A,North York,Lawrence Manor / Lawrence Heights
5,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [6]:
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
borough_fn = lambda b: b.iloc[0]
neighborhood_fn = lambda b: ", ".join(b)
all_fns = {"Borough": borough_fn, "Neighborhood": neighborhood_fn}
temp_toronto_df = toronto_df.groupby(by="PostalCode").aggregate(all_fns)
temp_toronto_df.head()

Unnamed: 0_level_0,Borough,Neighborhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,Malvern / Rouge
M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
M1E,Scarborough,Guildwood / Morningside / West Hill
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


In [9]:
toronto_df = temp_toronto_df.reset_index()[toronto_df.columns]
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [10]:
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
for (j, row) in toronto_df.iterrows():
    if row["Neighborhood"] == "Not assigned":
        borough_each = row["Borough"]
        print("Replace \"Not assigned\" => %s in row %i" % (borough_each, j))
        row["Neighborhood"] = borough_each

In [11]:
#More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: 
#Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.
for (j, row) in toronto_df.iterrows():
     row['Neighborhood']=row['Neighborhood'].replace('/',',')

In [12]:
toronto_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern , Rouge"
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park , Ionview , East Birchmount Park"
7,M1L,Scarborough,"Golden Mile , Clairlea , Oakridge"
8,M1M,Scarborough,"Cliffside , Cliffcrest , Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff , Cliffside West"


In [13]:
#In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe
toronto_df.shape

(103, 3)