# Segmenting and Clustering Neighborhoods in Toronto
## Data Cleaning

In [2]:
import numpy as np
import pandas as pd

In [3]:
# import data from the csv file scraped from the web page
df = pd.read_csv("postcode_canada.csv")
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M6C,York,Humewood-Cedarvale
1,M6E,York,Caledonia-Fairbanks
2,M6M,York,Del Ray
3,M6M,York,Keelesdale
4,M6M,York,Mount Dennis


After importing the data, we drop cells with  a borough that is Not assigned.

In [4]:
print(df.shape)

# drop the data which "Borough" is "Not assigned"
df = df[df["Borough"]!="Not assigned"]
print(df.shape)
df.head()

(287, 3)
(210, 3)


Unnamed: 0,Postcode,Borough,Neighborhood
0,M6C,York,Humewood-Cedarvale
1,M6E,York,Caledonia-Fairbanks
2,M6M,York,Del Ray
3,M6M,York,Keelesdale
4,M6M,York,Mount Dennis


Then, we will combine the neighborhoods with the same postcode into a single cell.

In [53]:
# get the data where the postcode has more than 1 neighborhood value.
df_pc = pd.DataFrame(df["Postcode"].value_counts())
df_pc = df_pc[df_pc["Postcode"]>1]

# copy the original df
df_new = df

for postcode, count in df_pc.iterrows():
    
    # collect all neighborhood names under the same postcode to a string
    df1 = df[df["Postcode"]==postcode]
    borough = df[df["Postcode"]== postcode].iloc[0, 1]
    
    neighborhood_string = ""
    for neighbor in df1["Neighborhood"]:
        neighborhood_string += neighbor
        neighborhood_string += ", "
    
    neighborhood_string = neighborhood_string[:-2] # remove the last 2 elements, which is ", "
    
    # make a temporary data frame to append later
    df_append = pd.DataFrame([[postcode, borough, neighborhood_string]],
                             columns=['Postcode', 'Borough', 'Neighborhood'])
    
    # drop all the cells with the postcode from the data frame
    df_new = df_new[df_new["Postcode"] != postcode]
    
    # add the processed cell of the postcode
    df_new = pd.concat([df_new, df_append])
    

In [54]:
# make postcode the index
df_new.reset_index(inplace=True, drop=True)
df_new.set_index(["Postcode"], inplace = True, drop=True)
df_new

Unnamed: 0_level_0,Borough,Neighborhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M6C,York,Humewood-Cedarvale
M6E,York,Caledonia-Fairbanks
M9N,York,Weston
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
...,...,...
M4X,Downtown Toronto,"Cabbagetown, St. James Town"
M4T,Central Toronto,"Moore Park, Summerhill East"
M8W,Etobicoke,"Alderwood, Long Branch"
M5K,Downtown Toronto,"Design Exchange, Toronto Dominion Centre"


Next, we will deal with the cell with a "Not assigned" neighborhood and assign its borough value to it.

In [55]:
# data frame of Not assigned Neighborhood
df_nn = df_new[df_new["Neighborhood"]=="Not assigned"]
df_nn

Unnamed: 0_level_0,Borough,Neighborhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M9A,Queen's Park,Not assigned


In [56]:
# assign Borough value to Neighborhood
boroughs = df_nn.loc[:,"Borough"]
df_nn.loc[:,"Neighborhood"] = boroughs

# drop the original cell in data frame, and append the new result
df_new.drop("M9A", inplace=True)
df_new = pd.concat([df_new, df_nn])

After appending the result into the data frame, we can see the cell "M9A" is at the bottom and with a name of Neighborhood. 

In [57]:
df_new.tail()

Unnamed: 0_level_0,Borough,Neighborhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M4T,Central Toronto,"Moore Park, Summerhill East"
M8W,Etobicoke,"Alderwood, Long Branch"
M5K,Downtown Toronto,"Design Exchange, Toronto Dominion Centre"
M6H,West Toronto,"Dovercourt Village, Dufferin"
M9A,Queen's Park,Queen's Park


In [58]:
df_new.to_csv("postcode_CA_cleaned.csv", index=True)