# Segmenting and Clustering Neighborhoods in Toronto - Notebook 01

## Importing the Pandas library:

In [None]:
import pandas as pd

## Reading the data from the URL and selecting the first returned table:

In [None]:
postal_df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]

## Naming the columns accordingly to what is expected:

In [None]:
postal_df.columns = ['PostalCode', 'Borough', 'Neighborhood']

## A first look at our data:

In [None]:
postal_df.head(10)

## Removing the rows with invalid borough entries: 

In [None]:
postal_df = postal_df[postal_df.Borough != 'Not assigned']

postal_df.reset_index(drop = True, inplace = True)

## Correcting the invalid neighborhood entries:

In [None]:
for i in range(len(postal_df)):
    
    if postal_df.loc[i, 'Neighborhood'] == 'Not assigned':
    
        postal_df.loc[i, 'Neighborhood'] = postal_df.loc[i, 'Borough']

## Looking at our dataframe with the corrected values for boroughs and neighborhoods:

In [None]:
postal_df.head(10)

## Creating a new dataframe to contain the grouped neighborhood entries:

In [None]:
output_df = postal_df.drop_duplicates(['PostalCode', 'Borough']).copy()

output_df.reset_index(drop = True, inplace = True)

for i in range(len(output_df)):
    
    nh_list = []
    
    for j in range(len(postal_df)):
    
        if postal_df.loc[j, 'PostalCode'] == output_df.loc[i, 'PostalCode']:
        
            if postal_df.loc[j, 'Borough'] == output_df.loc[i, 'Borough']:
            
                nh_list.append(postal_df.loc[j, 'Neighborhood'])
    
    nh_string = nh_list[0]
    
    for k in range(1, len(nh_list)):
    
        nh_string = nh_string + ', ' + nh_list[k]
    
    output_df.loc[i, 'Neighborhood'] = nh_string    

## Looking at the first ten result of our operations:

In [None]:
output_df.head(10)

## Checking the number of rows of our dataframe with grouped neighborhoods: 

In [None]:
print("Number of rows: {}".format(output_df.shape[0]))