# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import pandas as pd
import requests
#!conda install -c conda-forge lxml --yes
import lxml

# Part 1: Create the DataFrame

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [3]:
r = requests.get(url)

## 1. Scraping Toronto data with the help of Pandas library

In [4]:
dataframes=pd.read_html(r.text, header=0)

## 2. Get the DataFrame with the Toronto neighborhood data

In [5]:
toronto = dataframes[0]

In [6]:
toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## 3. Drop all rows with not assigned boroughs & Not assigned neighborhoods will renamed after their corresponding Borough

In [7]:
toronto.loc[toronto.Borough == 'Not assigned']

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
7,M8A,Not assigned,Not assigned
10,M2B,Not assigned,Not assigned
15,M7B,Not assigned,Not assigned
...,...,...,...
174,M4Z,Not assigned,Not assigned
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned


In [8]:
toronto.drop(toronto.loc[toronto.Borough == 'Not assigned'].index, inplace = True)

In [9]:
toronto.loc[toronto.Neighborhood == 'Not assigned'] # Not assigned neighborhoods do not exist after dropping the not assigned boroughs

Unnamed: 0,Postal Code,Borough,Neighborhood


In [10]:
toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [11]:
toronto.shape

(103, 3)

## 4. Combine Neighborhoods with the same Postal Code

In [12]:
# create a function that combines neighborhoods with the same postal code on the same row
def new_neighborhoods(data):
    pc = []
    bor = []
    neigh = []
    for c, b, n in zip(data['Postal Code'], data['Borough'], data['Neighborhood']):
        if c not in pc:
            pc.append(c)
            bor.append(b)
            neigh.append([n])
        else: # for all those neighborhoods that share the same postal code
            index = pc.index(c) # find the position of the code which already exists on the postal code (pc) list 
            neigh[index].append([n])  # use this position to append to the list that contains all the neighborhoods that share the same c code 
    
    data = {'Postal Code': pc, 'Borough':bor, 'Neighborhood': neigh}
    df = pd.DataFrame(data = data)
    
    df.Neighborhood = df.Neighborhood.str.get(0)
    
    
    return df
        
        
        

In [13]:
toronto_df = new_neighborhoods(toronto)
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


# 6. Print  the shape of the DataFrame

In [14]:
toronto_df.shape

(103, 3)