# Segmenting and Clustering Neighborhoods in Toronto

## Part 1 - Create Notebook
Done, here is my notebook I created!

## Part 2 - Create Dataframe from Wikipedia Page Containing Postal Codes in Canada Starting with M

In [56]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [102]:
# Get to webpage and extract source code using bs4 module
headers = requests.utils.default_headers()
headers.update({ 'User-Agent': 'Chrome/6.0.472.63 Sfari/534.3a'})

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
req = requests.get(url, headers)
soup = BeautifulSoup(req.content, 'html.parser')

In [69]:
# Identify table to scrap data from 
tables = soup.find('table', {'class':'wikitable sortable'})
table_rows = tables.find_all('tr')

# Create empty list to input table values into from for loop below
data = []

# Using for loop to pull text from each cell and add to list
for row in table_rows:
    data.append([t.text.strip() for t in row.find_all('td')])

# Convert list to dataframe and remove row containing null 
df_tdot = pd.DataFrame(data, columns = ['Postal Code', 'Borough', 'Neighborhood'])
df_tdot.dropna(inplace=True)
df_tdot.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Regent Park / Harbourfront


## Part 3 - Restructure Dataframe

In [97]:
# Ignore rows with Borough columnn containing 'Not assigned' by modifying existing dataframe to only include rows where Borough column does not match 'Not assigned'
df_tdot = df_tdot[df_tdot['Borough'] != 'Not assigned']

# Groupby function used to group Postal Code column so more than one Neighborhood can exist for a postal code row
df_tdot.groupby(by = ['Postal Code'], axis = 0)

# Reset index as it is missing values after the groupby function is applied
df_tdot.reset_index(drop = True, inplace = True)

# The groupby function defaulted seperating multiple Neighborhoods in a cell with '/'.  Used replace function to replace '/' with a comma.
df_tdot['Neighborhood'] = df_tdot['Neighborhood'].str.replace('/',',')

# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.  Note that no neighborhoods in scraped table has contains 'Not assigned'.
mask = df_tdot['Neighborhood'] == 'Not assigned'
df_tdot.loc[mask, 'Neighborhood'] = df_tdot.loc[mask, 'Borough']

df_tdot.head()


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


In [100]:
# Show shape of dataframe after dataframe is pre-processed
df_tdot.shape

(103, 3)