# Segmenting and Clustering Neighborhoods in Toronto
---------
Applied Data Science Capstone > 
Week 3

In [137]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests


### Download wikipedia page and parse columns

In [266]:
page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(page, 'html.parser')
post_table = soup.find('table')
#post_table.tr.find_all('th')
column_titles = [i.text.replace('\n','') for i in post_table.tr.find_all('th')]
column_titles

['Postcode', 'Borough', 'Neighbourhood']

### Parse out the rows 

In [267]:
data = []
rows = post_table.find_all('tr') 
for num in range(1, len(rows)):
    temp = rows[num].text.split('\n')
    temp = list(filter(None, temp))  # Filter out the empty elements
    data.append(temp)
    
#rows = soup.find('table').find_all('tr')[1].text.split('\n')
#rows = list(filter(None, rows))
data[:5]

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront']]

### Create a dataframe with the data

In [268]:
df_all = pd.DataFrame(data=data, columns=column_titles)
df_all.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Remove rows with borough that is "Not assigned"

In [269]:
df_all = df_all.loc[df_all['Borough'] != 'Not assigned'] 
df_all.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Combine neighborhoods in the same zipcode

In [270]:
unique_post = df_all['Postcode'].unique() # Get all the unique postcodes
df = pd.DataFrame(columns=column_titles) # create the new dataframe

# Loop through the unique postcodes and insert data
for i, p in enumerate(unique_post):
    hood = df_all['Neighbourhood'].loc[df_all['Postcode'] == p].to_string(index=False).strip().replace('\n',",")
    borough = df_all['Borough'].loc[df_all['Postcode'] == p].head(1).to_string(index=False).strip()
    df.loc[i] = [p, borough, hood]

df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned


### Copy the Borough's name to the "Not assigned" Neighborhood's name

In [271]:
for i, row in df.iterrows():
    if row['Neighbourhood'] == 'Not assigned':
        row['Neighbourhood'] =  row['Borough']    

df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
