# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
from bs4 import BeautifulSoup
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

import pandas as pd
import numpy as np

### Load Webpage

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

http = urllib3.PoolManager()
response = http.request('GET', url)

### Parse web webpage

In [3]:
soup = BeautifulSoup(response.data,'html.parser')
table = soup.table

In [4]:
#get table rows
rows = table.find_all('tr')

In [5]:
#Parse table cells
l = []
for tr in rows:
    td = tr.find_all('td')
    
    #skip header
    if len(td) == 0:
        continue
    
    row = [tr.text.strip() for tr in td]
    l.append(row)

### Create dataframe and clean up data

In [6]:
#The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
data  = pd.DataFrame(l, columns=['PostalCode', 'Borough', 'Neighborhood'])

In [7]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
data = data.loc[data.Borough != 'Not assigned']

In [8]:
#More than one neighborhood can exist in one postal code area.
#Combine those rows into one row with the neighborhoods separated with a comma. 
data = data.groupby(['PostalCode','Borough'])['Neighborhood'].apply(','.join).reset_index()

In [9]:
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
data.Neighborhood = np.where(data.Neighborhood == 'Not assigned', data.Borough, data.Neighborhood)

In [10]:
#Preview first 20 rows of dataframe.
data.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [11]:
#In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.
data.shape

(103, 3)