# Segmenting and Clustering Neighborhoods in Toronto

## Step 1: Scraping wikipedia

In [65]:
# Importing libraries
#!conda install -c conda-forge beautifulsoup4=4.6.3 --yes  # Installing beautifulsoup, the most popular Python web scraping tool
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd 

In [66]:
pageurl = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urlopen(pageurl)
soup = BeautifulSoup(page, 'lxml')
column_names = ['PostCode', 'Borough', 'Neighborhood'] 
lis = []
listes = soup.find_all('table', class_ = "wikitable sortable")
for elt in listes:
    lignes = elt.find_all('tr')
    for ligne in lignes:
        lin = ['','','']
        cels = ligne.find_all('td')
        for idx, cel in enumerate(cels):
            lin[idx] = cel.get_text()
            
        lis.append([lin[0], lin[1], lin[2][:-1]])

toronto = pd.DataFrame(lis[1:], columns=column_names)
toronto.head()

Unnamed: 0,PostCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Step 2: Removing the 'Not assigned'

In [67]:
toronto = toronto[toronto.Borough != 'Not assigned']
toronto.reset_index(drop=True, inplace=True)

for idx, row in toronto.iterrows():
    if row['Neighborhood'] == 'Not assigned':
        row['Neighborhood'] = row['Borough']

## Step 3: Grouping Neighborhoods

In [72]:
clean_df = toronto.groupby(['PostCode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
clean_df.shape

(103, 3)