# 🍁Segmenting and Clustering Neighborhoods in Toronto

## 1. Web Scraping

In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd

In [2]:
print("requests\t",requests.__version__,"\t| file: ",requests.__file__)
print("numpy\t\t",np.__version__,"\t| file: ",np.__file__)
print("pandas\t\t",pd.__version__,"\t| file: ",pd.__file__)

requests	 2.18.4 	| file:  /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages/requests/__init__.py
numpy		 1.13.3 	| file:  /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages/numpy/__init__.py
pandas		 0.21.0 	| file:  /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages/pandas/__init__.py


### 1.1. URL

In [3]:
url_postal_canada = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
resp = requests.get(url_postal_canada)

In [4]:
soup = BeautifulSoup(resp.text, 'html.parser')

### 1.2. Extract table

In [5]:
table = soup.find(class_='wikitable sortable')
table_rows = table.find_all('tr')

In [6]:
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.replace('\n', '') for tr in td]
    l.append(row)

canada = pd.DataFrame(l, columns=['Postcode','Borough','Neighbourhood'])
canada.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned


## 2. Create the dataframe

### 2.1. Ignore cells with a Borough that is Not assigned.

In [7]:
canada = canada[~canada['Borough'].isin([None,'Not assigned'])]
canada.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern


### 2.2. Complete cells with a Neighbourhood that is Not assigned.

In [8]:
canada[canada['Neighbourhood'].isin(['Not assigned'])]

Unnamed: 0,Postcode,Borough,Neighbourhood
9,M7A,Queen's Park,Not assigned


In [9]:
canada['Neighbourhood'] = np.where(canada['Neighbourhood']=='Not assigned', canada['Borough'], canada['Neighbourhood'])

canada[canada['Neighbourhood'].isin(['Not assigned'])]

Unnamed: 0,Postcode,Borough,Neighbourhood


In [10]:
canada[canada['Postcode'].isin(['M7A'])]

Unnamed: 0,Postcode,Borough,Neighbourhood
9,M7A,Queen's Park,Queen's Park


In [11]:
canada.shape

(212, 3)

### 2.3. Combine row with same 'Postcode' & 'Borough' into one row with the 'Neighborhoods' separated with a comma

In [14]:
canada = canada.groupby(['Postcode', 'Borough'], as_index=False).agg({'Neighbourhood':', '.join})
canada.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


### 2.4. Number of rows of dataframe

In [13]:
print('Number of rows of dataframe is: ',canada.shape[0])

Number of rows of dataframe is:  103
