# Assignment: Segmenting and Clustering Neighborhoods in Toronto
Notebook to scrape Neighbood information

### Step 1: Install and Import Packages

In [117]:
# library import Section 
import numpy as np
import pandas as pd


In [118]:
# install scraping package
#!conda install -c conda-forge beautifulsoup4

In [119]:
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate

### Step 2: Scrape the input data

In [120]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
tables = pd.read_html(url)
df = tables[0]
print('Inital shape: ' + str(df.shape))
df.head(5)

Inital shape: (287, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Step 3 - Filter out Boroughs with No Assigned

In [121]:
# Filter out Boroughs that are Not Assigned
df_filtered=df.loc[(df.Borough != "Not assigned")]
df_filtered.reset_index(drop=True, inplace=True)
df_filtered.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


### Step 4 - Merge rows with multple Postcodes and Boroughs

In [122]:
df_filtered = df_filtered.groupby(['Postcode','Borough'], sort=False).agg(', '.join)
df_filtered.reset_index(inplace=True)
df_filtered.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


### Step 5 - Check rows with Neighbourhood that are Not Assigned

In [123]:
# Filter out Neighbourhood that are Not Assigned
df_filtered.loc[(df_filtered.Neighbourhood == "Not assigned")]

Unnamed: 0,Postcode,Borough,Neighbourhood
5,M9A,Queen's Park,Not assigned


### Step 6 - Update Neighbourhoods that are Not Assigned

In [124]:
df_filtered['Neighbourhood'] = np.where(df_filtered.Neighbourhood == "Not assigned", df_filtered['Borough'],df_filtered['Neighbourhood'])
# check if the change worked
df_filtered.loc[(df_filtered.Postcode == "M9A")]

Unnamed: 0,Postcode,Borough,Neighbourhood
5,M9A,Queen's Park,Queen's Park


### Step 7 - Check final state of Dataframe

In [128]:
# Check the final dataframe
df_filtered.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Queen's Park,Queen's Park
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


### Step 8 - Final shape of Dataframe

In [126]:
print('Final shape: ' + str(df_filtered.shape))

Final shape: (103, 3)


In [129]:
df_filtered.to_csv('Neighbourhoods.csv')

### Step 9 - load GPS file and process

In [132]:
df_gps = pd.read_csv ('https://cocl.us/Geospatial_data')
df_gps.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [136]:
df_gps.rename(columns = {'Postal Code':'Postcode'},inplace=True)

In [139]:
df_nbrhoodswithgps=df_filtered.merge(df_gps,on='Postcode')
df_nbrhoodswithgps.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
5,M9A,Queen's Park,Queen's Park,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
