<h1 align="center">Segmenting and Clustering Neighborhoods in Toronto</h1>

## Part 1 - To scrape the Toronto postal data from Wiki

### 1. Scrape the Wikipedia page

In [2]:
import requests
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plot
from bs4 import BeautifulSoup

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_data = requests.get(url).text
html_tree = BeautifulSoup(html_data, 'html5lib')

### 2. Parse the table data and create a DataFrame with PostalCode, Borough and Neighborhood columns

In [3]:
trs = html_tree.find('tbody').find_all('tr')
# The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
column = ['PostalCode', 'Borough', 'Neighborhood']
toronto_df = pd.DataFrame(columns=column)
# extract raw data from html table to dataframe.
for tr in trs:
    tds = tr.find_all('td')
    if(tds != []):
        toronto_df = toronto_df.append({'PostalCode':tds[0].text.strip(), 'Borough':tds[1].text.strip(), 'Neighborhood':tds[2].text.strip()}, ignore_index=True)
# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
# remove Borough value = 'Not assigned'
index_na = toronto_df[(toronto_df['Borough']=='Not assigned')].index
toronto_df.drop(index=index_na, inplace=True)
toronto_df.reset_index(inplace=True)
toronto_df.drop(columns='index', axis=1, inplace=True)
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
if(toronto_df[(toronto_df['Neighborhood']=='Not assigned')].shape[0]!=0):
    print('Found Neighborhood is not assigned; use Borough value')
    toronto_df.loc[toronto_df['Neighborhood']=='Not assigned', 'Neighborhood']=toronto_df.loc[toronto_df['Neighborhood']=='Not assigned', 'Borough'] 
# In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.
print(toronto_df.shape)
print('The dataframe has {} rows with {} columns'.format(toronto_df.shape[0], toronto_df.shape[1]))
    

(103, 3)
The dataframe has 103 rows with 3 columns


## Part 2 - To retrieve latitude and longitude coordinates of each neighborhood. 
##### Using csv as geospatial input since geocoder is not working at all.

In [15]:
# Using csv as geospatial input since geocoder is not working at all.
import requests
from io import StringIO
path = 'http://cocl.us/Geospatial_data/Geospatial_Coordinates.csv'
s = requests.get(path).content
geospatial_df = pd.read_csv(StringIO(s.decode("utf-8")))
geospatial_df.columns = ['PostalCode', 'Latitude', 'Longitude']
for i in toronto_df['PostalCode'].values:
    latitude = geospatial_df.loc[geospatial_df['PostalCode']==i]['Latitude']
    longitude = geospatial_df.loc[geospatial_df['PostalCode']==i]['Longitude']
    toronto_df.loc[toronto_df['PostalCode']==i, 'Latitude'] = latitude.values[0]
    toronto_df.loc[toronto_df['PostalCode']==i, 'Longitude'] = longitude.values[0]
toronto_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509
