# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
from bs4 import BeautifulSoup
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

import pandas as pd
import numpy as np

### Load Webpage

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

http = urllib3.PoolManager()
response = http.request('GET', url)

### Parse web webpage

In [3]:
soup = BeautifulSoup(response.data,'html.parser')
table = soup.table

In [4]:
#get table rows
rows = table.find_all('tr')

In [5]:
#Parse table cells
l = []
for tr in rows:
    td = tr.find_all('td')
    
    #skip header
    if len(td) == 0:
        continue
    
    row = [tr.text.strip() for tr in td]
    l.append(row)

### Create dataframe and clean up data

In [6]:
#The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
data  = pd.DataFrame(l, columns=['PostalCode', 'Borough', 'Neighborhood'])

In [7]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
data = data.loc[data.Borough != 'Not assigned']

In [8]:
#More than one neighborhood can exist in one postal code area.
#Combine those rows into one row with the neighborhoods separated with a comma. 
data = data.groupby(['PostalCode','Borough'])['Neighborhood'].apply(','.join).reset_index()

In [9]:
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
data.Neighborhood = np.where(data.Neighborhood == 'Not assigned', data.Borough, data.Neighborhood)

In [10]:
#In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.
data.shape

(103, 3)

### Get coordinates

In [11]:
import geocoder # import geocoder

In [12]:
postal_codes = dict.fromkeys(list(data.PostalCode))

In [13]:
#google encoder didn't work, used the arcgis one instead
#the received coordinates differ slightly from the google ones

for postal_code in postal_codes.keys():
    #initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None): 
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
        postal_codes[postal_code] = lat_lng_coords

In [14]:
data['Latitude'] = data.PostalCode.apply(lambda x: postal_codes[x][0])
data['Longitude'] = data.PostalCode.apply(lambda x: postal_codes[x][1])

In [15]:
data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.811525,-79.195517
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.78573,-79.15875
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.76569,-79.175256
3,M1G,Scarborough,Woburn,43.768359,-79.21759
4,M1H,Scarborough,Cedarbrae,43.769688,-79.23944


###### the received coordinates differ slightly from the google ones => use the csv file instead

In [16]:
postal_codes = pd.read_csv('Geospatial_Coordinates.csv')

In [17]:
#delete coordinates
data.drop(columns=['Latitude','Longitude'], inplace=True)

In [18]:
#merge coordinates from csv file
data = data.merge(postal_codes, left_on='PostalCode', right_on='Postal Code')
data.drop(columns=['Postal Code'], inplace=True)

In [19]:
data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
