# Take geographical coordinates for Toronto boroughs

## Table of Contents
1. <a href="#1.-Get-data-from-Wikipedia">Get data from Wikipedia</a>
2. <a href="#2.-Add-geographical-coordinates">Add geographical coordinates</a>
3. <a href="#3.-Result">Result</a>

__Note: contents links do not work via github render__

Initialize libraries

In [24]:
from bs4 import BeautifulSoup
import requests
import pandas as pd # library for data analsysis
import numpy as np
import geocoder # import geocoder
import json
from pandas import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

## 1. Get data from Wikipedia

Download Toronto wiki page

In [25]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki=requests.get(url)
print(wiki.status_code)

200


In [26]:
soup = BeautifulSoup(wiki.text, "html.parser")
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

print(table_contents[0:10])

[{'PostalCode': 'M3A', 'Borough': 'North York', 'Neighborhood': 'Parkwoods'}, {'PostalCode': 'M4A', 'Borough': 'North York', 'Neighborhood': 'Victoria Village'}, {'PostalCode': 'M5A', 'Borough': 'Downtown Toronto', 'Neighborhood': 'Regent Park, Harbourfront'}, {'PostalCode': 'M6A', 'Borough': 'North York', 'Neighborhood': 'Lawrence Manor, Lawrence Heights'}, {'PostalCode': 'M7A', 'Borough': "Queen's Park", 'Neighborhood': 'Ontario Provincial Government'}, {'PostalCode': 'M9A', 'Borough': 'Etobicoke', 'Neighborhood': 'Islington Avenue'}, {'PostalCode': 'M1B', 'Borough': 'Scarborough', 'Neighborhood': 'Malvern, Rouge'}, {'PostalCode': 'M3B', 'Borough': 'North York', 'Neighborhood': 'Don Mills North'}, {'PostalCode': 'M4B', 'Borough': 'East York', 'Neighborhood': 'Parkview Hill, Woodbine Gardens'}, {'PostalCode': 'M5B', 'Borough': 'Downtown Toronto', 'Neighborhood': 'Garden District, Ryerson'}]


Create the dataframe with list of neighborhoods

In [27]:
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})
df['Borough'].head()

0          North York
1          North York
2    Downtown Toronto
3          North York
4        Queen's Park
Name: Borough, dtype: object

Drop the rows with a not assigned borough

In [28]:
df['Borough'].replace('', np.nan, inplace=True)
df.dropna(subset=['Borough'], inplace=True)
df.reset_index(drop=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [29]:
df = df[df.Borough != '']
df.reset_index(drop=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [30]:
if len(df.loc[df['Neighborhood']==''])>0:
    df.loc[df['Neighborhood']=='']=df.Borough
df.reset_index(drop=True)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto Business,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


The number of rows of Toronto neighborhoods dataframe

In [31]:
df.shape

(103, 3)

## 2. Add geographical coordinates

In [32]:
def getCoordinatesFromCSV(df):
    #!wget -q -O 'Geospatial_Coordinates.csv' https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv
    df2=pd.read_csv('Geospatial_Coordinates.csv')
    df3 = pd.merge(left=df, right=df2, how='left', left_on='PostalCode', right_on='Postal Code')
    df3.drop('Postal Code', axis=1, inplace=True)
    df3.reset_index(drop=True)
    return df3

def getCoordinatesByPostcode(postal_code):
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    attempts=0
    while(lat_lng_coords is None) and (attempts < 10):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))        
        lat_lng_coords = g.latlng
        attempts+=1
    
    if lat_lng_coords is None:
        return 0,0,10
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    return latitude, longitude, attempts


for index, row in df.iterrows():
    la, lo, attempts = getCoordinatesByPostcode(row['PostalCode'])
    if la==0:
        # load coordinates from csv if API returns nothing
        df=getCoordinatesFromCSV(df)
        break
    row['Latitude']=la
    row['Longitude']=lo

## 3. Result

In [33]:
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
