In [92]:
'''
Import dependencies
'''

import random 
import re
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.cluster import KMeans 

In [93]:
'''
Get the latest data from the Wikipedia page
'''

!wget -O raw_data.html https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M 

--2020-08-21 14:10:15--  https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M
Resolving en.wikipedia.org (en.wikipedia.org)... 208.80.154.224, 2620:0:861:ed1a::1
Connecting to en.wikipedia.org (en.wikipedia.org)|208.80.154.224|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 56520 (55K) [text/html]
Saving to: ‘raw_data.html’


2020-08-21 14:10:16 (415 KB/s) - ‘raw_data.html’ saved [56520/56520]



In [94]:
'''
Process the raw source code and grab the <table> string
'''

process = False
table_data_string = ''

with open("raw_data.html") as raw_data:
    for line in raw_data:
        if line.strip().startswith("<tbody") and table_data_string == '':
            process= True
            continue
        if process:
            table_data_string = table_data_string + line.strip()
        if line.strip().endswith("</table>"):
            process = False
            break

table_rows = table_data_string.split("<tr>")

In [95]:
'''
Split and clean the table string and write out the rows to .csv
'''

with open("clean_data.csv", "w") as clean_data:
    for row in table_rows:
        comma_row = re.sub('</t[dh]>', ';', row)
        clean_row = re.sub('<[/]*t[rdh]>', '', comma_row)
        clean_row = re.sub('&amp;', '&', clean_row)
        
        # Only proceed if the Borough is assigned
        row_items = clean_row.split(';') 
        if row_items[1] != "Not assigned":

            # Check if the neighbourhood is "Not assigned" and use the "Borough"
            if row_items[2] == "Not assigned":
                row_items[2] = row_items[1]
            clean_data.write(clean_row[:-1] + '\n')

In [96]:
'''
Read in the data
'''

df = pd.read_csv("clean_data.csv", sep=';')

df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [97]:
'''
Print the data frame dimensions
'''

df.shape

(103, 3)

In [None]:
'''
Install and import geocoder
'''

import sys
!{sys.executable} -m pip install geocoder

import geocoder

In [None]:
'''
Get coordinates - Option 1 (Note: I never got geocoder to work)
'''
def get_coordinates(postal_code):
    '''
    Ping the geocoder.google until it graces us with a response
    '''
    lat_lng_coords = None

    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
    return lat_lng_coords

with open("clean_data.csv") as data:
    with open("data_with_coordinates", "w") as completed_data:
        for line in data:
            if line.startswith("Postal"):
                completed_data.write("%s;Latitude;Longitude\n" % line.strip())
            else:
                postal_code = line.split(";")[0]
                coordinates = get_coordinates(postal_code)
                completed_line = "%s;%s;%s\n" % (line.strip(), coordinates[0], coordinates[1])
                completed_data.write(completed_line)
        


In [107]:
'''
Get coordinates - Option 2
'''

!wget -O coordinates.csv https://cocl.us/Geospatial_data

coordinates_mapping = {}
with open("coordinates.csv") as coordinates:
    for line in coordinates:
        postal_code, latitude, longitude = line.split(',')
        coordinates_mapping[postal_code] = (latitude, longitude)

--2020-08-21 14:41:08--  https://cocl.us/Geospatial_data
Resolving cocl.us (cocl.us)... 169.55.161.7
Connecting to cocl.us (cocl.us)|169.55.161.7|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2020-08-21 14:41:09--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com (ibm.box.com)... 185.235.236.197
Connecting to ibm.box.com (ibm.box.com)|185.235.236.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2020-08-21 14:41:10--  https://ibm.box.com/public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Reusing existing connection to ibm.box.com:443.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.ent.box.com/public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2020-08-21 14

In [111]:
'''
Create updated .csv file
'''

with open("clean_data.csv") as data:
    with open("data_with_coordinates.csv", "w") as completed_data:
        for line in data:
            if line.startswith("Postal"):
                completed_data.write("%s;Latitude;Longitude\n" % line.strip())
            else:
                postal_code = line.split(";")[0]
                completed_line = "%s;%s;%s\n" % (line.strip(), 
                                                 coordinates_mapping[postal_code][0], 
                                                 coordinates_mapping[postal_code][1])
                completed_data.write(completed_line)

In [112]:
'''
Inspect updates .csv file
'''

df = pd.read_csv("data_with_coordinates.csv", sep=';')
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
