# importing necessary modules

In [81]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os
import numpy as np
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

print('Libraries imported.')

Libraries imported.


## Let's obtain the table from the wikipedia page with the read_html method from pandas

In [3]:
page = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
dfs = pd.read_html(page,header=0)
canada_df = dfs[0]

In [4]:
#head of the canada dataframe
canada_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Clean the dataframe
***
 1.  First we will ignore the rows that have not an assigned row
 -  Second we will assign the borough as neighbourhood for rows that have a borough but not a neighbourhood
 -  Third we will merge in one row all the neighbourhood belonging to the same borough

In [5]:
#ignoring cells with not assigned borough
canada_df = canada_df[canada_df.Borough != "Not assigned"]

In [6]:
#assigning borough as neighbourhood for cells 
#that have a borought but not a neighborhood assigned
canada_df.Neighbourhood[canada_df.Neighbourhood == "Not assigned"]=canada_df.Borough[canada_df.Neighbourhood == "Not assigned"].values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:
#defining custom function that will group in one cell the neighborhood cells that 
#correspond to the same neighbourhood
def custom_neighbourhood_groupping(canada_df):
    a=str([str(k) for k in list(canada_df.values)])
    a= a.translate(None ,"'[]")
    return a

In [8]:
new_canada_df= canada_df.groupby(['Postcode','Borough']).agg(custom_neighbourhood_groupping).reset_index()

In [9]:
new_canada_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# Second part
## Obtaining latitude and longitude for every postal code

In [10]:
import geocoder

## First option
***
We will use the geocoder  package

In [11]:
def persistent_neighboorhood_geocoder(postal_code):
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format('M1B'))
        lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    return latitude,longitude

Below is an example of how to use the geocoder package

In [12]:
# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
    g = geocoder.google('{}, Toronto, Ontario'.format('M1B'))
    lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

KeyboardInterrupt: 

### Using the geocoder option is very slow ( after 5 minutes my first query had not an answer yet)

# Option 2: Using the Geospatial coordinates csv

In [13]:
coordinates_df=pd.read_csv('D:\Usuario Pau\Descargas\Geospatial_Coordinates.csv')

In [14]:
coordinates_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### We will rename the Postal code columns in both dataframes and then use the concat method from pandas to create the final dataframe

In [15]:
coordinates_df.rename(columns={'Postal Code':'PostalCode'},inplace=True)

In [16]:
new_canada_df.rename(columns={'Postcode':'PostalCode'},inplace=True)

In [17]:
new_canada_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [18]:
# Number of different postal codes
new_canada_df.shape

(103, 3)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [36]:
final_df =pd.concat([new_canada_df.set_index('PostalCode'),coordinates_df.set_index('PostalCode')],axis=1,join='inner').reset_index()

In [37]:
final_df.shape

(103, 5)

We did not loose any postal code during the assignment of the latitudes and longitudes

In [38]:
final_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
