## Import Data from wikipedia and clean it

In [91]:
## Import libraries
!conda install -c conda-forge wikipedia --yes

## Import libraries
import pandas as pd
import wikipedia as wp

## Import data using the Wikipedia API
html = wp.page("List_of_postal_codes_of_Canada:_M").html().encode("UTF-8")
df = pd.read_html(html)[0]

## Remove lines where the Borough is not assigned
df = df[df.Borough != 'Not assigned']

## Replace Neighbourhood = Not assigned with the Borough name

df.Neighbourhood = df.apply(lambda x: x['Borough'] if x['Neighbourhood']=='Not assigned' else x['Neighbourhood'], axis=1)

## Join the lines for Neighbourhoods

df = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: ','.join(x)).reset_index()

## Using geocoder to get the latitude and longitude

In [5]:
## Install geocoder

!conda install -c conda-forge geocoder --yes

Solving environment: done

# All requested packages already installed.



In [47]:
##Insert all the distinct postcodes into a different dataframe
df_pc = df.Postcode.drop_duplicates().to_frame()

## Add a Column to the dataframa for latitude and one for longitude
df_pc['Latitude'] = ''
df_pc['Longitude'] = ''

### Notes

Geocoder is very unreliable; however, if we were to get the latitude and longitude of each postcode using it, we would need a for loop to go through each post code, and then a while loop that would make calls using the geocoder API until the latitude and longitude was returned for each postcode using a code similar to the one below.

In [None]:
import geocoder # import geocoder

for Postcode in df_pc.Postcode:
    
    lat_lng_coords = None
    
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(df_pc.Postcode))
        lat_lng_coords = g.latlng

    df_pc.Latitude = lat_lng_coords[0]
    df_pc.Longitude = lat_lng_coords[1]

## Using json to get the latitude and longitude

In [95]:
## Import csv library
import csv

## Load the data into a dataframe

postcode_data = pd.read_csv('http://cocl.us/Geospatial_data')

## Rename the column
postcode_data.rename(columns={'Postal Code': 'Postcode'}, inplace=True)

postcode_data.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [94]:
## Merge the information from the csv file into the main dataframe
df = pd.merge(df,
                 postcode_data[['Postcode', 'Latitude', 'Longitude']],
                 on='Postcode')
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [100]:
##Filter by the postcode M5A to compare with the exercise image

df.loc[df['Postcode'] =='M5A']

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
53,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
