### Importation of the libraries

In [23]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

from bs4 import BeautifulSoup
print('Libraries imported.')

Libraries imported.


### Data scrapping

In [0]:
#sending the get request
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [0]:
# parsing data to a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

In [0]:
# we made a list for each colums
postalcode_list =[]
borough_list=[]
neighborhood_list=[]

In [0]:
#add each element into the table
for row in soup.find('table').find_all('tr'):
    element = row.find_all('td')
    if(len(element)>1):
      postalcode_list.append(element[0].text.rstrip('\n'))
      borough_list.append(element[1].text.rstrip('\n'))
      neighborhood_list.append(element[2].text.rstrip('\n'))
    

In [28]:
'''instantiate the dataframe : The dataframe will consist of
three columns: PostalCode, Borough, and Neighborhood'''
postal_df = pd.DataFrame({"PostalCode": postalcode_list,"Borough": borough_list,"Neighborhood": neighborhood_list})


'''Ignore cells with a borough that is Not assigned.'''
postal_df=postal_df[postal_df.Borough!='Not assigned'].reset_index(drop=True)


'''If a cell has a borough but a Not assigned neighborhood, 
then the neighborhood will be the same as the borough.'''
postal_df = postal_df.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: " , ".join(x))

'''More than one neighborhood can exist in one postal code area. 
For example, in the table on the Wikipedia page, you will notice that M5A 
is listed twice and has two neighborhoods: Harbourfront and Regent Park. 
These two rows will be combined into one row with the 
neighborhoods separated with a comma as shown in row 11 in the above table.'''
for i,row in postal_df.iterrows():
  if(row['Neighborhood']=="Not assigned"): 
    row["Neighborhood"] = row["Borough"]

#show
postal_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Import the csv


In [29]:
from google.colab import files
data_to_load = files.upload()

Saving Geospatial_Coordinates.csv to Geospatial_Coordinates (1).csv


In [0]:
coordinate = pd.read_csv('Geospatial_Coordinates.csv')

In [31]:
coordinate.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [0]:
#rename the first colum
coordinate.rename(columns={'Postal Code':'PostalCode'}, inplace= True)

In [33]:
coordinate.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [34]:
postal_df = pd.merge(postal_df,coordinate,how='left', on=['PostalCode'])
postal_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [35]:
#compare to the dataframe of the course
compare= pd.DataFrame(columns=["PostalCode", "Borough", "Neighborhood", "Latitude", "Longitude"])
listcodepostal=["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for codepostal in listcodepostal:
    compare = compare.append(postal_df[postal_df["PostalCode"]==codepostal])

compare.reset_index(drop=True)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Wexford, Maryvale",43.750072,-79.295849
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442
