In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


## We will scrape the web from wikipedia using urllib and BeautifulSoup

In [11]:
## We will scrape the web from wikipedia using urllib and BeautifulSoup

import urllib.request
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = urllib.request.urlopen(url)


# import the BeautifulSoup library so we can parse HTML and XML documents
from bs4 import BeautifulSoup

# parse the HTML from our URL into the BeautifulSoup parse tree format
soup = BeautifulSoup(page, "lxml")
#print(soup.prettify())

right_table=soup.find('table', class_='wikitable sortable')
#right_table

In [3]:
A=[]
B=[]
C=[]

for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))


In [4]:
df=pd.DataFrame(A,columns=['Postal Code'])
df['Borough'] = B
df['Neighborhood'] = C
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


## As every word is finished with \n we are going to cut them

In [5]:
## As every word is finished with \n we are going to cut them

A1 = []
B1 = []
C1 = []

for word in A:
    result = word.find('\n')
    word = word[:result]
    A1.append(word)
    
for word in B:
    result = word.find('\n')
    word = word[:result]
    B1.append(word)
    
for word in C:
    result = word.find('\n')
    word = word[:result]
    C1.append(word)
    
df=pd.DataFrame(A1,columns=['Postal Code'])
df['Borough'] = B1
df['Neighborhood'] = C1
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## We are going to look if there are rows having not assigned neighborhoods, but assigned Boroughs. And vice versa

In [6]:
## We are going to look if there are rows having not assigned neighborhoods, but assigned Boroughs. And vice versa

filter1 = (df['Borough'] != 'Not assigned') & (df['Neighborhood'] == 'Not assigned')
filter2 = (df['Borough'] == 'Not assigned') & (df['Neighborhood'] != 'Not assigned')

df1 = df[filter1]
print('df1 Shape is', df1.shape)

df2 = df[filter2]
print('df2 Shape is', df2.shape)

df1 Shape is (0, 3)
df2 Shape is (0, 3)


#### Result from above shows that there aren't



## We apply now a filter to drop those "not assigned" rows

In [8]:
## We apply now filter to drop those "not assigned" rows

filter = ((df['Borough'] != 'Not assigned') | (df['Neighborhood'] != 'Not assigned')) 
df = df[filter]

print('Filtered Dataframe Shape is', df.shape)

Filtered Dataframe Shape is (103, 3)


## We look for duplicates in the POSTAL CODE ; to know if we have to merge some neighborhoods

In [9]:
## Descending Order. So if there's a duplicate Postal Code it should be in first place

print(df['Postal Code'].value_counts().sort_values(ascending=False).head())

M1V    1
M2L    1
M2H    1
M5V    1
M5M    1
Name: Postal Code, dtype: int64


In [12]:
df.head(15)

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


## We look for latitude and longitude from the link. Then we are going to merge using the postal code

In [13]:
## We look for latitude and longitude from the link showed in the evaluation. Then we are going to merge using the postal code
LAT_LON = pd.read_csv('http://cocl.us/Geospatial_data')
LAT_LON

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


## We merge the original dataframe with the LAT_LON DF, using postal code as the common column

In [14]:
df_merged = df.merge(LAT_LON, on ='Postal Code', how='left')
df_merged.head(15)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
