## I import all the required libraries for the code

In [None]:
from bs4 import BeautifulSoup

In [None]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

## I pass the URL of the Wikipedia page to BeautifulSoup

In [91]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_file=requests.get(url)
soup=BeautifulSoup(html_file.content, 'lxml')

In [None]:
print(soup.prettify())

## This extracts the title of the webpage

In [None]:
print(soup.title.text)

### Since we are interested in the data in the table, this next step extracts the required data from the webpage

In [93]:
table=soup.find('table', class_='wikitable')

### I extract the table header from the "th" tag

In [None]:
#This extracts the columns, and passes them into a list
columns=[]

for i in table.find_all('th'):
    columns.append(i.text)
    
columns=[w.replace("\n",'') for w in columns]

columns

### I extract the row data from the "td" tags

In [None]:
#This extracts the row data, and passes them into a list
rows=[]

for i in table.find_all('td'):
    rows.append(i.text)

rows=[w.replace("\n",'') for w in rows]

rows

In [96]:
#I reshape the row data to match the desired df
a=int(len(rows)/3)
x=np.reshape(rows, (a,3))

In [97]:
toronto_df=pd.DataFrame()
toronto_df=toronto_df.from_records(x, columns=columns)

In [98]:
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [99]:
toronto_df.shape

(289, 3)

In [100]:
toronto_df['Borough'].replace('Not assigned', np.NaN, inplace=True)
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,,Not assigned
1,M2A,,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [101]:
toronto_df.dropna(subset=['Borough'], inplace=True)

In [102]:
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [103]:
toronto_df['Neighbourhood'].replace('Not assigned',toronto_df['Borough'], inplace=True)

In [104]:
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [105]:
foo=lambda a: "/".join(a)
toronto_df=toronto_df.groupby(['Postcode','Borough']).agg({'Neighbourhood': foo})

In [106]:
toronto_df.reset_index(inplace=True)

In [107]:
toronto_df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,Rouge/Malvern
1,M1C,Scarborough,Highland Creek/Rouge Hill/Port Union
2,M1E,Scarborough,Guildwood/Morningside/West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,East Birchmount Park/Ionview/Kennedy Park
7,M1L,Scarborough,Clairlea/Golden Mile/Oakridge
8,M1M,Scarborough,Cliffcrest/Cliffside/Scarborough Village West
9,M1N,Scarborough,Birch Cliff/Cliffside West


In [108]:
toronto_df.shape

(103, 3)

In [109]:
toronto_df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,Rouge/Malvern
1,M1C,Scarborough,Highland Creek/Rouge Hill/Port Union
2,M1E,Scarborough,Guildwood/Morningside/West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,East Birchmount Park/Ionview/Kennedy Park
7,M1L,Scarborough,Clairlea/Golden Mile/Oakridge
8,M1M,Scarborough,Cliffcrest/Cliffside/Scarborough Village West
9,M1N,Scarborough,Birch Cliff/Cliffside West


## I extract the content of the file, and modify the string to an array

In [110]:
url = 'https://cocl.us/Geospatial_data'
html_file=requests.get(url).content

In [111]:
lonlat=[]
lonlat=html_file.decode('ascii').replace("\r\n",',').split(',')

In [112]:
len(lonlat)

312

In [113]:
a=int(len(lonlat)/3)
x=np.reshape(lonlat, (a,3))

In [114]:
columns=x[0]
rows=x[1:]

In [115]:
lonlat_df=pd.DataFrame
lonlat_df=lonlat_df.from_records(rows, columns=columns)

In [116]:
lonlat_df.rename(columns={'Postal Code':'Postcode'},inplace=True)

In [117]:
lonlat_df.set_index('Postcode',inplace=True)

In [118]:
lonlat_df.head()

Unnamed: 0_level_0,Latitude,Longitude
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.8066863,-79.1943534
M1C,43.7845351,-79.1604971
M1E,43.7635726,-79.1887115
M1G,43.7709921,-79.2169174
M1H,43.773136,-79.2394761


In [119]:
toronto_df.set_index('Postcode',inplace=True)

In [120]:
toronto_df=toronto_df.join(lonlat_df,how='inner').reset_index()

In [121]:
toronto_df

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,Rouge/Malvern,43.8066863,-79.1943534
1,M1C,Scarborough,Highland Creek/Rouge Hill/Port Union,43.7845351,-79.1604971
2,M1E,Scarborough,Guildwood/Morningside/West Hill,43.7635726,-79.1887115
3,M1G,Scarborough,Woburn,43.7709921,-79.2169174
4,M1H,Scarborough,Cedarbrae,43.773136,-79.2394761
5,M1J,Scarborough,Scarborough Village,43.7447342,-79.2394761
6,M1K,Scarborough,East Birchmount Park/Ionview/Kennedy Park,43.7279292,-79.2620294
7,M1L,Scarborough,Clairlea/Golden Mile/Oakridge,43.7111117,-79.2845772
8,M1M,Scarborough,Cliffcrest/Cliffside/Scarborough Village West,43.716316,-79.2394761
9,M1N,Scarborough,Birch Cliff/Cliffside West,43.692657,-79.2648481
