In [32]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

## Get the webpage and html texts

In [33]:
page_link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page_response = requests.get(page_link, timeout=5)

In [34]:
page_content = BeautifulSoup(page_response.content, "html.parser")

text=[]
for i in range(0,867):
    text.append(page_content.find_all('td')[i].text)

text[0:9]

['M1A',
 'Not assigned',
 'Not assigned\n',
 'M2A',
 'Not assigned',
 'Not assigned\n',
 'M3A',
 'North York',
 'Parkwoods\n']

## Transform the text into a Dataframe

In [35]:
table=pd.DataFrame(columns=['Postcode','Borough','Neighbourhood'])

postcode=[]
borough=[]
neighbourhood=[]
for i in range(0,len(text)):
    if i%3==0:
        postcode.append(text[i])
    if i%3==1:
        borough.append(text[i])
    if i%3==2:
        neighbourhood.append(text[i].replace('\n',''))

table['Postcode']=postcode
table['Borough']=borough
table['Neighbourhood']=neighbourhood

table=table[table['Borough']!='Not assigned']
table['Neighbourhood'][table['Neighbourhood']=='Not assigned']=table['Borough'][table['Neighbourhood']=='Not assigned']
table=table.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
table.head(30)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


## Shape of the table

In [36]:
table.shape

(103, 3)

In [37]:
#!wget 'http://cocl.us/Geospatial_data'
Geospatial_data=pd.read_csv('Geospatial_data')
Geospatial_data.head(5)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [42]:
table_geo=table.join(Geospatial_data.set_index('Postal Code'), on='Postcode')
table_geo.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [59]:
table_toronto=table_geo[['Toronto' in a for a in table_geo['Borough']]]
table_toronto=table_toronto.reset_index(drop=True)
table_toronto_clustering = table_toronto.drop(['Postcode','Borough','Neighbourhood'], 1)
table_toronto_clustering.head(5)

Unnamed: 0,Latitude,Longitude
0,43.676357,-79.293031
1,43.679557,-79.352188
2,43.668999,-79.315572
3,43.659526,-79.340923
4,43.72802,-79.38879


In [74]:
from sklearn.cluster import KMeans
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

In [61]:
kclusters=5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(table_toronto_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1,
       1, 0, 0, 0, 2, 2, 2, 0, 4, 0, 0, 4, 4, 4, 3], dtype=int32)

In [65]:
table_toronto['Cluster Labels']=kmeans.labels_
table_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,3
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,3
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,3
3,M4M,East Toronto,Studio District,43.659526,-79.340923,3
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,1


In [80]:
# create map
map_clusters = folium.Map(location=[43.728020,-79.388790], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(table_toronto['Latitude'], table_toronto['Longitude'], table_toronto['Neighbourhood'], table_toronto['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters