### This notebook contains the clustering code and excercise for Week 3 - Coursera Capstone Project

In [92]:
import numpy as np 
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json 
from geopy.geocoders import Nominatim 
import requests 
from pandas.io.json import json_normalize 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium 
from bs4 import BeautifulSoup


print('Libraries imported.')

Libraries imported.


#### 1. Scrapping data and converting ot to DataFrame

In [133]:
#scrapping postal code data from wikipeida
canapedia_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(canapedia_url)

#convert html data to dataframe

data_html = soup.find_all('table')
canapedia_df = pd.read_html(str(data_html))[0]
canapedia_df.shape

(180, 3)

In [185]:
canapedia_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [239]:
#clean data, drop "not assigned" and NaN

canapedia_df = canapedia_df.dropna(0).reset_index()
canapedia_df = canapedia_df.drop(columns =['index'])
canapedia_df.head()


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### 2. Get Toronto's coorinates by postal code

In [187]:
!wget -q -O 'Geospatial_Coordinates.csv' http://cocl.us/Geospatial_data/Geospatial_Coordinates.csv
print('Data downloaded!')

Data downloaded!


In [240]:
can_coord=pd.read_csv('Geospatial_Coordinates.csv')
can_coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### 3. Clustering neighbourhoods

In [241]:
k= 7

cneigh_clustering = can_coord.drop('Postal Code', 1)

from sklearn.preprocessing import StandardScaler
cneigh_clustering_transform= StandardScaler().fit_transform(cneigh_clustering)

# run k-means clustering
kmeans = KMeans(n_clusters=k, random_state=0).fit(cneigh_clustering_transform)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([5, 5, 5, 5, 5, 0, 0, 0, 0, 0], dtype=int32)

In [242]:
can_coord.insert(0, 'Labels', kmeans.labels_)
can_coord.head()

Unnamed: 0,Labels,Postal Code,Latitude,Longitude
0,5,M1B,43.806686,-79.194353
1,5,M1C,43.784535,-79.160497
2,5,M1E,43.763573,-79.188711
3,5,M1G,43.770992,-79.216917
4,5,M1H,43.773136,-79.239476


In [243]:
can_neigh=canapedia_df.merge(can_coord,on='Postal Code')
can_neigh.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Labels,Latitude,Longitude
0,M3A,North York,Parkwoods,4,43.753259,-79.329656
1,M4A,North York,Victoria Village,0,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",3,43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",1,43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",3,43.662301,-79.389494


#### 4. Visusalisation

In [244]:
# create map
latitude = 43.6532
longitude = -79.3832
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(can_neigh['Latitude'], can_neigh['Longitude'], can_neigh['Neighborhood'], can_neigh['Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters