In [2]:
!pip install folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/4f/86/1ab30184cb60bc2b95deffe2bd86b8ddbab65a4fac9f7313c278c6e8d049/folium-0.9.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 14.1MB/s eta 0:00:01
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/63/36/1c93318e9653f4e414a2e0c3b98fc898b4970e939afeedeee6075dd3b703/branca-0.3.1-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.3.1 folium-0.9.1


### load libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

### scraping data

In [2]:
def load_page():
    page_url = "https://en.m.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
    rs = requests.get(page_url)
    soup = BeautifulSoup(rs.text)
    selected = soup.find(class_ = "wikitable sortable")
    return selected

### process dataframe
 _1. Drop Borough equals Not assigned_<br/>
 _2. Subsitute for Neighbour Not assigned with its Borough_<br/>
 _3. Join Neighbour with same Postcode_<br/>

In [3]:
def table_to_dataframe(soup):
    df = pd.read_html(str(soup))[0]
    df = df[df.Borough!='Not assigned']
    df.at[df.Neighbourhood=='Not assigned', 'Neighbourhood'] = df.Borough
    df = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()
    return df

### load dataframe

In [4]:
df_post = table_to_dataframe(load_page())
df_post.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Load Geopatial data

In [5]:
df_geo = pd.read_csv('http://cocl.us/Geospatial_data')
df_geo.rename(columns = {'Postal Code':'Postcode'},inplace=True)

### Merge data

In [6]:
df = df_post.merge(df_geo,on='Postcode')

### Filtering Toronto

In [7]:
df_toronto = df[df.Borough.str.contains('Toronto')]
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


### Kmean Clustering

In [8]:
def fit_model(k,data):
    model = KMeans(n_clusters=k,init='k-means++',random_state=0).fit(data)
    return model.labels_

### Visulization
 _1.Use unique borough count in Toronto as the number of clusters_<br/>
 _2.Use latitude and longitude in the Central Toronto as central point of the Map_<br/> 
 _2.Show clusters on Map_<br/>

In [12]:
#Clustering
k = df_toronto.Borough.nunique()
data = df_toronto[['Latitude','Longitude']]
labels = fit_model(k,data)
df_toronto['Label'] = labels
#Define Map central point
latitude,longitude = df_toronto[df_toronto.Borough=='Central Toronto'][['Latitude','Longitude']].values[0]

#Create Map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

#Create Color Palette
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

#Add Markers with cluster labels
for lat, lon, post, neigh, cluster in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Postcode'],df_toronto['Neighbourhood'], df_toronto['Label']):
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=folium.Popup(str(post) + ' ' + str(neigh), parse_html=True),
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
    
#Show map
map_clusters

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Examine

In [27]:
df_toronto[['Borough','Label']].groupby(['Borough']).Label.value_counts()

Borough           Label
Central Toronto   2         8
                  1         1
Downtown Toronto  1        17
                  3         1
East Toronto      0         5
West Toronto      3         6
Name: Label, dtype: int64

### The assignment of postal code was based on it's geographic position. Through kmean clustering we can find that the majority of labels match the related postal codes, and only few points in the middle of two boroughs were mislabeled.