## This notebook will be mainly used for the Segmenting and Clustering Neighborhoods in Toronto.

### Initial Requirements

In [None]:
!conda install lxml --y

Solving environment: \ 

In [1]:
pip install lxml

Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/dd/ba/a0e6866057fc0bbd17192925c1d63a3b85cf522965de9bc02364d08e5b84/lxml-4.5.0-cp36-cp36m-manylinux1_x86_64.whl (5.8MB)
[K     |████████████████████████████████| 5.8MB 16.0MB/s eta 0:00:01 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.5.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install beautifulsoup4

Collecting beautifulsoup4
[?25l  Downloading https://files.pythonhosted.org/packages/cb/a1/c698cf319e9cfed6b17376281bd0efc6bfc8465698f54170ef60a485ab5d/beautifulsoup4-4.8.2-py3-none-any.whl (106kB)
[K     |████████████████████████████████| 112kB 15.6MB/s eta 0:00:01
[?25hCollecting soupsieve>=1.2 (from beautifulsoup4)
  Downloading https://files.pythonhosted.org/packages/81/94/03c0f04471fc245d08d0a99f7946ac228ca98da4fa75796c507f61e688c2/soupsieve-1.9.5-py2.py3-none-any.whl
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.8.2 soupsieve-1.9.5
Note: you may need to restart the kernel to use updated packages.


In [1]:
import requests # library to handle requests
import pandas as pd # library for data analsysis
from pandas.io.html import read_html
import numpy as np # library to handle data in a vectorized manner
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

## Part 1- Getting the table

In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wikitables = read_html(url, attrs={"class":"wikitable"})
print("Extracted {num} wikitables".format(num=len(wikitables)))

Extracted 1 wikitables


In [3]:
wikitables[0].head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Dropping Brough 'Not assigned'

In [4]:
# Get names of indexes for which column Age has value 30
indexNames = wikitables[0][ wikitables[0]['Borough'] == 'Not assigned' ].index
 
# Delete these row indexes from dataFrame
wikitables[0].drop(indexNames , inplace=True)
wikitables[0]

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
...,...,...,...
281,M8Z,Etobicoke,Kingsway Park South West
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West


### Filtering of postcodes

In [8]:
#join of Neighbourhoods
wikitables_grouped_neigh = wikitables[0].groupby('Postcode').Neighbourhood.agg([('Neighbourhood', ', '.join)])
#group of Boroughs comma separated
wikitables_grouped_bor = wikitables[0].groupby('Postcode').Borough.agg([('Borough', ', '.join)])
#Postcode from index to column
wikitables_grouped_bor.reset_index(inplace=True)
#Merge of columns
wikitables_grouped_all = pd.DataFrame(zip(wikitables_grouped_bor.Postcode, wikitables_grouped_bor.Borough, wikitables_grouped_neigh.Neighbourhood))
#Column renaming
wikitables_grouped_all = wikitables_grouped_all.rename(columns={0: 'Postcodes', 1: 'Borough', 2:'Neighbourhood'})
#Borough only lasting of the last item after the last comma
wikitables_grouped_all['Borough'] = wikitables_grouped_all['Borough'].str.rsplit(',').str[-1] 
#Print
wikitables_grouped_all

Unnamed: 0,Postcodes,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


### Not assigned Neighbourhoods copied from its Boroughs

In [9]:
indexNames = wikitables[0][ wikitables[0]['Neighbourhood'] == 'Not assigned' ].index
print(indexNames)
wikitables_grouped_all.loc[9]
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(wikitables_grouped_all)

Int64Index([9], dtype='int64')
    Postcodes            Borough  \
0         M1B        Scarborough   
1         M1C        Scarborough   
2         M1E        Scarborough   
3         M1G        Scarborough   
4         M1H        Scarborough   
5         M1J        Scarborough   
6         M1K        Scarborough   
7         M1L        Scarborough   
8         M1M        Scarborough   
9         M1N        Scarborough   
10        M1P        Scarborough   
11        M1R        Scarborough   
12        M1S        Scarborough   
13        M1T        Scarborough   
14        M1V        Scarborough   
15        M1W        Scarborough   
16        M1X        Scarborough   
17        M2H         North York   
18        M2J         North York   
19        M2K         North York   
20        M2L         North York   
21        M2M         North York   
22        M2N         North York   
23        M2P         North York   
24        M2R         North York   
25        M3A         North York 

### Actual shape

In [11]:
wikitables_grouped_all.shape

(103, 3)

## Part 2- Adding Latitude & Longitude

In [41]:
latlong = pd.read_csv("https://cocl.us/Geospatial_data")
wikitables_grouped_all['Latitude'] = latlong['Latitude']
wikitables_grouped_all['Longitude'] = latlong['Longitude']
wikitables_grouped_all

Unnamed: 0,Postcodes,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437


## Part 3- cluster areas k=5

In [55]:
from sklearn.cluster import KMeans
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

In [47]:
#df reset
wikitables_grouped_all = wikitables_grouped_all.drop('Cluster Labels', 1)
wikitables_merged = wikitables_merged.drop('Cluster Labels', 1)

In [48]:
# set number of clusters
kclusters = 5
wikitables_grouped_all_clustering = latlong.drop('Postal Code', 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(wikitables_grouped_all_clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 
# add clustering labels
wikitables_merged = wikitables_grouped_all
wikitables_merged.insert(0, 'Cluster Labels', kmeans.labels_)
wikitables_merged

Unnamed: 0,Cluster Labels,Postcodes,Borough,Neighbourhood,Latitude,Longitude
0,0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,0,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,0,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,0,M1G,Scarborough,Woburn,43.770992,-79.216917
4,0,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...,...
98,1,M9N,York,Weston,43.706876,-79.518188
99,1,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,1,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
101,1,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437


In [67]:
# create map
latitude = 43.706748
longitude = -79.4
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(wikitables_merged['Latitude'], wikitables_merged['Longitude'], wikitables_merged['Neighbourhood'], wikitables_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters