### Installing the libraries

In [2]:
!pip install --user geopy

Collecting geopy
  Using cached https://files.pythonhosted.org/packages/53/fc/3d1b47e8e82ea12c25203929efb1b964918a77067a874b2c7631e2ec35ec/geopy-1.21.0-py2.py3-none-any.whl
Collecting geographiclib<2,>=1.49 (from geopy)
  Using cached https://files.pythonhosted.org/packages/8b/62/26ec95a98ba64299163199e95ad1b0e34ad3f4e176e221c40245f211e425/geographiclib-1.50-py3-none-any.whl
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.50 geopy-1.21.0


In [3]:
!pip install --user folium

Collecting folium
  Using cached https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/81/6d/31c83485189a2521a75b4130f1fee5364f772a0375f81afff619004e5237/branca-0.4.0-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.0 folium-0.10.1


In [5]:
!pip install --user sklearn

Collecting sklearn
Collecting scikit-learn (from sklearn)
  Downloading https://files.pythonhosted.org/packages/59/49/a6e1f2b9f94e4fca0c04f166db5c713c6d0a81c2f039fb0c66e770bbbcb1/scikit_learn-0.22.2.post1-cp36-cp36m-win_amd64.whl (6.5MB)
Collecting joblib>=0.11 (from scikit-learn->sklearn)
  Using cached https://files.pythonhosted.org/packages/28/5c/cf6a2b65a321c4a209efcdf64c2689efae2cb62661f8f6f4bb28547cf1bf/joblib-0.14.1-py2.py3-none-any.whl
Installing collected packages: joblib, scikit-learn, sklearn
Successfully installed joblib-0.14.1 scikit-learn-0.22.2.post1 sklearn-0.0


### Importing the libraries

In [6]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print('Libraries imported.')

Libraries imported.


### loading the dataest

In [7]:
data = pd.read_csv('postalcode_canada.csv')
data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M1B,Scarborough,Rouge
2,M1B,Scarborough,Malvern
3,M1C,Scarborough,Highland Creek
4,M1C,Scarborough,Rouge Hill


### Find the number of rows and colums of the dataset

In [8]:
data.shape

(287, 3)

### Removing cells with non-assigned borough 

In [9]:
nonassigned = data[data['Borough']=='Not assigned'].index

In [10]:
nonassigned

Int64Index([  0,  38,  39,  40,  41,  42,  43,  44,  57,  58,  59,  60,  61,
             62,  63,  68,  69,  80,  81,  82,  83,  84,  85,  86,  87,  88,
            117, 157, 158, 187, 188, 189, 190, 191, 192, 194, 195, 196, 197,
            198, 199, 200, 201, 202, 203, 204, 206, 207, 208, 209, 210, 212,
            213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225,
            226, 227, 259, 260, 261, 262, 263, 273, 274, 284, 285, 286],
           dtype='int64')

In [11]:
data.drop(nonassigned, inplace=True)
data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1B,Scarborough,Rouge
2,M1B,Scarborough,Malvern
3,M1C,Scarborough,Highland Creek
4,M1C,Scarborough,Rouge Hill
5,M1C,Scarborough,Port Union


### The number of rows and columns after removing rows where borough is not assigned

In [12]:
data.shape

(210, 3)

### Merging of rows in which the postal code are similar but with different neighborhood.

In [13]:
df_code = data['Postcode'].unique()
data.set_index('Postcode', drop=False, inplace=True)

separator=', '
for postcode in df_code:
    df_tmp = data.loc[postcode]
    if(df_tmp.size > 3):
        neighbourhood = separator.join(df_tmp['Neighbourhood'])
        data.loc[postcode, 'Neighbourhood'] = neighbourhood

data.drop(data[data.Neighbourhood == 'Not assigned'].index, inplace=True)
data.drop_duplicates('Postcode', inplace=True)
data.reset_index(drop = True, inplace=True)
data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [14]:
data.shape

(103, 3)

### Loading geopspatial data

In [15]:
geodata = pd.read_csv('Geospatial_Coordinates.csv')
geodata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merging the two datasets

In [16]:
geodf = geodata.rename(columns = {'Postal Code': 'Postcode'})
geodf.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [17]:
dataset = pd.merge(data,geodf, on = 'Postcode')
dataset.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [18]:
dataset.shape

(103, 5)

### Getting the latitudes and longitudes of Toronto using geopy library

In [19]:
address = 'Toronto'

geolocator = Nominatim(user_agent="tr_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


### Create a map of Toronto with neighborhoods superimposed on top.

In [16]:
# create map of toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, dataset in zip(dataset['Latitude'], dataset['Longitude'], dataset['Borough'], dataset['Neighbourhood']):
    label = '{}, {}'.format(dataset, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [20]:
dataset.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Using Kmeans to cluster neighborhoods.

In [21]:
cluster = dataset[['Latitude','Longitude']]

In [22]:
k=8
kmeans = KMeans(n_clusters = k,random_state=0).fit(cluster)
kmeans.labels_
dataset.insert(0, 'Cluster Labels', kmeans.labels_)

In [23]:
kmeans.labels_

array([2, 2, 2, 2, 2, 2, 2, 5, 2, 5, 2, 5, 2, 0, 2, 0, 2, 0, 0, 0, 0, 0,
       6, 6, 6, 0, 0, 5, 6, 4, 6, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       6, 6, 6, 6, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 6, 6, 6, 3,
       3, 3, 3, 3, 3, 6, 6, 6, 1, 3, 1, 3, 3, 4, 1, 1, 1, 1, 1, 3, 7, 5,
       1, 7, 1, 1, 7, 7, 7, 7, 4, 4, 4, 4, 4, 4, 4])

### Creating a map with clusters

In [25]:

# create map
map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(dataset['Latitude'], dataset['Longitude'], dataset['Neighbourhood'], dataset['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters