In [45]:
!pip install beautifulsoup4
!pip install lxml
!pip install geopy

import numpy as np
import pandas as pd
import requests
import random
import folium
import matplotlib.colors as colors
import matplotlib.cm as cm
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans
from IPython.display import Image 
from IPython.core.display import HTML
from pandas.io.json import json_normalize
from bs4 import BeautifulSoup

print('Libraries installed and imported')

Libraries installed and imported


# Get data from Wikipedia

In [9]:
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup=BeautifulSoup(data,'lxml')
print(soup.title)
from IPython.display import display_html
tab = str(soup.table)
display_html(tab,raw=True)

<title>List of postal codes of Canada: M - Wikipedia</title>


Postal Code,Borough,Neighbourhood
M1A,Not assigned,Not assigned
M2A,Not assigned,Not assigned
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
M8A,Not assigned,Not assigned
M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
M1B,Scarborough,"Malvern, Rouge"


# Convert data from html table to Pandas DataFrame

In [16]:
df1 = pd.read_html(tab)
df = df1[0]
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


# Drop data and cleaning

In [19]:
#Dropping the rows where Borough is 'Not assigned'
df2 = df[df.Borough !='Not assigned']

# Combine the neighbourhoods whihc have same Postalcode
clean_df = df2.groupby(['Postal Code','Borough'], sort=False).agg(', '.join)
clean_df.reset_index(inplace=True)

# Replace the name of the neighbourhoods which are 'Not assigned' with names of Borough
clean_df['Neighbourhood'] = np.where(clean_df['Neighbourhood'] == 'Not assigned',clean_df['Borough'], clean_df['Neighbourhood'])

clean_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [21]:
#Shape of the clean data
clean_df.shape

(103, 3)

# Import the csv file for different neighbourhoods

In [33]:
lat_and_lon = pd.read_csv('https://cocl.us/Geospatial_data')
lat_and_lon.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


# Merging the two tables for getting the Latitudes and Longitudes for different neighbourhoods

In [34]:
new_df = pd.merge(clean_df,lat_and_lon, on="Postal Code")
new_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


# Includes the Clustering and the plotting of the neighbourhoods of Canada here, in which contain Toronto in their Borough

In [35]:
new_df1 = new_df[new_df['Borough'].str.contains('Toronto',regex=False)]
new_df1


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


# Using Folium to visualize all the Neighbourhoods of the above data frame

In [36]:
toronto = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

for lat,lng,borough,neighbourhood in zip(new_df1['Latitude'],new_df1['Longitude'],new_df1['Borough'],new_df1['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3192cc',
    fill_opacity=0.6,
    parse_html=False).add_to(toronto)
toronto

# Using KMeans clustering

In [40]:
k=6
toronto_clustering = new_df1.drop(['Postal Code','Borough','Neighbourhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
kmeans.labels_
new_df1.insert(0, 'Cluster Labels', kmeans.labels_)

In [41]:
new_df1

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,2,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,4,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,2,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,2,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,5,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,2,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,3,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [49]:
# Create the map
clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

# Set color scheme for clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(new_df1['Latitude'], new_df1['Longitude'], new_df1['Neighbourhood'], new_df1['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

In [50]:
clusters