# Segmenting and Clustering Neighbourhoods in Toronto

# Task 1

### Installing and Importing the required Libraries

In [1]:
!pip install beautifulsoup4
!pip install lxml
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 


from IPython.display import display_html
import pandas as pd
import numpy as np
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

print('Folium installed')
print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Folium installed
Libraries imported.


### Scraping the Wikipedia page and transforming the data into a pandas dataframe

In [32]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup=BeautifulSoup(source,'lxml')
print(soup.title)
dfs = pd.read_html(tab)
df=dfs[0]
df.head(10)

<title>List of postal codes of Canada: M - Wikipedia</title>


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


### Data preprocessing and cleaning

In [30]:
# Dropping the rows where Borough is 'Not assigned'
df1 = df[df.Borough != 'Not assigned']

# Combining the neighbourhoods with same Postalcode
df2 = df1.groupby(['Postal Code','Borough'], sort=False).agg(', '.join)
df2.reset_index(inplace=True)

# Replacing the name of the neighbourhoods which are 'Not assigned' with names of Borough
df2['Neighbourhood'] = np.where(df2['Neighbourhood'] == 'Not assigned',df2['Borough'], df2['Neighbourhood'])

df2.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [31]:
df2.shape

(103, 3)

# Task 2

### Importing the csv file conatining the latitudes and longitudes for various neighbourhoods in Canada

In [27]:
lat_lon = pd.read_csv('https://cocl.us/Geospatial_data')
lat_lon.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [28]:
lat_lon.shape

(103, 3)

### Merging the two tables for getting the Latitudes and Longitudes for various neighbourhoods in Canada

In [25]:
df3 = pd.merge(df2, lat_lon, left_index = True , right_index = True ).drop(columns='Postal Code')
df3.head(10)

Unnamed: 0,Borough,Neighbourhood,Postcode,Latitude,Longitude
0,North York,Parkwoods,M1B,43.806686,-79.194353
1,North York,Victoria Village,M1C,43.784535,-79.160497
2,Downtown Toronto,"Regent Park, Harbourfront",M1E,43.763573,-79.188711
3,North York,"Lawrence Manor, Lawrence Heights",M1G,43.770992,-79.216917
4,Downtown Toronto,"Queen's Park, Ontario Provincial Government",M1H,43.773136,-79.239476
5,Etobicoke,"Islington Avenue, Humber Valley Village",M1J,43.744734,-79.239476
6,Scarborough,"Malvern, Rouge",M1K,43.727929,-79.262029
7,North York,Don Mills,M1L,43.711112,-79.284577
8,East York,"Parkview Hill, Woodbine Gardens",M1M,43.716316,-79.239476
9,Downtown Toronto,"Garden District, Ryerson",M1N,43.692657,-79.264848


In [29]:
df3.shape

(103, 5)

# Task 3

## Map of Toronto

### Getting all the rows from the data frame which contains Toronto in their Borough.

In [34]:
df4 = df3[df3['Borough'].str.contains('Toronto',regex=False)]
df4.head(10)

Unnamed: 0,Borough,Neighbourhood,Postcode,Latitude,Longitude
2,Downtown Toronto,"Regent Park, Harbourfront",M1E,43.763573,-79.188711
4,Downtown Toronto,"Queen's Park, Ontario Provincial Government",M1H,43.773136,-79.239476
9,Downtown Toronto,"Garden District, Ryerson",M1N,43.692657,-79.264848
15,Downtown Toronto,St. James Town,M1W,43.799525,-79.318389
19,East Toronto,The Beaches,M2K,43.786947,-79.385975
20,Downtown Toronto,Berczy Park,M2L,43.75749,-79.374714
24,Downtown Toronto,Central Bay Street,M2R,43.782736,-79.442259
25,Downtown Toronto,Christie,M3A,43.753259,-79.329656
30,Downtown Toronto,"Richmond, Adelaide, King",M3K,43.737473,-79.464763
31,West Toronto,"Dufferin, Dovercourt Village",M3L,43.739015,-79.506944


### Visualizing all the Neighbourhoods of the above data frame using Folium

## the maps may not appeared in Github, check out toronto map and cluster map in repository for more information

In [45]:
map_toronto = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

for lat,lng,borough,neighbourhood in zip(df4['Latitude'],df4['Longitude'],df4['Borough'],df4['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto

### Using KMeans clustering for the clsutering of the neighbourhoods

In [37]:
k=5
toronto_clustering = df4.drop(['Postcode','Borough','Neighbourhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
kmeans.labels_
df4.insert(0, 'Cluster Labels', kmeans.labels_)

In [38]:
df4.head(10)

Unnamed: 0,Cluster Labels,Borough,Neighbourhood,Postcode,Latitude,Longitude
2,4,Downtown Toronto,"Regent Park, Harbourfront",M1E,43.763573,-79.188711
4,4,Downtown Toronto,"Queen's Park, Ontario Provincial Government",M1H,43.773136,-79.239476
9,2,Downtown Toronto,"Garden District, Ryerson",M1N,43.692657,-79.264848
15,0,Downtown Toronto,St. James Town,M1W,43.799525,-79.318389
19,0,East Toronto,The Beaches,M2K,43.786947,-79.385975
20,0,Downtown Toronto,Berczy Park,M2L,43.75749,-79.374714
24,0,Downtown Toronto,Central Bay Street,M2R,43.782736,-79.442259
25,0,Downtown Toronto,Christie,M3A,43.753259,-79.329656
30,3,Downtown Toronto,"Richmond, Adelaide, King",M3K,43.737473,-79.464763
31,1,West Toronto,"Dufferin, Dovercourt Village",M3L,43.739015,-79.506944


In [39]:
# create map
map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df4['Latitude'], df4['Longitude'], df4['Neighbourhood'], df4['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters