# Segmenting and Clustering Neighborhoods in Toronto
For this assignment, you will be required to explore and cluster the neighborhoods in Toronto.

import libraries

In [1]:
import requests 
import pandas as pd 
import numpy as np 
import random
from sklearn.cluster import KMeans
!pip install bs4
from bs4 import BeautifulSoup
print('Imported!')

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25ldone
[?25h  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1272 sha256=dddaa5e8fefcd3f799c16a09b1015f103f2504c3bac96e3887e4b356a80b33d2
  Stored in directory: /tmp/wsuser/.cache/pip/wheels/0a/9e/ba/20e5bbc1afef3a491f0b3bb74d508f99403aabe76eda2167ca
Successfully built bs4
Installing collected packages: bs4
Successfully installed bs4-0.0.1
Imported!


Scrape data and create object

In [2]:
scrape = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(scrape,'lxml')
print(soup.title)

<title>List of postal codes of Canada: M - Wikipedia</title>


Create list, find table, create dictionary called cell having 3 keys PostalCode, Borough and Neighborhood, ignore Not Assigned, get Borough and Neighborhood information, append

In [3]:
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

print(table_contents)

[{'PostalCode': 'M3A', 'Borough': 'North York', 'Neighborhood': 'Parkwoods'}, {'PostalCode': 'M4A', 'Borough': 'North York', 'Neighborhood': 'Victoria Village'}, {'PostalCode': 'M5A', 'Borough': 'Downtown Toronto', 'Neighborhood': 'Regent Park, Harbourfront'}, {'PostalCode': 'M6A', 'Borough': 'North York', 'Neighborhood': 'Lawrence Manor, Lawrence Heights'}, {'PostalCode': 'M7A', 'Borough': "Queen's Park", 'Neighborhood': 'Ontario Provincial Government'}, {'PostalCode': 'M9A', 'Borough': 'Etobicoke', 'Neighborhood': 'Islington Avenue'}, {'PostalCode': 'M1B', 'Borough': 'Scarborough', 'Neighborhood': 'Malvern, Rouge'}, {'PostalCode': 'M3B', 'Borough': 'North York', 'Neighborhood': 'Don Mills North'}, {'PostalCode': 'M4B', 'Borough': 'East York', 'Neighborhood': 'Parkview Hill, Woodbine Gardens'}, {'PostalCode': 'M5B', 'Borough': 'Downtown Toronto', 'Neighborhood': 'Garden District, Ryerson'}, {'PostalCode': 'M6B', 'Borough': 'North York', 'Neighborhood': 'Glencairn'}, {'PostalCode': 'M9

Create a dataframe with list

In [4]:
df_Toronto=pd.DataFrame(table_contents)
df_Toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [5]:
#replace long Borough names
df_Toronto['Borough']=df_Toronto['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})
df_Toronto

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto Business,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [6]:
#print the number of rows of your dataframe
df_Toronto.shape

(103, 3)

### get the latitude and the longitude coordinates of each neighborhood. 

In [12]:
df_Toronto_coordinates = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv')
df_Toronto_coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
df_Toronto_coordinates.shape

(103, 3)

In [14]:
#merge dataframes
df_Toronto_coordinates.rename(columns = {'Postal Code':'PostalCode'},inplace = True)
df_Toronto_coordinates_merged = pd.merge(df_Toronto,df_Toronto_coordinates,on = 'PostalCode')
df_Toronto_coordinates_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


### Explore and Cluster
work with only boroughs that contain the word Toronto 

In [15]:
df_Toronto_Boroughs = df_Toronto_coordinates_merged[df_Toronto_coordinates_merged['Borough'].str.contains('Toronto')]
df_Toronto_Boroughs

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259
35,M4J,East York/East Toronto,The Danforth East,43.685347,-79.338106


In [17]:
#import visualization libraries

!pip install geocoder
!pip install folium
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
import geocoder
import folium 
print('Installed!')

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting folium
  Downloading folium-0.12.1-py2.py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 3.9 MB/s  eta 0:00:01
Collecting branca>=0.3.0
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.12.1
Installed!


In [19]:
#retrieve Toronto coordinates
g = geocoder.arcgis('Toronto, Ontario')
lat_long = g.latlng
latitude = lat_long[0]
longitude = lat_long[1]

print('latitude',latitude,'longitude',longitude)

latitude 43.648690000000045 longitude -79.38543999999996


In [21]:
#Folim map
to_map = folium.Map(location=[latitude, longitude], zoom_start=10)
for lat, lng, borough, neighbourhood in zip(df_Toronto_Boroughs['Latitude'],df_Toronto_Boroughs['Longitude'],df_Toronto_Boroughs['Borough'],df_Toronto_Boroughs['Neighborhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat,lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(to_map)
    
to_map

In [25]:
# cluster neighborhoods using k-means
k = 6
df_To_cluster = df_Toronto_Boroughs.drop(['PostalCode','Borough','Neighborhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(df_To_cluster)
kmeans.labels_

array([4, 4, 4, 2, 4, 4, 1, 4, 5, 2, 4, 1, 0, 4, 1, 2, 4, 2, 3, 3, 3, 3,
       5, 3, 1, 5, 3, 1, 5, 0, 1, 0, 4, 0, 4, 0, 4, 0, 2], dtype=int32)

In [23]:
df_Toronto_Boroughs.insert(0, 'ClusterLabels', kmeans.labels_)
df_Toronto_Boroughs

Unnamed: 0,ClusterLabels,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,5,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
9,5,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,5,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,2,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,5,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,0,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,5,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,3,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259
35,2,M4J,East York/East Toronto,The Danforth East,43.685347,-79.338106


In [27]:
#map with Clusters
import matplotlib.colors as colors

Toronto_cluster_map = folium.Map(location=[latitude, longitude],zoom_start=10)

# Set color
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Add markers   
markers_colors = []

for lat, lng, neighborhood, cluster in zip(df_Toronto_Boroughs['Latitude'], df_Toronto_Boroughs['Longitude'], df_Toronto_Boroughs['Neighborhood'], df_Toronto_Boroughs['ClusterLabels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(Toronto_cluster_map)

    
Toronto_cluster_map