# Segmenting and Clustering Neighborhoods in Toronto

### Import the library

In [1]:
import pandas as pd
import requests
import geocoder
from sklearn.cluster import KMeans
import folium
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

### Using pandas for read the internet page

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df = pd.read_html(url)
df = df[0]

### The 5 first rows

In [3]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Excluding the rows with 'Not assigned' in the Borough columns

In [4]:
df = df[df.Borough != 'Not assigned']

### Reseting index 

In [5]:
df.reset_index(drop = True, inplace = True)

### Loking for 'Not assigned' in column 'Neighborhood'

In [6]:
[df.Neighborhood.index(j) for i, j in zip(df.Borough, df.Neighborhood) if j == 'Not assigned']

[]

### Showing the dataframe

In [7]:
df.head(12)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### Shape of dataframe

In [8]:
df.shape

(103, 3)

### Using geocoder

In [9]:
df_geo = pd.read_csv('http://cocl.us/Geospatial_data')

In [10]:
df_geo

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [11]:
frame = [df, df_geo]
result = pd.merge(df, df_geo, how = 'outer', on = 'Postal Code', right_index = True)

In [12]:
result.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [13]:
result = result[result.Borough == 'Downtown Toronto']

In [14]:
result.reset_index(drop = True, inplace = True)

In [15]:
result.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


## Print the map

In [16]:
latitude = result['Latitude']
longitude = result['Longitude']
neigh = result['Neighborhood']

toronto = folium.Map(location = [latitude[0], longitude[0]], zoom_start = 12)

for lat, lon, n in zip(latitude, longitude, neigh):
    folium.Marker(location = [lat,lon],
                 pop = n,
                 icon = folium.Icon(icon = 'info-sign')
                 ).add_to(toronto)
toronto

## Cluster the neighborhood

In [17]:
# set number of clusters
k = 4

neigh_cluster = result[['Latitude', 'Longitude']]

clusters = KMeans(n_clusters = k, random_state = 0).fit(neigh_cluster)

result.insert(0, 'Cluster Labels', clusters.labels_)

In [18]:
result

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,3,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,0,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,0,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,1,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,1,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
5,0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
6,2,M6G,Downtown Toronto,Christie,43.669542,-79.422564
7,1,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
8,1,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752
9,1,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576


## Print the clusters

In [19]:
latitude = result['Latitude']
longitude = result['Longitude']
neigh = result['Neighborhood']

# create map
map_clusters = folium.Map(location=[latitude[0], longitude[0]], zoom_start=12)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(latitude, longitude, neigh, result['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters