# Segmenting and Clustering Neighborhoods in Toronto

#### Begin by using pandas to read in table

In [1]:
import pandas as pd
table = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M',header=0)
df = table[0]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### get rid of "Not Assigned" neighborhoods

In [2]:
dropneighborhood = []
for postcodes in range(0,len(df)):
    if df.iloc[postcodes,1]=="Not assigned":
        dropneighborhood.append(postcodes)
    elif df.iloc[postcodes,2]=="Not assigned":
        df.iloc[postcodes,2]=df.iloc[postcodes,1]
df = df.drop(dropneighborhood)    
df = df.reset_index()
df = df.drop(columns=['index'])
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


#### group neighborhoods together by zip code

In [3]:
newdf = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()
newdf.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


#### Shape of dataframe

In [4]:
newdf.shape

(103, 3)

#### Add Lat/Long

In [5]:
import geocoder
latitude = []
longitude =[]

for postcodes in range(len(newdf)):
    tests = newdf.iloc[postcodes,2].split(',')
    first_hood = tests[0]
    location = geocoder.arcgis(first_hood+" "+newdf.iloc[postcodes,1]+" "+newdf.iloc[postcodes,0])
    location.latlng

    # loop until you get the coordinates
    latitude.append(location.latlng[0])
    longitude.append(location.latlng[1])
    
newdf['Latitude'] = latitude
newdf['Longitude'] = longitude
newdf.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.811525,-79.195517
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.779805,-79.167484
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.749788,-79.188752
3,M1G,Scarborough,Woburn,43.768359,-79.21759
4,M1H,Scarborough,Cedarbrae,43.747758,-79.235186


#### Cluster Boroughs

Cluster with kmeans where k is the number of boroughs. 

In [12]:
kclusters = len(newdf['Borough'].unique())
neighborhoods_clustering = newdf.drop(['Postcode','Borough','Neighbourhood'],1)
# import k-means from clustering stage
from sklearn.cluster import KMeans
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(neighborhoods_clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 
newdf.insert(0, 'Cluster Labels', kmeans.labels_)


#### Map Boroughs

In [17]:
# create map
import folium # map rendering library
map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start=11)
import numpy as np
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(newdf['Latitude'], newdf['Longitude'], newdf['Borough'], newdf['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters