# Coursera Capstone Project Notebook
Will be used throughout the course

In [2]:
import pandas as pd
import numpy as np
print('Hello Capstone Project Course!')

Hello Capstone Project Course!


# Segmenting and Clustering Neighborhoods in Toronto Part 1

In [9]:
# Read file into dataframe
df = pd.read_csv('Week3.csv')
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [10]:
# Deleting rows where Borough is not assigned
df = df[df['Borough'] != 'Not assigned']
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [11]:
# Grouping postal codes that have different neighborhoods listed
df_group=df.groupby(['Postal Code','Neighborhood'], sort = False).agg(', '.join)
df_group=df_group.reset_index(drop=False)
df_group.rename(columns={'Neighborhood':'Neighborhood_joined'},inplace=True)
df_group.head()

Unnamed: 0,Postal Code,Neighborhood_joined,Borough
0,M3A,Parkwoods,North York
1,M4A,Victoria Village,North York
2,M5A,"Regent Park, Harbourfront",Downtown Toronto
3,M6A,"Lawrence Manor, Lawrence Heights",North York
4,M7A,"Queen's Park, Ontario Provincial Government",Downtown Toronto


In [12]:
# Merging Neighborhoods and dropping any duplicates
df_merge = pd.merge(df, df_group, on='Postal Code')
df_merge.drop(['Neighborhood'],axis=1,inplace=True)
df_merge.drop(['Borough_y'],axis=1,inplace=True)
df_merge.drop_duplicates(inplace=True)
df_merge.rename(columns={'Neighborhood_joined':'Neighborhood'},inplace=True)
df_merge.rename(columns={'Borough_x':'Borough'},inplace=True)
df_merge.head(20)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [13]:
df_merge.shape

(103, 3)

# Segmenting and Clustering Neighborhoods in Toronto Part 2

In [14]:
# Read in coordinates
geo_df=pd.read_csv('http://cocl.us/Geospatial_data')

In [15]:
# Merge two dataframes on postal code
geo_merged = pd.merge(geo_df, df_merge, on='Postal Code')
geo_merged.head()

Unnamed: 0,Postal Code,Latitude,Longitude,Borough,Neighborhood
0,M1B,43.806686,-79.194353,Scarborough,"Malvern, Rouge"
1,M1C,43.784535,-79.160497,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae


In [16]:
# Format dataframe
geo_data=geo_merged[['Postal Code','Borough','Neighborhood','Latitude','Longitude']]
geo_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Segmenting and Clustering Neighborhoods in Toronto Part 3

In [17]:
# Get only Toronto neighborhoods
df_toronto = geo_data[geo_data['Borough'].str.contains('Toronto')]
df_toronto

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
47,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
49,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049


In [18]:
!python3 -m pip install folium
import folium



In [19]:
# Create map with labels of all Toronto neighborhoods, centered on CN Tower

map_toronto = folium.Map(location=[43.628947,-79.3384420],zoom_start=10)

for lat,lon,borough,neighborhood in zip(df_toronto['Latitude'],df_toronto['Longitude'],df_toronto['Borough'],df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lon],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.5,
    parse_html=False).add_to(map_toronto)
map_toronto

In [20]:
import sklearn
from sklearn.cluster import KMeans

In [21]:
# Drop unneeded columns
toronto_parts = df_toronto.drop(['Postal Code','Borough','Neighborhood'], axis = 1)
toronto_parts

Unnamed: 0,Latitude,Longitude
37,43.676357,-79.293031
41,43.679557,-79.352188
42,43.668999,-79.315572
43,43.659526,-79.340923
44,43.72802,-79.38879
45,43.712751,-79.390197
46,43.715383,-79.405678
47,43.704324,-79.38879
48,43.689574,-79.38316
49,43.686412,-79.400049


In [22]:
# Fit KMeans model and displaying cluster number
k = 4
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_parts)
df_toronto.insert(0, 'Cluster Number', kmeans.labels_)
df_toronto

Unnamed: 0,Cluster Number,Postal Code,Borough,Neighborhood,Latitude,Longitude
37,1,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,1,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,1,M4M,East Toronto,Studio District,43.659526,-79.340923
44,2,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,2,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,2,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
47,2,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,2,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
49,2,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049


In [23]:
import random
import matplotlib.cm as cm
import matplotlib.colors as colors

In [24]:
# Create map centered on CN Tower, create different colors for clusters, print clustered labels on map

map_clusters = folium.Map(location=[43.628947,-79.3384420],zoom_start=10)

x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]

col = cm.rainbow(np.linspace(0, 1, len(ys)))
final_colors = [colors.rgb2hex(i) for i in col]

markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighborhood'], df_toronto['Cluster Number']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=final_colors[cluster-1],
        fill_opacity=0.5).add_to(map_clusters)
       
map_clusters