# First task: Creating the dataframe

In [181]:
import requests
import lxml.html as lh
import pandas as pd
import numpy as np

In [182]:
# get document
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url)
doc = lh.fromstring(page.content)
tr_elements = doc.xpath('//tr')

In [183]:
# get columns
columns = []
for column in tr_elements[0]:
    columns.append(column.text_content().split('\n')[0])
print(columns)

['Postal Code', 'Borough', 'Neighbourhood']


In [184]:
# get rows (only consider rows from wikipedia table, not additional at bottom of the page)
# therefore: choose interval from 1 to -4
rows = []
for row in tr_elements[1:-4]:
    row_entries = []
    for entry in row:
       row_entries.append(entry.text_content().split('\n')[0])
    rows.append(row_entries)
# print(rows)
print(rows)

[['M1A', 'Not assigned', 'Not assigned'], ['M2A', 'Not assigned', 'Not assigned'], ['M3A', 'North York', 'Parkwoods'], ['M4A', 'North York', 'Victoria Village'], ['M5A', 'Downtown Toronto', 'Regent Park, Harbourfront'], ['M6A', 'North York', 'Lawrence Manor, Lawrence Heights'], ['M7A', 'Downtown Toronto', "Queen's Park, Ontario Provincial Government"], ['M8A', 'Not assigned', 'Not assigned'], ['M9A', 'Etobicoke', 'Islington Avenue, Humber Valley Village'], ['M1B', 'Scarborough', 'Malvern, Rouge'], ['M2B', 'Not assigned', 'Not assigned'], ['M3B', 'North York', 'Don Mills'], ['M4B', 'East York', 'Parkview Hill, Woodbine Gardens'], ['M5B', 'Downtown Toronto', 'Garden District, Ryerson'], ['M6B', 'North York', 'Glencairn'], ['M7B', 'Not assigned', 'Not assigned'], ['M8B', 'Not assigned', 'Not assigned'], ['M9B', 'Etobicoke', 'West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale'], ['M1C', 'Scarborough', 'Rouge Hill, Port Union, Highland Creek'], ['M2C', 'Not assigned', 'N

In [185]:
# create dataframe
df = pd.DataFrame(np.array(rows), columns=columns)
df.shape

(180, 3)

In [186]:
# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
# Get names of indexes for which column Borough has value Not assigned
indexNames = df[ df['Borough'] == "Not assigned" ].index
# Delete these row indexes from dataFrame
df.drop(indexNames , inplace=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [187]:
df.shape

(103, 3)

# Second task: Adding Latitude and Longitude to dataframe

In [188]:
# read csv and print head for sanity check
geo_data = pd.read_csv("Geospatial_Coordinates.csv")
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [189]:
# add columns lat and long to existing dataframe
neighborhoods = df.set_index('Postal Code').join(geo_data.set_index('Postal Code'))
neighborhoods.head()

Unnamed: 0_level_0,Borough,Neighbourhood,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3A,North York,Parkwoods,43.753259,-79.329656
M4A,North York,Victoria Village,43.725882,-79.315572
M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


# Third task: Explore and cluster the neighborhoods in Toronto

In [190]:
# create map of Toronto using latitude and longitude values
import folium # map rendering library
latitude = 43.651070
longitude = -79.347015
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

# After showing the neighbourhoods, lets cluster them as in the cognitive class

In [191]:
# one hot encoding
manhattan_onehot = pd.get_dummies(neighborhoods[['Borough']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
manhattan_onehot['Neighbourhood'] = neighborhoods['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [manhattan_onehot.columns[-1]] + list(manhattan_onehot.columns[:-1])
manhattan_onehot = manhattan_onehot[fixed_columns]

manhattan_onehot.head()

manhattan_grouped = neighborhoods.groupby('Neighbourhood').mean().reset_index()
manhattan_grouped

# set number of clusters
kclusters = 5

manhattan_grouped_clustering = manhattan_grouped.drop('Neighbourhood', 1)

# import k-means from clustering stage
from sklearn.cluster import KMeans

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(manhattan_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

# add clustering labels
manhattan_grouped.insert(0, 'Cluster Labels', kmeans.labels_)

manhattan_merged = neighborhoods

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
manhattan_merged = manhattan_grouped[["Cluster Labels", "Neighbourhood"]].join(neighborhoods.set_index('Neighbourhood'), on='Neighbourhood', lsuffix='_left', rsuffix='_right')

manhattan_merged.head() # check the last columns!

Unnamed: 0,Cluster Labels,Neighbourhood,Borough,Latitude,Longitude
0,4,Agincourt,Scarborough,43.7942,-79.262029
1,2,"Alderwood, Long Branch",Etobicoke,43.602414,-79.543484
2,0,"Bathurst Manor, Wilson Heights, Downsview North",North York,43.754328,-79.442259
3,0,Bayview Village,North York,43.786947,-79.385975
4,0,"Bedford Park, Lawrence Manor East",North York,43.733283,-79.41975


In [194]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
import matplotlib.cm as cm
import matplotlib.colors as colors

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(manhattan_merged['Latitude'], manhattan_merged['Longitude'], manhattan_merged['Neighbourhood'], manhattan_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters