# 1. import library

In [30]:
#!pip install geocoder --user
from bs4 import BeautifulSoup
import requests
import pandas as pd
import geocoder
import folium
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np


# 2. download data
scrap data and compute in structural form

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(url).text
can_html  = BeautifulSoup(source, 'xml')
can_html = can_html.find('table')


In [74]:
row_data = []
for tr_cell in can_html.find_all('tr'):
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
        #print(td_cell.text.strip())
print(len(row_data))
row_data[:10]

180


['M1ANot assigned',
 'M2ANot assigned',
 'M3ANorth York(Parkwoods)',
 'M4ANorth York(Victoria Village)',
 'M5ADowntown Toronto(Regent Park / Harbourfront)',
 'M6ANorth York(Lawrence Manor / Lawrence Heights)',
 "M7AQueen's Park(Ontario Provincial Government)",
 'M8ANot assigned',
 'M9AEtobicoke(Islington Avenue)',
 'M1BScarborough(Malvern / Rouge)']

In [75]:
#extract data into a list
borough  = [row_data[i][3:] for i in range(len(row_data))]
postalCode = [row_data[i][:3] for i in range(len(row_data))]
print(postalCode[:5])
print(borough[:5])

['M1A', 'M2A', 'M3A', 'M4A', 'M5A']
['Not assigned', 'Not assigned', 'North York(Parkwoods)', 'North York(Victoria Village)', 'Downtown Toronto(Regent Park / Harbourfront)']


In [121]:
# make df 
can_df = pd.DataFrame()
can_df['Postalcode'] = postalCode
can_df['temp'] = borough
can_df.head()


Unnamed: 0,Postalcode,temp
0,M1A,Not assigned
1,M2A,Not assigned
2,M3A,North York(Parkwoods)
3,M4A,North York(Victoria Village)
4,M5A,Downtown Toronto(Regent Park / Harbourfront)


# 3. data cleaning
we will clean the data in df: remove na, split data.

then we will add geodata to the dataframe



In [122]:
can_df = can_df[can_df.temp != "Not assigned"] # delect df row which are 'Not assigned'
can_df[['Borough','Neighborhood','na']] = can_df.temp.str.split("(",expand = True,) # split col:temp
can_df['Neighborhood'] = can_df['Neighborhood'].str.replace(')','') 
can_df['Neighborhood'] = can_df['Neighborhood'].str.replace('/',',')
can_df = can_df.drop(['temp','na'],axis=1) # drop columns
can_df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"
5,M6A,North York,"Lawrence Manor , Lawrence Heights"
6,M7A,Queen's Park,Ontario Provincial Government


Get geodata and merge into the dataframe

In [194]:
def geodata(alistPostal):
    list_geo = []  
    for i in range(len(alistPostal)):
            code = alistPostal[i]
            g = geocoder.arcgis('{},Toronto, Ontario'.format(code)).latlng
            list_geo.append(g)
    df_geo = pd.DataFrame()
    df_geo['Postalcode'] = alistPostal
    df_geo['Latitude'] = [i[0] for i in list_geo]
    df_geo['Longitude'] = [i[1] for i in list_geo]
    return df_geo

In [196]:
list_postal = can_df['Postalcode'].tolist()
df_geo = geodata(list_postal)
df_geo.head()

Unnamed: 0,Postalcode,Latitude,Longitude
0,M3A,43.75245,-79.32991
1,M4A,43.73057,-79.31306
2,M5A,43.65512,-79.36264
3,M6A,43.72327,-79.45042
4,M7A,43.66253,-79.39188


In [202]:
df_toronto = can_df.merge(df_geo,on = 'Postalcode')
df_toronto.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65512,-79.36264
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.72327,-79.45042
4,M7A,Queen's Park,Ontario Provincial Government,43.66253,-79.39188


In [203]:
df_toronto.to_excel('TorontoGeo.xlsx')

# 4. Visualization

In [205]:

# Create Toronto map
map_toronto = folium.Map(location=[43.65, -79.38], zoom_start=10)

# Add markers
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=7,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [24]:

df_toronto = pd.read_excel("TorontoGeo.xlsx")

# 5. K-mean Clustering

In [25]:
df_toronto.head()

Unnamed: 0.1,Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,0,M3A,North York,Parkwoods,43.75245,-79.32991
1,1,M4A,North York,Victoria Village,43.73057,-79.31306
2,2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65512,-79.36264
3,3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.72327,-79.45042
4,4,M7A,Queen's Park,Ontario Provincial Government,43.66253,-79.39188


In [26]:
k=5
toronto_clustering = df_toronto.drop(['Postalcode','Borough','Neighborhood'],axis=1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
kmeans.labels_
df_toronto.insert(0, 'Cluster_Labels', kmeans.labels_)

In [27]:
df_toronto.head()

Unnamed: 0.1,Cluster_Labels,Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,0,0,M3A,North York,Parkwoods,43.75245,-79.32991
1,0,1,M4A,North York,Victoria Village,43.73057,-79.31306
2,0,2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65512,-79.36264
3,0,3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.72327,-79.45042
4,0,4,M7A,Queen's Park,Ontario Provincial Government,43.66253,-79.39188


In [34]:
df_toronto['Cluster_Labels'].value_counts()

4    21
2    21
1    21
0    21
3    19
Name: Cluster_Labels, dtype: int64

In [35]:
# create map
map_clusters = folium.Map(location=[43.65, -79.38],zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighborhood'], df_toronto['Cluster_Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters