In [None]:
!pip install bs4
!pip install lxml

# Imports and functions definitions to get the table as a DataFrame

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
def get_request(url):
    response = requests.get(url)
    return response

def get_empty_table_df():
    columns = ['Postcode', 'Borough', 'Neighborhood']
    table_df = pd.DataFrame(columns=columns)
    table_df.reset_index()
    
    return table_df

def get_df_line(line):
    td = line.findAll('td')
    
    new_line = [0, 0, 0]
    for i in range(0, 3):
        new_line[i] = td[i].findAll(text=True)[0]
    
    if new_line[2].endswith('\n'):
        new_line[2] = new_line[2][:-1]
        
    return new_line

def treat_not_assigned(df):
    df.replace('Not assigned', np.nan, inplace=True)
    
    for i in range(0, df.shape[0]):
        if pd.isnull(df.loc[i, 'Neighborhood']) and (not pd.isnull(df.loc[i, 'Borough'])):
            df.loc[i, 'Neighborhood'] = df.loc[i, 'Borough']
    
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    return df

def group_by_postcode(df):
    df = df.groupby(by='Postcode', observed=True)
    return df
    

def get_table_df(response):
    table_df = get_empty_table_df()
    
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.findAll('table')[0]
    tbody = table.findAll('tbody')[0]
    lines = tbody.findAll('tr')
    
    for i in range(1, len(lines)): #Skip header line
        line = get_df_line(lines[i])
        table_df.loc[i-1] = line
    
    table_df = treat_not_assigned(table_df)
        
    return table_df

# Get the page and clean the data

In [3]:
response = get_request('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M');

In [4]:
table_df = get_table_df(response)

# Functions definitions to group elements in DataFrame

In [5]:
def get_list_of_postcodes(df):
    return df['Postcode'].unique()

def get_empty_final_df_with_postcodes(postcodes):
    columns = ['Borough', 'Neighborhood']
    df = pd.DataFrame(columns=columns)
    
    for i in range(len(postcodes)):
        df.loc[postcodes[i], :] = [[], []]
    
    df.index.name = 'Postcode'
    
    return df

def fill_final_df(table_df, postcodes, final_df):
    for i in range(len(postcodes)):
        for j in range(len(table_df)):
            if table_df.loc[j, 'Postcode'] == postcodes[i]:
                final_df.loc[postcodes[i], 'Borough'].append(table_df.loc[j, 'Borough'])
                final_df.loc[postcodes[i], 'Neighborhood'].append(table_df.loc[j, 'Neighborhood'])
    
    final_df.reset_index(inplace=True)
    return final_df

def format_final_df(final_df):
    for i in range(len(final_df)):
        final_df.loc[i, 'Borough'] = final_df.loc[i, 'Borough'][0]
        final_df.loc[i, 'Neighborhood'] = ', '.join(final_df.loc[i, 'Neighborhood'])
    
    return final_df

def get_final_df(table_df):
    postcodes = get_list_of_postcodes(table_df)
    final_df = get_empty_final_df_with_postcodes(postcodes)
    final_df = fill_final_df(table_df, postcodes, final_df)
    final_df = format_final_df(final_df)
    
    return final_df



In [6]:
final_df = get_final_df(table_df)

# Print 12 first elements and the shape of the final DataFrame

In [7]:
final_df.head(12)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [8]:
final_df.shape

(103, 3)

# Get latitude and longitude from postcodes

In [9]:
lat_long_csv = pd.read_csv('Geospatial_Coordinates.csv')
lat_long_csv.set_index('Postal Code', inplace=True)

In [10]:
for i in range(len(final_df)):
    final_df.loc[i, 'Latitude'] = lat_long_csv.loc[final_df.loc[i, 'Postcode'], :]['Latitude']
    final_df.loc[i, 'Longitude'] = lat_long_csv.loc[final_df.loc[i, 'Postcode'], :]['Longitude']

In [11]:
final_df.head(12)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


# Clusterization

In [12]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

In [13]:
num_clusters = 4

k_means = KMeans(init='k-means++', n_clusters=num_clusters, n_init=1000)
k_means.fit(final_df[['Latitude', 'Longitude']])
labels = k_means.labels_
final_df['Cluster Labels'] = labels

In [14]:
latitude = 43.651070
longitude = -79.347015
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# Set color scheme for the clusters
x = np.arange(num_clusters)
ys = [i + x + (i*x)**2 for i in range(num_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(final_df['Latitude'], final_df['Longitude'], final_df['Neighborhood'], final_df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# Conclusions

The clusterization was performed using the only numerical values available: latitude and longitude. It was possible to group all the 103 postcodes by location, but it is not possible to make any conclusions regarding any type of similarities, except for their geographical location. Other features are necessary to advance on the study of the similarities of the neighborhoods.