
 # 1. Data scraping for Toronto neighborhoods

In [1]:
# importing the libraries for data scraping
import pandas as pd
import numpy as np

- We're going to use the data from an HTML link of a Wikipedia page. It has information about all the neighborhoods of the city of Toronto, Canada. 
- The link is : https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M
- We will extract the data from the url with pandas.

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

data = pd.read_html(url) # extracting the data from the html link
# We look for the table we need from the data previously extracted
df = data[0] 

df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


- We then proceed to clean the data by filtering through the rows where the Borough is labeled 'Not assigned'.

In [3]:
df.shape

(180, 3)

In [4]:
df = df[df['Borough'] != 'Not assigned'] 
df = df.reset_index(drop = True)
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [5]:
df['Neighborhood'].value_counts() #checking for Neighborhoods labeled 'Not assigned' in the dataframe

Downsview                                                                                                                                 4
Don Mills                                                                                                                                 2
Scarborough Village                                                                                                                       1
Davisville North                                                                                                                          1
Victoria Village                                                                                                                          1
Christie                                                                                                                                  1
Woburn                                                                                                                                    1
Stn A PO Boxes      

In [6]:
df.shape

(103, 3)

# 2. Merging the geospacial coordinates with the dataframe

For each postal code, we're going to associate coordinates (lat, long) using the geospatial coordiantes csv file.

In [7]:
coordinates = pd.read_csv('https://cocl.us/Geospatial_data')
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [8]:
# merging the coordinates with the dataframe with the .merge() function
df = df.merge(coordinates, on = 'Postal Code', how = 'left')
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


# 3. Clustering the 