## Segmenting and Clustering Neighboring in Toronto - Assignment - 1.2

In [1]:
# import libraries
import pandas as pd
import numpy as np
import requests as rq

## Fetch the data to Pandas Dataframe

In [2]:
# URL of the page
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# Download the data
page= rq.get(url).text

# dataframe creation
data = pd.read_html(page, header=0, attrs={"class":"wikitable sortable"})[0]

In [3]:
data.head()  # have a look at the dataframe

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### dimensions of data

In [4]:
data.shape                     # dimensions of data

(289, 3)

## Data Cleaning

In [5]:
data['Borough'].value_counts()           # to count Not assigned values

Not assigned        77
Etobicoke           45
North York          38
Scarborough         38
Downtown Toronto    37
Central Toronto     17
West Toronto        13
York                 9
East Toronto         7
East York            6
Queen's Park         1
Mississauga          1
Name: Borough, dtype: int64

In [6]:
df = data[data['Borough']!='Not assigned']          # taking Borough with no "Not assignment"

In [7]:
df['Borough'].value_counts()            # checking unique values

Etobicoke           45
North York          38
Scarborough         38
Downtown Toronto    37
Central Toronto     17
West Toronto        13
York                 9
East Toronto         7
East York            6
Queen's Park         1
Mississauga          1
Name: Borough, dtype: int64

In [8]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [9]:
df=df.reset_index(drop=True)                   # Reseting index to start from 0
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [10]:
df[df['Neighbourhood']=='Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood
6,M7A,Queen's Park,Not assigned


In [11]:
df['Neighbourhood']= np.where(df['Neighbourhood']=='Not assigned', df['Borough'], df['Neighbourhood'])

## Data Aggregation / Transformation

In [12]:
df = df.groupby(['Postcode','Borough'], sort=False).agg(lambda x: ','.join(x))
df.reset_index(inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


Looking at the final shape of cleaned data

In [13]:
df.shape

(103, 3)

## Getting Coordinates for each neighborhood

In [14]:
geocodes = pd.read_csv('C:/Data/Geospatial_Coordinates.csv')  # Loading Coordinates from the provided file

In [15]:
geocodes.columns = ['Postcode', 'Latitude', 'Longitude']      # Setting Columns heading

In [16]:
geocodes.head()                # Exploring the latitudes and logitudes

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [17]:
dff =  pd.merge(df, geocodes, on='Postcode', how='inner')     # Joining the two dataframes to get the required result

In [18]:
dff.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
