### Loading Libraries

In [1]:
import pandas as pd
import geocoder
pd.__version__

'1.1.1'

### Reading data from Wikipedia

In [2]:
# Reading data from Wikipedia.
canada_wiki = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
print("Type : ", type(canada_wiki))
print("Length is : ", len(canada_wiki))

Type :  <class 'list'>
Length is :  3


## Data pre-processing

In [3]:
# Checking each element of list only to find the data we require as the first element in the list.
# The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
df = canada_wiki[0]
df.columns = df.columns.str.replace(" ","")
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [4]:
print("Checking for missing or NA values:", "\n", df.isna().sum(),"\n")
print("Checking for duplicated values:", "\n",df.duplicated().sum())

Checking for missing or NA values: 
 PostalCode       0
Borough          0
Neighbourhood    0
dtype: int64 

Checking for duplicated values: 
 0


In [5]:
# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
print(df.Borough.unique())
df = df[df["Borough"]!="Not assigned"]
df.reset_index(inplace=True, drop = True)
df.head()

['Not assigned' 'North York' 'Downtown Toronto' 'Etobicoke' 'Scarborough'
 'East York' 'York' 'East Toronto' 'West Toronto' 'Central Toronto'
 'Mississauga']


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [6]:
# Dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   PostalCode     103 non-null    object
 1   Borough        103 non-null    object
 2   Neighbourhood  103 non-null    object
dtypes: object(3)
memory usage: 2.5+ KB


In [7]:
# More than one neighborhood can exist in one postal code area.
df[df["PostalCode"]=="M5A"]

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [8]:
# Checking if any Neighbourhood is "Not assigned"
# print(df.Neighbourhood.unique())
[i for i in df.Neighbourhood if i.lower().startswith("Not assigned".lower())]

[]

No Neighbourhood starts with "Not assigned".

In [9]:
# Resetting index
df.reset_index(inplace=True, drop = True)
df.shape

(103, 3)

## Getting latitude, longitude data using Geocoder

In [10]:

lat_long_list = []
# loop until we get the coordinates
for postal_code in df.PostalCode:
    lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
        lat_long_list.append(lat_lng_coords)
#         print(lat_lng_coords)

In [11]:
lat_long_df = pd.DataFrame(lat_long_list, columns = ["Latitude", "Longitude"])
print(lat_long_df.shape)
lat_long_df.head()

(103, 2)


Unnamed: 0,Latitude,Longitude
0,43.75245,-79.32991
1,43.73057,-79.31306
2,43.65512,-79.36264
3,43.72327,-79.45042
4,43.66253,-79.39188


## Getting the final dataframe for use

In [12]:
final_df = pd.concat([df, lat_long_df], axis = 1)
print("Rows, Columns are : ", final_df.shape)
final_df.head()

Rows, Columns are :  (103, 5)


Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188


### Analysis

In [13]:
final_df2 = final_df.copy()

In [15]:
final_df2.Borough.unique()

array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)