# Imports

In [5]:
import pandas as pd
import numpy as np

## For web scrapping
import requests
from bs4 import BeautifulSoup

# Part One:

### Scrapping and first cleaning
Scrape the website using BeautifulSoup and extract the column names and the data ros

In [6]:
## Scrape wikipedia site
html_doc = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(html_doc, "html.parser")

tables = soup.find_all("table")
print(f"Found {len(tables)} tables in the document.")
table = tables[0]   # You have to look for which one is the correct one

# Get table rows and column_names
table_body = table.find("tbody")
table_rows = table_body.find_all("tr")
print(f"Found {len(table_rows)} rows in the table.")

column_names = table_rows[0]
rows = table_rows[1:]
print(f"Found {len(rows)} rows with values in the table.")
print(f"Column names: {column_names}.")

Found 3 tables in the document.
Found 181 rows in the table.
Found 180 rows with values in the table.
Column names: <tr>
<th>Postal Code
</th>
<th>Borough
</th>
<th>Neighbourhood
</th></tr>.


### Data refinement
Refine data into a dataframe

In [7]:
# Process scraped values in a dataframe
column_names = [x.text.strip() for x in column_names.find_all("th")]

postal_code = []
borough = []
neighbourhood = []

for r in rows:
    values = r.find_all("td")
    postal_code.append(values[0].text.strip())
    borough.append(values[1].text.strip())
    neighbourhood.append(values[2].text.strip())

# Create the DataFrame
df = pd.DataFrame({"Postal Code": postal_code, "Borough": borough, "Neighbourhood": neighbourhood})

print(f"Shape of raw data: {df.shape}")
df.head()

Shape of raw data: (180, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Filtering
Filter data so it does not contain invalid data.

In [8]:
# Filter the data so only rows with assigned boroughs are kept
df = df[df["Borough"] != "Not assigned"]

# Aggregate rows with the same boroughs but different neighbourhoods
df = df.groupby(["Postal Code", "Borough"])["Neighbourhood"].apply(lambda x: ",".join(x)).reset_index()


print(df.head())
df.shape

  Postal Code      Borough                           Neighbourhood
0         M1B  Scarborough                          Malvern, Rouge
1         M1C  Scarborough  Rouge Hill, Port Union, Highland Creek
2         M1E  Scarborough       Guildwood, Morningside, West Hill
3         M1G  Scarborough                                  Woburn
4         M1H  Scarborough                               Cedarbrae


(103, 3)

# Part Two:

In [9]:
import geocoder # import geocoder

print("Finished importing libraries.")

Finished importing libraries.


The code bellow is based on the code given in the exercise. Sadly, as already indicated, it does not work. Therefore I will use the .csv file.

In [10]:
# This cell does not work.

def try_coordinates(postal_code):
    """
        Code based on snipped in corsera course
    """

    coords = None
    coords = geocoder.google('{}, Toronto, Ontario'.format(postal_code)).latlng
    
    if coords is None: print("Failed, will try again:")
    
    while(coords is None):
        print("\tFailure")
        coords = geocoder.google('{}, Toronto, Ontario'.format(postal_code)).latlng
        if (not coords is None): print()
        
    return coords

coordinates = try_coordinates("M5G")
print(coordinates)

Failed, will try again:
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure
	Failure


KeyboardInterrupt: 

In [11]:
# Load data from website
location_df = pd.read_csv("https://cocl.us/Geospatial_data")
location_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merging the two dataframes:

In [12]:
print("DataFrame from Part One:")
print(df.head())
print(df.shape)
print()
print("DataFrame containing the coordinates:")
print(location_df.head())
print(df.shape)

DataFrame from Part One:
  Postal Code      Borough                           Neighbourhood
0         M1B  Scarborough                          Malvern, Rouge
1         M1C  Scarborough  Rouge Hill, Port Union, Highland Creek
2         M1E  Scarborough       Guildwood, Morningside, West Hill
3         M1G  Scarborough                                  Woburn
4         M1H  Scarborough                               Cedarbrae
(103, 3)

DataFrame containing the coordinates:
  Postal Code   Latitude  Longitude
0         M1B  43.806686 -79.194353
1         M1C  43.784535 -79.160497
2         M1E  43.763573 -79.188711
3         M1G  43.770992 -79.216917
4         M1H  43.773136 -79.239476
(103, 3)


In [13]:
merged_df = pd.merge(df, location_df, on="Postal Code")

print(merged_df.head())
print(merged_df.shape)

  Postal Code      Borough                           Neighbourhood   Latitude  \
0         M1B  Scarborough                          Malvern, Rouge  43.806686   
1         M1C  Scarborough  Rouge Hill, Port Union, Highland Creek  43.784535   
2         M1E  Scarborough       Guildwood, Morningside, West Hill  43.763573   
3         M1G  Scarborough                                  Woburn  43.770992   
4         M1H  Scarborough                               Cedarbrae  43.773136   

   Longitude  
0 -79.194353  
1 -79.160497  
2 -79.188711  
3 -79.216917  
4 -79.239476  
(103, 5)


# Part Three:

In [14]:
# Rename for convenience
df = merged_df

In [16]:
df["Borough"].value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
York                 5
East York            5
East Toronto         5
Mississauga          1
Name: Borough, dtype: int64

In [22]:
# Filter df to only contain boroughs whose name contains "Toronto" or "toronto"
df = df[df["Borough"].str.contains(".*(T|t)oronto.*")]

print(df["Borough"].value_counts())
df.head()

Downtown Toronto    19
Central Toronto      9
West Toronto         6
East Toronto         5
Name: Borough, dtype: int64


  return func(self, *args, **kwargs)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [23]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
