In [2]:
# Scraping Toronto postal codes from wiki page
## The second notebook

In [3]:
wiki_url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M" # wiki page url
wiki_url

'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [4]:
# modules importing
import pandas as pd
import requests
from bs4 import BeautifulSoup



We will create an empty DataFrame with three columns: _PostalCodes_, _Borough_, _Neighborhood_:

In [5]:
# creating DataFrame
columns = ['PostalCodes',
          'Borough',
          'Neighborhood']
df = pd.DataFrame(columns=columns)

## Scraping wikipedia page to extract borough and neighborhood for each postal code in Toronto, Canada

Next cell will perform data extracting and cleaning:

In [6]:
# wiki table extracting
results = requests.get(wiki_url).text
table = BeautifulSoup(results).find("table") # extracting the table from a wiki page
rows = table.find_all("tr")
raw_data = [] # list of dictionaries in order to build DataFrame
for row in rows: 
    for cell in row.find_all("td"):
        postal_code = cell.p.b.text
        try:
            borough, neighborhood = cell.span.text.split('(')[:2]
        except Exception as err:
            continue
        neighborhood = neighborhood.strip(')').replace(' /', ',').replace(')',' ').strip(' ')
        borough = borough.strip(' ')
        raw_data.append({'PostalCodes': postal_code,
                   'Borough': borough,
                   'Neighborhood': neighborhood})

df = pd.DataFrame(raw_data)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})
df.head(11)

Unnamed: 0,PostalCodes,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [7]:
df.shape

(103, 3)

## Adding coordinates to the dataframe

In [8]:
!wget -q -O 'geodata.csv' https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv

In [10]:
geospatial_dataset = pd.read_csv("geodata.csv")
geospatial_dataset = geospatial_dataset.set_index("Postal Code") # We set "Postal Code" column as index
geospatial_dataset.head()

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


In [13]:
latitude = [] 
longitude = []
for ind in df.index: # iterate through df rows
    postal_code = df['PostalCodes'][ind] # define a postal code of each row
    latitude.append(geospatial_dataset['Latitude'][postal_code]) # find coordinates for the postal code
    longitude.append(geospatial_dataset['Longitude'][postal_code])
df['Latitude'] = latitude # add new columns
df['Longitude'] = longitude
df.head(11)

Unnamed: 0,PostalCodes,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
