# Gathering Toronto Postal Codes- web scraping and data wrangling- Part 1

Coursera Capstone- Week 3, Part 1

In [1]:
#importing required libraries
from bs4 import BeautifulSoup
import lxml
import requests
import pandas as pd

In [2]:
#accessing wikipedia weblink to scrap the data
response = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(response, 'lxml')

In [3]:
#accessing table with postcodes and writing scraping script 
table = soup.find('table')
rows = table.find_all('tr')
l = []
for tr in rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)

In [4]:
#Renaming the columns, and cleaning the table
postcodes = pd.DataFrame(l, columns=["Postcode", "Borough", "Neighborhood"])
postcodes.drop([0], axis = 0, inplace = True)
postcodes['Neighborhood'] = postcodes['Neighborhood'].str.replace('\n','')
postcodes.head()

Unnamed: 0,Postcode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [5]:
#Removing Boroughs that are not assigned
postcodes = postcodes[postcodes.Borough != 'Not assigned']

In [6]:
#Grouping table to have one row per each Postcode and Borough
cols = ['Postcode','Borough']
postcodes = postcodes.groupby(cols).agg(
lambda x: ', '.join(x)
)
postcodes.reset_index(level = ['Postcode','Borough'], inplace = True)
postcodes.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [7]:
# Assigning the name of Neighborhood same as that of Borough, in case Neighborhood is not assigned
updated_value = postcodes[postcodes.Neighborhood == 'Not assigned'].Borough
postcodes.Neighborhood[postcodes.Neighborhood == 'Not assigned'] = updated_value
postcodes[postcodes.Borough == "Queen's Park"]

Unnamed: 0,Postcode,Borough,Neighborhood
85,M7A,Queen's Park,Queen's Park
93,M9A,Queen's Park,Queen's Park


In [8]:
#Checking the shape of the final data set
postcodes.shape

(103, 3)

# Adding geographic coordinates to the file- Part 2

In [67]:
#importing required libraries
!conda install -c conda-forge geocoder --yes
import geocoder

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geocoder


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    geocoder-1.38.1            |             py_1          53 KB  conda-forge
    ratelim-0.1.6              |             py_2           6 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.4 MB

The following NEW packages will be INSTALLED:

    geocoder:        1.38.1-py_1       conda-forge
    ratelim:         0.1.6-py_2        conda-forge

The following packages will be UPDATED:

    

In [14]:
# initialize variable to None
lat_lng_coords = None
postal_code = postcodes['Postcode']
d=[]

#loop to get all the postal codes from the file
for postal_code in postal_code:
    g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
    lat_lng_coords = g.latlng
    latitude = lat_lng_coords[0]  
    longitude = lat_lng_coords[1]  
    d.append([postal_code, latitude,longitude])    


In [15]:
#Getting coordinates for all the postal codes
geocoordinates = pd.DataFrame(d, columns = ['Postcode','Latitude','Longtitude'])
geocoordinates.head()

Unnamed: 0,Postcode,Latitude,Longtitude
0,M1B,43.811525,-79.195517
1,M1C,43.785665,-79.158725
2,M1E,43.765815,-79.175193
3,M1G,43.768369,-79.21759
4,M1H,43.769688,-79.23944


In [16]:
#Merging the files to get final data set with coordinates
postcodes_final = pd.merge(postcodes, geocoordinates)
postcodes_final.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longtitude
0,M1B,Scarborough,"Rouge, Malvern",43.811525,-79.195517
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.785665,-79.158725
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.765815,-79.175193
3,M1G,Scarborough,Woburn,43.768369,-79.21759
4,M1H,Scarborough,Cedarbrae,43.769688,-79.23944
