# Simple Webscraping Example with Beautiful Soup

In [1]:
# Import all neccessary libraries

from bs4 import BeautifulSoup
import urllib.request
import pandas as pd

In [2]:
# Assign the URL to a variable

url = "https://www12.statcan.gc.ca/census-recensement/2021/dp-pd/prof/details/page.cfm?Lang=E&SearchText=M1C&DGUIDlist=2021A0011M1C&GENDERlist=1,2,3&STATISTIClist=1,4&HEADERlist=0"


# use the urlopen function to open the webpage
html = urllib.request.urlopen(url)

# show object html
html

<http.client.HTTPResponse at 0x7f1526f27160>

In [3]:
# Create the BeautifulSoup object

html_to_parse = BeautifulSoup(html, "html.parser")

In [4]:
# create a list of tables.  There is only 1 table in this webpage

tables = html_to_parse.find_all("table")
print(f"Number of tables found: {len(tables)}")

Number of tables found: 1


In [5]:
# Create list of all the <th> tags in the table that has the title "2021A0011M1C - Population, 2021 - Counts - Total"

td = tables[0].find(attrs={"title":"2021A0011M1C - Population, 2021 - Counts - Total"})

In [6]:
td

<td class="text-right text-nowrap" headers="rh1 r1 geo2021A0011M1C geo2021A0011M1Cstat1 geo2021A0011M1Cstat1gen1" title="2021A0011M1C - Population, 2021 - Counts - Total"> 35,642</td>

In [7]:
# convert to float
float(td.text.replace(",", ""))

35642.0

## Create a script that will look up from a list of Postal codes

In [8]:
import urllib.parse as urlparse
from urllib.parse import urlencode

In [9]:
# A list of postal code from the previous part

postal = ['M3A', 'M4A', 'M5A', 'M6A', 'M7A','M5E', 'M4E', 'M6E', 'M5G', 'M6G','M2H','M3H','M4H','M5H','M6H',
'M1J','M2J', 'M3J',
       'M4J',
       'M5J',
       'M5K',
       'M6K',
       'M1L',
       'M2L',
       'M3L',
       'M4L',
       'M5L',
       'M6L',
       'M9L',
       'M1M',
       'M2M',
       'M3M',
       'M4M',
       'M5M',
       'M6M',
       'M9M',
       'M1N',
       'M2N',
       'M3N',
       'M4N',
       'M5N',
       'M6N',  'M9N',
       'M1P',
       'M2P', 
       'M4P',
       'M5P',
       'M6P',
       'M9P',
       'M1R',
       'M2R', 
       'M4R',
       'M5R',
       'M6R',
       'M7R',
       'M9R',
       'M1S',  'M4S',
       'M5S',
       'M6S', 
       'M1T',
       'M4T',
       'M5T',
       'M1V',
       'M4V',
       'M5V',
       'M8V',
       'M9V',
       'M1W',
       'M4W',
       'M5W',
       'M8W',
       'M9W',
       'M1X', 
       'M4X',
       'M5X',
       'M8X',
       'M7Y',
       'M8Y',
       'M8Z',]


In [10]:
# Creating Empty DataFrame and Storing it in variable df

df = pd.DataFrame(columns = ['postal_code', 'data', 'value'])

In [15]:
# Loop through each postal code


for i in postal:
    url = "https://www12.statcan.gc.ca/census-recensement/2021/dp-pd/prof/details/page.cfm?Lang=E"
    params = {
        'SearchText': i,
        'DGUIDlist': '2021A0011'+i
    }

    # this part switches up the postal code parameter in the url
    url_parts = list(urlparse.urlparse(url))
    query = dict(urlparse.parse_qsl(url_parts[4]))
    query.update(params)
    
    url_parts[4] = urlencode(query)
    query = urlparse.urlunparse(url_parts)
    
    # the following code is similar to the above
    html = urllib.request.urlopen(query)
    html_to_parse = BeautifulSoup(html, "html.parser")
    tables = html_to_parse.find_all("table")
    print(f"Number of tables found: {len(tables)}")
    
    
    tables = html_to_parse.find_all("table")
    if len(tables) == 0:
        print("No tables found on this page.")
        continue  # Skip to the next iteration

    
    # change the title to find the data you want
    title = (f"2021A0011{i} - Population, 2021 - Counts - Total")
    td = tables[0].find(attrs={"title":title})
    print(td)
    df.loc[len(df.index)] = [i, title,  float(td.text.replace(",", ""))]

Number of tables found: 1
<td class="text-right text-nowrap" headers="rh1 r1 geo2021A0011M3A geo2021A0011M3Astat1 geo2021A0011M3Astat1gen1" title="2021A0011M3A - Population, 2021 - Counts - Total"> 34,361</td>
Number of tables found: 1
<td class="text-right text-nowrap" headers="rh1 r1 geo2021A0011M4A geo2021A0011M4Astat1 geo2021A0011M4Astat1gen1" title="2021A0011M4A - Population, 2021 - Counts - Total"> 14,589</td>
Number of tables found: 1
<td class="text-right text-nowrap" headers="rh1 r1 geo2021A0011M5A geo2021A0011M5Astat1 geo2021A0011M5Astat1gen1" title="2021A0011M5A - Population, 2021 - Counts - Total"> 48,978</td>
Number of tables found: 1
<td class="text-right text-nowrap" headers="rh1 r1 geo2021A0011M6A geo2021A0011M6Astat1 geo2021A0011M6Astat1gen1" title="2021A0011M6A - Population, 2021 - Counts - Total"> 22,380</td>
Number of tables found: 1
<td class="text-right text-nowrap" headers="rh1 r1 geo2021A0011M7A geo2021A0011M7Astat1 geo2021A0011M7Astat1gen1" title="2021A0011M7A 

In [16]:
df

Unnamed: 0,postal_code,data,value
0,M3A,"2021A0011M3A - Population, 2021 - Counts - Total",34361.0
1,M4A,"2021A0011M4A - Population, 2021 - Counts - Total",14589.0
2,M5A,"2021A0011M5A - Population, 2021 - Counts - Total",48978.0
3,M6A,"2021A0011M6A - Population, 2021 - Counts - Total",22380.0
4,M7A,"2021A0011M7A - Population, 2021 - Counts - Total",5.0
...,...,...,...
89,M1X,"2021A0011M1X - Population, 2021 - Counts - Total",14810.0
90,M4X,"2021A0011M4X - Population, 2021 - Counts - Total",19896.0
91,M8X,"2021A0011M8X - Population, 2021 - Counts - Total",10624.0
92,M8Y,"2021A0011M8Y - Population, 2021 - Counts - Total",21986.0


In [17]:
# Now you can export this to a CSV file for further analysis or visulization
df.to_csv('postal.csv')