In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [None]:
# There are 6167 statements, 50 results per page, and 124 pages in the search results
search_urls = [
    f"https://www.publicsafety.gc.ca/cnt/rsrcs/lbrr/ctlg/rslts-en.aspx?l=7&nb=50&pn={page}" for page in range(1, 124)
]

# Get the urls for each statement page
statement_page_urls = []
for url in search_urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.content)

    try:
        # Get all of the table rows and extract the 'a' (anchor) tag with the 'href' attribute
        rows = soup.find_all("tr")
        for row in rows:
            link_tag = row.find("a", href=True)
            if link_tag:
                link = "https://www.publicsafety.gc.ca/cnt/rsrcs/lbrr/ctlg/" + link_tag["href"]
                statement_page_urls.append(link)
    except Exception:
        print("Failed to retrieve the statement pages on page: ", url)

print(f"Found {len(statement_page_urls)} statement pages to crawl.")

In [None]:
# Get the title and statement pdf url for each statement page
data = {
    "StatementName": [],
    "StatementURL": [],
}

for url in statement_page_urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.content)

    try:
        title = None
        for t in soup.find_all("h2"):
            if "(2024)" in t.text.strip():
                title = t.text.strip()

        # Fails for french statments with string "Accès en ligne", but we want to skip these anyways...
        link = soup.find("a", string="Online access")["href"]

        data["StatementName"].append(title)
        data["StatementURL"].append(link)

    except Exception:
        print("Failed to retrieve the title and statement url for page: ", url)

In [None]:
df = pd.DataFrame.from_dict(data)
print(df.head())
df.to_csv("Canadian_Statement_Metadata.csv")