In [None]:
# Standard imports
import numpy as np
import pandas as pd

# For web scraping
import requests
from bs4 import BeautifulSoup


In [None]:
#  Function to get text from webpage using BeautifulSoup
def safe_get_text(soup, selector, index):
    elements = soup.find_all(selector)
    if len(elements) > index:
        return elements[index].get_text(strip=True)
    return "Not specified"

# This URL is where all government news releases, statements, and advisories are stored.
base_url = "https://www.canada.ca/en/news/advanced-news-search/news-results.html"

# These parameters display all news releases, statements, and advisories from 2015 to present. The 'idx' parameter is used to iterate through the pages.
params = {
    'start': '2015-01-01',
    'end': '',
    'idx': 0,
    '_': '1704247183169'
}

# List to store article data
data = []

# Loop through the pages
for idx in range(0, 66590, 10):
    params['idx'] = idx  # Update the 'idx' parameter
    response = requests.get(base_url, params=params)
    # Print the range of the current page to keep track of progress
    print(f'Range: {idx})')

    if response.status_code != 200:
        print(f"Failed to retrieve the webpage for idx={idx}")
        continue

    #Use BeautifulSoup to parse the HTML
    soup = BeautifulSoup(response.content, 'html.parser')
    articles = soup.find_all('article', class_='item')

    # Loop through the articles on the page and pull out key information
    for article in articles:
        title = safe_get_text(article, 'h3.h5', 0)
        link = article.find('h3', class_='h5').find('a', href=True)['href'] if article.find('h3', class_='h5').find('a', href=True) else "No link"
        date = article.find('time')['datetime'] if article.find('time') else "No date"
        additional_info = safe_get_text(article, 'p', 0).split('|')
        department = additional_info[1].strip() if len(additional_info) >= 2 else "Not specified"
        article_type = additional_info[2].strip() if len(additional_info) > 2 else "Not specified"
        summary = safe_get_text(article, 'p', 1)
        
        # Append the fields to the list
        data.append([title, link, date, department, article_type, summary])

In [None]:
# Create a DataFrame
articles = pd.DataFrame(data, columns=['Title', 'Link', 'Date', 'Department', 'Article Type', 'Summary'])


In [None]:
# Save the DataFrame to a CSV file
articles.to_csv('/Users/paulhershaw/brainstation_course/project_folder/stone/data/gov_articles_URLs.csv', index=False)

