### This notebook is for scraping article text based on the csv with URLs.

This is very much a work in progress. I am having issues getting the location, title and text.

In [None]:
#Import libraries
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import requests
import pycountry

## Explore HTML
Below we set up a simple tool to explore the HTML of articles. I am going back and forth with issues articles to refine my scrapper(s).

In [74]:
# Save the URL of the webpage we want to scrape to a variable
url = 'https://www.canada.ca/en/food-inspection-agency/news/2023/09/government-of-canada-invests-in-capacity-for-responding-to-avian-influenza-outbreaks-in-british-columbia.html'

# Turn the undecoded content into a Beautiful Soup object and assign it to a variable
# A Beautiful Soup object represents the HTML content of the response

response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

#Use this print statement to see the HTML content of the webpage and refine the scraping below. 
soup

<!DOCTYPE html>

<!--[if lt IE 9]>
<html class="no-js lt-ie9" dir="ltr" lang="en" xmlns="http://www.w3.org/1999/xhtml">
<![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" dir="ltr" lang="en" xmlns="http://www.w3.org/1999/xhtml">
<!--<![endif]-->
<head>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta charset="utf-8"/>
<title>Government of Canada invests in capacity for responding to avian influenza outbreaks in British Columbia - Canada.ca</title>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<link href="http://purl.org/dc/terms/" rel="schema.dcterms"/>
<link href="https://www.canada.ca/en/food-inspection-agency/news/2023/09/government-of-canada-invests-in-capacity-for-responding-to-avian-influenza-outbreaks-in-british-columbia.html" rel="canonical"/>
<link href="https://www.canada.ca/en/food-inspection-agency/news/2023/09/government-of-canada-invests-in-capacity-for-responding-to-avian-influenza-outbreaks-in-british-columbia.html" hreflang="en" 

## Scrapper 1 : Detailed Locations
Below is a tool to pull out useful information from the URLs. In this version, I am trying to get Province and City in the location field, but having difficulty.

In [67]:
# Load the CSV file with the URLs to scrape
csv_path = '/Users/paulhershaw/brainstation_course/project_folder/stone/data/articles_temp_copy.csv'  # Update this to your CSV file path
df = pd.read_csv(csv_path)

# Define a function to scrape data from each URL
def scrape_data(url, article_type):
    #Media advisories are lower value communications, as they are always followed by a news release.
    if article_type.lower() == 'media advisories':
        return url, "Skipping Media Advisory", "Skipping Media Advisory"

    #Given the large number of URLs, I am using a Try and except statement to catch any errors and continue the loop.
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        canonical_link = soup.find('link', rel='canonical')['href']

        # Check for 'Backgrounder' articles, which will not have location information.
        if article_type.lower() == 'backgrounders':
            cmp_text = soup.find('div', class_='cmp-text')
            news_release_text = ' '.join([p.get_text() for p in cmp_text.find_all('p')]) if cmp_text else "No text found"
            
            # For backgrounders, the location is always 'Location not found'
            location = "Location not found"
        else:
            # Existing logic for other article types - this is rough, and not really working yet.
            cmp_text_div = soup.find('div', class_='cmp-text')
            location = find_location(cmp_text_div.get_text() if cmp_text_div else "")

            news_release_text_elements = cmp_text_div.find_all('p')[1:] if cmp_text_div else []
            news_release_text = ' '.join([p.get_text() for p in news_release_text_elements])

        return canonical_link, location, news_release_text
    except Exception as e:
        print(f"Error scraping URL {url}: {e}")
        return None, None, None

 
def find_location(text):
    # List of Canadian provinces and territories - trying to be extensive here to pick up all possible variations.
    provinces = ["Alberta", "British Columbia", "Manitoba", "New Brunswick", "Newfoundland and Labrador", 
                 "Nova Scotia", "Ontario", "Prince Edward Island", "Quebec", "Saskatchewan", "Northwest Territories", 
                 "Nunavut", "Yukon", "AB", "BC", "MB", "NB", "NL", "NS", "ON", "PE", "QC", "SK", "NT", "NU", "YT", "B.C.","P.E.I.","N.S.","N.B.","Que.","Sask."]

    # List of countries 
    countries = [country.name for country in pycountry.countries]

    # Combine the lists
    locations = provinces + countries

    # Search for these locations in the text
    for location in locations:
        if location in text:
            # Find the text before the first comma - this is not working right now.
            location_start = text.find(location)
            if location_start != -1:
                preceding_text = text[:location_start].split()
                if preceding_text:
                    city = preceding_text[-1]  # Get the last word before the location
                    return f"{city}, {location}"
    
    return "Location not found"



# Loop through each URL and add results to the DataFrame. 
for index, row in df.iterrows():
    print(f"Scraping URL: {row['Link']}")
    canonical_link, location, news_release_text = scrape_data(row['Link'], row['Article Type'])

    df.at[index, 'Canonical Link'] = canonical_link
    df.at[index, 'Location'] = location
    df.at[index, 'News Release Text'] = news_release_text

    print(f"Canonical Link: {canonical_link}")
    print(f"Location: {location}")
    print(f"News Release Text: {news_release_text}\n")


print("Scraping completed. Updated CSV saved.")


Scraping URL: https://www.canada.ca/en/correctional-service/news/2024/01/death-of-an-inmate-from-bath-institution.html
Canonical Link: https://www.canada.ca/en/correctional-service/news/2024/01/death-of-an-inmate-from-bath-institution.html
Location: Kingston,, Ontario
News Release Text: On January 2, 2024, Ian Smith, an inmate from Bath Institution, died while in our custody. At the time of death, the inmate had been serving an indeterminate sentence, which commenced on January 23, 2020, for aggravated assault and breach of a long-term supervision order. The inmate’s next of kin have been notified. As in all cases involving the death of an inmate, the Correctional Service of Canada (CSC) will review the circumstances. CSC policy requires that the police and the coroner be notified. CSC WebsiteCommissioner’s Directive: Death of an InmateDeaths in custodyBath Institution -30-

Scraping URL: https://www.canada.ca/en/department-national-defence/news/2023/09/defence-minister-bill-blair-meet

## Scrapper 2 : Locations - Province or Country Only
Below is a tool to pull out useful information from the URLs. In this version, I am trying to get Province or Country in the location field. This didnt solve the problem, and I would prefer to have province and city.

In [84]:
# Define a function to scrape data from each URL
def scrape_data(url, article_type):
    if article_type.lower() == 'media advisories':
        return url, "Skipping Media Advisory", "Skipping Media Advisory", "Title not available"

    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        canonical_link = soup.find('link', rel='canonical')['href']

        # Extract the title from the HTML content
        title = soup.find('title').get_text() if soup.find('title') else "Title not found"

        # Check for 'Backgrounder' articles
        if article_type.lower() == 'backgrounders':
            cmp_text = soup.find('div', class_='cmp-text')
            news_release_text = ' '.join([p.get_text() for p in cmp_text.find_all('p')]) if cmp_text else "No text found"
            
            # For backgrounders, the location is always 'Location not found'
            location = "Location not found"
        else:
            # Existing logic for other article types
            cmp_text_div = soup.find('div', class_='cmp-text')
            location = find_location(cmp_text_div.get_text() if cmp_text_div else "")

            news_release_text_elements = cmp_text_div.find_all('p')[1:] if cmp_text_div else []
            news_release_text = ' '.join([p.get_text() for p in news_release_text_elements])

        return canonical_link, location, news_release_text, title
    except Exception as e:
        print(f"Error scraping URL {url}: {e}")
        return None, None, None, "Error fetching title"

def find_location(text):
    # List of Canadian provinces and territories
    provinces = ["Alberta", "British Columbia", "Manitoba", "New Brunswick", "Newfoundland and Labrador", 
                 "Nova Scotia", "Ontario", "Prince Edward Island", "Quebec", "Saskatchewan", "Northwest Territories", 
                 "Nunavut", "Yukon", "AB", "BC", "MB", "NB", "NL", "NS", "ON", "PE", "QC", "SK", "NT", "NU", "YT", "B.C.", "P.E.I.", "N.S.", "N.B.", "Que.", "Sask."]

    # List of countries 
    countries = [country.name for country in pycountry.countries]

    # Combine the lists
    locations = provinces + countries

    # Search for these locations in the text
    for location in locations:
        if location in text:
            return location
    
    return "Location not found"


# Load the CSV file
csv_path = '/Users/paulhershaw/brainstation_course/project_folder/stone/data/gov_articles_URLs.csv'  # Update this to your CSV file path
df = pd.read_csv(csv_path)
df = df.sample(n=500, random_state=42)

# Loop through each URL in the DataFrame
for index, row in df.iterrows():
    print(f"Scraping URL: {row['Link']}")
    canonical_link, location, news_release_text, title = scrape_data(row['Link'], row['Article Type'])

    df.at[index, 'Title'] = title  
    df.at[index, 'Canonical Link'] = canonical_link
    df.at[index, 'Location'] = location
    df.at[index, 'News Release Text'] = news_release_text
    
    print(f"Title: {title}\n")  
    print(f"Canonical Link: {canonical_link}")
    print(f"Location: {location}")
    print(f"News Release Text: {news_release_text}")
    

print("Scraping completed. Updated CSV saved.")


Scraping URL: https://www.canada.ca/en/global-affairs/news/2019/09/canadian-delegation-to-attend-74th-session-of-united-nations-general-assembly.html
Title: Canadian delegation to attend 74th session of United Nations General Assembly - Canada.ca

Canonical Link: https://www.canada.ca/en/global-affairs/news/2019/09/canadian-delegation-to-attend-74th-session-of-united-nations-general-assembly.html
Location: Ontario
News Release Text: Global Affairs Canada today announced that Deputy Minister of Foreign Affairs Marta Morgan will lead a Canadian delegation to New York from September 23 to 27, 2019 to attend the 74th session of the United Nations General Assembly. Deputy Minister Morgan will join Ambassador and Permanent Representative of Canada to the United Nations in New York Marc-André Blanchard and be accompanied by several high-level representatives, including the Right Honourable Jean Chrétien, the Right Honourable Joe Clark, former Quebec premier Jean Charest, and Senator Peter Boe

In [85]:
df.to_csv('/Users/paulhershaw/brainstation_course/project_folder/stone/data/gov_articles_text.csv', index=False)