In [None]:
# Download Reuters corpus
# import nltk
# nltk.download("reuters")

In [None]:
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import json
from newsapi import NewsApiClient
from nltk.corpus import reuters
import os
import pandas as pd
from pathlib import Path
import requests

load_dotenv()

In [None]:
# Initialize News API
news_api_key = os.getenv("NEWS_API_KEY")
newsapi = NewsApiClient(api_key=news_api_key)

def get_newsapi_articles(q):
    """
    Return the list of articles for specified query terms.
    """

    data = newsapi.get_everything(q=q, language="en", page_size=100)
    articles = data["articles"]
    print(f"Found {len(articles)} articles for '{q}''.")

    return articles

In [None]:
# Initialize Bing News API
bing_subscription_key = os.getenv("BING_API_KEY")
headers = {"Ocp-Apim-Subscription-Key" : bing_subscription_key}
bing_search_url = "https://api.bing.microsoft.com/v7.0/news/search"

def get_bing_articles(q):
    """
    Return the list of articles for specified query terms.
    """

    params = {"q": q, "textDecorations": True, "textFormat": "HTML", "count": 100}

    response = requests.get(bing_search_url, headers=headers, params=params)
    response.raise_for_status()
    articles = response.json()["value"]

    print(f"{len(articles)} articles found")

    return articles

In [None]:
# Guardian News API
guardian_api_key = os.getenv("GUARDIAN_API_KEY")
guardian_search_url = "https://content.guardianapis.com/search"
guardian_tags_url = "http://content.guardianapis.com/tags"

relevant_tags = {
    "weapons_defense": ["science/weaponstechnology", "world/chemical-weapons", "us-news/us-military", "world/drones"],
    "human_rights": ["global-development/human-rights", "society/equality-and-human-rights-commission-ehrc", "sustainable-business/business-human-rights"],
    "animal_rights": ["science/animal-experimentation", "world/animal-welfare"],
    "environment": ["environment/climate-change", "carbonreduction/carbonreduction", "environment/carbon-emissions", "environment/carbon-offset-projects", "environment/carbonfootprints", "environment/renewableenergy", "environment/sustainable-development", "global-development/sustainable-development-goals", "social-progress-imperative/social-progress-imperative"]
}

def get_guardian_articles(q, section_id = None, tag_id = None):
    """
    Return the list of articles for specified query terms and optional tag ID.
    """

    params = {"q": q, "from-date": "2016-01-01", "show-tags": "all", "page-size": 200, "api-key": guardian_api_key, "show-fields": "body", "tag": tag_id, "sectionId": section_id}

    response = requests.get(guardian_search_url, params=params)
    response.raise_for_status()
    response_json = response.json()["response"]
    results = response_json["results"]

    print(f"{len(results)} articles found")

    for article in results:
        html_text = article["fields"]["body"]
        soup = BeautifulSoup(html_text, 'html.parser')
        article["text"] = soup.get_text()

    return results


def get_guardian_articles_by_tag(tag_id):
    """
    Return the list of articles for specified tag ID.
    """

    params = {"show-tags": "all", "from-date": "2016-01-01", "page-size": 200, "api-key": guardian_api_key, "tag": tag_id, "show-fields": "body"}

    response = requests.get(guardian_search_url, params=params)
    response.raise_for_status()
    response_json = response.json()["response"]
    results = response_json["results"]

    print(f"{len(results)} articles found")

    for article in results:
        html_text = article["fields"]["body"]
        soup = BeautifulSoup(html_text, 'html.parser')
        article["text"] = soup.get_text()

    return results


def get_paged_guardian_tags(page, tag_type = "keyword", page_size = 1000):
    """
    Return the list of tags for specified page.
    """

    params = {"page-size": page_size, "page": page, "type": tag_type, "api-key": guardian_api_key}

    response = requests.get(guardian_tags_url, params=params)
    response.raise_for_status()
    response_json = response.json()["response"]

    return response_json


def get_guardian_tags(tag_type = "keyword"):
    """
    Return the list of tags.
    """

    page = 1
    pages = None
    page_size = 1000
    tags = []

    response = get_paged_guardian_tags(page, page_size=1)
    pages = response["total"] / page_size

    while page < pages:
        print(f"Retrieving tags of page {page}")
        response = get_paged_guardian_tags(page, tag_type=tag_type, page_size=page_size)
        tags = tags + response["results"]
        page = page + 1

    return tags


def export_guardian_tags():
    """
    Export Guardian keyword tags to CSV.
    """

    tags = get_guardian_tags()
    tags_df = pd.DataFrame(tags)
    tags_filepath = Path("data/guardian_tags.csv")
    tags_df[["id", "sectionId", "sectionName", "webTitle", "apiUrl"]].to_csv(tags_filepath, index=None)

In [None]:
# Get articles for query terms and section ID
articles = get_guardian_articles("Tesla", section_id="environment")
articles_df = pd.DataFrame(articles)
articles_df.head()

In [None]:
# Get articles for query terms and tag ID
articles = get_guardian_articles("Tesla", tag_id="environment/climate-change")
articles_df = pd.DataFrame(articles)
articles_df.head()