In [11]:
# Download Reuters corpus
# import nltk
# nltk.download("reuters")

In [12]:
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import json
from newsapi import NewsApiClient
from nltk.corpus import reuters
import os
import pandas as pd
from pathlib import Path
import requests

load_dotenv()

True

In [13]:
# Initialize News API
news_api_key = os.getenv("NEWS_API_KEY")
newsapi = NewsApiClient(api_key=news_api_key)

def get_newsapi_articles(q):
    """
    Return the list of articles for specified query terms.
    """

    data = newsapi.get_everything(q=q, language="en", page_size=100)
    articles = data["articles"]
    print(f"Found {len(articles)} articles for '{q}''.")

    return articles

In [14]:
# Initialize Bing News API
bing_subscription_key = os.getenv("BING_API_KEY")
headers = {"Ocp-Apim-Subscription-Key" : bing_subscription_key}
bing_search_url = "https://api.bing.microsoft.com/v7.0/news/search"

def get_bing_articles(q):
    """
    Return the list of articles for specified query terms.
    """

    params = {"q": q, "textDecorations": True, "textFormat": "HTML", "count": 100}

    response = requests.get(bing_search_url, headers=headers, params=params)
    response.raise_for_status()
    articles = response.json()["value"]

    print(f"{len(articles)} articles found")

    return articles

In [41]:
# Guardian News API
guardian_api_key = os.getenv("GUARDIAN_API_KEY")
guardian_search_url = "https://content.guardianapis.com/search"
guardian_tags_url = "http://content.guardianapis.com/tags"

relevant_tags = {
    "weapons_defense": ["science/weaponstechnology", "world/chemical-weapons", "us-news/us-military", "world/drones"],
    "human_rights": ["global-development/human-rights", "society/equality-and-human-rights-commission-ehrc", "sustainable-business/business-human-rights"],
    "animal_rights": ["science/animal-experimentation", "world/animal-welfare"],
    "environment": ["environment/climate-change", "carbonreduction/carbonreduction", "environment/carbon-emissions", "environment/carbon-offset-projects", "environment/carbonfootprints", "environment/renewableenergy", "environment/sustainable-development", "global-development/sustainable-development-goals", "social-progress-imperative/social-progress-imperative"]
}

def get_guardian_articles(q, section_id = None, tag_id = None):
    """
    Return the list of articles for specified query terms and optional tag ID.
    """

    params = {"q": q, "from-date": "2016-01-01", "show-tags": "all", "page-size": 200, "api-key": guardian_api_key, "show-fields": "body", "tag": tag_id, "sectionId": section_id}

    response = requests.get(guardian_search_url, params=params)
    response.raise_for_status()
    response_json = response.json()["response"]
    results = response_json["results"]

    print(f"{len(results)} articles found")

    for article in results:
        html_text = article["fields"]["body"]
        soup = BeautifulSoup(html_text, 'html.parser')
        article["text"] = soup.get_text()

    return results


def get_guardian_articles_by_tag(tag_id):
    """
    Return the list of articles for specified tag ID.
    """

    params = {"show-tags": "all", "from-date": "2016-01-01", "page-size": 200, "api-key": guardian_api_key, "tag": tag_id, "show-fields": "body"}

    response = requests.get(guardian_search_url, params=params)
    response.raise_for_status()
    response_json = response.json()["response"]
    results = response_json["results"]

    print(f"{len(results)} articles found")

    for article in results:
        html_text = article["fields"]["body"]
        soup = BeautifulSoup(html_text, 'html.parser')
        article["text"] = soup.get_text()

    return results


def get_paged_guardian_tags(page, tag_type = "keyword", page_size = 1000):
    """
    Return the list of tags for specified page.
    """

    params = {"page-size": page_size, "page": page, "type": tag_type, "api-key": guardian_api_key}

    response = requests.get(guardian_tags_url, params=params)
    response.raise_for_status()
    response_json = response.json()["response"]

    return response_json


def get_guardian_tags(tag_type = "keyword"):
    """
    Return the list of tags.
    """

    page = 1
    pages = None
    page_size = 1000
    tags = []

    response = get_paged_guardian_tags(page, page_size=1)
    pages = response["total"] / page_size

    while page < pages:
        print(f"Retrieving tags of page {page}")
        response = get_paged_guardian_tags(page, tag_type=tag_type, page_size=page_size)
        tags = tags + response["results"]
        page = page + 1

    return tags


def export_guardian_tags():
    """
    Export Guardian keyword tags to CSV.
    """

    tags = get_guardian_tags()
    tags_df = pd.DataFrame(tags)
    tags_filepath = Path("data/guardian_tags.csv")
    tags_df[["id", "sectionId", "sectionName", "webTitle", "apiUrl"]].to_csv(tags_filepath, index=None)

In [42]:
# Get articles for query terms and section ID
articles = get_guardian_articles("Tesla", section_id="environment")
articles_df = pd.DataFrame(articles)
articles_df.head()

200 articles found


Unnamed: 0,apiUrl,fields,id,isHosted,pillarId,pillarName,sectionId,sectionName,tags,text,type,webPublicationDate,webTitle,webUrl
0,https://content.guardianapis.com/business/2021...,{'body': '<p>A sharp decline in Tesla’s share ...,business/2021/mar/05/tesla-share-price-market-...,False,pillar/news,News,business,Business,"[{'id': 'business/stock-markets', 'type': 'key...",A sharp decline in Tesla’s share price has wip...,article,2021-03-05T17:41:58Z,Tesla share price plunge knocks $267bn off mar...,https://www.theguardian.com/business/2021/mar/...
1,https://content.guardianapis.com/technology/20...,{'body': '<p>Tesla shares fell in after-hours ...,technology/2021/jan/28/tesla-shares-fall-despi...,False,pillar/news,News,technology,Technology,"[{'id': 'technology/tesla', 'type': 'keyword',...",Tesla shares fell in after-hours trading after...,article,2021-01-28T09:30:30Z,Tesla shares fall despite electric carmaker's ...,https://www.theguardian.com/technology/2021/ja...
2,https://content.guardianapis.com/business/2021...,{'body': '<p>Elon Musk has made so much money ...,business/2021/jan/24/tesla-investment-reaps-29...,False,pillar/news,News,business,Business,[{'id': 'business/scottishmortgageinvestmenttr...,Elon Musk has made so much money from Tesla th...,article,2021-01-24T17:53:19Z,Tesla investment reaps $29bn profit for Edinbu...,https://www.theguardian.com/business/2021/jan/...
3,https://content.guardianapis.com/technology/20...,"{'body': '<p>Tesla, the electric car company r...",technology/2021/feb/08/tesla-bitcoin-price-new...,False,pillar/news,News,technology,Technology,"[{'id': 'technology/bitcoin', 'type': 'keyword...","Tesla, the electric car company run by the wor...",article,2021-02-08T13:53:55Z,"Tesla buys $1.5bn in bitcoin, pushing price to...",https://www.theguardian.com/technology/2021/fe...
4,https://content.guardianapis.com/technology/20...,{'body': '<p>Tesla is to raise up to $5bn sell...,technology/2020/dec/08/tesla-shares-p-500-inde...,False,pillar/news,News,technology,Technology,"[{'id': 'technology/tesla', 'type': 'keyword',...",Tesla is to raise up to $5bn selling new share...,article,2020-12-08T15:09:32Z,Tesla to raise another $5bn by selling shares,https://www.theguardian.com/technology/2020/de...


In [43]:
# Get articles for query terms and tag ID
articles = get_guardian_articles("Tesla", tag_id="environment/climate-change")
articles_df = pd.DataFrame(articles)
articles_df.head()

99 articles found


Unnamed: 0,apiUrl,fields,id,isHosted,pillarId,pillarName,sectionId,sectionName,tags,text,type,webPublicationDate,webTitle,webUrl
0,https://content.guardianapis.com/technology/20...,{'body': '<p>One of the biggest investors in T...,technology/2020/nov/06/tesla-investor-defends-...,False,pillar/news,News,technology,Technology,"[{'id': 'technology/tesla', 'type': 'keyword',...",One of the biggest investors in Tesla has defe...,article,2020-11-06T12:24:55Z,Tesla investor defends electric carmaker's soa...,https://www.theguardian.com/technology/2020/no...
1,https://content.guardianapis.com/business/nils...,"{'body': '<p>Right on cue, here comes a £4bn s...",business/nils-pratley-on-finance/2020/nov/18/a...,False,pillar/news,News,business,Business,"[{'id': 'business/nils-pratley-on-finance', 't...","Right on cue, here comes a £4bn stock market l...",article,2020-11-18T19:40:43Z,A £4bn UK 'mini Tesla' choosing to list stock ...,https://www.theguardian.com/business/nils-prat...
2,https://content.guardianapis.com/australia-new...,{'body': '<p>A major business lobby group and ...,australia-news/2020/dec/18/business-lobby-grou...,False,pillar/news,News,australia-news,Australia news,"[{'id': 'australia-news/zali-steggall', 'type'...",A major business lobby group and corporations ...,article,2020-12-17T16:30:04Z,Business lobby group and corporations back Zal...,https://www.theguardian.com/australia-news/202...
3,https://content.guardianapis.com/environment/2...,{'body': '<p>The Tesla co-founder Elon Musk ha...,environment/2021/feb/08/elon-musk-pledges-100m...,False,pillar/news,News,environment,Environment,[{'id': 'environment/carbon-capture-and-storag...,The Tesla co-founder Elon Musk has offered a $...,article,2021-02-08T13:45:36Z,Elon Musk pledges $100m to carbon capture contest,https://www.theguardian.com/environment/2021/f...
4,https://content.guardianapis.com/business/2020...,{'body': '<p>Carmakers talk a good game. Even ...,business/2020/nov/08/bentley-is-leading-the-ch...,False,pillar/news,News,business,Business,"[{'id': 'business/automotive-industry', 'type'...",Carmakers talk a good game. Even the biggest p...,article,2020-11-08T07:00:32Z,Bentley is leading the charge to batteries. Br...,https://www.theguardian.com/business/2020/nov/...
