In [None]:
# !pip install googlenewsdecoder
# !pip install pygooglenews

In [1]:
from pygooglenews import GoogleNews
import pprint
from itertools import islice
from googlenewsdecoder import gnewsdecoder

We are going to define here the search term and start and end dates for our query as well as the language and country of the GoogleNews class.

In [5]:
search_term = 'Sudan'
start_date = '2024-01-01'
end_date = '2024-12-30'

language = 'en'
country = 'US' # bust be the isocode of the country

In [6]:
gn = GoogleNews(lang = language, country=country)

In [7]:
query = gn.search(query = search_term , from_ = start_date , to_ = end_date)  # Get news articles for a specific query (e.g., conflict in Harare)

print('TOTAL NUMBER OF ARTICLES:' , len(query['entries']))  # Get the number of articles returned

# Print the news articles referring to the query
for i, article in enumerate(query['entries'][:5]):
    print(f"Article {i+1}:")
    print(f"Title: {article.title}")
    print(f"Link: {article.link}")
    print(f"Published: {article.published}")
    print(f"Source: {article.source}")
    print()

TOTAL NUMBER OF ARTICLES: 100
Article 1:
Title: Survivors of Sudan’s brutal war have been forgotten - nrc.no
Link: https://news.google.com/rss/articles/CBMib0FVX3lxTE1zWElDaEZTQkJjNGFNUzk5bWNXRjNpVjhOaDU4b1ZvVEhUaVlqc3JXWVJET2tHMkhwOVh0S2JrXzRVWmNDa2VMSmhNY3Z0SjdKWXV4U1ZvcmVuSVlZOXBhRDVhOXV6UF9PaVVUR2J0bw?oc=5
Published: Wed, 14 Feb 2024 08:00:00 GMT
Source: {'href': 'https://www.nrc.no', 'title': 'nrc.no'}

Article 2:
Title: Sudan humanitarian crisis has catastrophic impact for women and girls, with two-fold increase of gender-based violence - UN Women
Link: https://news.google.com/rss/articles/CBMiiwJBVV95cUxOcTlTTngwdU95OXo0V3gtUDV3VjdQMXpNb2Rpb2k1dExfNThnTjJoRmtNc0VpNjVQNk92Y01acFl2MER5cHZTNS1KUlBLYkYzQlBCd1N0R1FOd29mUDNNQlU4cllra2U0cDNZenFucjdGcDFSTFdUazR1cjd5dEdaWDRJYVdGWkprUHFWZ3MzU3o4Z080MngxclE1YktzUVhEQXRXRWV3YndVWkxOd3JEV094Smh6TWJBNkV1X3o4cjRxSkEzb296YngtZGJQTERteURvc1FfTXdFMHZnWEx1QVdfS2RNUEllQmJTMnVWUnV3LUlSMnE1dE55dXhYMjJvdWIzZFFIVjd4X2M?oc=5
Published: Fri, 27 Sep 2024 

It extracts 100 articles regardless of the date interval, so we have to work around that. Maybe make a dataset per day

In [8]:
source_urls = [article.link for article in query['entries']]

def decode_urls(source_urls):
    interval_time = 1  # interval is optional, default is None
    #proxy = "http://user:pass@localhost:8080" # proxy is optional, default is None

    results = []

    for url in source_urls:
        try:
            decoded_url = gnewsdecoder(url, 
                                       interval=interval_time, 
                                       # proxy=proxy
                                       )
            if decoded_url.get("status"):
                clean_url = decoded_url["decoded_url"]
                #print("Decoded URL:", clean_url)
                results.append(clean_url)
            else:
                print("Error:", decoded_url["message"])
        except Exception as e:
            print(f"Error occurred: {e}")

    return results

decoded_urls = decode_urls(source_urls)

Decoded URL: https://www.nrc.no/news/2024/february/chad-refugees-from-sudan
Decoded URL: https://www.unwomen.org/en/news-stories/press-release/2024/09/sudan-humanitarian-crisis-has-catastrophic-impact-for-women-and-girls-with-two-fold-increase-of-gender-based-violence
Decoded URL: https://www.spf.org/iina/en/articles/sakane_11.html
Decoded URL: https://www.wfp.org/stories/famine-sudan-not-too-late-reverse-hunger-tide
Decoded URL: https://www.amnesty.org/en/latest/news/2024/07/sudan-constant-flow-of-arms-fuelling-relentless-civilian-suffering-in-conflict-new-investigation/
Decoded URL: https://timep.org/2024/09/19/internet-in-conflict-sudans-battle-for-connection/
Decoded URL: https://news.un.org/en/story/2024/05/1150416
Decoded URL: https://medicine.yale.edu/news-article/yales-humanitarian-research-lab-responds-as-violence-escalates-in-sudan/
Decoded URL: https://www.doctorswithoutborders.org/latest/south-sudan-msf-calls-urgent-action-cholera-outbreak-rapidly-spreads
Decoded URL: https

This process is kinda slow -- took exactly 3min for 100 articles. 

In [9]:
decoded_urls = list(set(decoded_urls))  # Remove duplicates
print(f"Number of unique decoded URLs: {len(decoded_urls)}")

Number of unique decoded URLs: 100


# 2. Extracting the articles' content

In [11]:
# !pip install trafilatura

Collecting trafilatura
  Using cached trafilatura-2.0.0-py3-none-any.whl.metadata (12 kB)
Collecting courlan>=1.3.2 (from trafilatura)
  Using cached courlan-1.3.2-py3-none-any.whl.metadata (17 kB)
Collecting htmldate>=1.9.2 (from trafilatura)
  Using cached htmldate-1.9.3-py3-none-any.whl.metadata (10 kB)
Collecting justext>=3.0.1 (from trafilatura)
  Using cached justext-3.0.2-py2.py3-none-any.whl.metadata (7.3 kB)
Collecting lxml>=5.3.0 (from trafilatura)
  Downloading lxml-5.4.0-cp312-cp312-win_amd64.whl.metadata (3.6 kB)
Collecting babel>=2.16.0 (from courlan>=1.3.2->trafilatura)
  Using cached babel-2.17.0-py3-none-any.whl.metadata (2.0 kB)
Collecting tld>=0.13 (from courlan>=1.3.2->trafilatura)
  Downloading tld-0.13.1-py2.py3-none-any.whl.metadata (10 kB)
Collecting lxml_html_clean (from lxml[html_clean]>=4.4.2->justext>=3.0.1->trafilatura)
  Using cached lxml_html_clean-0.4.2-py3-none-any.whl.metadata (2.4 kB)
Using cached trafilatura-2.0.0-py3-none-any.whl (132 kB)
Using cach

In [17]:
import httpx
import trafilatura
import polars as pl
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import polars as pl

In [18]:
# Fetch function
def fetch_article(url, timeout=15):
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/114.0.0.0 Safari/537.36"
    }
    try:
        with httpx.Client(headers=headers, follow_redirects=True, timeout=timeout) as client:
            response = client.get(url)
            if response.status_code == 200:
                html = response.text
                article_text = trafilatura.extract(html)
                metadata = trafilatura.extract_metadata(html)

                title = getattr(metadata, "title", None) if metadata else None
                if title is None and isinstance(metadata, dict):
                    title = metadata.get("title")

                if article_text:
                    return {"url": url, "title": title, "full_text": article_text}
    except Exception:
        pass
    return {"url": url, "title": None, "full_text": None}

# run concurrently and add results to a list
results = []

with ThreadPoolExecutor(max_workers=32) as executor:
    futures = {executor.submit(fetch_article, url): url for url in decoded_urls}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching", ncols=80):
        results.append(future.result())

# add to our dataframe

df = pl.DataFrame(results)
df = df.unique(subset=["url"])


Fetching: 100%|███████████████████████████████| 100/100 [00:09<00:00, 10.13it/s]


In [19]:
df.head()

url,title,full_text
str,str,str
"""https://www.cgdev.org/blog/wor…","""The World Is Failing Sudan""","""I don’t know if the rhetoric-t…"
"""https://www.csis.org/analysis/…","""Sudan’s Humanitarian Crisis: W…","""Sudan’s Humanitarian Crisis: W…"
"""https://usun.usmission.gov/rem…","""Remarks at a UN Security Counc…","""Ambassador Robert Wood Alterna…"
"""https://www.crisisgroup.org/af…","""Halting the Catastrophic Battl…","""A man stands by as a fire rage…"
"""https://www.nytimes.com/2024/0…",,
