In [1]:
# !pip install googlenewsdecoder
# !pip install pygooglenews

In [3]:
from pygooglenews import GoogleNews
import pprint
from itertools import islice
from googlenewsdecoder import gnewsdecoder
from datetime import datetime, timedelta
import polars as pl
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
import os
import random
import httpx
import trafilatura
import time

import importlib
import sys
sys.path.append('c:/Users/blanc/OneDrive/Desktop/DSDM/3-TFM/repo/UN_Conflict_Report/graphrag_pipeline/pipeline')
google_news_ingestion = importlib.import_module('01_data_ingestion.google_news_ingestion')
GoogleNewsIngestor = google_news_ingestion.GoogleNewsIngestor


# 1. Google News Query
We are going to define here the search term and start and end dates for our query as well as the language and country of the GoogleNews class.

In [None]:
# DATES
ONE_DAY = ('2024-01-01', '2024-01-02')
ONE_MONTH = ('2025-06-10', '2025-06-27')
ONE_YEAR = ('2023-01-01', '2023-12-31')


In [10]:
gn_ingestor = GoogleNewsIngestor('Sudan -South Sudan', ONE_MONTH)

In [11]:
gn_ingestor.execute_google_news_query()
gn_ingestor.print_query_summary()

Fetching batches:   0%|          | 0/6 [00:00<?, ?it/s]

Fetching batches: 100%|██████████| 6/6 [00:01<00:00,  4.97it/s]


------------------------------------------------------
TOTAL NUMBER OF ARTICLES: 337
Average articles per day: 19
Batches that were expanded to 1-day queries (0):
Batches that failed (0)
------------------------------------------------------
shape: (5, 4)
┌────────────────────────────┬───────────────────────────┬──────────────────┬──────────────────────┐
│ title                      ┆ google_link               ┆ published        ┆ source               │
│ ---                        ┆ ---                       ┆ ---              ┆ ---                  │
│ str                        ┆ str                       ┆ str              ┆ str                  │
╞════════════════════════════╪═══════════════════════════╪══════════════════╪══════════════════════╡
│ 10 dead in Iranian attack  ┆ https://news.google.com/r ┆ Sun, 15 Jun 2025 ┆ lokmattimes.com      │
│ on c…                      ┆ ss/ar…                    ┆ 07:00:00 GMT     ┆                      │
│ Sudan - WFP Sudan Market   ┆ https:

In [None]:
df = gn_ingestor.df_1

country = "Sudan -South Sudan"
prefix = country.replace(" ", "")[:3].upper()  # e.g., "SUD"

# Generate a unique 5-digit number for each row
n_rows = df.height
random.seed(42)  # For reproducibility, optional
unique_digits = random.sample(range(10000, 99999), n_rows)



df = df.with_columns(
    [
        pl.Series(
            name="id",
            values=[f"GN_{prefix}{num}" for num in unique_digits]
        ),
        pl.col("published").str.strptime(pl.Date, "%a, %d %b %Y %H:%M:%S %Z", strict=False).alias("date")
    ]
).drop("published")

df.head()

title,google_link,source,id,date
str,str,str,str,date
"""10 dead in Iranian attack on c…","""https://news.google.com/rss/ar…","""lokmattimes.com""","""GN_SUD93810""",2025-06-15
"""Sudan - WFP Sudan Market Monit…","""https://news.google.com/rss/ar…","""ReliefWeb""","""GN_SUD24592""",2025-06-19
"""Sudan Strongly Condemns Attack…","""https://news.google.com/rss/ar…","""MSN""","""GN_SUD13278""",2025-06-23
"""UAE Profiting from Sudan’s Cha…","""https://news.google.com/rss/ar…","""وطن. يغرد خارج السرب""","""GN_SUD46048""",2025-06-25
"""QC relief mission to address S…","""https://news.google.com/rss/ar…","""Qatar Tribune""","""GN_SUD42098""",2025-06-22


In [6]:
gn_ingestor.decode_urls_and_fetch_articles_base64()

decoding URLs (base64): 100%|██████████| 500/500 [00:00<00:00, 105767.20it/s]
Fetching: 0it [00:00, ?it/s]
decoding URLs (base64): 100%|██████████| 500/500 [00:00<00:00, 176751.12it/s]
Fetching: 0it [00:00, ?it/s]
decoding URLs (base64): 100%|██████████| 113/113 [00:00<?, ?it/s]
Fetching: 0it [00:00, ?it/s]


In [7]:
# gn_ingestor.decode_urls_and_fetch_articles()

In [8]:
gn_ingestor.print_urls_and_texts_summary()

DataFrame: df_1
Number of correctly fetched articles (non-null): 0 / 1
Number of non-empty articles: 0 / 1
DataFrame: df_2
Number of correctly fetched articles (non-null): 0 / 1
Number of non-empty articles: 0 / 1
DataFrame: df_3
Number of correctly fetched articles (non-null): 0 / 1
Number of non-empty articles: 0 / 1


In [None]:
june_25_df = pl.concat([gn_ingestor.df_1, gn_ingestor.df_2])

In [None]:
# DataFrame: df_1
# Number of correctly fetched articles (non-null): 400 / 500
# Number of non-empty articles: 400 / 500
# DataFrame: df_2
# Number of correctly fetched articles (non-null): 311 / 391
# Number of non-empty articles: 311 / 391

In [13]:
april_23_df.head()

title,google_link,published,source,decoded_url,full_text
str,str,str,str,str,str
"""WHO says ‘huge biological risk…","""https://news.google.com/rss/ar…","""Tue, 25 Apr 2023 07:00:00 GMT""","""Al Jazeera""","""https://www.aljazeera.com/news…","""WHO says ‘huge biological risk…"
"""Sudan violence: Ballymena man …","""https://news.google.com/rss/ar…","""Mon, 24 Apr 2023 07:00:00 GMT""","""BBC""","""https://www.bbc.co.uk/news/art…","""Ballymena man among those caug…"
"""UNESCO/EU project supports Sou…","""https://news.google.com/rss/ar…","""Thu, 20 Apr 2023 07:00:00 GMT""","""UNESCO""","""https://www.unesco.org/en/arti…",
"""Sudan's Badr Airlines launches…","""https://news.google.com/rss/ar…","""Sun, 02 Apr 2023 07:00:00 GMT""","""NewVision.co.ug""","""https://www.newvision.co.ug/ca…",
"""South Sudan Needs $1.7 Billion…","""https://news.google.com/rss/ar…","""Fri, 07 Apr 2023 07:00:00 GMT""","""theheritagetimes.com""","""https://www.theheritagetimes.c…","""By Ebi Kesiena South Sudan’s s…"


For a 1-month interval:
* Number of articles: ~800-1200 articles
* Execution times:
    * Google News query: 1-3 seconds
    * URL decoding + article fetching (done in the same step): ~15min

For now, query language and country are set to English, United States. This is the country from which the articles come from. The GoogleNews class also needs to be initialized with a value for these two parameters. 

While the country choice can have implications, most widely-recognised news sources (BBC, Al-Jazeera...) as well as UN sites that post news are available in the US and in English. We cannot set the search option to English in some countries, that is why we do not use the country we are searching as the query country. 

In the long term, consider how to overcome the implications of the query country choice. Maybe concatenate results from top_news in different query countries? That may be redundant, though.

# Filtering Sources

In [6]:
google_news_dir = Path(os.getcwd()).parent / 'data' / 'google_news'

df_1_path = google_news_dir / 'google_news_Sudan_2025-05-01_2025-06-01_chunk1.parquet'
df_2_path = google_news_dir / 'google_news_Sudan_2025-05-01_2025-06-01_chunk2.parquet'
df_3_path = google_news_dir / 'google_news_Sudan_2025-05-01_2025-06-01_chunk3.parquet'

df_1, df_2, df_3 = pl.read_parquet(df_1_path), pl.read_parquet(df_2_path), pl.read_parquet(df_3_path)

df = pl.concat([df_1, df_2, df_3])

df.head(3)

title,google_link,source,id,date,decoded_url,full_text
str,str,str,str,date,str,str
"""South Sudan president replaces…","""https://news.google.com/rss/ar…","""Sudan Tribune""","""GN_SUD66148""",2025-05-01,"""https://sudantribune.com/artic…",
"""US Issues New 'Do Not Travel' …","""https://news.google.com/rss/ar…","""Travel Noire""","""GN_SUD17331""",2025-05-01,"""https://travelnoire.com/do-not…","""The United States Department o…"
"""Op-Ed: As U.S. ‘America First’…","""https://news.google.com/rss/ar…","""OkayAfrica""","""GN_SUD26728""",2025-05-01,"""https://www.okayafrica.com/afr…","""Op-Ed: As U.S. ‘America First’…"


In [9]:
with pl.Config(tbl_rows=-1):  # -1 means show all rows
    print(df['source'].value_counts().sort('count', descending=True))

shape: (453, 2)
┌─────────────────────────────────┬───────┐
│ source                          ┆ count │
│ ---                             ┆ ---   │
│ str                             ┆ u32   │
╞═════════════════════════════════╪═══════╡
│ Sudan Tribune                   ┆ 85    │
│ ReliefWeb                       ┆ 56    │
│ Radio Tamazuj                   ┆ 32    │
│ Anadolu Ajansı                  ┆ 24    │
│ TRT Global                      ┆ 24    │
│ Al Jazeera                      ┆ 22    │
│ Radio Dabanga                   ┆ 21    │
│ allAfrica.com                   ┆ 17    │
│ Reuters                         ┆ 16    │
│ The New Arab                    ┆ 16    │
│ Middle East Monitor             ┆ 15    │
│ News Central TV                 ┆ 15    │
│ UN News                         ┆ 14    │
│ paanluelwel.com                 ┆ 13    │
│ France 24                       ┆ 13    │
│ Arab News                       ┆ 12    │
│ BBC                             ┆ 12    │
│ Yahoo         

In [None]:
trusted_sources = [
    'Sudan Tribune', 'سودان تريبيون', 'sudantribune.net', # MEDIUM CREDIBILITY but MOSTLY FACTUAL 
    'AllAfrica', 'allafrica.com', 
    'Reuters', 'reuters.com',
    'The New Arab', 'newarab.com',
    'France 24', 'France24', 'france24.com',
    'BBC', 'bbc.com', 'BBC News', 'BBC Arabic', 'بي بي سي عربي',
    'Yahoo', 'Yahoo News',
    'AP News', 'Associated Press', 'apnews.com',
    'CNN'
    # UN SOURCES
    'UN NEWS', 'un.org/news',
    'OCHA', 'unocha.org',
    'Unicef', 'unicef.org',

]



SyntaxError: incomplete input (3475688130.py, line 1)

* Al-Jazeera - factual reporting: MIXED (???)
* Middle East Monitor - same