In [1]:
# !pip install lxml_html_clean
# !pip install trafilatura


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting trafilatura
  Downloading trafilatura-2.0.0-py3-none-any.whl.metadata (12 kB)
Collecting courlan>=1.3.2 (from trafilatura)
  Downloading courlan-1.3.2-py3-none-any.whl.metadata (17 kB)
Collecting htmldate>=1.9.2 (from trafilatura)
  Downloading htmldate-1.9.4-py3-none-any.whl.metadata (10 kB)
Collecting justext>=3.0.1 (from trafilatura)
  Downloading justext-3.0.2-py2.py3-none-any.whl.metadata (7.3 kB)
Collecting tld>=0.13 (from courlan>=1.3.2->trafilatura)
  Downloading tld-0.13.1-py2.py3-none-any.whl.metadata (10 kB)
Collecting dateparser>=1.1.2 (from htmldate>=1.9.2->trafilatura)
  Downloading dateparser-1.2.2-py3-none-any.whl.metadata (29 kB)
Collecting tzlocal>=0.2 (from dateparser>=1.1.2->htmldate>=1.9.2->trafilatura)
  Down

In [9]:
import requests
import zipfile
import io
import pandas as pd
import trafilatura

file_url = "http://data.gdeltproject.org/gkg/20251104.gkg.csv.zip"


print("Downloading GDELT file...")
response = requests.get(file_url)
response.raise_for_status()

with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    inner_filename = z.namelist()[0]
    with z.open(inner_filename) as f:
        df = pd.read_csv(f, sep="\t", header=None, low_memory=False)

print(f"Loaded {len(df):,} rows and {df.shape[1]} columns")

Downloading GDELT file...
Loaded 96,247 rows and 11 columns


In [18]:
theme = 'politic'
western_country_codes = [
    'US','CA','GB','UK','IE','FR','DE','GM','NL','BE','LU','CH','AT','DK','SE','NO','FI',
    'IT','ES','PT','IS','AU','NZ','GR','CY','IL'
]


# GDELT v2 GKG schema:
themes_col = 3          # "Themes"
locations_col = 4       # "Locations"
source_urls_col = df.shape[1] - 1  # Usually last column

# -----------------------------
# 1. Filter rows that contain the chosen theme (case-insensitive)
# -----------------------------
theme_filter = df[themes_col].astype(str).str.contains(theme, case=False, na=False)

# -----------------------------
# 2. Define Western world country codes
# -----------------------------

western_pattern = '|'.join([f'#{code}#' for code in western_country_codes])

# -----------------------------
# 3. Filter for rows whose location column contains a Western country code
# -----------------------------
location_filter = df[locations_col].astype(str).str.contains(western_pattern, case=False, na=False)

# -----------------------------
# 4. Combine filters
# -----------------------------
df_filtered = df[theme_filter & location_filter]

print(f"Found {len(df_filtered):,} rows with '{theme}' in themes and Western locations")

# -----------------------------
# 5. Extract URLs
# -----------------------------
raw_urls = df_filtered[source_urls_col].dropna().astype(str)

urls = []
for u in raw_urls:
    urls.extend(u.split(';'))

urls = [u.strip() for u in urls if u.strip()]
unique_urls = list(set(urls))

print(f"Found {len(unique_urls):,} unique URLs for theme '{theme}' in Western countries")

# -----------------------------
# 6. Show sample URLs
# -----------------------------
print("\nSample URLs:")
for url in unique_urls[:20]:
    print(url)


Found 27,564 rows with 'politic' in themes and Western locations
Found 27,564 unique URLs for theme 'politic' in Western countries

Sample URLs:
https://www.yahoo.com/news/articles/former-state-senator-discards-republican-124022599.html
https://www.algemeiner.com/2025/11/04/israel-jewish-groups-remember-former-us-vice-president-dick-cheney-great-friend-steadfast-supporter/
https://wlap.iheart.com/content/2025-11-04-texas-governor-wants-a-100-tariff-on-people-moving-from-nyc-to-texas/
https://massachusettsnewswire.com/evp-of-information-technology-at-aces-quality-management-ben-mahan-named-housingwire-2025-tech-trendsetter-70953/
https://www.hulldailymail.co.uk/news/hull-east-yorkshire-news/hull-prepares-arrival-electric-buses-10619943
http://www.newjerseytelegraph.com/news/278673650/american-critical-minerals-announces-appointment-of-dean-pekeski-as-senior-advisor-to-the-company
https://www.mesabitribune.com/news/business/75-million-to-be-invested-in-u-s-steel-s-alabama-plant-to-advanc

In [3]:
# !pip install newspaper3k


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
from newspaper import Article
import pandas as pd

# Use the NYT URLs you already have
nyt_urls = [
    "https://www.nytimes.com/athletic/6718447/2025/10/21/newcastle-technical-director-gopaladesikan/",
    "https://www.nytimes.com/athletic/6732846/2025/10/21/bundesliga-briefing-harry-kane-yussuf-poulsen-tifo-leipzig/",
    # add more URLs or use your unique_nyt_urls list
]

results = []

for url in nyt_urls:
    try:
        article = Article(url)
        article.download()
        article.parse()  # parses HTML and extracts text/title
        results.append({
            "url": url,
            "title": article.title
        })
    except Exception as e:
        results.append({
            "url": url,
            "title": None,
            "error": str(e)
        })

# Convert to DataFrame
df_titles = pd.DataFrame(results)

print(df_titles.head(10))


                                                 url  \
0  https://www.nytimes.com/athletic/6718447/2025/...   
1  https://www.nytimes.com/athletic/6732846/2025/...   

                                               title  
0  Newcastle’s new technical director: A former M...  
1  Bundesliga Briefing: All-time performance from...  
