In [1]:
# !pip install lxml_html_clean
# !pip install trafilatura


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting trafilatura
  Downloading trafilatura-2.0.0-py3-none-any.whl.metadata (12 kB)
Collecting courlan>=1.3.2 (from trafilatura)
  Downloading courlan-1.3.2-py3-none-any.whl.metadata (17 kB)
Collecting htmldate>=1.9.2 (from trafilatura)
  Downloading htmldate-1.9.4-py3-none-any.whl.metadata (10 kB)
Collecting justext>=3.0.1 (from trafilatura)
  Downloading justext-3.0.2-py2.py3-none-any.whl.metadata (7.3 kB)
Collecting tld>=0.13 (from courlan>=1.3.2->trafilatura)
  Downloading tld-0.13.1-py2.py3-none-any.whl.metadata (10 kB)
Collecting dateparser>=1.1.2 (from htmldate>=1.9.2->trafilatura)
  Downloading dateparser-1.2.2-py3-none-any.whl.metadata (29 kB)
Collecting tzlocal>=0.2 (from dateparser>=1.1.2->htmldate>=1.9.2->trafilatura)
  Down

In [9]:
import requests
import zipfile
import io
import pandas as pd
import trafilatura

file_url = "http://data.gdeltproject.org/gkg/20251104.gkg.csv.zip"


print("Downloading GDELT file...")
response = requests.get(file_url)
response.raise_for_status()

with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    inner_filename = z.namelist()[0]
    with z.open(inner_filename) as f:
        df = pd.read_csv(f, sep="\t", header=None, low_memory=False)

print(f"Loaded {len(df):,} rows and {df.shape[1]} columns")

Downloading GDELT file...
Loaded 96,247 rows and 11 columns


In [83]:
theme = 'politic'
western_country_codes = [
    'US','CA','GB','UK','IE','FR','DE','GM','NL','BE','LU','CH','AT','DK','SE','NO','FI',
    'IT','ES','PT','IS','AU','NZ','GR','CY','IL'
]

# Major Western media outlets
major_sources = [
    'bbc.com', 'bbc.co.uk', 'reuters.com', 'nytimes.com', 'washingtonpost.com',
    'cnn.com', 'foxnews.com', 'theguardian.com', 'economist.com', 'dw.com', 'lemonde.fr'
]

major_sources = [
    'cnn.com', 'dailymail.com', 'theguardian.com', 'washingtonexaminer.com', 'newsweek.com', 'breitbart.com'
]
# major_sources = ['breitbart']



# -----------------------------
# GDELT v2 GKG schema:
# -----------------------------
themes_col = 3          # "Themes"
locations_col = 4       # "Locations"
source_col = 9         # "Source" (adjust if your column index differs)
source_urls_col = df.shape[1] - 1  # Usually last column
date_col = 0


# -----------------------------
# 1. Filter rows that contain the chosen theme (case-insensitive)
# -----------------------------
theme_filter = df[themes_col].astype(str).str.contains(theme, case=False, na=False)

# -----------------------------
# 2. Filter for Western countries based on location column
# -----------------------------
western_pattern = '|'.join([f'#{code}#' for code in western_country_codes])
location_filter = df[locations_col].astype(str).str.contains(western_pattern, case=False, na=False)

# -----------------------------
# 3. Filter for major Western media outlets
# -----------------------------
df[source_col] = df[source_col].astype(str).str.lower()
source_filter = df[source_col].apply(lambda s: any(src in s for src in major_sources))

# -----------------------------
# 4. Combine all filters
# -----------------------------
df_filtered = df[theme_filter & location_filter & source_filter]

print(f"Found {len(df_filtered):,} rows with '{theme}' in themes, Western locations, and major Western sources")

# -----------------------------
# 5. Extract URLs
# -----------------------------
raw_urls = df_filtered[source_urls_col].dropna().astype(str)

urls = []
for u in raw_urls:
    urls.extend(u.split(';'))

urls = [u.strip() for u in urls if u.strip()]
unique_urls = list(set(urls))

print(f"Found {len(unique_urls):,} unique URLs for theme '{theme}' in Western countries and sources")

# -----------------------------
# 6. Show sample URLs
# -----------------------------
print("\nSample URLs:")
for url in unique_urls[:20]:
    print(url)


Found 404 rows with 'politic' in themes, Western locations, and major Western sources
Found 404 unique URLs for theme 'politic' in Western countries and sources

Sample URLs:
https://www.washingtonexaminer.com/news/white-house/3873973/trump-renews-push-end-filibuster-coming-elections/
https://us.cnn.com/politics/state-redistricting-maps-vis
https://www.cnn.com/2025/11/04/world/2-french-nationals-freed-from-iranian-prison-latam-intl
https://www.cnn.com/travel/japan-expensive-rice-kinmemai-premium-intl-hnk-dst
https://www.breitbart.com/politics/2025/11/03/leftist-canada-pm-mark-carney-elected-to-challenge-trump-apologizes-to-trump/
https://us.cnn.com/2025/11/04/us/fbi-investigation-michigan-terrorist-plot
https://www.breitbart.com/clips/2025/11/03/exclusive-beat-china-rep-max-miller-teases-transformational-crypto-legislation/
https://us.cnn.com/2025/11/04/middleeast/major-general-tomer-yerushalmi-israel-intl
https://www.washingtonexaminer.com/opinion/beltway-confidential/3873991/earle-se

In [71]:
# !pip install newspaper3k

In [None]:
from newspaper import Article
import pandas as pd

In [88]:
# Use the NYT URLs you already have
nyt_urls = unique_urls


results = []

for url in unique_urls:
    try:
        article = Article(url)
        article.download()
        article.parse()  # parses HTML and extracts text/title
        results.append({
            "url": url,
            "title": article.title
        })
    except Exception as e:
        results.append({
            "url": url,
            "title": None,
            "error": str(e)
        })

# Convert to DataFrame
df_titles = pd.DataFrame(results)

print(df_titles.head(10))


                                                 url  \
0  https://www.washingtonexaminer.com/news/white-...   
1  https://us.cnn.com/politics/state-redistrictin...   
2  https://www.cnn.com/2025/11/04/world/2-french-...   
3  https://www.cnn.com/travel/japan-expensive-ric...   
4  https://www.breitbart.com/politics/2025/11/03/...   
5  https://us.cnn.com/2025/11/04/us/fbi-investiga...   
6  https://www.breitbart.com/clips/2025/11/03/exc...   
7  https://us.cnn.com/2025/11/04/middleeast/major...   
8  https://www.washingtonexaminer.com/opinion/bel...   
9  https://www.washingtonexaminer.com/news/white-...   

                                               title error  
0  Trump renews push to end filibuster, citing co...   NaN  
1  Tracking state congressional redistricting eff...   NaN  
2  Two French nationals freed from Iranian prison...   NaN  
3  Kinmemai Premium: The story behind the world’s...   NaN  
4  Leftist Canada PM Mark Carney, Elected to Chal...   NaN  
5  Key moments th

In [92]:
df_titles

Unnamed: 0,url,title,error
0,https://www.washingtonexaminer.com/news/white-...,"Trump renews push to end filibuster, citing co...",
1,https://us.cnn.com/politics/state-redistrictin...,Tracking state congressional redistricting eff...,
2,https://www.cnn.com/2025/11/04/world/2-french-...,Two French nationals freed from Iranian prison...,
3,https://www.cnn.com/travel/japan-expensive-ric...,Kinmemai Premium: The story behind the world’s...,
4,https://www.breitbart.com/politics/2025/11/03/...,"Leftist Canada PM Mark Carney, Elected to Chal...",
...,...,...,...
399,https://www.breitbart.com/politics/2025/11/04/...,GOP Healthcare Leaders: Dems Blocked Measure t...,
400,https://www.newsweek.com/government-shutdown-2...,Top Republican Gives Update on Government Shut...,
401,https://www.theguardian.com/environment/2025/n...,Ofwat letting water firms charge twice to tack...,
402,https://www.cnn.com/2025/11/04/politics/watch-...,What to watch for as the Supreme Court reviews...,


In [90]:
df_titles['url']

0      https://www.washingtonexaminer.com/news/white-...
1      https://us.cnn.com/politics/state-redistrictin...
2      https://www.cnn.com/2025/11/04/world/2-french-...
3      https://www.cnn.com/travel/japan-expensive-ric...
4      https://www.breitbart.com/politics/2025/11/03/...
                             ...                        
399    https://www.breitbart.com/politics/2025/11/04/...
400    https://www.newsweek.com/government-shutdown-2...
401    https://www.theguardian.com/environment/2025/n...
402    https://www.cnn.com/2025/11/04/politics/watch-...
403    https://www.washingtonexaminer.com/news/campai...
Name: url, Length: 404, dtype: object

In [91]:
df_titles[df_titles['title'].isna()]

Unnamed: 0,url,title,error
107,https://us.cnn.com/cnn-underscored/gifts/best-...,,Article `download()` failed with 403 Client Er...
139,https://www.cnn.com/cnn-underscored/gifts/best...,,Article `download()` failed with 403 Client Er...
313,https://www.newsweek.com/who-is-kim-young-nam-...,,Article `download()` failed with HTTPSConnecti...
337,https://www.newsweek.com/when-november-snap-be...,,Article `download()` failed with HTTPSConnecti...


In [60]:
df_titles[df_titles['title'].isna()]['url']

3      https://www.foxnews.com/media/pelosi-rants-tru...
5      https://www.foxnews.com/politics/virginia-gop-...
6      https://www.forbes.com/councils/forbestechcoun...
8      https://www.foxnews.com/politics/ice-reveals-s...
10     https://www.foxnews.com/politics/your-guide-to...
                             ...                        
411    https://www.forbes.com/sites/stevetengler/2025...
413    https://www.foxnews.com/media/marjorie-taylor-...
417    https://www.foxnews.com/media/mamdani-tells-ms...
418    https://www.foxnews.com/politics/white-house-s...
419    https://www.foxnews.com/politics/what-results-...
Name: url, Length: 149, dtype: object

In [61]:
df_titles[df_titles['title'].isna()]['error'].unique()

array(['Article `download()` failed with 404 Client Error: Not Found for url: https://www.foxnews.com/media/pelosi-rants-trump-vile-creature-worst-thing-face-earth on URL https://www.foxnews.com/media/pelosi-rants-trump-vile-creature-worst-thing-face-earth',
       'Article `download()` failed with 404 Client Error: Not Found for url: https://www.foxnews.com/politics/virginia-gop-attorney-general-promotes-splitting-vote-snubbing-fellow-republican-earle-sears on URL https://www.foxnews.com/politics/virginia-gop-attorney-general-promotes-splitting-vote-snubbing-fellow-republican-earle-sears',
       'Article `download()` failed with 403 Client Error: Max restarts limit reached for url: https://www.forbes.com/councils/forbestechcouncil/2025/11/04/the-ai-divide-overinvesting-in-tools-underinvesting-in-people/ on URL https://www.forbes.com/councils/forbestechcouncil/2025/11/04/the-ai-divide-overinvesting-in-tools-underinvesting-in-people/',
       'Article `download()` failed with 404 Clien