# US Attention Data -- Exploration Notebook

**Wikipedia pageviews, Google Trends, and GDELT event data tracking global attention in 2025**

Author: [Luke Steuber](https://lukesteuber.com) | Bluesky: [@lukesteuber.com](https://bsky.app/profile/lukesteuber.com)

Dataset: [lukeslp/us-attention-data](https://huggingface.co/datasets/lukeslp/us-attention-data)

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
from datetime import datetime

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

# Load all datasets
with open('trends_data.json') as f:
    trends = json.load(f)
with open('wikipedia_pageviews.json') as f:
    wiki_pageviews = json.load(f)
with open('gdelt_weekly_events.json') as f:
    gdelt_weekly = json.load(f)
with open('unified_data.json') as f:
    unified = json.load(f)
with open('weekly_trends.json') as f:
    weekly_trends = json.load(f)
with open('wikipedia_trending.json') as f:
    wiki_trending = json.load(f)
with open('events_unified.json') as f:
    events = json.load(f)
with open('weekly_attention_timeline.json') as f:
    attention_timeline = json.load(f)

print("Loaded datasets:")
print(f"  Trends terms:          {len(trends.get('terms', {}))}")
print(f"  Wikipedia countries:   {len(wiki_pageviews.get('countries', {}))}")
print(f"  GDELT weekly events:   {len(gdelt_weekly.get('weekly_events', []))}")
print(f"  Unified countries:     {len(unified.get('countries', []))}")
print(f"  Weekly trends:         {len(weekly_trends.get('weeks', []))}")
print(f"  Wikipedia trending:    {len(wiki_trending)}")
print(f"  Unified events:        {len(events.get('events', []))}")
print(f"  Attention timeline:    {len(attention_timeline.get('weekly_timeline', []))}")

## Most Viewed Wikipedia Articles

Which articles drew the most attention across all tracked countries?

In [None]:
df_trending = pd.DataFrame(wiki_trending)
top_20 = df_trending.nlargest(20, 'total_views')

fig, ax = plt.subplots(figsize=(14, 8))
bars = ax.barh(range(len(top_20)), top_20['total_views'].values, color='#1565C0')
ax.set_yticks(range(len(top_20)))
ax.set_yticklabels(top_20['title'].values)
ax.set_xlabel('Total Pageviews')
ax.set_title('Top 20 Most Viewed Wikipedia Articles (2025)')
ax.invert_yaxis()
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f'{x/1e6:.1f}M'))

for bar, val in zip(bars, top_20['total_views'].values):
    ax.text(bar.get_width() + 1e5, bar.get_y() + bar.get_height()/2,
            f'{val/1e6:.1f}M', va='center', fontsize=9)

plt.tight_layout()
plt.show()

print(f"Total trending articles tracked: {len(wiki_trending)}")
print(f"Highest daily peak: {df_trending['peak_views'].max():,} views")

## Google Trends -- Search Interest Over Time

What were people searching for throughout 2025?

In [None]:
# Weekly trends analysis
weeks_data = weekly_trends.get('weeks', [])
df_weeks = pd.DataFrame(weeks_data)

fig, axes = plt.subplots(2, 1, figsize=(16, 10))

# Global intensity over time
df_weeks['start_date_parsed'] = pd.to_datetime(df_weeks['start_date'])
axes[0].plot(df_weeks['start_date_parsed'], pd.to_numeric(df_weeks['global_intensity'], errors='coerce'),
             color='#E91E63', linewidth=2, marker='o', markersize=4)
axes[0].set_ylabel('Global Intensity')
axes[0].set_title('Google Trends -- Global Search Intensity by Week (2025)')
axes[0].tick_params(axis='x', rotation=45)

# US intensity
axes[1].plot(df_weeks['start_date_parsed'], pd.to_numeric(df_weeks['us_intensity'], errors='coerce'),
             color='#2196F3', linewidth=2, marker='s', markersize=4)
axes[1].set_ylabel('US Intensity')
axes[1].set_title('Google Trends -- US Search Intensity by Week')
axes[1].set_xlabel('Week')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Show top US searches
print("\nTop US searches by week (sample):")
for week in df_weeks.head(10).itertuples():
    print(f"  Week {week.week}: {week.top_search_us}")

## GDELT Events Timeline

The GDELT Project monitors world events from news sources globally.

In [None]:
# GDELT weekly event volume
gdelt_weeks = gdelt_weekly.get('weekly_events', [])

week_dates = []
event_counts = []
for w in gdelt_weeks:
    week_dates.append(w['week_start'])
    event_counts.append(len(w.get('events', [])))

fig, ax = plt.subplots(figsize=(16, 6))
dates = pd.to_datetime(week_dates)
ax.bar(dates, event_counts, width=5, color='#FF5722', alpha=0.8)
ax.set_xlabel('Week')
ax.set_ylabel('Number of Events')
ax.set_title('GDELT -- Weekly Event Volume (2025)')
ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

total_events = sum(event_counts)
print(f"Total GDELT events tracked: {total_events:,}")
print(f"Average events per week: {total_events / len(gdelt_weeks):.1f}")

## Unified Events -- Impact Analysis

Major events tracked across all data sources, scored by impact.

In [None]:
df_events = pd.DataFrame(events.get('events', []))

# Impact score distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 7))

axes[0].hist(pd.to_numeric(df_events['impact_score'], errors='coerce').dropna(),
             bins=20, color='#9C27B0', edgecolor='white', alpha=0.8)
axes[0].set_xlabel('Impact Score')
axes[0].set_ylabel('Number of Events')
axes[0].set_title('Distribution of Event Impact Scores')

# Sentiment direction
sentiment = df_events['sentiment_direction'].value_counts()
colors_sent = {'negative': '#F44336', 'positive': '#4CAF50', 'neutral': '#9E9E9E', 'mixed': '#FF9800'}
bars = axes[1].bar(sentiment.index, sentiment.values,
                   color=[colors_sent.get(s, '#607D8B') for s in sentiment.index])
axes[1].set_ylabel('Number of Events')
axes[1].set_title('Events by Sentiment Direction')

plt.tight_layout()
plt.show()

# Top impact events
top_events = df_events.nlargest(10, 'impact_score')[['name', 'date', 'impact_score', 'sentiment_direction']]
print("\nTop 10 Highest Impact Events:")
for _, row in top_events.iterrows():
    print(f"  [{row['impact_score']}] {row['date']}: {row['name']} ({row['sentiment_direction']})")

## Country-Level Attention Comparison

How does media/search attention vary across countries in the unified dataset?

In [None]:
# Unified country data
country_list = unified.get('countries', [])
df_unified_countries = pd.DataFrame(country_list)

fig, ax = plt.subplots(figsize=(14, 7))
country_names = df_unified_countries['name'].values
regions = df_unified_countries['region'].values

region_colors = {}
unique_regions = list(set(regions))
cmap = plt.cm.Set2(np.linspace(0, 1, len(unique_regions)))
for i, r in enumerate(unique_regions):
    region_colors[r] = cmap[i]

bar_colors = [region_colors[r] for r in regions]
bars = ax.barh(range(len(country_names)), range(len(country_names), 0, -1),
               color=bar_colors)

# Instead, show sentiment timelines for a few key countries
fig2, ax2 = plt.subplots(figsize=(16, 7))
highlight_countries = ['US', 'GB', 'FR', 'DE', 'CA', 'MX']
for c_data in country_list:
    if c_data['code'] in highlight_countries:
        timeline = c_data.get('sentiment_timeline', [])
        if timeline:
            dates = [t.get('date', t.get('week_start', '')) for t in timeline]
            scores = [t.get('score', t.get('sentiment', 0)) for t in timeline]
            if dates and scores:
                ax2.plot(pd.to_datetime(dates[:52]), [float(s) if s else 0 for s in scores[:52]],
                         label=c_data['name'], linewidth=2, marker='o', markersize=3)

ax2.set_xlabel('Date')
ax2.set_ylabel('Sentiment Score')
ax2.set_title('Sentiment Toward US -- Key Countries Over Time')
ax2.legend(loc='best')
ax2.tick_params(axis='x', rotation=45)
ax2.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.close(fig)  # close the placeholder
plt.show()

print(f"Countries tracked: {len(country_list)}")
print(f"Regions: {', '.join(unique_regions)}")

## Weekly Attention Timeline

Composite attention signal combining Wikipedia, Trends, and GDELT.

In [None]:
timeline = attention_timeline.get('weekly_timeline', [])

weeks_parsed = []
for w in timeline:
    entry = {'week_start': w['week_start']}
    components = w.get('components', {})
    for key in components:
        if isinstance(components[key], (int, float)):
            entry[key] = components[key]
        elif isinstance(components[key], dict):
            for subkey, val in components[key].items():
                if isinstance(val, (int, float)):
                    entry[f'{key}_{subkey}'] = val
    weeks_parsed.append(entry)

df_timeline = pd.DataFrame(weeks_parsed)
df_timeline['week_start'] = pd.to_datetime(df_timeline['week_start'])

# Plot available numeric columns
numeric_cols = [c for c in df_timeline.columns if c != 'week_start' and df_timeline[c].dtype in ['float64', 'int64']]

fig, ax = plt.subplots(figsize=(16, 7))
for i, col in enumerate(numeric_cols[:5]):
    ax.plot(df_timeline['week_start'], df_timeline[col],
            label=col.replace('_', ' ').title(), linewidth=1.5, alpha=0.8)

ax.set_xlabel('Week')
ax.set_ylabel('Score')
ax.set_title('Weekly Attention Components Over Time')
ax.legend(loc='best', fontsize=9)
ax.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()

print(f"Weeks tracked: {len(timeline)}")
print(f"Components available: {numeric_cols[:10]}")

## Summary Statistics

In [None]:
print("=" * 60)
print("US ATTENTION DATA -- SUMMARY")
print("=" * 60)
print(f"Timeframe:               {trends.get('timeframe', 'N/A')}")
print(f"Wikipedia trending:      {len(wiki_trending):>10,} articles")
print(f"GDELT weeks tracked:     {len(gdelt_weekly.get('weekly_events',[])):>10}")
print(f"Unified events:          {len(events.get('events',[])):>10}")
print(f"Countries tracked:       {len(unified.get('countries',[])):>10}")
print(f"Weekly trends:           {len(weekly_trends.get('weeks',[])):>10}")
print(f"Search terms tracked:    {len(trends.get('terms',{})):>10}")
print()
print("Data sources:")
for src in unified.get('metadata', {}).get('data_sources', []):
    if isinstance(src, dict):
        print(f"  -- {src.get('name', src)}: {src.get('description', '')}")
    else:
        print(f"  -- {src}")
print("=" * 60)