# Shifting Agendas: The Guardian and the Death of Queen Elizabeth II

**Event-Driven Topic Shifts: A BERTopic Analysis of The Guardian Before and After the Death of Queen Elizabeth II**

This notebook performs topic modeling, alignment, sentiment and entity analyses to quantify how The Guardian's news agenda shifted around the death of Queen Elizabeth II (2020â€“2025 coverage).

In [None]:
# ==========================================
# 1. Setup & Imports
# ==========================================
import pandas as pd
import numpy as np
import re
import os
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# NLP & ML Libraries
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary

# Configuration
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth', 100)

# Ensure NLTK resources are available
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords', quiet=True)

print("Libraries loaded successfully.")

In [None]:
# ==========================================
# 2. Utility Classes
# ==========================================

class TextAnalyzer:
    """Handles text preprocessing, loading, and dataset splitting."""
    
    @staticmethod
    def clean_text(text):
        """Basic text cleaning: lowercase, remove URLs, keep only letters."""
        text = str(text).lower()
        text = re.sub(r"http\S+", "", text)  # Remove URLs
        text = re.sub(r"[^a-z\s]", " ", text) # Keep only letters
        text = re.sub(r"\s+", " ", text)      # Remove extra spaces
        return text.strip()
    
    @staticmethod
    def load_and_filter_data(filepath, start_date, end_date):
        """Load CSV, parse dates, and filter by the specified date range."""
        print(f"Loading data from {filepath}...")
        df = pd.read_csv(filepath)
        # Convert date column to datetime objects (UTC)
        df['date'] = pd.to_datetime(df['date'], utc=True, errors='coerce')
        # Drop rows with invalid dates and sort
        df = df.dropna(subset=['date']).sort_values('date').reset_index(drop=True)
        
        # Filter by date range
        mask = (df['date'] >= pd.to_datetime(start_date, utc=True)) & \
               (df['date'] <= pd.to_datetime(end_date, utc=True))
        df_filtered = df[mask].copy()
        
        # Apply text cleaning
        df_filtered['clean_text'] = df_filtered['title'].apply(TextAnalyzer.clean_text)
        return df_filtered

    @staticmethod
    def split_event_data(df, event_date, buffer_days=30, exclude_keywords=None):
        """Split data into Pre-Event and Post-Event sets, applying buffer zones and keyword exclusion."""
        event_dt = pd.to_datetime(event_date, utc=True)
        
        # Initial split based on event date
        pre_df = df[df['date'] < event_dt].copy()
        post_df = df[df['date'] >= event_dt].copy()
        
        # Apply temporal buffer (remove articles too close to the event)
        pre_buffered = pre_df[pre_df['date'] <= (event_dt - pd.Timedelta(days=buffer_days))]
        post_buffered = post_df[post_df['date'] >= (event_dt + pd.Timedelta(days=buffer_days))]
        
        # Filter out articles containing specific event-related keywords (No-Event subset)
        if exclude_keywords:
            pattern = '|'.join([re.escape(k) for k in exclude_keywords])
            pre_clean = pre_buffered[~pre_buffered['clean_text'].str.contains(pattern, case=False, na=False)]
            post_clean = post_buffered[~post_buffered['clean_text'].str.contains(pattern, case=False, na=False)]
        else:
            pre_clean, post_clean = pre_buffered, post_buffered
            
        return pre_clean, post_clean

class TopicModeler:
    """Wrapper class for LDA and BERTopic model training and evaluation."""
    
    @staticmethod
    def train_lda(texts, n_topics=5):
        """Train Latent Dirichlet Allocation (LDA) model."""
        vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
        dtm = vectorizer.fit_transform(texts)
        lda = LatentDirichletAllocation(n_components=n_topics, random_state=42, n_jobs=-1)
        lda.fit(dtm)
        return lda, vectorizer
    
    @staticmethod
    def train_bertopic(texts, n_topics=5, min_topic_size=15):
        """Train BERTopic model using Sentence Transformers and UMAP/HDBSCAN."""
        embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
        # Configure UMAP for dimensionality reduction
        umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
        # Configure HDBSCAN for clustering
        hdbscan_model = HDBSCAN(min_cluster_size=min_topic_size, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
        
        topic_model = BERTopic(
            embedding_model=embedding_model,
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            nr_topics=n_topics,
            verbose=True
        )
        topics, probs = topic_model.fit_transform(texts)
        return topic_model, topics

    @staticmethod
    def compute_cv_coherence(topics_words, texts):
        """Calculate C_v topic coherence score using Gensim."""
        dictionary = Dictionary(texts)
        # Use processes=1 to avoid issues in some notebook environments
        cm = CoherenceModel(topics=topics_words, texts=texts, dictionary=dictionary, coherence='c_v', processes=1)
        return cm.get_coherence()

print("Utility classes defined successfully.")

In [None]:
# ==========================================
# 3. Main Analysis Pipeline
# ==========================================

# --- Configuration ---
DATA_PATH = 'guardian_news_std.csv'
START_DATE = '2020-01-01'
END_DATE = '2025-12-31'
EVENT_DATE = '2022-09-08'  # Date of Queen Elizabeth II's death
BUFFER_DAYS = 30
EXCLUDE_KEYWORDS = [
    'lord', 'queen', 'elizabeth', 'monarchy', 'royal', 'king charles', 
    'obituary', 'death', 'died', 'tribute', 'funeral'
]

# --- Step 1: Load and Filter Data ---
full_df = TextAnalyzer.load_and_filter_data(DATA_PATH, START_DATE, END_DATE)

# --- Step 2: Split Data (Pre-Event vs Post-Event) ---
pre_df, post_df = TextAnalyzer.split_event_data(
    full_df, EVENT_DATE, BUFFER_DAYS, EXCLUDE_KEYWORDS
)

# --- Step 3: Downsampling ---
# Use a subset of data to balance classes and improve training speed
SAMPLE_SIZE = 20000
pre_sample = pre_df.sample(n=min(len(pre_df), SAMPLE_SIZE), random_state=42)
post_sample = post_df.sample(n=min(len(post_df), SAMPLE_SIZE), random_state=42)

print(f"\nData Summary (After removing articles with keywords: {', '.join(EXCLUDE_KEYWORDS[:3])}...):")
print(f"Pre-Event Dataset:  {len(pre_sample)} documents")
print(f"Post-Event Dataset: {len(post_sample)} documents")

In [None]:
# ==========================================
# 4. Model Training & Evaluation
# ==========================================

# Prepare text data for modeling
pre_texts = pre_sample['clean_text'].tolist()
post_texts = post_sample['clean_text'].tolist()
# Tokenize for coherence calculation
pre_tokens = [t.split() for t in pre_texts]  
post_tokens = [t.split() for t in post_texts]

# --- A. Train LDA Models ---
print("\n--- Training LDA Models (Baseline) ---")
lda_pre, vec_pre = TopicModeler.train_lda(pre_texts, n_topics=5)
lda_post, vec_post = TopicModeler.train_lda(post_texts, n_topics=5)

# Helper to extract top words from LDA
def get_lda_top_words(model, feature_names, n_top_words=10):
    return [[feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]] 
            for topic in model.components_]

lda_pre_topics = get_lda_top_words(lda_pre, vec_pre.get_feature_names_out())
lda_post_topics = get_lda_top_words(lda_post, vec_post.get_feature_names_out())

# Compute LDA Coherence
print("Calculating LDA Coherence (C_v)...")
coh_lda_pre = TopicModeler.compute_cv_coherence(lda_pre_topics, pre_tokens)
coh_lda_post = TopicModeler.compute_cv_coherence(lda_post_topics, post_tokens)

# --- B. Train BERTopic Models ---
print("\n--- Training BERTopic Models (Advanced) ---")
bert_pre, _ = TopicModeler.train_bertopic(pre_texts, n_topics=5)
bert_post, _ = TopicModeler.train_bertopic(post_texts, n_topics=5)

# Helper to extract top words from BERTopic (skipping outlier topic -1)
def get_bert_top_words(model):
    topics = []
    # Iterate through topics, typically starting from 0
    for i in range(len(model.get_topic_info()) - 1):
        if i in model.get_topics():
            topics.append([word for word, _ in model.get_topic(i)][:10])
    return topics

bert_pre_topics = get_bert_top_words(bert_pre)
bert_post_topics = get_bert_top_words(bert_post)

# Compute BERTopic Coherence
print("Calculating BERTopic Coherence (C_v)...")
coh_bert_pre = TopicModeler.compute_cv_coherence(bert_pre_topics, pre_tokens)
coh_bert_post = TopicModeler.compute_cv_coherence(bert_post_topics, post_tokens)

# --- Display Summary ---
results_df = pd.DataFrame({
    'Model': ['LDA', 'LDA', 'BERTopic', 'BERTopic'],
    'Period': ['Pre-Event', 'Post-Event', 'Pre-Event', 'Post-Event'],
    'Coherence (Cv)': [coh_lda_pre, coh_lda_post, coh_bert_pre, coh_bert_post]
})
print("\nModel Evaluation Summary:")
print(results_df)

In [None]:
# ==========================================
# 5. Visualization
# ==========================================

def plot_wordclouds(topics_words, title):
    """Generate and display word clouds for the identified topics."""
    from wordcloud import WordCloud
    
    n_topics = len(topics_words)
    if n_topics == 0:
        print(f"No topics found for {title}")
        return
        
    fig, axes = plt.subplots(1, n_topics, figsize=(20, 4))
    fig.suptitle(title, fontsize=16)
    
    for i, words in enumerate(topics_words):
        # Create word cloud from list of words
        wc = WordCloud(background_color='white', width=400, height=300).generate(' '.join(words))
        
        # Handle single subplot case
        ax = axes[i] if n_topics > 1 else axes
        ax.imshow(wc, interpolation='bilinear')
        ax.set_title(f'Topic {i}')
        ax.axis('off')
    plt.show()

print("Visualizing LDA Topics...")
plot_wordclouds(lda_pre_topics, "LDA Pre-Event Topics")
plot_wordclouds(lda_post_topics, "LDA Post-Event Topics")

print("Visualizing BERTopic Topics...")
plot_wordclouds(bert_pre_topics, "BERTopic Pre-Event Topics")
plot_wordclouds(bert_post_topics, "BERTopic Post-Event Topics")

In [None]:
# ==========================================
# 6. Save Outputs
# ==========================================

output_dir = 'results_output'
os.makedirs(output_dir, exist_ok=True)

# Save metrics to CSV
results_df.to_csv(os.path.join(output_dir, 'coherence_metrics.csv'), index=False)

# Save detailed topic keywords to text file
with open(os.path.join(output_dir, 'topic_keywords.txt'), 'w') as f:
    f.write("LDA Pre-Event Topics:\n" + str(lda_pre_topics) + "\n\n")
    f.write("LDA Post-Event Topics:\n" + str(lda_post_topics) + "\n\n")
    f.write("BERTopic Pre-Event Topics:\n" + str(bert_pre_topics) + "\n\n")
    f.write("BERTopic Post-Event Topics:\n" + str(bert_post_topics) + "\n\n")

print(f"All results have been saved to the directory: {output_dir}")