# Topic Modeling with BERTopic on Parliamentary Speeches

This notebook implements a sophisticated topic modeling pipeline that:
1. **Embeds individual speeches** for semantic segmentation
2. **Segments speeches** using similarity-based boundary detection  
3. **Re-embeds aggregated segments** for better semantic representation
4. **Discovers topics** using BERTopic with custom clustering
5. **Generates readable topic names** using LLMs
6. **Tracks topic evolution** over time

## Key Approach - Dual Embedding Strategy:
- **First embedding**: Individual speeches using raw text (for segmentation)
- **Second embedding**: Concatenated segment texts (for topic modeling)
- **Why twice?** Re-embedding captures full discourse coherence vs. averaging individual embeddings
- **Raw text used throughout** for better semantic capture

In [14]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Load PROCESSED data for topic modeling
processed_data_path = r'data folder\data\AT_for_topic_modeling.pkl'
AT_processed_df = pd.read_pickle(processed_data_path)
print(f"Loaded processed data: {AT_processed_df.shape}")

# Load ORIGINAL complete data for final mapping
original_data_path = r'data folder\data\AT_original_complete.pkl'
AT_original_df = pd.read_pickle(original_data_path)
print(f"Loaded original complete data: {AT_original_df.shape}")

print(f"\nFiltered out for topic modeling:")
print(f"  Too short: {AT_original_df['Is_Too_Short'].sum()}")
print(f"  Used for topic modeling: {len(AT_processed_df)}")

# Verify we have the required columns for the pipeline
print(f"\nProcessed data columns: {list(AT_processed_df.columns)}")
print(f"Has 'Text' column: {'Text' in AT_processed_df.columns}")

Loaded processed data: (190633, 28)
Loaded original complete data: (231752, 27)

Filtered out for topic modeling:
  Too short: 41119
  Used for topic modeling: 190633

Processed data columns: ['Sitting_ID', 'Speech_ID', 'Title', 'Date', 'Body', 'Term', 'Session', 'Meeting', 'Sitting', 'Agenda', 'Subcorpus', 'Lang', 'Speaker_role', 'Speaker_MP', 'Speaker_minister', 'Speaker_party', 'Speaker_party_name', 'Party_status', 'Party_orientation', 'Speaker_ID', 'Speaker_name', 'Speaker_gender', 'Speaker_birth', 'Text', 'Word_Count', 'Is_Too_Short', 'Is_Filtered', 'Used_For_Topic_Modeling']
Has 'Text' column: True
Loaded original complete data: (231752, 27)

Filtered out for topic modeling:
  Too short: 41119
  Used for topic modeling: 190633

Processed data columns: ['Sitting_ID', 'Speech_ID', 'Title', 'Date', 'Body', 'Term', 'Session', 'Meeting', 'Sitting', 'Agenda', 'Subcorpus', 'Lang', 'Speaker_role', 'Speaker_MP', 'Speaker_minister', 'Speaker_party', 'Speaker_party_name', 'Party_status', 'P

## Embedding and Segmentation Functions

These functions handle the dual-embedding approach:
1. **Speech-level embeddings** for similarity-based segmentation
2. **Segment-level embeddings** for final topic modeling

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.signal import find_peaks
from sentence_transformers import SentenceTransformer
import torch
import time
import gc
from tqdm import tqdm

def load_embedding_model(model_name="nomic-ai/nomic-embed-text-v1.5", device=None):
    """
    Load a sentence embedding model with better memory management.
    
    Args:
        model_name (str): HuggingFace model name
        device (str): Device to use ('cuda', 'cpu', or None for auto)
    
    Returns:
        SentenceTransformer: Loaded model
    """
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    print(f"Loading embedding model: {model_name} on {device}")
    start_time = time.time()
    
    try:
        # For CPU, optimize memory usage
        if device == 'cpu':
            torch.set_num_threads(4)
            model = SentenceTransformer(
                model_name, 
                device=device, 
                trust_remote_code=True,
                model_kwargs={'torch_dtype': torch.float32}
            )
        else:
            model = SentenceTransformer(model_name, device=device, trust_remote_code=True)
        
        print(f"Model loaded in {time.time() - start_time:.2f} seconds")
        return model
        
    except Exception as e:
        print(f"Error loading {model_name}: {e}")
        raise e

def generate_speech_embeddings_for_segmentation(df, text_column='Text', model_name="nomic-ai/nomic-embed-text-v1.5", batch_size=8):
    """
    FIRST EMBEDDING: Generate embeddings for individual speeches for segmentation.
    Uses FULL raw text with fallback for extremely long texts.
    
    Args:
        df (pd.DataFrame): DataFrame with speeches
        text_column (str): Column containing text to embed (use 'Text' for raw)
        model_name (str): Embedding model to use
        batch_size (int): Batch size for embedding generation
    
    Returns:
        pd.DataFrame: DataFrame with added 'Speech_Embeddings' column
    """
    print("=" * 60)
    print("FIRST EMBEDDING: Individual speeches for segmentation")
    print("=" * 60)
    print(f"Generating embeddings for {len(df)} speeches using {model_name}")
    
    # Load model
    model = load_embedding_model(model_name)
    
    # Use FULL texts
    texts = df[text_column].astype(str).tolist()
    
    # Show text length statistics and identify extremely long texts
    text_lengths = [len(text) for text in texts]
    print(f"Text length statistics (characters):")
    print(f"  Min: {min(text_lengths)}, Max: {max(text_lengths)}, Mean: {np.mean(text_lengths):.0f}")
    
    # Calculate 99.9th percentile threshold to filter out top 0.1% longest speeches
    length_threshold = np.percentile(text_lengths, 99.9)
    extremely_long_mask = np.array(text_lengths) > length_threshold
    n_extremely_long = extremely_long_mask.sum()
    
    print(f"  99.9th percentile length: {length_threshold:.0f} characters")
    print(f"  Extremely long speeches (top 0.1%): {n_extremely_long}")
    print(f"  These will be assigned zero embeddings for memory safety")
    
    def embed_with_fallback(text, speech_index):
        """Embed text with fallback strategies for very long texts."""
        text_len = len(text)
        
        # Skip extremely long texts (top 0.1%) - assign zero embedding
        if text_len > length_threshold:
            return np.zeros(model.get_sentence_embedding_dimension())
        
        try:
            # Try full text first
            embedding = model.encode(
                [text], 
                batch_size=8,
                convert_to_tensor=False,
                normalize_embeddings=True,
                show_progress_bar=False
            )[0]
            return embedding
            
        except Exception:
            # Fallback: chunking for very long texts
            if len(text) > 10000:
                try:
                    chunks = []
                    chunk_size = 8000
                    for i in range(0, len(text), chunk_size):
                        chunk = text[i:i + chunk_size]
                        if len(chunk.strip()) > 100:
                            chunks.append(chunk)
                        if len(chunks) >= 3:
                            break
                    
                    if chunks:
                        chunk_embeddings = model.encode(
                            chunks,
                            batch_size=8,
                            convert_to_tensor=False,
                            normalize_embeddings=True,
                            show_progress_bar=False
                        )
                        return np.mean(chunk_embeddings, axis=0)
                except Exception:
                    pass
            
            # Final fallback: truncate
            try:
                truncated_text = text[:5000]
                embedding = model.encode(
                    [truncated_text],
                    batch_size=8,
                    convert_to_tensor=False,
                    normalize_embeddings=True,
                    show_progress_bar=False
                )[0]
                return embedding
            except Exception:
                return np.zeros(model.get_sentence_embedding_dimension())
    
    # Generate embeddings with progress bar
    print("Generating speech-level embeddings...")
    start_time = time.time()
    embeddings = []
    
    # Create progress bar
    progress_bar = tqdm(enumerate(texts), total=len(texts), desc="Embedding speeches", 
                       unit="speech", ncols=100, 
                       bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]')
    
    for i, text in progress_bar:
        # Memory cleanup every 100 speeches
        if i % 100 == 0 and i > 0:
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
        
        embedding = embed_with_fallback(text, i)
        embeddings.append(embedding)
        
        # Update progress bar with additional info
        if i % 10 == 0:  # Update less frequently to avoid slowing down
            progress_bar.set_postfix({
                'Rate': f'{i/(time.time()-start_time):.1f} sp/s',
                'Long': n_extremely_long
            })
    
    progress_bar.close()
    
    print(f"✓ Speech embeddings completed in {time.time() - start_time:.2f} seconds")
    print(f"✓ Embedding shape: {np.array(embeddings).shape}")
    print(f"✓ Filtered out {n_extremely_long} extremely long speeches (assigned zero embeddings)")
    
    # Final memory cleanup
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    # Add to dataframe
    df_with_embeddings = df.copy()
    df_with_embeddings['Speech_Embeddings'] = embeddings
    df_with_embeddings['Is_Extremely_Long'] = extremely_long_mask
    
    return df_with_embeddings

def calculate_windowed_similarity(embeddings_list, window_size=3):
    """
    Calculate cosine similarity between windowed embeddings.
    
    Args:
        embeddings_list (list): List of embedding vectors
        window_size (int): Size of window for averaging
    
    Returns:
        np.array: Array of similarity scores
    """
    if len(embeddings_list) < 2:
        return np.array([])
    if window_size < 1:
        raise ValueError("Window size must be at least 1.")

    num_utterances = len(embeddings_list)
    similarities = []

    for g in range(num_utterances - 1):
        # Window before gap
        start_before = max(0, g - window_size + 1)
        end_before = g + 1
        window_before = embeddings_list[start_before:end_before]

        # Window after gap
        start_after = g + 1
        end_after = min(num_utterances, g + 1 + window_size)
        window_after = embeddings_list[start_after:end_after]

        if not window_before or not window_after:
            similarities.append(0)
            continue

        # Calculate mean embeddings and similarity
        mean_before = np.mean([np.asarray(e) for e in window_before], axis=0)
        mean_after = np.mean([np.asarray(e) for e in window_after], axis=0)
        
        sim = cosine_similarity(mean_before.reshape(1, -1), mean_after.reshape(1, -1))[0][0]
        similarities.append(sim)
        
    return np.array(similarities)

def find_topic_boundaries(similarities, height_threshold=0.25, prominence_threshold=0.15, distance_threshold=5):
    """
    Find topic boundaries using peak detection on inverted similarity scores.
    
    Args:
        similarities (np.array): Array of similarity scores
        height_threshold (float): Minimum height for peaks in inverted similarities
        prominence_threshold (float): Minimum prominence for peaks
        distance_threshold (int): Minimum distance between peaks
    
    Returns:
        np.array: Indices of detected boundaries
    """
    if len(similarities) == 0:
        return np.array([])
    
    # Invert similarities to find valleys (topic boundaries)
    inverted_similarities = np.maximum(0, 1 - similarities)
    
    # Find peaks in inverted similarities
    peaks, _ = find_peaks(
        inverted_similarities,
        height=height_threshold,
        prominence=prominence_threshold,
        distance=distance_threshold
    )
    
    return peaks

def segment_speeches_by_similarity(df, window_size=3, height_threshold=0.25, 
                                   prominence_threshold=0.15, distance_threshold=5):
    """
    Segment speeches within each sitting based on semantic similarity using speech embeddings.
    
    Args:
        df (pd.DataFrame): DataFrame with speeches and speech embeddings
        window_size (int): Window size for similarity calculation
        height_threshold (float): Height threshold for boundary detection
        prominence_threshold (float): Prominence threshold for boundary detection
        distance_threshold (int): Distance threshold for boundary detection
    
    Returns:
        pd.DataFrame: DataFrame with added 'Segment_ID' column
    """
    print(f"Segmenting speeches using similarity-based approach")
    print(f"Parameters: window_size={window_size}, height_threshold={height_threshold}")
    print(f"           prominence_threshold={prominence_threshold}, distance_threshold={distance_threshold}")
    
    df_segmented = df.copy()
    segment_ids = []
    total_boundaries = 0
    
    # Process each sitting separately
    for sitting_id, group in df_segmented.groupby('Sitting_ID'):
        if len(group) < 2:
            # Not enough speeches for segmentation
            segment_ids.extend([f"{sitting_id}_seg_0"] * len(group))
            continue
        
        # Use the speech-level embeddings for segmentation
        embeddings_list = group['Speech_Embeddings'].tolist()
        similarities = calculate_windowed_similarity(embeddings_list, window_size)
        
        if len(similarities) == 0:
            segment_ids.extend([f"{sitting_id}_seg_0"] * len(group))
            continue
        
        # Find boundaries
        boundaries = find_topic_boundaries(
            similarities, height_threshold, prominence_threshold, distance_threshold
        )
        total_boundaries += len(boundaries)
        
        # Assign segment IDs
        current_segment = 0
        sitting_segment_ids = []
        
        for i in range(len(group)):
            # Check if this speech starts a new segment
            if i > 0 and (i - 1) in boundaries:
                current_segment += 1
            sitting_segment_ids.append(f"{sitting_id}_seg_{current_segment}")
        
        segment_ids.extend(sitting_segment_ids)
    
    df_segmented['Segment_ID'] = segment_ids
    
    # Print statistics
    total_segments = df_segmented['Segment_ID'].nunique()
    avg_segments_per_sitting = df_segmented.groupby('Sitting_ID')['Segment_ID'].nunique().mean()
    
    print(f"Segmentation complete!")
    print(f"Total boundaries detected: {total_boundaries}")
    print(f"Total segments created: {total_segments}")
    print(f"Average segments per sitting: {avg_segments_per_sitting:.2f}")
    
    return df_segmented

## Segment Aggregation and Re-embedding Functions

After creating segments, we aggregate the raw text and re-embed for better topic modeling representation.

In [None]:
def aggregate_segments(df, text_column='Text'):
    """
    Aggregate speeches into segments using FULL raw text for maximum semantic coherence.
    """
    print(f"Aggregating {len(df)} speeches into segments using FULL raw text...")
    
    # Check for extremely long speeches in segments
    if 'Is_Extremely_Long' in df.columns:
        n_extremely_long = df['Is_Extremely_Long'].sum()
        if n_extremely_long > 0:
            print(f"Note: {n_extremely_long} speeches were marked as extremely long (zero embeddings)")
    
    # Aggregate by segment
    segment_agg = df.groupby('Segment_ID').agg({
        text_column: ' '.join,  # Concatenate ALL raw texts in segment
        'Sitting_ID': 'first',
        'Date': 'first',
        'Speaker_party': lambda x: x.mode().iloc[0] if not x.mode().empty else x.iloc[0],
        'Speaker_name': lambda x: ' | '.join(x.unique()[:3]),
        'Word_Count': 'sum'
    }).reset_index()
    
    # Rename aggregated text column
    segment_agg.rename(columns={text_column: 'Aggregated_Text'}, inplace=True)
    
    # Show segment statistics
    segment_lengths = [len(text) for text in segment_agg['Aggregated_Text']]
    print(f"Created {len(segment_agg)} segments")
    print(f"Segment length - Min: {min(segment_lengths)}, Max: {max(segment_lengths)}, Mean: {np.mean(segment_lengths):.0f}")
    
    # Check if any segments are now extremely long after aggregation
    segment_threshold = np.percentile(segment_lengths, 99.5)  # Slightly less strict for segments
    very_long_segments = sum(1 for length in segment_lengths if length > segment_threshold)
    if very_long_segments > 0:
        print(f"Warning: {very_long_segments} segments are very long (>{segment_threshold:.0f} chars)")
    
    return segment_agg

def generate_segment_embeddings_for_topic_modeling(segment_df, model_name="BAAI/bge-m3"):
    """
    SECOND EMBEDDING: Generate embeddings for aggregated segments for topic modeling.
    """
    print("=" * 60)
    print("SECOND EMBEDDING: Aggregated segments for topic modeling")
    print("=" * 60)
    print(f"Generating embeddings for {len(segment_df)} segments using {model_name}")
    
    # Load model
    model = load_embedding_model(model_name)
    
    # Calculate segment length threshold (top 0.5% of segments)
    segment_lengths = [len(text) for text in segment_df['Aggregated_Text']]
    segment_threshold = np.percentile(segment_lengths, 99.5)
    print(f"Segment length threshold (99.5th percentile): {segment_threshold:.0f} characters")
    
    def embed_single_segment(text, index):
        """Embed a single segment using the FULL text with length checking."""
        text_len = len(text)
        
        # Skip extremely long segments
        if text_len > segment_threshold:
            return np.zeros(model.get_sentence_embedding_dimension())
        
        try:
            return model.encode(
                text, 
                convert_to_tensor=False, 
                normalize_embeddings=True,
                show_progress_bar=False
            )
        except Exception as e:
            print(f"Error embedding segment {index+1} (length: {len(text)} chars): {e}")
            return np.zeros(model.get_sentence_embedding_dimension())
    
    # Generate embeddings with progress bar
    print("Generating segment embeddings...")
    start_time = time.time()
    embeddings = []
    
    # Create progress bar for segments
    progress_bar = tqdm(enumerate(segment_df['Aggregated_Text']), 
                       total=len(segment_df), 
                       desc="Embedding segments", 
                       unit="segment", 
                       ncols=100,
                       bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]')
    
    for i, text in progress_bar:
        if i % 10 == 0 and i > 0:
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
        
        embedding = embed_single_segment(text, i)
        embeddings.append(embedding)
        
        # Update progress bar with rate info
        if i % 5 == 0:
            progress_bar.set_postfix({
                'Rate': f'{i/(time.time()-start_time):.1f} seg/s'
            })
    
    progress_bar.close()
    
    print(f"✓ Segment embeddings completed in {time.time() - start_time:.2f} seconds")
    
    # Final memory cleanup
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    # Add to dataframe
    segment_df_with_embeddings = segment_df.copy()
    segment_df_with_embeddings['Segment_Embeddings'] = embeddings
    
    return segment_df_with_embeddings

## Topic Modeling Functions

These functions handle BERTopic training using the re-embedded segment representations.

In [17]:
from bertopic import BERTopic
from sklearn.cluster import KMeans
import hdbscan
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
import openai
import os
from dotenv import load_dotenv
from umap import UMAP

def create_topic_model(clustering_method='kmeans', n_clusters=12, min_cluster_size=30, 
                       custom_stopwords=None, top_n_words=8, n_segments=None):
    """
    Create a BERTopic model with specified clustering method and validation.
    
    Args:
        clustering_method (str): 'kmeans' or 'hdbscan'
        n_clusters (int): Number of clusters for KMeans
        min_cluster_size (int): Minimum cluster size for HDBSCAN
        custom_stopwords (list): Additional stopwords
        top_n_words (int): Number of keywords per topic
        n_segments (int): Number of segments to validate against
    
    Returns:
        BERTopic: Configured BERTopic model
    """
    # Default custom stopwords for parliamentary speeches
    if custom_stopwords is None:
        custom_stopwords = [
            'mr', 'mrs', 'ms', 'madam', 'honourable', 'member', 'members', 'vp', 'sp', 'fp',
            'minister', 'speaker', 'deputy', 'president', 'chairman', 'chair', 'secretary',
            'motion', 'amendment', 'debate', 'question', 'order', 'point', 'procedure',
            'applause', 'thank', 'thanks', 'congratulations', 'welcome', 'session', 'meeting'
        ]
    
    # Combine with default English stopwords
    all_stopwords = list(text.ENGLISH_STOP_WORDS.union(custom_stopwords))
    
    # Validate data size and adjust parameters
    if n_segments is not None:
        if n_segments < 15:
            print(f"Warning: Only {n_segments} segments available. Adjusting model parameters for small dataset.")
            if clustering_method.lower() == 'kmeans':
                n_clusters = min(n_clusters, max(2, n_segments // 2))
                print(f"Adjusted n_clusters to {n_clusters}")
            else:
                min_cluster_size = min(min_cluster_size, max(2, n_segments // 3))
                print(f"Adjusted min_cluster_size to {min_cluster_size}")
    
    # Choose clustering model
    if clustering_method.lower() == 'kmeans':
        clustering_model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        print(f"Using KMeans clustering with {n_clusters} clusters")
    elif clustering_method.lower() == 'hdbscan':
        clustering_model = hdbscan.HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_cluster_size,
            metric='euclidean',
            cluster_selection_method='eom',
            prediction_data=True
        )
        print(f"Using HDBSCAN clustering with min_cluster_size={min_cluster_size}")
    else:
        raise ValueError("clustering_method must be 'kmeans' or 'hdbscan'")
    
    # Create vectorizer
    vectorizer = CountVectorizer(stop_words=all_stopwords)
    
    # Configure UMAP for small datasets
    if n_segments is not None and n_segments < 15:
        umap_model = UMAP(
            n_neighbors=min(5, n_segments - 1),
            n_components=min(5, n_segments - 1),
            metric='cosine',
            random_state=42
        )
        print(f"Using UMAP with adjusted parameters for small dataset")
    else:
        umap_model = UMAP(
            n_neighbors=15,
            n_components=5,
            metric='cosine',
            random_state=42
        )
    
    # Create BERTopic model
    topic_model = BERTopic(
        embedding_model=None,  # We provide pre-computed embeddings
        umap_model=umap_model,
        hdbscan_model=clustering_model,
        vectorizer_model=vectorizer,
        top_n_words=top_n_words,
        verbose=False
    )
    
    return topic_model

def train_topic_model(segment_df, clustering_method='kmeans', n_clusters=12):
    """
    Train BERTopic model on aggregated segments using segment embeddings.
    
    Args:
        segment_df (pd.DataFrame): DataFrame with segments and segment embeddings
        clustering_method (str): Clustering method to use
        n_clusters (int): Number of clusters for KMeans
    
    Returns:
        tuple: (topic_model, segment_df_with_topics, topic_info)
    """
    print(f"Training BERTopic model on {len(segment_df)} segments")
    
    # Validate minimum data requirements
    if len(segment_df) < 5:
        print(f"ERROR: Only {len(segment_df)} segments available. Need at least 5 segments for topic modeling.")
        print("Suggestions:")
        print("1. Use a larger test sample (try 200-500 speeches instead of 50)")
        print("2. Adjust segmentation parameters to create fewer, larger segments")
        print("3. Skip topic modeling for very small datasets")
        return None, None, None
    
    # Create model with validation
    topic_model = create_topic_model(
        clustering_method=clustering_method, 
        n_clusters=n_clusters,
        n_segments=len(segment_df)
    )
    
    # Prepare data - use the segment embeddings (not speech embeddings)
    texts = segment_df['Aggregated_Text'].tolist()
    embeddings = np.array(segment_df['Segment_Embeddings'].tolist())
    
    print(f"Using segment embeddings with shape: {embeddings.shape}")
    
    # Train model
    print("Training BERTopic...")
    start_time = time.time()
    
    try:
        topics, probabilities = topic_model.fit_transform(texts, embeddings=embeddings)
        print(f"Training completed in {time.time() - start_time:.2f} seconds")
    except Exception as e:
        print(f"BERTopic training failed: {e}")
        print("This often happens with very small datasets. Try using more data.")
        return None, None, None
    
    # Add topics to dataframe
    segment_df_with_topics = segment_df.copy()
    segment_df_with_topics['Topic'] = topics
    
    # Get topic info
    topic_info = topic_model.get_topic_info()
    
    # Print statistics
    print(f"Discovered {len(topic_info)} topics")
    if clustering_method.lower() == 'hdbscan':
        outliers = (topics == -1).sum()
        print(f"Outliers: {outliers} ({outliers/len(topics)*100:.1f}%)")
    
    return topic_model, segment_df_with_topics, topic_info

## Main Processing Pipeline

Updated pipeline implementing the dual-embedding approach correctly.

In [18]:
def run_dual_embedding_topic_pipeline(df, save_intermediate=True, data_folder='data folder/data/'):
    """
    Run the complete dual-embedding topic modeling pipeline.
    
    DUAL EMBEDDING STRATEGY:
    1. First embedding: Individual speeches using RAW TEXT (for segmentation)
    2. Second embedding: Aggregated segments using RAW TEXT (for topic modeling)
    
    Args:
        df (pd.DataFrame): Input dataframe with speeches (should contain raw text)
        save_intermediate (bool): Whether to save intermediate results
        data_folder (str): Folder to save results
    
    Returns:
        dict: Dictionary containing all results
    """
    results = {}
    
    # Verify we have the required columns - simplified requirements
    required_cols = ['Text', 'Sitting_ID', 'Speaker_ID']
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        print(f"Missing required columns: {missing_cols}")
        print(f"Available columns: {list(df.columns)}")
        raise ValueError(f"Missing required columns: {missing_cols}")
    
    print("=== DUAL EMBEDDING PIPELINE USING RAW TEXT ===")
    print(f"Input data shape: {df.shape}")
    print(f"Using RAW text for both embeddings (better semantic quality)")
    
    # Step 1: Generate speech embeddings for segmentation (FIRST EMBEDDING - RAW TEXT)
    print("\n=== STEP 1: FIRST EMBEDDING - Individual Speeches (Raw Text) for Segmentation ===")
    df_with_speech_embeddings = generate_speech_embeddings_for_segmentation(
        df, 
        text_column='Text',  # Use RAW text from 'Text' column
        model_name="nomic-ai/nomic-embed-text-v1.5",  # Back to original model
        batch_size=8  # Very small batch size for memory safety
    )
    results['df_with_speech_embeddings'] = df_with_speech_embeddings
    
    if save_intermediate:
        df_with_speech_embeddings.to_pickle(f'{data_folder}AT_with_speech_embeddings_raw.pkl')
        print(f"Saved speech embeddings (raw text) to {data_folder}AT_with_speech_embeddings_raw.pkl")
    
    # Step 2: Segment speeches using speech embeddings
    print("\n=== STEP 2: Segmenting Speeches Using Speech Embeddings ===")
    df_segmented = segment_speeches_by_similarity(
        df_with_speech_embeddings,
        window_size=3,
        height_threshold=0.25,
        prominence_threshold=0.15,
        distance_threshold=5
    )
    results['df_segmented'] = df_segmented
    
    # Step 3: Aggregate segments (concatenate RAW text)
    print("\n=== STEP 3: Aggregating Segments (Raw Text Concatenation) ===")
    segments_df = aggregate_segments(df_segmented, text_column='Text')  # Use RAW text
    results['segments_df'] = segments_df
    
    # Step 4: Generate segment embeddings for topic modeling (SECOND EMBEDDING - RAW TEXT)
    print("\n=== STEP 4: SECOND EMBEDDING - Aggregated Segments (Raw Text) for Topic Modeling ===")
    segments_with_embeddings = generate_segment_embeddings_for_topic_modeling(
        segments_df,
        model_name="BAAI/bge-m3"  # Keep original model
    )
    results['segments_with_embeddings'] = segments_with_embeddings
    
    if save_intermediate:
        segments_with_embeddings.to_pickle(f'{data_folder}AT_segments_with_embeddings_raw.pkl')
        print(f"Saved segment embeddings (raw text) to {data_folder}AT_segments_with_embeddings_raw.pkl")
    
    # Step 5: Train topic model using segment embeddings
    print("\n=== STEP 5: Training Topic Model Using Segment Embeddings ===")
    topic_model, segments_with_topics, topic_info = train_topic_model(
        segments_with_embeddings,
        clustering_method='kmeans',
        n_clusters=12
    )
    results['topic_model'] = topic_model
    results['segments_with_topics'] = segments_with_topics
    results['topic_info'] = topic_info
    
    # Step 6: Generate topic names using LLM
    print("\n=== STEP 6: Generating Topic Names ===")
    try:
        topic_info_with_names = generate_topic_names_with_llm(topic_info)
    except Exception as e:
        print(f"LLM naming failed: {e}. Using default names.")
        topic_info_with_names = topic_info.copy()
        topic_info_with_names['LLM_Name'] = topic_info_with_names.apply(lambda row: f"Topic {row['Topic']}", axis=1)
    
    results['topic_info_with_names'] = topic_info_with_names
    
    # Step 7: Map topics back to original speeches
    print("\n=== STEP 7: Mapping Topics Back to Individual Speeches ===")
    
    # Create segment-to-topic mapping
    segment_topic_map = segments_with_topics[['Segment_ID', 'Topic']].copy()
    
    # Map topics back to individual speeches
    df_with_topics = df_segmented.merge(segment_topic_map, on='Segment_ID', how='left')
    
    # Create topic name mapping
    topic_name_map = dict(zip(topic_info_with_names['Topic'], topic_info_with_names['LLM_Name']))
    
    # Add readable topic names
    df_with_topics['Topic_Name'] = df_with_topics['Topic'].map(topic_name_map)
    
    results['df_with_topics'] = df_with_topics
    results['topic_name_map'] = topic_name_map
    
    # Check merge success
    missing_topics = df_with_topics['Topic'].isna().sum()
    if missing_topics > 0:
        print(f"Warning: {missing_topics} speeches could not be mapped to topics")
    else:
        print("✓ All speeches successfully mapped to topics!")
    
    # Final save
    if save_intermediate:
        # Clean up speech embeddings before saving final result to save space
        df_final_clean = df_with_topics.drop(columns=['Speech_Embeddings'], errors='ignore')
        df_final_clean.to_pickle(f'{data_folder}AT_with_topics_final.pkl')
        topic_info_with_names.to_csv(f'{data_folder}topic_info_with_names.csv', index=False)
        print(f"Saved final results to {data_folder}")
    
    print("\n" + "=" * 70)
    print("DUAL EMBEDDING PIPELINE COMPLETE - USING RAW TEXT")
    print("=" * 70)
    print(f"✓ Processed {len(df)} speeches into {len(segments_df)} segments")
    print(f"✓ Used RAW text for both speech-level and segment-level embeddings")
    print(f"✓ Discovered {len(topic_info)} topics using segment embeddings")
    print(f"✓ Successfully mapped {len(df_with_topics) - missing_topics} speeches to topics")
    print(f"✓ Embedding approach: Raw Speech Text → Segmentation → Raw Segment Text → Topics")
    
    return results

# Run the dual-embedding pipeline directly on full dataset
print("Starting topic modeling pipeline on full dataset...")

try:
    # Run on full dataset
    results = run_dual_embedding_topic_pipeline(AT_processed_df)
    
except Exception as e:
    print(f"Error in pipeline: {e}")
    import traceback
    traceback.print_exc()

Starting topic modeling pipeline on full dataset...
=== DUAL EMBEDDING PIPELINE USING RAW TEXT ===
Input data shape: (190633, 28)
Using RAW text for both embeddings (better semantic quality)

=== STEP 1: FIRST EMBEDDING - Individual Speeches (Raw Text) for Segmentation ===
FIRST EMBEDDING: Individual speeches for segmentation
Generating embeddings for 190633 speeches using nomic-ai/nomic-embed-text-v1.5
Loading embedding model: nomic-ai/nomic-embed-text-v1.5 on cpu


<All keys matched successfully>


Model loaded in 6.23 seconds
Text length statistics (characters):
  Min: 40, Max: 238216, Mean: 2057
  99.9th percentile length: 21750 characters
  Extremely long speeches (top 0.1%): 191
  These will be assigned zero embeddings for memory safety
Generating speech-level embeddings...
Starting embedding generation...


KeyboardInterrupt: 

In [None]:
# Extract results only if the pipeline succeeded
if 'results' in locals() and results is not None:
    df_final = results['df_with_topics']
    topic_info = results['topic_info_with_names']
    segments_df = results['segments_with_topics']

    print("=" * 70)
    print("DUAL EMBEDDING PIPELINE VALIDATION - RAW TEXT APPROACH")
    print("=" * 70)

    # Verify dual embedding approach with raw text
    print("✓ FIRST EMBEDDING: Individual speeches using RAW text for segmentation")
    print(f"  - Speech embeddings shape: {len(results['df_with_speech_embeddings'])}")
    print(f"  - Used RAW text for better semantic boundary detection")

    print("\n✓ SECOND EMBEDDING: Aggregated segments using RAW text for topic modeling")
    print(f"  - Segment embeddings shape: {len(results['segments_with_embeddings'])}")
    print(f"  - Used concatenated RAW text for better topic representation")

    # Topic summary
    print(f"\n✓ TOPICS DISCOVERED:")
    for _, row in topic_info.iterrows():
        if row['Topic'] != -1:  # Skip outlier topic if any
            print(f"  Topic {row['Topic']}: {row['LLM_Name']} ({row['Count']} segments)")

    # Verify RAW text was used throughout (more comprehensive check)
    sample_segment = segments_df.iloc[0]['Aggregated_Text']
    print(f"\n✓ RAW TEXT VERIFICATION (sample segment):")
    print(f"  Contains punctuation: {'.' in sample_segment}")
    print(f"  Contains capitals: {any(c.isupper() for c in sample_segment)}")
    print(f"  Contains common words: {'the' in sample_segment.lower()}")
    print(f"  Contains parliamentary formalities: {'Mr.' in sample_segment or 'Speaker' in sample_segment}")
    print(f"  Length: {len(sample_segment)} chars")
    print(f"  Sample: '{sample_segment[:150]}...'")

    print("\n" + "=" * 70)
    print("RAW TEXT DUAL EMBEDDING APPROACH SUCCESSFULLY IMPLEMENTED!")
    print("=" * 70)
else:
    print("Pipeline did not complete successfully. Results not available for validation.")
    print("This usually happens when:")
    print("1. The test dataset is too small (creates too few segments)")
    print("2. Segmentation parameters are too restrictive")
    print("3. Memory or model loading issues")
    print("\nTry running with more speeches in the test set or adjust segmentation parameters.")

NameError: name 'results' is not defined

In [None]:
def map_topics_back_to_original(original_df, processed_df_with_topics, segment_df_with_topics):
    """
    Map topics back to the original complete dataframe, including filtered speeches.
    
    Args:
        original_df: Complete original dataframe
        processed_df_with_topics: Processed dataframe with topic assignments
        segment_df_with_topics: Segment-level dataframe with topics
    
    Returns:
        pd.DataFrame: Original dataframe with topic assignments
    """
    print("Mapping topics back to original complete dataframe...")
    
    # Start with original dataframe
    result_df = original_df.copy()
    result_df['Topic'] = None
    result_df['Topic_Assignment_Method'] = None
    
    # 1. Direct mapping for speeches that went through topic modeling
    print("1. Direct mapping for processed speeches...")
    
    # Create mapping from processed speeches to topics
    if 'Speech_ID' in processed_df_with_topics.columns:
        # Use Speech_ID for mapping
        speech_topic_map = processed_df_with_topics.set_index('Speech_ID')['Topic_Name'].to_dict()
        mask = result_df['Speech_ID'].isin(speech_topic_map.keys())
        result_df.loc[mask, 'Topic'] = result_df.loc[mask, 'Speech_ID'].map(speech_topic_map)
        result_df.loc[mask, 'Topic_Assignment_Method'] = 'Direct'
        direct_assigned = mask.sum()
    else:
        # Fallback: try index-based mapping (less reliable)
        direct_assigned = 0
        print("Warning: No Speech_ID found for direct mapping")
    
    print(f"   Directly assigned: {direct_assigned}")
    
    # 2. Contextual assignment for short speeches
    print("2. Contextual assignment for short speeches...")
    short_speeches = result_df['Is_Too_Short'] & result_df['Topic'].isna()
    contextual_assigned = 0
    
    for idx in result_df[short_speeches].index:
        sitting_id = result_df.loc[idx, 'Sitting_ID']
        
        # Find other speeches in same sitting with topics
        same_sitting = result_df[
            (result_df['Sitting_ID'] == sitting_id) & 
            (result_df['Topic'].notna())
        ]
        
        if len(same_sitting) > 0:
            # Get most common topic in this sitting
            most_common_topic = same_sitting['Topic'].mode()
            if len(most_common_topic) > 0:
                result_df.loc[idx, 'Topic'] = most_common_topic.iloc[0]
                result_df.loc[idx, 'Topic_Assignment_Method'] = 'Contextual'
                contextual_assigned += 1
    
    print(f"   Contextually assigned: {contextual_assigned}")
    
    # 3. Handle remaining unassigned speeches
    print("3. Handling remaining unassigned speeches...")
    
    # Short speeches without context
    short_unassigned = result_df['Is_Too_Short'] & result_df['Topic'].isna()
    result_df.loc[short_unassigned, 'Topic'] = 'Procedural'
    result_df.loc[short_unassigned, 'Topic_Assignment_Method'] = 'Procedural'
    
    # Any other unassigned
    still_unassigned = result_df['Topic'].isna()
    result_df.loc[still_unassigned, 'Topic'] = 'Unclassified'
    result_df.loc[still_unassigned, 'Topic_Assignment_Method'] = 'Unclassified'
    
    print(f"   Short speeches marked as 'Procedural': {short_unassigned.sum()}")
    print(f"   Other unassigned marked as 'Unclassified': {still_unassigned.sum()}")
    
    # Summary
    print(f"\nFinal topic assignment summary:")
    assignment_counts = result_df['Topic_Assignment_Method'].value_counts()
    for method, count in assignment_counts.items():
        print(f"   {method}: {count}")
    
    return result_df

# Run complete pipeline with correct variable names (only if results exist)
def run_complete_topic_pipeline(processed_df, original_df, save_results=True):
    """
    Complete pipeline that processes topics and maps back to original data.
    """
    # Run existing topic modeling pipeline
    results = run_dual_embedding_topic_pipeline(processed_df, save_intermediate=True)
    
    if results is None:
        print("Topic modeling pipeline failed. Cannot proceed with mapping.")
        return None, None
    
    # Map back to original complete dataframe
    final_df = map_topics_back_to_original(
        original_df, 
        results['df_with_topics'], 
        results['segments_with_topics']
    )
    
    if save_results:
        final_df.to_pickle('data folder/data/AT_complete_with_topics.pkl')
        print("Saved complete dataframe with topics to 'AT_complete_with_topics.pkl'")
    
    return final_df, results

# Only run complete pipeline if the test succeeded
if 'results' in locals() and results is not None:
    print("\n" + "=" * 70)
    print("RUNNING COMPLETE PIPELINE ON FULL DATASET")
    print("=" * 70)
    AT_final, topic_results = run_complete_topic_pipeline(AT_processed_df, AT_original_df)
else:
    print("\nSkipping complete pipeline run due to test failure.")
    print("Increase test sample size or check data quality first.")