# Parliamentary Speech Processing Pipeline - Multi-Language Support

This notebook implements a complete pipeline for processing parliamentary speeches in multiple languages:

1. **Setup & Configuration** - Google Colab setup and imports
2. **Core Functions** - Embedding and segmentation functions (language-agnostic)
3. **Data Processing** - Configurable pipeline for any language
4. **Language-Specific Execution** - Run pipeline for English, German, or other languages

## Key Features:
- **Multi-language support**: English, German (easily extensible)
- **Dual embedding strategy**: Speech-level + Segment-level
- **Smart segmentation**: Agenda-aware boundary detection
- **GPU optimization**: A100 optimized with checkpointing

In [None]:
# === GOOGLE COLAB SETUP ===
from google.colab import drive
drive.mount('/content/drive')

!pip install sentence-transformers bertopic umap-learn hdbscan tqdm openai python-dotenv

import torch
import pandas as pd
import numpy as np
import warnings
import os
import gc
import random
from tqdm import tqdm
import pickle
import time
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

# GPU optimization and batch size configuration
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
    print(f"GPU: {gpu_name} | Memory: {gpu_memory:.1f} GB")
    
    # Dynamic batch size based on GPU type
    if 'A100' in gpu_name:
        DEFAULT_BATCH_SIZE = 124  # A100 optimized
        print(f"🚀 A100 detected: Using optimized batch size {DEFAULT_BATCH_SIZE}")
    elif 'V100' in gpu_name or 'T4' in gpu_name:
        DEFAULT_BATCH_SIZE = 64   # V100/T4 optimized
        print(f"⚡ {gpu_name} detected: Using batch size {DEFAULT_BATCH_SIZE}")
    else:
        DEFAULT_BATCH_SIZE = 32   # Conservative default
        print(f"🔧 Generic GPU detected: Using conservative batch size {DEFAULT_BATCH_SIZE}")
    
    torch.backends.cudnn.benchmark = True
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
else:
    print("No GPU detected - will use CPU (slower)")
    DEFAULT_BATCH_SIZE = 8  # CPU batch size

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"✅ Setup complete! Default batch size: {DEFAULT_BATCH_SIZE}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
CUDA available: False
No GPU detected - will use CPU (slower)
Setup complete! ✓


In [None]:
# === CONFIGURATION ===
data_folder = '/content/drive/MyDrive/thesis data/'

# Language configurations
LANGUAGE_CONFIG = {
    'english': {
        'file': 'AT_en.pkl',
        'chairperson_role': 'Chairperson',
        'agenda_keywords': {
            'strong': ['agenda item'],
            'medium': ['agenda'],
            'weak': ['item', 'point']
        }
    },
    'german': {
        'file': 'AT_german.pkl', 
        'chairperson_role': 'PräsidentIn',
        'agenda_keywords': {
            'strong': ['tagesordnungspunkt', 'punkt der tagesordnung'],
            'medium': ['tagesordnung', 'verhandlung'],
            'weak': ['behandlung']
        }
    }
}

print("📋 Available languages:", list(LANGUAGE_CONFIG.keys()))

In [None]:
# === CORE FUNCTIONS (UNIFIED) ===

def load_and_verify_data(language):
    """Load and verify data for specified language."""
    config = LANGUAGE_CONFIG[language]
    data_path = f"{data_folder}{config['file']}"
    
    df = pd.read_pickle(data_path)
    print(f"✅ Loaded {language} dataset: {df.shape}")
    
    # Verify required columns
    if 'Text_ID' not in df.columns:
        raise ValueError(f"Text_ID column not found in {language} dataset")
        
    if 'Speaker_role' in df.columns:
        role_counts = df['Speaker_role'].value_counts()
        if config['chairperson_role'] in role_counts.index:
            print(f"✅ Found '{config['chairperson_role']}': {role_counts[config['chairperson_role']]:,} speeches")
        else:
            print(f"⚠️ '{config['chairperson_role']}' not found in Speaker_role")
    
    print(f"📊 {len(df):,} speeches across {df['Text_ID'].nunique():,} sittings")
    return df

def generate_embeddings(df, text_column='Text', model_name="BAAI/bge-m3", batch_size=None, checkpoint_freq=10000):
    """Generate BGE-m3 embeddings with GPU optimization."""
    # Use dynamic batch size if not specified
    if batch_size is None:
        batch_size = DEFAULT_BATCH_SIZE
    
    print("=" * 60)
    print("GENERATING SPEECH EMBEDDINGS")
    print("=" * 60)
    print(f"Using batch size: {batch_size}")
    
    # Setup checkpointing
    checkpoint_dir = '/content/drive/MyDrive/checkpoints/'
    os.makedirs(checkpoint_dir, exist_ok=True)
    checkpoint_path = f'{checkpoint_dir}speech_embeddings_checkpoint.pkl'
    
    # Try to load existing checkpoint
    checkpoint_data = None
    if os.path.exists(checkpoint_path):
        with open(checkpoint_path, 'rb') as f:
            checkpoint_data = pickle.load(f)
        print(f"📂 Resuming from checkpoint at index {checkpoint_data['last_processed_idx'] + 1}")
    
    start_idx = checkpoint_data['last_processed_idx'] + 1 if checkpoint_data else 0
    embeddings = checkpoint_data['embeddings'] if checkpoint_data else []
    
    # Load model
    model = SentenceTransformer(model_name, device="cuda" if torch.cuda.is_available() else "cpu")
    if torch.cuda.is_available():
        model.half()
    tokenizer = model.tokenizer
    model.max_seq_length = 8192
    
    texts = df[text_column].astype(str).values
    
    with tqdm(total=len(texts), initial=start_idx, desc="🚀 Embedding", unit="speech") as pbar:
        for i in range(start_idx, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            
            # Process batch (handle long texts)
            batch_embeddings = []
            short_texts = []
            short_indices = []
            
            for j, text in enumerate(batch_texts):
                token_count = len(tokenizer.encode(text, add_special_tokens=False))
                if token_count <= 8192:
                    short_texts.append(text)
                    short_indices.append(j)
                else:
                    # Handle long text with chunking
                    emb = embed_long_text(text, model, tokenizer)
                    batch_embeddings.append((j, emb))
            
            # Batch process short texts with dynamic batch size
            if short_texts:
                actual_batch_size = min(batch_size, len(short_texts))
                short_embeddings = model.encode(short_texts, batch_size=actual_batch_size, 
                                              convert_to_tensor=False, show_progress_bar=False)
                for idx, emb in zip(short_indices, short_embeddings):
                    batch_embeddings.append((idx, emb))
            
            batch_embeddings.sort(key=lambda x: x[0])
            embeddings.extend([emb for _, emb in batch_embeddings])
            pbar.update(len(batch_texts))
            
            # Checkpoint periodically
            if (i + batch_size) % checkpoint_freq == 0:
                with open(checkpoint_path, 'wb') as f:
                    pickle.dump({'embeddings': embeddings, 'last_processed_idx': i + len(batch_texts) - 1}, f)
                if (i + batch_size) % (checkpoint_freq * 4) == 0:
                    print(f"\n💾 Progress: {i + batch_size:,}/{len(texts):,}")
            
            # Memory management
            if (i + batch_size) % (checkpoint_freq * 2) == 0:
                torch.cuda.empty_cache()
                gc.collect()
    
    # Cleanup
    if os.path.exists(checkpoint_path):
        os.remove(checkpoint_path)
    
    df_result = df.copy()
    df_result['Speech_Embeddings'] = embeddings
    return df_result

def embed_long_text(text, model, tokenizer):
    """Handle texts longer than model max length."""
    token_ids = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    starts = list(range(0, len(token_ids), 4096 - 1024))
    for start in starts:
        end = min(start + 4096, len(token_ids))
        chunk_ids = token_ids[start:end]
        chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
        chunks.append(chunk_text)
    
    chunk_embeddings = model.encode(chunks, batch_size=32, convert_to_tensor=False, show_progress_bar=False)
    return np.mean(chunk_embeddings, axis=0)

def segment_speeches(df, language, window_size=5, min_segment_size=3):
    """Language-aware parliamentary segmentation."""
    config = LANGUAGE_CONFIG[language]
    
    print(f"🏛️ {language.upper()} Parliamentary Segmentation")
    print(f"🔍 Looking for '{config['chairperson_role']}' with keywords: {config['agenda_keywords']}")
    
    segment_ids = []
    segmentation_metrics = []

    sitting_column = 'Text_ID'
    
    # Get unique sittings for progress tracking
    unique_sittings = df[sitting_column].unique()
    print(f"🔄 Processing {len(unique_sittings)} sittings...")

    for sitting_id in tqdm(unique_sittings, desc=f"Segmenting {language} sittings", unit="sitting"):
        group = df[df[sitting_column] == sitting_id]
        sitting_length = len(group)

        if sitting_length < min_segment_size:
            # Very small sitting - one segment
            sitting_segments = [f"{sitting_id}_seg_0"] * len(group)
            segment_ids.extend(sitting_segments)
            segmentation_metrics.append({
                'sitting_id': sitting_id,
                'sitting_length': sitting_length,
                'num_segments': 1,
                'avg_segment_size': sitting_length,
                'boundaries_found': 0,
                'agenda_boundaries': 0
            })
            continue

        embeddings = np.array(group['Speech_Embeddings'].tolist())

        # More aggressive target segments for finer granularity
        if sitting_length < 30:
            target_segments = max(2, min(5, sitting_length // 8))
            threshold_percentile = 35
        elif sitting_length < 100:
            target_segments = max(3, min(12, sitting_length // 12))
            threshold_percentile = 40
        else:
            target_segments = max(5, min(20, sitting_length // 15))
            threshold_percentile = 45

        # === LANGUAGE-SPECIFIC CHAIRPERSON AGENDA DETECTION ===
        agenda_boundaries = set()
        agenda_signals = []

        for i, (idx, row) in enumerate(group.iterrows()):
            agenda_score = 0

            # Check for language-specific chairperson role
            if 'Speaker_role' in row and pd.notna(row['Speaker_role']) and row['Speaker_role'] == config['chairperson_role']:
                text = str(row['Text']).lower()

                # Check for strong agenda signals
                if any(keyword in text for keyword in config['agenda_keywords']['strong']):
                    agenda_score = 1.0  # Strongest signal
                elif any(keyword in text for keyword in config['agenda_keywords']['medium']):
                    agenda_score = 0.7  # Strong signal
                elif any(keyword in text for keyword in config['agenda_keywords']['weak']):
                    agenda_score = 0.5  # Medium signal
                elif i == 0:  # First speech by chairperson (session start)
                    agenda_score = 0.3  # Mild signal

            agenda_signals.append(agenda_score)

            # Add strong agenda boundaries
            if agenda_score >= 0.7 and i >= min_segment_size and (sitting_length - i) >= min_segment_size:
                agenda_boundaries.add(i)

        # === MULTI-SCALE SIMILARITY ANALYSIS ===
        similarity_signals = {}

        # 1. Primary windowed similarity
        similarities = []
        for i in range(len(embeddings) - window_size):
            window1 = np.mean(embeddings[i:i + window_size], axis=0)
            window2 = np.mean(embeddings[i + window_size:i + 2*window_size], axis=0)

            sim = cosine_similarity(
                window1.reshape(1, -1),
                window2.reshape(1, -1)
            )[0][0]
            similarities.append(sim)

        similarity_signals['primary'] = np.array(similarities)

        # 2. Point-to-point similarity for fine-grained detection
        if len(embeddings) > 6:
            point_sims = []
            for i in range(len(embeddings) - 1):
                sim = cosine_similarity(
                    embeddings[i].reshape(1, -1),
                    embeddings[i + 1].reshape(1, -1)
                )[0][0]
                point_sims.append(sim)

            # Align with primary signal
            point_sims = np.array(point_sims)
            if len(point_sims) > len(similarities):
                point_sims = point_sims[:len(similarities)]
            elif len(point_sims) < len(similarities):
                padding = len(similarities) - len(point_sims)
                point_sims = np.pad(point_sims, (0, padding), mode='edge')

            similarity_signals['point'] = point_sims

        # 3. Gradient-based change detection
        if len(embeddings) > 10:
            trajectory = []
            for i in range(1, len(embeddings)):
                displacement = np.linalg.norm(embeddings[i] - embeddings[i-1])
                trajectory.append(float(displacement))

            trajectory = np.array(trajectory, dtype=np.float64)
            if len(trajectory) > 3:
                try:
                    from scipy.ndimage import uniform_filter1d
                    smoothed = uniform_filter1d(trajectory.astype(np.float64), size=3)
                    gradient = np.gradient(smoothed)

                    # Align with similarities
                    if len(gradient) > len(similarities):
                        gradient = gradient[:len(similarities)]
                    elif len(gradient) < len(similarities):
                        padding = len(similarities) - len(gradient)
                        gradient = np.pad(gradient, (0, padding), mode='edge')

                    similarity_signals['gradient'] = gradient
                except:
                    pass

        if len(similarity_signals['primary']) == 0:
            sitting_segments = [f"{sitting_id}_seg_0"] * len(group)
            segment_ids.extend(sitting_segments)
            segmentation_metrics.append({
                'sitting_id': sitting_id,
                'sitting_length': sitting_length,
                'num_segments': 1,
                'avg_segment_size': sitting_length,
                'boundaries_found': 0,
                'agenda_boundaries': 0
            })
            continue

        # === BOUNDARY DETECTION ===
        candidate_boundaries = set()

        # 1. Add agenda boundaries (highest priority)
        candidate_boundaries.update(agenda_boundaries)

        # 2. Find boundaries from primary similarity drops
        primary_sims = similarity_signals['primary']
        threshold = np.percentile(primary_sims, threshold_percentile)

        for i in range(len(primary_sims)):
            if (primary_sims[i] < threshold and
                i >= min_segment_size and
                (len(group) - i - window_size) >= min_segment_size):
                candidate_boundaries.add(i + window_size)

        # 3. Add from point-to-point analysis
        if 'point' in similarity_signals:
            point_threshold = np.percentile(similarity_signals['point'], threshold_percentile - 10)
            for i in range(len(similarity_signals['point'])):
                if (similarity_signals['point'][i] < point_threshold and
                    i >= min_segment_size and
                    (len(group) - i) >= min_segment_size):
                    candidate_boundaries.add(i)

        # 4. Add from gradient analysis
        if 'gradient' in similarity_signals:
            gradient = similarity_signals['gradient']
            gradient_threshold = np.percentile(np.abs(gradient), 75)
            for i in range(len(gradient)):
                if (np.abs(gradient[i]) > gradient_threshold and
                    i >= min_segment_size and
                    (len(group) - i) >= min_segment_size):
                    candidate_boundaries.add(i)

        candidates = sorted(list(candidate_boundaries))

        # === BOUNDARY SELECTION WITH AGENDA PRIORITIZATION ===
        boundaries = []
        if candidates:
            if len(candidates) <= target_segments - 1:
                boundaries = candidates
            else:
                # Score candidates with agenda boost
                candidate_scores = []
                for c in candidates:
                    score = 0

                    # Agenda boost (highest priority)
                    if c < len(agenda_signals):
                        score += agenda_signals[c] * 5.0  # Very high weight for agenda

                    # Primary similarity score
                    if c - window_size >= 0 and c - window_size < len(primary_sims):
                        score += (1 - primary_sims[c - window_size]) * 2.0

                    # Point similarity score
                    if 'point' in similarity_signals and c < len(similarity_signals['point']):
                        score += (1 - similarity_signals['point'][c]) * 1.5

                    # Gradient score
                    if 'gradient' in similarity_signals and c < len(similarity_signals['gradient']):
                        score += np.abs(similarity_signals['gradient'][c]) * 1.0

                    candidate_scores.append((c, score))

                # Select top scoring boundaries
                candidate_scores.sort(key=lambda x: x[1], reverse=True)
                boundaries = sorted([c for c, _ in candidate_scores[:target_segments-1]])

        # === BOUNDARY VALIDATION ===
        validated_boundaries = []
        for boundary in boundaries:
            if not validated_boundaries or (boundary - validated_boundaries[-1]) >= min_segment_size:
                validated_boundaries.append(boundary)

        boundaries = validated_boundaries

        # Assign segment IDs
        current_segment = 0
        sitting_segments = []

        for i in range(len(group)):
            if i > 0 and (i - 1) in boundaries:
                current_segment += 1
            sitting_segments.append(f"{sitting_id}_seg_{current_segment}")

        segment_ids.extend(sitting_segments)

        # Store metrics
        num_segments = len(set(sitting_segments))
        agenda_bound_count = len([b for b in boundaries if b in agenda_boundaries])

        segmentation_metrics.append({
            'sitting_id': sitting_id,
            'sitting_length': sitting_length,
            'num_segments': num_segments,
            'avg_segment_size': sitting_length / num_segments,
            'boundaries_found': len(boundaries),
            'agenda_boundaries': agenda_bound_count,
            'target_segments': target_segments,
            'candidate_boundaries': len(candidates),
            'signals_used': len(similarity_signals) + 1  # +1 for agenda signals
        })

    df['Segment_ID'] = segment_ids
    return df, segmentation_metrics

def generate_segment_embeddings(df, text_column='Text', segment_id_column='Segment_ID', batch_size=None):
    """Generate embeddings for segments."""
    # Use dynamic batch size if not specified
    if batch_size is None:
        batch_size = DEFAULT_BATCH_SIZE
        
    print("=" * 60)
    print("SEGMENT EMBEDDINGS: Concatenated segment texts")
    print("=" * 60)
    print(f"Using batch size: {batch_size}")
    
    # Create segment texts by concatenating speeches within each segment
    segment_texts = []
    segment_ids = []
    
    for segment_id, group in df.groupby(segment_id_column):
        # Concatenate all texts in the segment with separators
        concatenated_text = ' [SEP] '.join(group[text_column].astype(str).values)
        segment_texts.append(concatenated_text)
        segment_ids.append(segment_id)
    
    print(f"Processing {len(segment_texts)} segments...")
    
    # Generate embeddings for concatenated segment texts
    model = SentenceTransformer("BAAI/bge-m3", device="cuda" if torch.cuda.is_available() else "cpu")
    if torch.cuda.is_available():
        model.half()  # FP16 for A100
    
    tokenizer = model.tokenizer
    segment_embeddings = []
    
    with tqdm(total=len(segment_texts), desc="🚀 Embedding segments", unit="segment") as pbar:
        for i in range(0, len(segment_texts), batch_size):
            batch_texts = segment_texts[i:i+batch_size]
            
            # Process each text in batch (handle long texts)
            batch_embs = []
            for text in batch_texts:
                emb = embed_long_text(text, model, tokenizer)  # Fixed function name
                batch_embs.append(emb)
            
            segment_embeddings.extend(batch_embs)
            pbar.update(len(batch_texts))
    
    # Create mapping from segment_id to embedding
    segment_embedding_map = dict(zip(segment_ids, segment_embeddings))
    
    # Map embeddings back to original dataframe
    df_result = df.copy()
    df_result['Segment_Embeddings'] = df_result[segment_id_column].map(segment_embedding_map)
    
    print(f"✅ Segment embeddings generated for {len(segment_ids)} unique segments")
    return df_result

def run_complete_pipeline(language):
    """Run the complete pipeline for a specified language."""
    print(f"\n🚀 Starting {language.upper()} Pipeline")
    print("=" * 60)
    print(f"🎯 Using batch size: {DEFAULT_BATCH_SIZE}")
    
    # Define all checkpoint paths
    speech_checkpoint_path = f"{data_folder}AT_{language}_with_speech_embeddings.pkl"
    segmentation_checkpoint_path = f"{data_folder}AT_{language}_segmented_with_speech_embeddings.pkl"
    final_path = f"{data_folder}AT_{language}_final.pkl"
    
    # Check if final result already exists
    if os.path.exists(final_path):
        print(f"🎯 FINAL RESULT EXISTS: Loading {final_path}")
        df_final = pd.read_pickle(final_path)
        print(f"✅ Loaded final result: {df_final.shape}")
        print(f"🎯 Segments: {df_final['Segment_ID'].nunique()}")
        return df_final
    
    # Check if segmented data exists (can skip to segment embeddings)
    if os.path.exists(segmentation_checkpoint_path):
        print(f"🔄 SEGMENTATION CHECKPOINT FOUND: Loading {segmentation_checkpoint_path}")
        df_segmented = pd.read_pickle(segmentation_checkpoint_path)
        print(f"✅ Loaded segmented data: {df_segmented.shape}")
        print(f"📊 Continuing from segment embeddings...")
        
        # Generate segment embeddings
        df_final = generate_segment_embeddings(df_segmented)
        
        # Final save
        df_final.to_pickle(final_path)
        print(f"💾 FINAL: {final_path}")
        return df_final
    
    # Check if speech embeddings exist (can skip to segmentation)
    if os.path.exists(speech_checkpoint_path):
        print(f"🔄 SPEECH EMBEDDINGS CHECKPOINT FOUND: Loading {speech_checkpoint_path}")
        df_with_embeddings = pd.read_pickle(speech_checkpoint_path)
        print(f"✅ Loaded speech embeddings: {df_with_embeddings.shape}")
        print(f"📊 Continuing from segmentation...")
    else:
        # Load data and generate speech embeddings
        print("📥 Loading data and generating speech embeddings...")
        df = load_and_verify_data(language)
        df_with_embeddings = generate_embeddings(df)  # Uses DEFAULT_BATCH_SIZE
        
        # Save speech embeddings checkpoint
        df_with_embeddings.to_pickle(speech_checkpoint_path)
        print(f"💾 SPEECH CHECKPOINT: {speech_checkpoint_path}")
    
    # Segment speeches
    df_segmented, seg_metrics = segment_speeches(df_with_embeddings, language)
    
    # Display segmentation results
    metrics_df = pd.DataFrame(seg_metrics)
    print(f"\n✅ {language.upper()} segmentation complete!")
    print(f"📊 Results:")
    print(f"  • Total speeches processed: {len(df_segmented):,}")
    print(f"  • Unique segments created: {df_segmented['Segment_ID'].nunique():,}")
    print(f"  • Average speeches per segment: {len(df_segmented) / df_segmented['Segment_ID'].nunique():.1f}")
    print(f"  • Average segments per sitting: {metrics_df['num_segments'].mean():.1f}")
    print(f"  • Agenda boundaries used: {metrics_df['agenda_boundaries'].sum()}")
    
    # Save segmented data checkpoint
    df_segmented.to_pickle(segmentation_checkpoint_path)
    print(f"💾 SEGMENTATION CHECKPOINT: {segmentation_checkpoint_path}")
    
    # FIX: Generate segment embeddings (was missing!)
    print(f"\n🔄 Generating segment embeddings for {language.upper()}...")
    df_final = generate_segment_embeddings(df_segmented)  # Uses DEFAULT_BATCH_SIZE
    
    # Final save
    df_final.to_pickle(final_path)
    print(f"💾 FINAL: {final_path}")
    
    return df_final

In [None]:
# Choose language(s) to process
languages_to_process = ['german']  # Run German or english

results = {}
for language in languages_to_process:
    try:
        print(f"\n{'='*80}")
        print(f"PROCESSING: {language.upper()}")
        print(f"{'='*80}")
        
        results[language] = run_complete_pipeline(language)
        
        print(f"✅ {language.upper()} COMPLETED!")
        
    except Exception as e:
        print(f"❌ Error processing {language}: {e}")
        import traceback
        traceback.print_exc()

print(f"\n🎉 Pipeline completed for: {list(results.keys())}")