# Parliamentary Speech Embeddings - Speech-Level Processing

This notebook implements the first stage of parliamentary speech processing:

1. **Setup & Configuration** - Google Colab setup and imports
2. **Data Loading & Verification** - Load and verify parliamentary data
3. **Speech Embeddings Generation** - Generate BGE-m3 embeddings for individual speeches
4. **Data Saving** - Save processed data with speech embeddings

## Key Features:
- **Multi-parliament support**: Austrian, Croatian, British parliaments
- **Multi-language support**: English, German, Croatian
- **GPU optimization**: A100 optimized with checkpointing
- **Robust processing**: Handles long texts with chunking

## Output:
Saves datasets with speech embeddings that can be used for segmentation in the next stage.

In [None]:
# === GOOGLE COLAB SETUP ===
from google.colab import drive
drive.mount('/content/drive')

!pip install tqdm python-dotenv sentence-transformers

import torch
import pandas as pd
import numpy as np
import warnings
import random
import os
import gc
import pickle
from tqdm import tqdm
from sentence_transformers import SentenceTransformer


warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

# GPU optimization and batch size configuration
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
    print(f"GPU: {gpu_name} | Memory: {gpu_memory:.1f} GB")
    
    # Dynamic batch size based on GPU type
    if 'A100' in gpu_name:
        DEFAULT_BATCH_SIZE = 64  # A100 optimized
        print(f"🚀 A100 detected: Using optimized batch size {DEFAULT_BATCH_SIZE}")
    elif 'V100' in gpu_name or 'T4' in gpu_name:
        DEFAULT_BATCH_SIZE = 64   # V100/T4 optimized
        print(f"⚡ {gpu_name} detected: Using batch size {DEFAULT_BATCH_SIZE}")
    else:
        DEFAULT_BATCH_SIZE = 32   # Conservative default
        print(f"🔧 Generic GPU detected: Using conservative batch size {DEFAULT_BATCH_SIZE}")
    
    torch.backends.cudnn.benchmark = True
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
else:
    print("No GPU detected - will use CPU (slower)")
    DEFAULT_BATCH_SIZE = 8  # CPU batch size

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"✅ Setup complete! Default batch size: {DEFAULT_BATCH_SIZE}")

In [None]:
# === ENHANCED CONFIGURATION ===
data_folder = '/content/drive/MyDrive/thesis data/'

# Enhanced Parliament and Language configurations
PARLIAMENT_CONFIG = {
    'austrian': {
        'english': {
            'file': 'AT_en.pkl',
            'chairperson_role': 'Chairperson'
        },
        'german': {
            'file': 'AT_german.pkl', 
            'chairperson_role': 'PräsidentIn'
        }
    },
    'croatian': {
        'english': {
            'file': 'CRO_en.pkl',
            'chairperson_role': 'Chairperson'
        },
        'croatian': {
            'file': 'CRO_hr.pkl',
            'chairperson_role': 'Predsjedavajući'
        }
    },
    'british': {
        'english': {
            'file': 'GB_en.pkl',
            'chairperson_role': 'Chairperson'
        }
    }
}

def list_available_options():
    """Display available parliament and language options."""
    print("📋 Available Processing Options:")
    print("=" * 50)
    
    for parliament, languages in PARLIAMENT_CONFIG.items():
        print(f"\n🏛️ {parliament.upper()} Parliament:")
        for language, config in languages.items():
            print(f"  • {language.capitalize()}: {config['file']}")
            print(f"    - Chairperson role: '{config['chairperson_role']}'")

def get_config(parliament, language):
    """Get configuration for specific parliament and language combination."""
    if parliament not in PARLIAMENT_CONFIG:
        raise ValueError(f"Parliament '{parliament}' not supported. Available: {list(PARLIAMENT_CONFIG.keys())}")
    
    if language not in PARLIAMENT_CONFIG[parliament]:
        available_langs = list(PARLIAMENT_CONFIG[parliament].keys())
        raise ValueError(f"Language '{language}' not available for {parliament} parliament. Available: {available_langs}")
    
    return PARLIAMENT_CONFIG[parliament][language]

# Display available options
list_available_options()

print(f"\n🔧 Usage examples:")
print(f"  • Austrian Parliament in German: parliament='austrian', language='german'")
print(f"  • Croatian Parliament in Croatian: parliament='croatian', language='croatian'")
print(f"  • British Parliament in English: parliament='british', language='english'")

In [None]:
# === DATA VERIFICATION FUNCTION ===
def verify_parliament_data(parliament, language):
    """Verify data structure for specific parliament and language."""
    config = get_config(parliament, language)
    data_path = f"{data_folder}{config['file']}"
    
    print(f"📊 {parliament.upper()} Parliament - {language.upper()} Dataset Verification:")
    print("=" * 60)
    
    try:
        df = pd.read_pickle(data_path)
        print(f"✅ Loaded: {df.shape}")
        print(f"📁 File: {config['file']}")
        print(f"📋 Columns: {list(df.columns)}")
        
        # Check required columns
        required_cols = ['Text_ID', 'Text']
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            print(f"⚠️ Missing required columns: {missing_cols}")
        else:
            print("✅ All required columns present")
        
        # Check Speaker_role column and chairperson role
        if 'Speaker_role' in df.columns:
            print(f"\n🎭 Speaker roles found:")
            role_counts = df['Speaker_role'].value_counts().head(10)
            for role, count in role_counts.items():
                percentage = count / len(df) * 100
                marker = "👑" if role == config['chairperson_role'] else "  "
                print(f"  {marker} {role}: {count:,} ({percentage:.1f}%)")
            
            # Verify chairperson role exists
            if config['chairperson_role'] in role_counts.index:
                chair_count = role_counts[config['chairperson_role']]
                chair_pct = chair_count / len(df) * 100
                print(f"\n✅ Chairperson role '{config['chairperson_role']}': {chair_count:,} ({chair_pct:.1f}%)")
            else:
                print(f"\n⚠️ Chairperson role '{config['chairperson_role']}' not found!")
                print(f"Available roles: {list(role_counts.index)}")
        else:
            print("⚠️ No 'Speaker_role' column found")
        
        # Check Text_ID for session identification
        if 'Text_ID' in df.columns:
            print(f"\n🏛️ Sessions: {df['Text_ID'].nunique():,} unique sessions")
            print(f"📝 Total speeches: {len(df):,}")
            print(f"📊 Average speeches per session: {len(df) / df['Text_ID'].nunique():.1f}")
        
        # Sample Text_ID format
        if len(df) > 0:
            print(f"\n📝 Sample Text_ID: {df['Text_ID'].iloc[0]}")
            
        return df
        
    except Exception as e:
        print(f"❌ Error loading data: {e}")
        return None

print("🔍 Data Verification Examples:")
print("Uncomment the lines below to verify specific parliament/language combinations:\n")
print("# verify_parliament_data('austrian', 'english')")
print("# verify_parliament_data('croatian', 'croatian')")
print("# verify_parliament_data('british', 'english')")

In [None]:
# === SPEECH EMBEDDINGS FUNCTIONS ===

def load_and_verify_data(parliament, language):
    """Load and verify data for specified parliament and language."""
    config = get_config(parliament, language)
    data_path = f"{data_folder}{config['file']}"
    
    df = pd.read_pickle(data_path)
    print(f"✅ Loaded {parliament} parliament ({language}): {df.shape}")
    
    # Verify required columns
    if 'Text_ID' not in df.columns:
        raise ValueError(f"Text_ID column not found in {parliament} {language} dataset")
        
    if 'Speaker_role' in df.columns:
        role_counts = df['Speaker_role'].value_counts()
        if config['chairperson_role'] in role_counts.index:
            print(f"✅ Found '{config['chairperson_role']}': {role_counts[config['chairperson_role']]:,} speeches")
        else:
            print(f"⚠️ '{config['chairperson_role']}' not found in Speaker_role")
            print(f"Available roles: {list(role_counts.index[:5])}")
    
    print(f"📊 {len(df):,} speeches across {df['Text_ID'].nunique():,} sessions")
    return df

def embed_long_text(text, model, tokenizer):
    """Handle texts longer than model max length."""
    token_ids = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    starts = list(range(0, len(token_ids), 4096 - 1024))
    for start in starts:
        end = min(start + 4096, len(token_ids))
        chunk_ids = token_ids[start:end]
        chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
        chunks.append(chunk_text)
    
    chunk_embeddings = model.encode(chunks, batch_size=32, convert_to_tensor=False, show_progress_bar=False)
    return np.mean(chunk_embeddings, axis=0)

def generate_speech_embeddings(df, text_column='Text', model_name="BAAI/bge-m3", batch_size=None, checkpoint_freq=10000):
    """Generate BGE-m3 embeddings for individual speeches with GPU optimization."""
    # Use dynamic batch size if not specified
    if batch_size is None:
        batch_size = DEFAULT_BATCH_SIZE
    
    print("=" * 60)
    print("GENERATING SPEECH EMBEDDINGS")
    print("=" * 60)
    print(f"Using batch size: {batch_size}")
    
    # Setup checkpointing
    checkpoint_dir = '/content/drive/MyDrive/checkpoints/'
    os.makedirs(checkpoint_dir, exist_ok=True)
    checkpoint_path = f'{checkpoint_dir}speech_embeddings_checkpoint.pkl'
    
    # Try to load existing checkpoint
    checkpoint_data = None
    if os.path.exists(checkpoint_path):
        with open(checkpoint_path, 'rb') as f:
            checkpoint_data = pickle.load(f)
        print(f"📂 Resuming from checkpoint at index {checkpoint_data['last_processed_idx'] + 1}")
    
    start_idx = checkpoint_data['last_processed_idx'] + 1 if checkpoint_data else 0
    embeddings = checkpoint_data['embeddings'] if checkpoint_data else []
    
    # Load model
    model = SentenceTransformer(model_name, device="cuda" if torch.cuda.is_available() else "cpu")
    if torch.cuda.is_available():
        model.half()
    tokenizer = model.tokenizer
    model.max_seq_length = 8192
    
    texts = df[text_column].astype(str).values
    
    with tqdm(total=len(texts), initial=start_idx, desc="🚀 Embedding", unit="speech") as pbar:
        for i in range(start_idx, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            
            # Process batch (handle long texts)
            batch_embeddings = []
            short_texts = []
            short_indices = []
            
            for j, text in enumerate(batch_texts):
                token_count = len(tokenizer.encode(text, add_special_tokens=False))
                if token_count <= 8192:
                    short_texts.append(text)
                    short_indices.append(j)
                else:
                    # Handle long text with chunking
                    emb = embed_long_text(text, model, tokenizer)
                    batch_embeddings.append((j, emb))
            
            # Batch process short texts
            if short_texts:
                actual_batch_size = min(batch_size, len(short_texts))
                short_embeddings = model.encode(short_texts, batch_size=actual_batch_size, 
                                              convert_to_tensor=False, show_progress_bar=False)
                for idx, emb in zip(short_indices, short_embeddings):
                    batch_embeddings.append((idx, emb))
            
            batch_embeddings.sort(key=lambda x: x[0])
            embeddings.extend([emb for _, emb in batch_embeddings])
            pbar.update(len(batch_texts))
            
            # Checkpoint periodically
            if (i + batch_size) % checkpoint_freq == 0:
                with open(checkpoint_path, 'wb') as f:
                    pickle.dump({'embeddings': embeddings, 'last_processed_idx': i + len(batch_texts) - 1}, f)
                if (i + batch_size) % (checkpoint_freq * 4) == 0:
                    print(f"\n💾 Progress: {i + batch_size:,}/{len(texts):,}")
            
            # Memory management
            if (i + batch_size) % (checkpoint_freq * 2) == 0:
                torch.cuda.empty_cache()
                gc.collect()
    
    # Cleanup checkpoint
    if os.path.exists(checkpoint_path):
        os.remove(checkpoint_path)
    
    df_result = df.copy()
    df_result['Speech_Embeddings'] = embeddings
    return df_result

def process_speech_embeddings_pipeline(parliament, language):
    """Complete pipeline for generating and saving speech embeddings."""
    print(f"\n🚀 Starting Speech Embeddings Pipeline")
    print(f"Parliament: {parliament.upper()}")
    print(f"Language: {language.upper()}")
    print("=" * 70)
    print(f"🎯 Using batch size: {DEFAULT_BATCH_SIZE}")
    
    # Define output path
    output_path = f"{data_folder}{parliament}_{language}_with_speech_embeddings.pkl"
    
    # Check if already processed
    if os.path.exists(output_path):
        print(f"🎯 SPEECH EMBEDDINGS ALREADY EXIST: {output_path}")
        df_result = pd.read_pickle(output_path)
        print(f"✅ Loaded existing result: {df_result.shape}")
        return df_result
    
    # Load raw data
    print("📥 Loading raw data...")
    df = load_and_verify_data(parliament, language)
    
    # Generate speech embeddings
    print("🔄 Generating speech embeddings...")
    df_with_embeddings = generate_speech_embeddings(df)
    
    # Save result
    print("💾 Saving speech embeddings...")
    df_with_embeddings.to_pickle(output_path)
    
    print(f"\n✅ SPEECH EMBEDDINGS PIPELINE COMPLETED!")
    print(f"📊 Final dataset: {df_with_embeddings.shape}")
    print(f"💾 Saved to: {output_path}")
    print(f"📈 Ready for segmentation pipeline!")
    
    return df_with_embeddings

In [None]:
# === PROCESSING CONFIGURATION ===
# Choose parliament and language to process

# CONFIGURATION - Update these variables to select what to process
PARLIAMENT_TO_PROCESS = 'british'       # Options: 'austrian', 'croatian', 'british'
LANGUAGE_TO_PROCESS = 'english'        # Options depend on parliament:
                                        # Austrian: 'english', 'german'
                                        # Croatian: 'english', 'croatian'
                                        # British: 'english' (only option)

print(f"🎯 SPEECH EMBEDDINGS PROCESSING CONFIGURATION")
print(f"=" * 50)
print(f"Parliament: {PARLIAMENT_TO_PROCESS}")
print(f"Language: {LANGUAGE_TO_PROCESS}")

# Validate configuration
try:
    config = get_config(PARLIAMENT_TO_PROCESS, LANGUAGE_TO_PROCESS)
    print(f"✅ Configuration valid!")
    print(f"📁 Input file: {config['file']}")
    print(f"👑 Chairperson role: {config['chairperson_role']}")
    
    # Optional: Verify data before processing
    print(f"\n📊 Data verification:")
    verify_result = verify_parliament_data(PARLIAMENT_TO_PROCESS, LANGUAGE_TO_PROCESS)
    
    if verify_result is not None:
        print(f"\n🚀 Ready to process {PARLIAMENT_TO_PROCESS} parliament in {LANGUAGE_TO_PROCESS}!")
        print(f"💡 Run the next cell to start processing.")
    else:
        print(f"❌ Data verification failed. Please check file path and data structure.")
        
except Exception as e:
    print(f"❌ Configuration error: {e}")
    print(f"\n🔧 Available options:")
    list_available_options()

In [None]:
# === EXECUTE SPEECH EMBEDDINGS PROCESSING IN SEQUENCE ===
print(f"🚀 STARTING SEQUENTIAL SPEECH EMBEDDINGS PROCESSING")
print(f"=" * 60)

# Define processing queue for Croatian (Croatian) and British (English)
processing_queue = [
    ('croatian', 'croatian'),
    ('british', 'english')
]

results = {}
total_start_time = pd.Timestamp.now()

for i, (parliament, language) in enumerate(processing_queue, 1):
    try:
        print(f"\n{'='*80}")
        print(f"PROCESSING {i}/{len(processing_queue)}: {parliament.upper()} Parliament in {language.upper()}")
        print(f"{'='*80}")
        
        start_time = pd.Timestamp.now()
        result = process_speech_embeddings_pipeline(parliament, language)
        end_time = pd.Timestamp.now()
        processing_time = end_time - start_time
        
        results[f"{parliament}_{language}"] = {
            'data': result,
            'processing_time': processing_time,
            'shape': result.shape,
            'speeches_count': len(result)
        }
        
        print(f"✅ {parliament.upper()} {language.upper()} SPEECH EMBEDDINGS COMPLETED!")
        print(f"⏱️ Processing time: {processing_time}")
        print(f"📊 Dataset shape: {result.shape}")
        
        # Clear GPU memory between runs
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            gc.collect()
            
        # Brief pause between processing
        import time
        time.sleep(2)
        
    except Exception as e:
        print(f"❌ Error processing {parliament} {language}: {e}")
        import traceback
        traceback.print_exc()
        continue

total_end_time = pd.Timestamp.now()
total_processing_time = total_end_time - total_start_time

print(f"\n🎉 SEQUENTIAL SPEECH EMBEDDINGS PROCESSING COMPLETED!")
print(f"=" * 60)
print(f"⏱️ Total processing time: {total_processing_time}")
print(f"📊 Successfully processed: {list(results.keys())}")

for key, info in results.items():
    print(f"  • {key}: {info['shape']} ({info['speeches_count']:,} speeches) - {info['processing_time']}")

print(f"\n💾 Output files created:")
for parliament, language in processing_queue:
    if f"{parliament}_{language}" in results:
        print(f"  • {parliament}_{language}_with_speech_embeddings.pkl")

print(f"\n➡️ Ready for segmentation and segment embeddings pipeline!")

In [None]:
# === PROCESSING SUMMARY AND VERIFICATION ===

def verify_sequential_results():
    """Verify the results of sequential processing."""
    print("🔍 VERIFYING SEQUENTIAL PROCESSING RESULTS")
    print("=" * 50)
    
    processing_queue = [('croatian', 'croatian'), ('british', 'english')]
    
    for parliament, language in processing_queue:
        output_path = f"{data_folder}{parliament}_{language}_with_speech_embeddings.pkl"
        
        print(f"\n📋 {parliament.upper()} Parliament ({language.upper()}):")
        
        if os.path.exists(output_path):
            try:
                df = pd.read_pickle(output_path)
                print(f"  ✅ File exists: {output_path}")
                print(f"  📊 Shape: {df.shape}")
                print(f"  🔢 Speeches: {len(df):,}")
                print(f"  🏛️ Sessions: {df['Text_ID'].nunique():,}")
                
                if 'Speech_Embeddings' in df.columns:
                    embedding_shape = df['Speech_Embeddings'].iloc[0].shape
                    print(f"  🎯 Embedding dimensions: {embedding_shape}")
                    print(f"  ✅ Speech embeddings successfully generated")
                else:
                    print(f"  ❌ Speech_Embeddings column missing")
                    
                if 'Speaker_role' in df.columns:
                    config = get_config(parliament, language)
                    chairperson_count = len(df[df['Speaker_role'] == config['chairperson_role']])
                    chairperson_pct = chairperson_count / len(df) * 100
                    print(f"  👑 Chairperson speeches: {chairperson_count:,} ({chairperson_pct:.1f}%)")
                
            except Exception as e:
                print(f"  ❌ Error loading file: {e}")
        else:
            print(f"  ❌ File not found: {output_path}")

# Run verification
verify_sequential_results()

print(f"\n💡 Next Steps:")
print(f"1. These files are ready for segmentation processing")
print(f"2. The segmentation pipeline will identify chairperson segments")
print(f"3. Generate segment-level embeddings for clustering analysis")