# Parliamentary Speech Segmentation & Segment Embeddings

This notebook implements the second stage of parliamentary speech processing:

1. **Setup & Configuration** - Google Colab setup and imports
2. **Data Loading** - Load datasets with pre-computed speech embeddings
3. **Segmentation** - Parliament-aware speech segmentation using embeddings and keywords
4. **Segment Embeddings** - Generate embeddings for concatenated segment texts
5. **Final Processing** - Save complete datasets ready for topic modeling

## Key Features:
- **Smart Segmentation**: Agenda-aware boundary detection with parliament-specific keywords
- **Multi-parliament support**: Austrian, Croatian, British parliaments with optimized settings
- **Segment Embeddings**: Concatenate speeches within segments and embed the combined text
- **GPU optimization**: Efficient processing with checkpointing

## Input:
Requires datasets with speech embeddings from the first processing stage.

## Output:
Complete datasets with both speech and segment embeddings, ready for analysis.

In [None]:
# === GOOGLE COLAB SETUP ===
from google.colab import drive
drive.mount('/content/drive')

!pip install sentence-transformers bertopic umap-learn hdbscan tqdm openai python-dotenv scipy

import torch
import pandas as pd
import numpy as np
import warnings
import os
import gc
import random
from tqdm import tqdm
import pickle
import time
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

# GPU optimization and batch size configuration
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
    print(f"GPU: {gpu_name} | Memory: {gpu_memory:.1f} GB")
    
    # Dynamic batch size based on GPU type
    if 'A100' in gpu_name:
        DEFAULT_BATCH_SIZE = 124  # A100 optimized
        print(f"🚀 A100 detected: Using optimized batch size {DEFAULT_BATCH_SIZE}")
    elif 'V100' in gpu_name or 'T4' in gpu_name:
        DEFAULT_BATCH_SIZE = 64   # V100/T4 optimized
        print(f"⚡ {gpu_name} detected: Using batch size {DEFAULT_BATCH_SIZE}")
    else:
        DEFAULT_BATCH_SIZE = 32   # Conservative default
        print(f"🔧 Generic GPU detected: Using conservative batch size {DEFAULT_BATCH_SIZE}")
    
    torch.backends.cudnn.benchmark = True
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
else:
    print("No GPU detected - will use CPU (slower)")
    DEFAULT_BATCH_SIZE = 8  # CPU batch size

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"✅ Setup complete! Default batch size: {DEFAULT_BATCH_SIZE}")

In [None]:
# === ENHANCED CONFIGURATION ===
data_folder = '/content/drive/MyDrive/thesis data/'

# Enhanced Parliament and Language configurations with keywords
PARLIAMENT_CONFIG = {
    'austrian': {
        'english': {
            'file': 'AT_en.pkl',
            'chairperson_role': 'Chairperson',
            'agenda_keywords': {
                'strong': ['agenda item'],
                'medium': ['agenda'],
                'weak': ['item', 'point']
            }
        },
        'german': {
            'file': 'AT_german.pkl', 
            'chairperson_role': 'PräsidentIn',
            'agenda_keywords': {
                'strong': ['tagesordnungspunkt', 'punkt der tagesordnung'],
                'medium': ['tagesordnung', 'verhandlung'],
                'weak': ['behandlung']
            }
        }
    },
    'croatian': {
        'english': {
            'file': 'CRO_en.pkl',
            'chairperson_role': 'Chairperson',
            'agenda_keywords': {
                'strong': ['agenda item'],
                'medium': ['agenda'],
                'weak': ['item', 'point']
            }
        },
        'croatian': {
            'file': 'CRO_hr.pkl',
            'chairperson_role': 'Predsjedavajući',
            'agenda_keywords': {
                # Based on segmentation_analysis findings - optimized pattern
                'strong': ['riječ|sljedeći|točka|prelazimo|glasovanje'],
                'medium': ['hvala|molim|sada|nastavljamo'],
                'weak': ['otvaramo|zatvaramo|počinje|red']
            }
        }
    },
    'british': {
        'english': {
            'file': 'GB_en.pkl',
            'chairperson_role': 'Speaker',
            'agenda_keywords': {
                # British parliamentary terminology
                'strong': ['order paper|business of the house|next business'],
                'medium': ['honourable member|right honourable|speaker|chair'],
                'weak': ['question|division|debate|motion']
            }
        }
    }
}

def list_available_options():
    """Display available parliament and language options."""
    print("📋 Available Processing Options:")
    print("=" * 50)
    
    for parliament, languages in PARLIAMENT_CONFIG.items():
        print(f"\n🏛️ {parliament.upper()} Parliament:")
        for language, config in languages.items():
            print(f"  • {language.capitalize()}: {config['file']}")
            print(f"    - Chairperson role: '{config['chairperson_role']}'")
            print(f"    - Keywords: {len(config['agenda_keywords']['strong']) + len(config['agenda_keywords']['medium'])} patterns")

def get_config(parliament, language):
    """Get configuration for specific parliament and language combination."""
    if parliament not in PARLIAMENT_CONFIG:
        raise ValueError(f"Parliament '{parliament}' not supported. Available: {list(PARLIAMENT_CONFIG.keys())}")
    
    if language not in PARLIAMENT_CONFIG[parliament]:
        available_langs = list(PARLIAMENT_CONFIG[parliament].keys())
        raise ValueError(f"Language '{language}' not available for {parliament} parliament. Available: {available_langs}")
    
    return PARLIAMENT_CONFIG[parliament][language]

# Display available options
list_available_options()

print(f"\n🔧 Usage examples:")
print(f"  • Austrian Parliament in German: parliament='austrian', language='german'")
print(f"  • Croatian Parliament in Croatian: parliament='croatian', language='croatian'")
print(f"  • British Parliament in English: parliament='british', language='english'")

In [None]:
# === DATA LOADING FUNCTIONS ===

def load_speech_embeddings_data(parliament, language):
    """Load dataset with pre-computed speech embeddings."""
    # Define input path from speech embeddings stage
    input_path = f"{data_folder}{parliament}_{language}_with_speech_embeddings.pkl"
    
    if not os.path.exists(input_path):
        raise FileNotFoundError(f"Speech embeddings file not found: {input_path}")
    
    df = pd.read_pickle(input_path)
    print(f"✅ Loaded {parliament} parliament ({language}) with speech embeddings: {df.shape}")
    
    # Verify speech embeddings exist
    if 'Speech_Embeddings' not in df.columns:
        raise ValueError(f"Speech_Embeddings column not found in {input_path}")
    
    # Verify required columns
    if 'Text_ID' not in df.columns:
        raise ValueError(f"Text_ID column not found in {input_path}")
    
    config = get_config(parliament, language)
    if 'Speaker_role' in df.columns:
        role_counts = df['Speaker_role'].value_counts()
        if config['chairperson_role'] in role_counts.index:
            print(f"✅ Found '{config['chairperson_role']}': {role_counts[config['chairperson_role']]:,} speeches")
        else:
            print(f"⚠️ '{config['chairperson_role']}' not found in Speaker_role")
    
    print(f"📊 {len(df):,} speeches across {df['Text_ID'].nunique():,} sessions")
    print(f"🔢 Speech embedding shape: {df['Speech_Embeddings'][0].shape}")
    
    return df

def verify_speech_embeddings_data(parliament, language):
    """Verify that speech embeddings data exists and is valid."""
    input_path = f"{data_folder}{parliament}_{language}_with_speech_embeddings.pkl"
    
    print(f"📊 Verifying Speech Embeddings Data:")
    print(f"Path: {input_path}")
    print("=" * 60)
    
    if not os.path.exists(input_path):
        print(f"❌ File not found: {input_path}")
        print(f"💡 Please run the speech embeddings processing pipeline first.")
        return False
    
    try:
        df = pd.read_pickle(input_path)
        print(f"✅ File exists and loadable: {df.shape}")
        
        # Check for required columns
        required_cols = ['Text_ID', 'Text', 'Speech_Embeddings']
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            print(f"❌ Missing required columns: {missing_cols}")
            return False
        
        print(f"✅ All required columns present")
        print(f"🔢 Speech embeddings shape: {df['Speech_Embeddings'][0].shape}")
        return True
        
    except Exception as e:
        print(f"❌ Error loading file: {e}")
        return False

print("🔍 Data Verification Examples:")
print("# verify_speech_embeddings_data('croatian', 'croatian')")
print("# verify_speech_embeddings_data('austrian', 'english')")

In [None]:
# === SEGMENTATION FUNCTIONS ===

def segment_speeches(df, parliament, language, window_size=5, min_segment_size=3):
    """Enhanced parliament-aware segmentation using speech embeddings and keywords."""
    config = get_config(parliament, language)
    
    print(f"🏛️ {parliament.upper()} Parliament ({language.upper()}) - Enhanced Segmentation")
    print(f"🔍 Chairperson: '{config['chairperson_role']}'")
    print(f"🔧 Keywords: {config['agenda_keywords']}")
    
    segment_ids = []
    segmentation_metrics = []
    sitting_column = 'Text_ID'
    
    # Get unique sittings for progress tracking
    unique_sittings = df[sitting_column].unique()
    print(f"🔄 Processing {len(unique_sittings)} sessions...")

    for sitting_id in tqdm(unique_sittings, desc=f"Segmenting {parliament} {language}", unit="session"):
        group = df[df[sitting_column] == sitting_id]
        sitting_length = len(group)

        if sitting_length < min_segment_size:
            # Very small sitting - one segment
            sitting_segments = [f"{sitting_id}_seg_0"] * len(group)
            segment_ids.extend(sitting_segments)
            segmentation_metrics.append({
                'sitting_id': sitting_id,
                'sitting_length': sitting_length,
                'num_segments': 1,
                'avg_segment_size': sitting_length,
                'boundaries_found': 0,
                'agenda_boundaries': 0
            })
            continue

        embeddings = np.array(group['Speech_Embeddings'].tolist())

        # Parliament-specific target segments
        if parliament == 'british':
            # British Parliament tends to have longer sessions
            if sitting_length < 40:
                target_segments = max(2, min(6, sitting_length // 10))
                threshold_percentile = 40
            elif sitting_length < 150:
                target_segments = max(4, min(15, sitting_length // 15))
                threshold_percentile = 45
            else:
                target_segments = max(6, min(25, sitting_length // 20))
                threshold_percentile = 50
        else:
            # Austrian/Croatian logic
            if sitting_length < 30:
                target_segments = max(2, min(5, sitting_length // 8))
                threshold_percentile = 35
            elif sitting_length < 100:
                target_segments = max(3, min(12, sitting_length // 12))
                threshold_percentile = 40
            else:
                target_segments = max(5, min(20, sitting_length // 15))
                threshold_percentile = 45

        # === ENHANCED CHAIRPERSON AGENDA DETECTION ===
        agenda_boundaries = set()
        agenda_signals = []

        for i, (idx, row) in enumerate(group.iterrows()):
            agenda_score = 0

            # Check for parliament-specific chairperson role
            if 'Speaker_role' in row and pd.notna(row['Speaker_role']) and row['Speaker_role'] == config['chairperson_role']:
                text = str(row['Text']).lower()

                # Enhanced keyword matching with regex support
                for keyword_list in config['agenda_keywords']['strong']:
                    keywords = keyword_list.split('|') if '|' in keyword_list else [keyword_list]
                    if any(keyword in text for keyword in keywords):
                        agenda_score = 1.0
                        break
                
                if agenda_score == 0:  # Check medium keywords
                    for keyword_list in config['agenda_keywords']['medium']:
                        keywords = keyword_list.split('|') if '|' in keyword_list else [keyword_list]
                        if any(keyword in text for keyword in keywords):
                            agenda_score = 0.7
                            break
                
                if agenda_score == 0:  # Check weak keywords
                    for keyword_list in config['agenda_keywords']['weak']:
                        keywords = keyword_list.split('|') if '|' in keyword_list else [keyword_list]
                        if any(keyword in text for keyword in keywords):
                            agenda_score = 0.5
                            break
                
                # Special handling for session start
                if agenda_score == 0 and i == 0:
                    agenda_score = 0.3

            agenda_signals.append(agenda_score)

            # Add strong agenda boundaries
            if agenda_score >= 0.7 and i >= min_segment_size and (sitting_length - i) >= min_segment_size:
                agenda_boundaries.add(i)

        # === SIMILARITY ANALYSIS ===
        similarity_signals = {}

        # Primary windowed similarity
        similarities = []
        for i in range(len(embeddings) - window_size):
            window1 = np.mean(embeddings[i:i + window_size], axis=0)
            window2 = np.mean(embeddings[i + window_size:i + 2*window_size], axis=0)
            sim = cosine_similarity(window1.reshape(1, -1), window2.reshape(1, -1))[0][0]
            similarities.append(sim)

        similarity_signals['primary'] = np.array(similarities)

        # Point-to-point similarity
        if len(embeddings) > 6:
            point_sims = []
            for i in range(len(embeddings) - 1):
                sim = cosine_similarity(
                    embeddings[i].reshape(1, -1),
                    embeddings[i + 1].reshape(1, -1)
                )[0][0]
                point_sims.append(sim)

            # Align with primary signal
            point_sims = np.array(point_sims)
            if len(point_sims) > len(similarities):
                point_sims = point_sims[:len(similarities)]
            elif len(point_sims) < len(similarities):
                padding = len(similarities) - len(point_sims)
                point_sims = np.pad(point_sims, (0, padding), mode='edge')

            similarity_signals['point'] = point_sims

        # Gradient-based change detection
        if len(embeddings) > 10:
            trajectory = []
            for i in range(1, len(embeddings)):
                displacement = np.linalg.norm(embeddings[i] - embeddings[i-1])
                trajectory.append(float(displacement))

            trajectory = np.array(trajectory, dtype=np.float64)
            if len(trajectory) > 3:
                try:
                    from scipy.ndimage import uniform_filter1d
                    smoothed = uniform_filter1d(trajectory.astype(np.float64), size=3)
                    gradient = np.gradient(smoothed)

                    # Align with similarities
                    if len(gradient) > len(similarities):
                        gradient = gradient[:len(similarities)]
                    elif len(gradient) < len(similarities):
                        padding = len(similarities) - len(gradient)
                        gradient = np.pad(gradient, (0, padding), mode='edge')

                    similarity_signals['gradient'] = gradient
                except:
                    pass

        if len(similarity_signals['primary']) == 0:
            sitting_segments = [f"{sitting_id}_seg_0"] * len(group)
            segment_ids.extend(sitting_segments)
            segmentation_metrics.append({
                'sitting_id': sitting_id,
                'sitting_length': sitting_length,
                'num_segments': 1,
                'avg_segment_size': sitting_length,
                'boundaries_found': 0,
                'agenda_boundaries': 0
            })
            continue

        # === BOUNDARY DETECTION ===
        candidate_boundaries = set()

        # 1. Add agenda boundaries (highest priority)
        candidate_boundaries.update(agenda_boundaries)

        # 2. Find boundaries from primary similarity drops
        primary_sims = similarity_signals['primary']
        threshold = np.percentile(primary_sims, threshold_percentile)

        for i in range(len(primary_sims)):
            if (primary_sims[i] < threshold and
                i >= min_segment_size and
                (len(group) - i - window_size) >= min_segment_size):
                candidate_boundaries.add(i + window_size)

        # 3. Add from point-to-point analysis
        if 'point' in similarity_signals:
            point_threshold = np.percentile(similarity_signals['point'], threshold_percentile - 10)
            for i in range(len(similarity_signals['point'])):
                if (similarity_signals['point'][i] < point_threshold and
                    i >= min_segment_size and
                    (len(group) - i) >= min_segment_size):
                    candidate_boundaries.add(i)

        # 4. Add from gradient analysis
        if 'gradient' in similarity_signals:
            gradient = similarity_signals['gradient']
            gradient_threshold = np.percentile(np.abs(gradient), 75)
            for i in range(len(gradient)):
                if (np.abs(gradient[i]) > gradient_threshold and
                    i >= min_segment_size and
                    (len(group) - i) >= min_segment_size):
                    candidate_boundaries.add(i)

        candidates = sorted(list(candidate_boundaries))

        # === BOUNDARY SELECTION ===
        boundaries = []
        if candidates:
            if len(candidates) <= target_segments - 1:
                boundaries = candidates
            else:
                # Score candidates with parliament-specific agenda boost
                candidate_scores = []
                for c in candidates:
                    score = 0

                    # Parliament-specific agenda boost
                    if c < len(agenda_signals):
                        if parliament == 'british':
                            agenda_boost = 4.0
                        elif parliament == 'croatian':
                            agenda_boost = 5.0
                        else:
                            agenda_boost = 3.0
                        score += agenda_signals[c] * agenda_boost

                    # Similarity scores
                    if c - window_size >= 0 and c - window_size < len(primary_sims):
                        score += (1 - primary_sims[c - window_size]) * 2.0

                    if 'point' in similarity_signals and c < len(similarity_signals['point']):
                        score += (1 - similarity_signals['point'][c]) * 1.5

                    if 'gradient' in similarity_signals and c < len(similarity_signals['gradient']):
                        score += np.abs(similarity_signals['gradient'][c]) * 1.0

                    candidate_scores.append((c, score))

                # Select top scoring boundaries
                candidate_scores.sort(key=lambda x: x[1], reverse=True)
                boundaries = sorted([c for c, _ in candidate_scores[:target_segments-1]])

        # === BOUNDARY VALIDATION ===
        validated_boundaries = []
        for boundary in boundaries:
            if not validated_boundaries or (boundary - validated_boundaries[-1]) >= min_segment_size:
                validated_boundaries.append(boundary)

        boundaries = validated_boundaries

        # Assign segment IDs
        current_segment = 0
        sitting_segments = []

        for i in range(len(group)):
            if i > 0 and (i - 1) in boundaries:
                current_segment += 1
            sitting_segments.append(f"{sitting_id}_seg_{current_segment}")

        segment_ids.extend(sitting_segments)

        # Store metrics
        num_segments = len(set(sitting_segments))
        agenda_bound_count = len([b for b in boundaries if b in agenda_boundaries])

        segmentation_metrics.append({
            'sitting_id': sitting_id,
            'sitting_length': sitting_length,
            'num_segments': num_segments,
            'avg_segment_size': sitting_length / num_segments,
            'boundaries_found': len(boundaries),
            'agenda_boundaries': agenda_bound_count,
            'target_segments': target_segments,
            'candidate_boundaries': len(candidates),
            'signals_used': len(similarity_signals) + 1,
            'parliament': parliament,
            'language': language
        })

    # At the end, add Segment_ID column to the dataframe
    df['Segment_ID'] = segment_ids
    return df, segmentation_metrics

# === ADD SEGMENT_ID UTILITY FUNCTION ===

def add_segment_id_to_dataframe(df, parliament, language):
    """Add Segment_ID column to existing dataframe."""
    print(f"🔧 Adding Segment_ID to {parliament} {language} dataframe...")
    
    df_with_segments, metrics = segment_speeches(df, parliament, language)
    
    print(f"✅ Added Segment_ID column: {df_with_segments['Segment_ID'].nunique():,} unique segments")
    print(f"📊 Average speeches per segment: {len(df_with_segments) / df_with_segments['Segment_ID'].nunique():.1f}")
    
    return df_with_segments

In [None]:
# === SEGMENT EMBEDDINGS FUNCTIONS ===

def embed_long_text(text, model, tokenizer):
    """Handle texts longer than model max length."""
    token_ids = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    starts = list(range(0, len(token_ids), 4096 - 1024))
    for start in starts:
        end = min(start + 4096, len(token_ids))
        chunk_ids = token_ids[start:end]
        chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
        chunks.append(chunk_text)
    
    chunk_embeddings = model.encode(chunks, batch_size=32, convert_to_tensor=False, show_progress_bar=False)
    return np.mean(chunk_embeddings, axis=0)

def generate_segment_embeddings(df, text_column='Text', segment_id_column='Segment_ID', batch_size=None):
    """Generate embeddings for concatenated segment texts."""
    if batch_size is None:
        batch_size = DEFAULT_BATCH_SIZE
        
    print("=" * 60)
    print("SEGMENT EMBEDDINGS: Concatenated segment texts")
    print("=" * 60)
    print(f"Using batch size: {batch_size}")
    
    # Create segment texts by concatenating speeches within each segment
    segment_texts = []
    segment_ids = []
    
    for segment_id, group in df.groupby(segment_id_column):
        # Concatenate all texts in the segment with separators
        concatenated_text = ' [SEP] '.join(group[text_column].astype(str).values)
        segment_texts.append(concatenated_text)
        segment_ids.append(segment_id)
    
    print(f"Processing {len(segment_texts)} segments...")
    
    # Generate embeddings for concatenated segment texts
    model = SentenceTransformer("BAAI/bge-m3", device="cuda" if torch.cuda.is_available() else "cpu")
    if torch.cuda.is_available():
        model.half()
    
    tokenizer = model.tokenizer
    segment_embeddings = []
    
    with tqdm(total=len(segment_texts), desc="🚀 Embedding segments", unit="segment") as pbar:
        for i in range(0, len(segment_texts), batch_size):
            batch_texts = segment_texts[i:i+batch_size]
            
            # Process each text in batch (handle long texts)
            batch_embs = []
            for text in batch_texts:
                emb = embed_long_text(text, model, tokenizer)
                batch_embs.append(emb)
            
            segment_embeddings.extend(batch_embs)
            pbar.update(len(batch_texts))
    
    # Create mapping from segment_id to embedding
    segment_embedding_map = dict(zip(segment_ids, segment_embeddings))
    
    # Map embeddings back to original dataframe
    df_result = df.copy()
    df_result['Segment_Embeddings'] = df_result[segment_id_column].map(segment_embedding_map)
    
    print(f"✅ Segment embeddings generated for {len(segment_ids)} unique segments")
    return df_result

In [None]:
# === COMPLETE SEGMENTATION PIPELINE ===

def run_segmentation_pipeline(parliament, language):
    """Complete pipeline for segmentation and segment embeddings."""
    print(f"\n🚀 Starting Segmentation & Segment Embeddings Pipeline")
    print(f"Parliament: {parliament.upper()}")
    print(f"Language: {language.upper()}")
    print("=" * 70)
    print(f"🎯 Using batch size: {DEFAULT_BATCH_SIZE}")
    
    # Define paths
    input_path = f"{data_folder}{parliament}_{language}_with_speech_embeddings.pkl"
    segmentation_checkpoint_path = f"{data_folder}{parliament}_{language}_segmented_with_speech_embeddings.pkl"
    final_path = f"{data_folder}{parliament}_{language}_final.pkl"
    
    # Check if final result already exists
    if os.path.exists(final_path):
        print(f"🎯 FINAL RESULT EXISTS: Loading {final_path}")
        df_final = pd.read_pickle(final_path)
        print(f"✅ Loaded final result: {df_final.shape}")
        print(f"🎯 Segments: {df_final['Segment_ID'].nunique()}")
        return df_final
    
    # Check if segmented data exists (can skip to segment embeddings)
    if os.path.exists(segmentation_checkpoint_path):
        print(f"🔄 SEGMENTATION CHECKPOINT FOUND: Loading {segmentation_checkpoint_path}")
        df_segmented = pd.read_pickle(segmentation_checkpoint_path)
        print(f"✅ Loaded segmented data: {df_segmented.shape}")
        print(f"📊 Continuing from segment embeddings...")
        
        # Generate segment embeddings
        df_final = generate_segment_embeddings(df_segmented)
        
        # Final save
        df_final.to_pickle(final_path)
        print(f"💾 FINAL: {final_path}")
        return df_final
    
    # Load data with speech embeddings
    print("📥 Loading data with speech embeddings...")
    df_with_embeddings = load_speech_embeddings_data(parliament, language)
    
    # Perform segmentation
    print("🔄 Performing segmentation...")
    df_segmented, seg_metrics = segment_speeches(df_with_embeddings, parliament, language)
    
    # Display segmentation results
    metrics_df = pd.DataFrame(seg_metrics)
    print(f"\n✅ {parliament.upper()} {language.upper()} segmentation complete!")
    print(f"📊 Results:")
    print(f"  • Total speeches processed: {len(df_segmented):,}")
    print(f"  • Unique segments created: {df_segmented['Segment_ID'].nunique():,}")
    print(f"  • Average speeches per segment: {len(df_segmented) / df_segmented['Segment_ID'].nunique():.1f}")
    print(f"  • Average segments per session: {metrics_df['num_segments'].mean():.1f}")
    print(f"  • Agenda boundaries used: {metrics_df['agenda_boundaries'].sum()}")
    print(f"  • Total boundaries found: {metrics_df['boundaries_found'].sum()}")
    
    # Save segmented data checkpoint
    df_segmented.to_pickle(segmentation_checkpoint_path)
    print(f"💾 SEGMENTATION CHECKPOINT: {segmentation_checkpoint_path}")
    
    # Generate segment embeddings
    print(f"\n🔄 Generating segment embeddings for {parliament.upper()} {language.upper()}...")
    df_final = generate_segment_embeddings(df_segmented)
    
    # Final save
    df_final.to_pickle(final_path)
    print(f"💾 FINAL: {final_path}")
    
    return df_final

In [None]:
# === PROCESSING CONFIGURATION ===
# Choose parliament and language to process

# CONFIGURATION - Update these variables to select what to process
PARLIAMENT_TO_PROCESS = 'croatian'       # Options: 'austrian', 'croatian', 'british'
LANGUAGE_TO_PROCESS = 'croatian'        # Options depend on parliament:
                                        # Austrian: 'english', 'german'
                                        # Croatian: 'english', 'croatian'
                                        # British: 'english' (only option)

print(f"🎯 SEGMENTATION & SEGMENT EMBEDDINGS CONFIGURATION")
print(f"=" * 50)
print(f"Parliament: {PARLIAMENT_TO_PROCESS}")
print(f"Language: {LANGUAGE_TO_PROCESS}")

# Validate configuration
try:
    config = get_config(PARLIAMENT_TO_PROCESS, LANGUAGE_TO_PROCESS)
    print(f"✅ Configuration valid!")
    print(f"👑 Chairperson role: {config['chairperson_role']}")
    print(f"🔧 Keyword patterns: {len(config['agenda_keywords']['strong']) + len(config['agenda_keywords']['medium']) + len(config['agenda_keywords']['weak'])}")
    
    # Verify input data exists
    print(f"\n📊 Verifying input data:")
    if verify_speech_embeddings_data(PARLIAMENT_TO_PROCESS, LANGUAGE_TO_PROCESS):
        print(f"\n🚀 Ready to process {PARLIAMENT_TO_PROCESS} parliament in {LANGUAGE_TO_PROCESS}!")
        print(f"💡 Run the next cell to start processing.")
    else:
        print(f"❌ Input data verification failed.")
        print(f"💡 Please run speech embeddings processing first.")
        
except Exception as e:
    print(f"❌ Configuration error: {e}")
    print(f"\n🔧 Available options:")
    list_available_options()

In [None]:
# === EXECUTE SEGMENTATION PIPELINE ===
print(f"🚀 STARTING SEGMENTATION & SEGMENT EMBEDDINGS PROCESSING")
print(f"=" * 50)
print(f"Parliament: {PARLIAMENT_TO_PROCESS}")
print(f"Language: {LANGUAGE_TO_PROCESS}")

try:
    # Run the complete segmentation pipeline
    result = run_segmentation_pipeline(PARLIAMENT_TO_PROCESS, LANGUAGE_TO_PROCESS)
    
    print(f"\n🎉 SEGMENTATION & SEGMENT EMBEDDINGS COMPLETED!")
    print(f"=" * 50)
    print(f"📊 Final dataset: {result.shape}")
    print(f"🎯 Segments created: {result['Segment_ID'].nunique():,}")
    print(f"📝 Average speeches per segment: {len(result) / result['Segment_ID'].nunique():.1f}")
    print(f"🔢 Speech embedding shape: {result['Speech_Embeddings'][0].shape}")
    print(f"🔢 Segment embedding shape: {result['Segment_Embeddings'][0].shape}")
    
    if 'Speaker_role' in result.columns:
        config = get_config(PARLIAMENT_TO_PROCESS, LANGUAGE_TO_PROCESS)
        chairperson_count = len(result[result['Speaker_role'] == config['chairperson_role']])
        chairperson_pct = chairperson_count / len(result) * 100
        print(f"👑 Chairperson speeches: {chairperson_count:,} ({chairperson_pct:.1f}%)")
    
    print(f"\n💾 Final output: {PARLIAMENT_TO_PROCESS}_{LANGUAGE_TO_PROCESS}_final.pkl")
    print(f"🎉 Ready for topic modeling and analysis!")
    
except Exception as e:
    print(f"❌ Error during processing: {e}")
    import traceback
    traceback.print_exc()
    
    print(f"\n🔧 Troubleshooting tips:")
    print(f"1. Ensure speech embeddings were processed first")
    print(f"2. Check sufficient GPU memory and disk space")
    print(f"3. Verify parliament and language configuration")
    print(f"4. Try restarting runtime if memory issues occur")

In [None]:
# === BATCH PROCESSING OPTION ===
# Uncomment and modify this cell to process multiple parliament/language combinations

"""
# Example: Process multiple combinations in sequence
processing_queue = [
    ('austrian', 'english'),
    ('austrian', 'german'), 
    ('croatian', 'english'),
    ('croatian', 'croatian'),
    ('british', 'english')
]

results = {}
for parliament, language in processing_queue:
    try:
        print(f"\n{'='*80}")
        print(f"SEGMENTATION PROCESSING: {parliament.upper()} Parliament in {language.upper()}")
        print(f"{'='*80}")
        
        result = run_segmentation_pipeline(parliament, language)
        results[f"{parliament}_{language}"] = result
        
        print(f"✅ {parliament.upper()} {language.upper()} SEGMENTATION COMPLETED!")
        
        # Clear GPU memory between runs
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            gc.collect()
        
    except Exception as e:
        print(f"❌ Error processing {parliament} {language}: {e}")
        import traceback
        traceback.print_exc()
        continue

print(f"\n🎉 Batch segmentation processing completed!")
print(f"📊 Successfully processed: {list(results.keys())}")
"""

print("💡 Batch processing option available above.")
print("Uncomment and run to process multiple parliament/language combinations.")
print("\n🎯 Processing Pipeline Summary:")
print("1. 📝 Speech Embeddings Notebook → Calculates speech-level embeddings")
print("2. 🎯 Segmentation Notebook (this one) → Segments + segment embeddings") 
print("3. 📊 Analysis → Topic modeling and comparative analysis")