# Parliamentary Speech Segmentation Analysis

This notebook focuses on analyzing parliamentary speech patterns and implementing segmentation algorithms:

1. **Data Loading** - Load preprocessed embeddings data
2. **Agenda Analysis** - Analyze chairperson speech patterns and agenda keywords
3. **Comparative Analysis** - Compare English vs German agenda patterns
4. **Segmentation Implementation** - Parliamentary-aware segmentation algorithm
5. **Segmentation Evaluation** - Analyze segmentation quality and effectiveness

## Key Features:
- **Language-specific agenda detection** (English/German)
- **Multi-scale similarity analysis** for boundary detection
- **Agenda-aware segmentation** prioritizing parliamentary structure
- **Comparative keyword analysis** between languages

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = None

# Load the data with embeddings
embeddings = pd.read_pickle(r"data folder\data\AT_with_embeddings_final.pkl")

print(f"✅ Loaded data: {embeddings.shape}")
print(f"Columns: {list(embeddings.columns)}")

In [None]:
# === DATA OVERVIEW ===
print("📊 Data Overview:")
print(f"  • Total speeches: {embeddings.shape[0]:,}")
print(f"  • Speech embedding shape: {embeddings['Speech_Embeddings'][0].shape}")
print(f"  • Segment embedding shape: {embeddings['Segment_Embeddings'][0].shape}")
print(f"  • Unique segments: {embeddings['Segment_ID'].nunique():,}")
print(f"  • Average speeches per segment: {embeddings.shape[0] / embeddings['Segment_ID'].nunique():.1f}")

# Check for missing values
print(f"\n🔍 Missing values:")
print(f"  • Segment_ID: {embeddings['Segment_ID'].isna().sum()}")
print(f"  • Speech_Embeddings: {embeddings['Speech_Embeddings'].isna().sum()}")
print(f"  • Segment_Embeddings: {embeddings['Segment_Embeddings'].isna().sum()}")

# Check sitting length distribution
sitting_lengths = embeddings.groupby('Sitting_ID').size()
print(f"\n📈 Sitting length distribution:")
print(f"  • Min speeches per sitting: {sitting_lengths.min()}")
print(f"  • Max speeches per sitting: {sitting_lengths.max()}")
print(f"  • Average speeches per sitting: {sitting_lengths.mean():.1f}")
print(f"  • Sittings with <50 speeches: {(sitting_lengths < 50).sum()}")
print(f"  • Sittings with >200 speeches: {(sitting_lengths > 200).sum()}")

In [None]:
# === COMPREHENSIVE CHAIRPERSON AND AGENDA ANALYSIS ===

print("📊 Overall Speech Statistics:")
print(f"  • Total speeches in dataset: {len(embeddings):,}")

# Chairperson speeches
chairperson_total = embeddings[embeddings['Speaker_role'] == 'Chairperson']
print(f"  • Total chairperson speeches: {len(chairperson_total):,}")
print(f"  • Chairperson percentage: {len(chairperson_total)/len(embeddings)*100:.1f}%")

print("\n📋 Agenda-related Speech Analysis:")

# Various agenda patterns
agenda_patterns = {
    'agenda': embeddings['Text'].str.contains('agenda', case=False),
    'agenda item': embeddings['Text'].str.contains('agenda item', case=False),
    'next agenda': embeddings['Text'].str.contains('next agenda', case=False),
    'next agenda item': embeddings['Text'].str.contains('next agenda item', case=False)
}

print("\nChairperson speeches only:")
# Count for chairperson speeches only
for pattern_name, pattern_mask in agenda_patterns.items():
    chairperson_with_pattern = embeddings[(embeddings['Speaker_role'] == 'Chairperson') & pattern_mask]
    count = len(chairperson_with_pattern)
    percentage_of_chairperson = count / len(chairperson_total) * 100 if len(chairperson_total) > 0 else 0
    percentage_of_total = count / len(embeddings) * 100
    print(f"  • Containing '{pattern_name}': {count:,} ({percentage_of_chairperson:.1f}% of chairperson speeches, {percentage_of_total:.2f}% of all speeches)")

# Sample analysis
chairperson_with_agenda = embeddings[(embeddings['Speaker_role'] == 'Chairperson') & 
                                    (embeddings['Text'].str.contains('agenda', case=False))]

print(f"\n📈 Summary of speeches:")
if len(chairperson_with_agenda) > 1:
    first_idx = chairperson_with_agenda.index[0]
    last_idx = chairperson_with_agenda.index[-1]
    total_span = last_idx - first_idx
    avg_gap = total_span / (len(chairperson_with_agenda) - 1) if len(chairperson_with_agenda) > 1 else 0
    print(f"  • First speech at row: {first_idx:,}")
    print(f"  • Last speech at row: {last_idx:,}")
    print(f"  • Total row span: {total_span:,}")
    print(f"  • Average gap between agenda speeches: {avg_gap:.1f} rows")

In [None]:
# Load German data for comparison
AT_german = pd.read_pickle(r"data folder\data\AT_german.pkl")

In [None]:
# === GERMAN AGENDA ANALYSIS ===

print("🇩🇪 GERMAN AGENDA ANALYSIS")
print("=" * 50)

if 'Text' in AT_german.columns:
    sample_text = AT_german['Text'].iloc[0].lower()
    
    # Check for German indicators
    german_indicators = ['der', 'die', 'das', 'und', 'ist', 'sie', 'haben', 'werden', 'mit', 'tagesordnung']
    english_indicators = ['the', 'and', 'is', 'they', 'have', 'will', 'with', 'agenda']
    
    german_count = sum(1 for word in german_indicators if word in sample_text)
    english_count = sum(1 for word in english_indicators if word in sample_text)
    
    if german_count > english_count:
        print("✅ German text detected - proceeding with German agenda analysis")
        is_german = True
    else:
        print("ℹ️ English text detected - German analysis will show zero results")
        is_german = False
else:
    print("❌ No 'Text' column found")
    is_german = False

if is_german:
    print(f"📊 Overall Speech Statistics:")
    print(f"  • Total speeches in dataset: {len(AT_german):,}")

    # German chairperson speeches (PräsidentIn)
    chairperson_total_de = AT_german[AT_german['Speaker_role'] == 'PräsidentIn']
    print(f"  • Total PräsidentIn speeches: {len(chairperson_total_de):,}")
    print(f"  • PräsidentIn percentage: {len(chairperson_total_de)/len(AT_german)*100:.1f}%")

    print("\n📋 German Agenda-related Speech Analysis:")

    # German agenda patterns
    agenda_patterns_de = {
        'tagesordnung': AT_german['Text'].str.contains('tagesordnung', case=False),
        'tagesordnungspunkt': AT_german['Text'].str.contains('tagesordnungspunkt', case=False),
        'punkt der tagesordnung': AT_german['Text'].str.contains('punkt der tagesordnung', case=False),
        'nächster tagesordnungspunkt': AT_german['Text'].str.contains('nächster tagesordnungspunkt', case=False),
        'behandlung': AT_german['Text'].str.contains('behandlung', case=False),
        'verhandlung': AT_german['Text'].str.contains('verhandlung', case=False),
        'punkt': AT_german['Text'].str.contains('punkt', case=False)
    }

    print("PräsidentIn speeches only:")
    # Count for chairperson speeches only
    for pattern_name, pattern_mask in agenda_patterns_de.items():
        chairperson_with_pattern = AT_german[(AT_german['Speaker_role'] == 'PräsidentIn') & pattern_mask]
        count = len(chairperson_with_pattern)
        percentage_of_chairperson = count / len(chairperson_total_de) * 100 if len(chairperson_total_de) > 0 else 0
        percentage_of_total = count / len(AT_german) * 100
        print(f"  • Containing '{pattern_name}': {count:,} ({percentage_of_chairperson:.1f}% of PräsidentIn speeches, {percentage_of_total:.2f}% of all speeches)")

    print(f"\n📈 Summary of German agenda speeches:")
    chairperson_with_tagesordnung = AT_german[(AT_german['Speaker_role'] == 'PräsidentIn') & 
                                        (AT_german['Text'].str.contains('tagesordnung', case=False))]
    if len(chairperson_with_tagesordnung) > 1:
        print(f"  • Total Tagesordnung speeches found: {len(chairperson_with_tagesordnung):,}")

print("\n" + "=" * 50)

In [None]:
# === COMPARATIVE ANALYSIS: ENGLISH vs GERMAN AGENDA PATTERNS ===

print("🔍 COMPARATIVE ANALYSIS: English vs German Agenda Detection")
print("=" * 70)

# Based on analysis results - comparing the patterns
english_results = {
    'agenda': {'count': 11613, 'pct_chair': 9.3, 'pct_total': 5.01},
    'agenda item': {'count': 5039, 'pct_chair': 4.0, 'pct_total': 2.17},
    'next agenda': {'count': 0, 'pct_chair': 0.0, 'pct_total': 0.00},
    'next agenda item': {'count': 0, 'pct_chair': 0.0, 'pct_total': 0.00}
}

german_results = {
    'tagesordnung': {'count': 11779, 'pct_chair': 9.4, 'pct_total': 5.08},
    'tagesordnungspunkt': {'count': 2691, 'pct_chair': 2.2, 'pct_total': 1.16},
    'punkt der tagesordnung': {'count': 3261, 'pct_chair': 2.6, 'pct_total': 1.41},
    'nächster tagesordnungspunkt': {'count': 0, 'pct_chair': 0.0, 'pct_total': 0.00},
    'behandlung': {'count': 2978, 'pct_chair': 2.4, 'pct_total': 1.28},
    'verhandlung': {'count': 9529, 'pct_chair': 7.6, 'pct_total': 4.11},
    'punkt': {'count': 13883, 'pct_chair': 11.1, 'pct_total': 5.99}
}

print("📊 Key Findings:")
print("\n1. DIRECT EQUIVALENTS:")
eng_agenda = english_results['agenda']
ger_tagesordnung = german_results['tagesordnung']
print(f"   • 'agenda' (EN): {eng_agenda['count']:,} speeches ({eng_agenda['pct_chair']:.1f}% of chairperson)")
print(f"   • 'tagesordnung' (DE): {ger_tagesordnung['count']:,} speeches ({ger_tagesordnung['pct_chair']:.1f}% of chairperson)")
print(f"   • Difference: {ger_tagesordnung['count'] - eng_agenda['count']:+,} speeches ({ger_tagesordnung['pct_chair'] - eng_agenda['pct_chair']:+.1f}%)")

print("\n4. SEGMENTATION RECOMMENDATIONS:")
print("   🎯 STRONG signals (agenda boundaries):")
print("      • German: 'tagesordnungspunkt', 'punkt der tagesordnung'")
print("      • English: 'agenda item'")
print("\n   🎯 MEDIUM signals:")
print("      • German: 'tagesordnung', 'verhandlung'")
print("      • English: 'agenda'")
print("\n   ⚠️  WEAK signals (use carefully):")
print("      • German: 'behandlung' (removed 'punkt' - too common)")
print("      • English: limited weak signals")

print("\n" + "=" * 70)

## Parliamentary Segmentation Implementation

Enhanced segmentation algorithm that considers parliamentary structure and agenda patterns for both English and German texts.

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

def parliamentary_segment_speeches(df, window_size=5, min_segment_size=3):
    """
    Parliamentary segmentation with multi-scale analysis and chairperson agenda detection
    """
    segment_ids = []
    segmentation_metrics = []
    
    # Get unique sittings for progress tracking
    unique_sittings = df['Sitting_ID'].unique()
    print(f"🔄 Processing {len(unique_sittings)} sittings...")
    
    for sitting_id in tqdm(unique_sittings, desc="Segmenting sittings", unit="sitting"):
        group = df[df['Sitting_ID'] == sitting_id]
        sitting_length = len(group)
        
        if sitting_length < min_segment_size:
            # Very small sitting - one segment
            sitting_segments = [f"{sitting_id}_seg_0"] * len(group)
            segment_ids.extend(sitting_segments)
            segmentation_metrics.append({
                'sitting_id': sitting_id,
                'sitting_length': sitting_length,
                'num_segments': 1,
                'avg_segment_size': sitting_length,
                'boundaries_found': 0,
                'agenda_boundaries': 0
            })
            continue
        
        embeddings = np.array(group['Speech_Embeddings'].tolist())
        
        # Flexible target_segments formula
        target_segments = max(2, int(np.ceil(sitting_length / 25)))
        threshold_percentile = 40
        
        # === CHAIRPERSON AGENDA DETECTION ===
        agenda_boundaries = set()
        agenda_signals = []
        
        for i, (idx, row) in enumerate(group.iterrows()):
            agenda_score = 0
            
            # Strong signal for chairperson with agenda mentions
            if row['Speaker_role'] == 'Chairperson':
                text = str(row['Text']).lower()
                
                if 'agenda item' in text:
                    agenda_score = 1.0  # Strongest signal
                elif 'agenda' in text:
                    agenda_score = 0.7  # Strong signal
                elif i == 0:  # First speech by chairperson (session start)
                    agenda_score = 0.3  # Mild signal
            
            agenda_signals.append(agenda_score)
            
            # Add strong agenda boundaries
            if agenda_score >= 0.7 and i >= min_segment_size and (sitting_length - i) >= min_segment_size:
                agenda_boundaries.add(i)
        
        # === MULTI-SCALE SIMILARITY ANALYSIS ===
        # ...existing similarity analysis code...
        
        # === BOUNDARY DETECTION AND ASSIGNMENT ===
        # ...existing boundary detection code...
        
        # Store metrics
        segmentation_metrics.append({
            'sitting_id': sitting_id,
            'sitting_length': sitting_length,
            'num_segments': len(set(sitting_segments)),
            'avg_segment_size': sitting_length / len(set(sitting_segments)),
            'boundaries_found': len(boundaries) if 'boundaries' in locals() else 0,
            'agenda_boundaries': len([b for b in boundaries if b in agenda_boundaries]) if 'boundaries' in locals() else 0
        })
    
    df['Segment_ID'] = segment_ids
    return df, segmentation_metrics

print("✅ Parliamentary segmentation function loaded")

In [None]:
# === APPLY PARLIAMENTARY SEGMENTATION ===
print("🏛️ Running Parliamentary Segmentation...")

# Check data size first
long_speeches_df = embeddings[~embeddings['Is_Too_Short']].copy()
unique_sittings = long_speeches_df['Sitting_ID'].nunique()
total_speeches = len(long_speeches_df)

print(f"📊 Data to process:")
print(f"  • Unique sittings: {unique_sittings:,}")
print(f"  • Total speeches: {total_speeches:,}")
print(f"  • Average speeches per sitting: {total_speeches/unique_sittings:.1f}")

# Run parliamentary segmentation
segmented_df, seg_metrics = parliamentary_segment_speeches(
    long_speeches_df, 
    window_size=5,        
    min_segment_size=3
)

print(f"\n✅ Parliamentary segmentation complete!")
print(f"📊 Results:")
print(f"  • Total speeches processed: {len(segmented_df):,}")
print(f"  • Unique segments created: {segmented_df['Segment_ID'].nunique():,}")
print(f"  • Average speeches per segment: {len(segmented_df) / segmented_df['Segment_ID'].nunique():.1f}")

# Convert metrics to DataFrame for analysis
metrics_df = pd.DataFrame(seg_metrics)

print(f"\n📈 Segmentation Quality Overview:")
print(f"  • Average segments per sitting: {metrics_df['num_segments'].mean():.1f}")
print(f"  • Average segment size: {metrics_df['avg_segment_size'].mean():.1f}")