In [13]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = None

# Load the data with embeddings
embeddings = pd.read_pickle(r"data folder\data\AT_with_embeddings_final.pkl")

print(f"✅ Loaded data: {embeddings.shape}")
print(f"Columns: {list(embeddings.columns)}")

✅ Loaded data: (231752, 30)
Columns: ['Sitting_ID', 'Speech_ID', 'Title', 'Date', 'Body', 'Term', 'Session', 'Meeting', 'Sitting', 'Agenda', 'Subcorpus', 'Lang', 'Speaker_role', 'Speaker_MP', 'Speaker_minister', 'Speaker_party', 'Speaker_party_name', 'Party_status', 'Party_orientation', 'Speaker_ID', 'Speaker_name', 'Speaker_gender', 'Speaker_birth', 'Text', 'Word_Count', 'Is_Too_Short', 'Is_Filtered', 'Speech_Embeddings', 'Segment_ID', 'Segment_Embeddings']


In [14]:
# === DATA OVERVIEW ===
print("📊 Data Overview:")
print(f"  • Total speeches: {embeddings.shape[0]:,}")
print(f"  • Speech embedding shape: {embeddings['Speech_Embeddings'][0].shape}")
print(f"  • Segment embedding shape: {embeddings['Segment_Embeddings'][0].shape}")
print(f"  • Unique segments: {embeddings['Segment_ID'].nunique():,}")
print(f"  • Average speeches per segment: {embeddings.shape[0] / embeddings['Segment_ID'].nunique():.1f}")

# Check for missing values
print(f"\n🔍 Missing values:")
print(f"  • Segment_ID: {embeddings['Segment_ID'].isna().sum()}")
print(f"  • Speech_Embeddings: {embeddings['Speech_Embeddings'].isna().sum()}")
print(f"  • Segment_Embeddings: {embeddings['Segment_Embeddings'].isna().sum()}")

# Check sitting length distribution
sitting_lengths = embeddings.groupby('Sitting_ID').size()
print(f"\n📈 Sitting length distribution:")
print(f"  • Min speeches per sitting: {sitting_lengths.min()}")
print(f"  • Max speeches per sitting: {sitting_lengths.max()}")
print(f"  • Average speeches per sitting: {sitting_lengths.mean():.1f}")
print(f"  • Sittings with <50 speeches: {(sitting_lengths < 50).sum()}")
print(f"  • Sittings with >200 speeches: {(sitting_lengths > 200).sum()}")

📊 Data Overview:
  • Total speeches: 231,752
  • Speech embedding shape: (1024,)
  • Segment embedding shape: (1024,)
  • Unique segments: 5,728
  • Average speeches per segment: 40.5

🔍 Missing values:
  • Segment_ID: 0
  • Speech_Embeddings: 41119
  • Segment_Embeddings: 0

📈 Sitting length distribution:
  • Min speeches per sitting: 1
  • Max speeches per sitting: 1378
  • Average speeches per sitting: 189.8
  • Sittings with <50 speeches: 474
  • Sittings with >200 speeches: 542

📈 Sitting length distribution:
  • Min speeches per sitting: 1
  • Max speeches per sitting: 1378
  • Average speeches per sitting: 189.8
  • Sittings with <50 speeches: 474
  • Sittings with >200 speeches: 542


## Enhanced Segmentation with Domain Knowledge

Improved segmentation considering that Austrian parliament sittings can vary from 1-10 agendas (average 5-6).
We'll implement adaptive segmentation that adjusts to sitting length and uses domain knowledge.

In [15]:
# === ENHANCED SEGMENTATION FUNCTIONS ===
from sklearn.metrics.pairwise import cosine_similarity
from scipy.signal import find_peaks
from sklearn.metrics import silhouette_score
from tqdm import tqdm
import numpy as np

def calculate_adaptive_windowed_similarity(embeddings_list, base_window=3, speaker_roles=None):
    """Enhanced similarity calculation with adaptive windowing."""
    if len(embeddings_list) < 2:
        return np.array([])
    
    num_utterances = len(embeddings_list)
    similarities = []
    
    for g in range(num_utterances - 1):
        # Adaptive window based on speaker role
        if speaker_roles and g < len(speaker_roles):
            window_size = max(1, base_window - 1) if speaker_roles[g] == 'chairperson' else base_window
        else:
            window_size = base_window
            
        # Calculate windows
        start_before = max(0, g - window_size + 1)
        end_before = g + 1
        window_before = embeddings_list[start_before:end_before]
        
        start_after = g + 1
        end_after = min(num_utterances, g + 1 + window_size)
        window_after = embeddings_list[start_after:end_after]
        
        if not window_before or not window_after:
            similarities.append(0)
            continue
            
        # Calculate similarity
        mean_before = np.mean([np.asarray(e) for e in window_before], axis=0)
        mean_after = np.mean([np.asarray(e) for e in window_after], axis=0)
        sim = cosine_similarity(mean_before.reshape(1, -1), mean_after.reshape(1, -1))[0][0]
        similarities.append(sim)
        
    return np.array(similarities)

def find_adaptive_boundaries(similarities, sitting_length, min_segments=1, max_segments=10):
    """
    Find boundaries with adaptive target based on sitting length.
    Short sittings (1-2 agendas), medium (3-8), long (9-10+).
    """
    if len(similarities) == 0:
        return np.array([])
    
    # Adaptive target based on sitting length
    if sitting_length < 50:
        target_segments = max(1, min(3, sitting_length // 15))  # 1-3 segments for short sittings
        height_range = (0.2, 0.4)
        prominence_range = (0.15, 0.3)
    elif sitting_length < 150:
        target_segments = max(3, min(7, sitting_length // 25))  # 3-7 segments for medium sittings  
        height_range = (0.15, 0.35)
        prominence_range = (0.1, 0.25)
    else:
        target_segments = max(5, min(10, sitting_length // 30))  # 5-10 segments for long sittings
        height_range = (0.1, 0.3)
        prominence_range = (0.08, 0.2)
    
    target_segments = max(min_segments, min(target_segments, max_segments))
    
    inverted_similarities = np.maximum(0, 1 - similarities)
    best_boundaries = np.array([])
    best_score = float('inf')
    
    # Grid search for optimal parameters
    for height in np.linspace(height_range[0], height_range[1], 4):
        for prominence in np.linspace(prominence_range[0], prominence_range[1], 4):
            for distance in [2, 3, 5, 8]:
                peaks, _ = find_peaks(
                    inverted_similarities,
                    height=height,
                    prominence=prominence,
                    distance=distance
                )
                
                num_segments = len(peaks) + 1
                segment_score = abs(num_segments - target_segments)
                
                if segment_score < best_score:
                    best_score = segment_score;
                    best_boundaries = peaks;
                    
    return best_boundaries

def calculate_segmentation_coherence(embeddings_list, segment_boundaries):
    """Calculate coherence metrics for segmentation quality."""
    if len(embeddings_list) < 4 or len(segment_boundaries) == 0:
        return {'intra_similarity': 0, 'inter_similarity': 0, 'silhouette': 0, 'coherence_ratio': 0}
    
    # Create segment labels
    segment_labels = np.zeros(len(embeddings_list))
    current_segment = 0
    
    for i in range(len(embeddings_list)):
        if i > 0 and (i - 1) in segment_boundaries:
            current_segment += 1
        segment_labels[i] = current_segment
    
    embeddings_array = np.array([np.asarray(e) for e in embeddings_list])
    
    # Intra-segment similarity (higher is better)
    intra_similarities = []
    for seg_id in np.unique(segment_labels):
        seg_embeddings = embeddings_array[segment_labels == seg_id]
        if len(seg_embeddings) > 1:
            seg_sim_matrix = cosine_similarity(seg_embeddings)
            mask = np.triu(np.ones_like(seg_sim_matrix), k=1) == 1
            intra_similarities.append(seg_sim_matrix[mask].mean())
    
    avg_intra = np.mean(intra_similarities) if intra_similarities else 0
    
    # Inter-segment similarity (lower is better)
    inter_similarities = []
    unique_segments = np.unique(segment_labels)
    for i in range(len(unique_segments)):
        for j in range(i + 1, len(unique_segments)):
            seg1 = embeddings_array[segment_labels == unique_segments[i]]
            seg2 = embeddings_array[segment_labels == unique_segments[j]]
            
            if len(seg1) > 0 and len(seg2) > 0:
                seg1_center = np.mean(seg1, axis=0)
                seg2_center = np.mean(seg2, axis=0)
                inter_sim = cosine_similarity(seg1_center.reshape(1, -1), 
                                            seg2_center.reshape(1, -1))[0][0]
                inter_similarities.append(inter_sim)
    
    avg_inter = np.mean(inter_similarities) if inter_similarities else 0
    
    # Silhouette score
    silhouette = 0
    if len(unique_segments) > 1 and len(embeddings_array) > len(unique_segments):
        try:
            silhouette = silhouette_score(embeddings_array, segment_labels, metric='cosine')
        except:
            silhouette = 0
    
    return {
        'intra_similarity': avg_intra,
        'inter_similarity': avg_inter,
        'silhouette': silhouette,
        'coherence_ratio': avg_intra - avg_inter if avg_inter > 0 else 0
    }

print("✓ Enhanced segmentation functions loaded")

✓ Enhanced segmentation functions loaded


In [16]:
# === ADAPTIVE SEGMENTATION IMPLEMENTATION ===

def segment_speeches_adaptive(df, base_window=3):
    """
    Adaptive segmentation that adjusts to sitting length (1-10 agendas expected).
    """
    print("🏛️ Running adaptive segmentation for Austrian Parliament")
    print("📊 Targeting 1-10 segments per sitting based on length")
    
    df_segmented = df.copy()
    segment_ids = []
    all_coherence_metrics = []
    boundary_stats = []
    
    sittings = list(df_segmented.groupby('Sitting_ID'))
    
    for sitting_id, group in tqdm(sittings, desc="🔍 Adaptive segmenting", unit="sitting"):
        sitting_length = len(group)
        
        if sitting_length < 3:  # Very short sittings
            segment_ids.extend([f"{sitting_id}_seg_0"] * len(group))
            boundary_stats.append({
                'sitting_id': sitting_id,
                'sitting_length': sitting_length,
                'boundaries_found': 0,
                'segments_created': 1,
                'category': 'very_short'
            })
            continue
        
        # Prepare speaker information
        speaker_roles = []
        for _, row in group.iterrows():
            role = 'chairperson' if not row.get('Speaker_MP', True) else 'mp'
            speaker_roles.append(role)
        
        embeddings_list = group['Speech_Embeddings'].tolist()
        
        # Calculate similarities with speaker awareness
        similarities = calculate_adaptive_windowed_similarity(
            embeddings_list, 
            base_window=base_window,
            speaker_roles=speaker_roles
        )
        
        if len(similarities) == 0:
            segment_ids.extend([f"{sitting_id}_seg_0"] * len(group))
            continue
        
        # Find adaptive boundaries
        boundaries = find_adaptive_boundaries(similarities, sitting_length)
        
        # Calculate coherence metrics
        coherence = calculate_segmentation_coherence(embeddings_list, boundaries)
        all_coherence_metrics.append(coherence)
        
        # Assign segment IDs
        current_segment = 0
        sitting_segment_ids = []
        
        for i in range(len(group)):
            if i > 0 and (i - 1) in boundaries:
                current_segment += 1
            sitting_segment_ids.append(f"{sitting_id}_seg_{current_segment}")
        
        segment_ids.extend(sitting_segment_ids)
        
        # Categorize sitting by length
        if sitting_length < 50:
            category = 'short'
        elif sitting_length < 150:
            category = 'medium'
        else:
            category = 'long'
            
        boundary_stats.append({
            'sitting_id': sitting_id,
            'sitting_length': sitting_length,
            'boundaries_found': len(boundaries),
            'segments_created': len(set(sitting_segment_ids)),
            'category': category
        })
    
    df_segmented['Segment_ID_Enhanced'] = segment_ids
    
    # Print comprehensive statistics
    stats_df = pd.DataFrame(boundary_stats)
    
    print(f"\n✅ Adaptive segmentation complete!")
    print(f"📊 Overall Statistics:")
    print(f"  • Total segments: {df_segmented['Segment_ID_Enhanced'].nunique()}")
    print(f"  • Avg segments per sitting: {stats_df['segments_created'].mean():.2f}")
    print(f"  • Coherence ratio: {np.mean([m['coherence_ratio'] for m in all_coherence_metrics]):.3f}")
    
    print(f"\n📈 By Sitting Category:")
    for category in ['short', 'medium', 'long']:
        cat_stats = stats_df[stats_df['category'] == category]
        if len(cat_stats) > 0:
            print(f"  • {category.capitalize()} sittings (<50, 50-150, >150 speeches):")
            print(f"    - Count: {len(cat_stats)}")
            print(f"    - Avg segments: {cat_stats['segments_created'].mean():.1f}")
            print(f"    - Avg length: {cat_stats['sitting_length'].mean():.0f} speeches")
    
    return df_segmented, all_coherence_metrics, boundary_stats

# Apply adaptive segmentation to long speeches only
print("🚀 Running adaptive segmentation...")
long_speeches_df = embeddings[~embeddings['Is_Too_Short']].copy()
segmented_df, coherence_metrics, boundary_stats = segment_speeches_adaptive(long_speeches_df)

🚀 Running adaptive segmentation...
🏛️ Running adaptive segmentation for Austrian Parliament
📊 Targeting 1-10 segments per sitting based on length
🏛️ Running adaptive segmentation for Austrian Parliament
📊 Targeting 1-10 segments per sitting based on length


🔍 Adaptive segmenting: 100%|██████████| 1221/1221 [03:00<00:00,  6.75sitting/s]




✅ Adaptive segmentation complete!
📊 Overall Statistics:
  • Total segments: 5952
  • Avg segments per sitting: 4.87
  • Coherence ratio: -0.202

📈 By Sitting Category:
  • Short sittings (<50, 50-150, >150 speeches):
    - Count: 454
    - Avg segments: 1.0
    - Avg length: 5 speeches
  • Medium sittings (<50, 50-150, >150 speeches):
    - Count: 180
    - Avg segments: 3.4
    - Avg length: 83 speeches
  • Long sittings (<50, 50-150, >150 speeches):
    - Count: 556
    - Avg segments: 8.7
    - Avg length: 311 speeches


## BERTopic Configuration and Training

Configure BERTopic with Austrian parliament-specific stop words and optimal parameters.

In [17]:
# === BERTOPIC SETUP ===
from bertopic import BERTopic
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from umap import UMAP
import openai
import os
from dotenv import load_dotenv

# Austrian parliament-specific stop words
custom_stopwords = [
    'mr', 'mrs', 'ms', 'dr', 'madam', 'honourable', 'member', 'members', 'vp', 'sp', 'fp', 
    'minister', 'speaker', 'deputy', 'president', 'chairman', 'chair', 'schilling', 
    'secretary', 'lord', 'lady', 'question', 'order', 'point', 'debate', 'motion', 'amendment',
    'congratulations', 'congratulate', 'thanks', 'thank', 'say', 'one', 'want', 'know', 'think', 
    'believe', 'see', 'go', 'come', 'give', 'take', 'people', 'federal', 'government', 'austria', 
    'austrian', 'committee', 'call', 'said', 'already', 'please', 'request', 'proceed', 'reading',
    'course', 'welcome', 'council', 'open', 'written', 'contain', 'items', 'item', 'yes', 'no', 
    'following', 'next', 'speech', 'year', 'years', 'state', 'also', 'would', 'like', 'may', 'must', 
    'upon', 'indeed', 'session', 'meeting', 'report', 'commission', 'behalf', 'gentleman', 'gentlemen', 
    'ladies', 'applause', 'group', 'colleague', 'colleagues', 'issue', 'issues', 'chancellor', 'court', 
    'ask', 'answer', 'reply', 'regard', 'regarding', 'regards', 'respect', 'respectfully', 'sign', 
    'shall', 'procedure', 'declare', 'hear', 'minutes', 'speaking', 'close', 'abg', 'mag', 'orf', 'wait'
]

all_stopwords = list(ENGLISH_STOP_WORDS) + custom_stopwords

# Use enhanced segmentation for topic modeling
segment_texts = segmented_df.groupby('Segment_ID_Enhanced')['Text'].apply(lambda x: ' '.join(x)).tolist()
segment_embeddings = np.array(segmented_df.groupby('Segment_ID_Enhanced')['Segment_Embeddings'].first().tolist())

print(f"📊 Topic modeling data prepared:")
print(f"  • Segments for modeling: {len(segment_texts)}")
print(f"  • Embedding dimension: {segment_embeddings.shape[1]}")

# Configure BERTopic
vectorizer_model = CountVectorizer(
    stop_words=all_stopwords,
    ngram_range=(1, 2),
    min_df=3,
    max_df=0.9,
    max_features=1000
)

n_clusters = 15
umap_model = UMAP(n_neighbors=15, n_components=5, metric='cosine', random_state=42)
clustering_model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)

topic_model = BERTopic(
    embedding_model=None,
    umap_model=umap_model,
    hdbscan_model=clustering_model,
    vectorizer_model=vectorizer_model,
    nr_topics=n_clusters,
    min_topic_size=15,
    verbose=True
)

print("🚀 Training BERTopic model...")
topics, probs = topic_model.fit_transform(segment_texts, embeddings=segment_embeddings)

topic_info = topic_model.get_topic_info()
print(f"\n🎯 Topics created: {len(topic_info[topic_info['Topic'] != -1])}")

2025-09-19 12:26:04,081 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


📊 Topic modeling data prepared:
  • Segments for modeling: 5952
  • Embedding dimension: 1024
🚀 Training BERTopic model...


2025-09-19 12:26:24,933 - BERTopic - Dimensionality - Completed ✓
2025-09-19 12:26:24,935 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-19 12:26:24,935 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-19 12:26:25,089 - BERTopic - Cluster - Completed ✓
2025-09-19 12:26:25,090 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-09-19 12:26:25,089 - BERTopic - Cluster - Completed ✓
2025-09-19 12:26:25,090 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-09-19 12:29:50,017 - BERTopic - Representation - Completed ✓
2025-09-19 12:29:50,179 - BERTopic - Topic reduction - Reducing number of topics
2025-09-19 12:29:50,184 - BERTopic - Topic reduction - Number of topics (15) is equal or higher than the clustered topics(15).
2025-09-19 12:29:50,186 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-19 12:29:50,017 - BERTopic - Representatio


🎯 Topics created: 15


In [18]:
# === LLM TOPIC NAME GENERATION ===

# Load environment variables
dotenv_path = os.path.join(os.pardir, '.env')
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path)
    print(f"✅ Loaded .env file from: {dotenv_path}")
else:
    print("⚠️ .env file not found. Ensure OPENAI_API_KEY is set in environment.")

def generate_topic_name_with_llm(keywords_list):
    """Generate concise topic names using OpenAI API."""
    if not isinstance(keywords_list, list) or not keywords_list:
        return ""
    
    keyword_string = ', '.join(keywords_list)
    prompt = f"""Keywords: {keyword_string}

Generate a concise topic name (1-3 words, not counting 'and') for Austrian parliamentary debate using these keywords.
Give slightly more weight to earlier keywords. Avoid words: reform, security, allocation, strategy.
Output only the name."""

    try:
        if not os.getenv('OPENAI_API_KEY'):
            print("Error: OPENAI_API_KEY not set.")
            return ""
        
        response = openai.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are an assistant skilled at summarizing Austrian parliamentary topics from keywords."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.5,
            max_tokens=25
        )
        return response.choices[0].message.content.strip().replace('"', '').replace('\n', ' ').strip()
    except Exception as e:
        print(f"Error calling OpenAI API: {e}")
        return ""

# Generate LLM names
print("📝 Generating topic names with LLM...")
topic_info['LLM_Name'] = topic_info.apply(
    lambda row: generate_topic_name_with_llm(row['Representation']) if row['Topic'] != -1 else "Outliers",
    axis=1
)

print("\n🎯 Topic Summary:")
print(topic_info[['Topic', 'Count', 'Name', 'LLM_Name']].head(10))

✅ Loaded .env file from: ..\.env
📝 Generating topic names with LLM...

🎯 Topic Summary:
   Topic  Count                                               Name  \
0      0    438  0_removed national_jakob schwarz_gabriela schw...   
1      1    343                1_euroteam_leikam_lassing_bhmdorfer   
2      2    283  2_road pricing_driving licence_semmering base_...   
3      3    368           3_elga_nonsmokers_pharmacies_common care   
4      4    403   4_esm_debt brake_financial transaction_euro zone   
5      5    537                  5_ceta_euratom_antiatomic_temelin   
6      6    452  6_childcare allowance_child rearing_rearing_mi...   
7      7    423  7_vaccinated_corona crisis_intrusion belakovic...   
8      8    418  8_schwarzenberger_feurstein_active labour_semp...   
9      9    482         9_custom fonts_fonts_volksanwalt_addiction   

                                 LLM_Name  
0        Parliamentary Leadership Changes  
1              Euroteam and Liberal Forum  
2  Transp

## Temporal Analysis and Visualization

Analyze how topics evolve over time and create comprehensive visualizations.

In [None]:
# === TEMPORAL ANALYSIS ===
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import math

print("📅 Preparing temporal analysis...")

# Map topics back to original dataframe - FIXED
# Create a mapping from segment ID to topic ID
segment_topic_map = pd.DataFrame({
    'Segment_ID_Enhanced': segmented_df['Segment_ID_Enhanced'].unique(),
    'Topic_ID': topics
})

# Merge with segmented dataframe
embeddings_with_topics = segmented_df.merge(
    segment_topic_map, 
    on='Segment_ID_Enhanced', 
    how='left'
)

# Prepare temporal data
embeddings_with_topics['Timestamp'] = pd.to_datetime(embeddings_with_topics['Date'], errors='coerce')
embeddings_with_topics.dropna(subset=['Timestamp', 'Topic_ID'], inplace=True)
embeddings_with_topics['Year'] = embeddings_with_topics['Timestamp'].dt.year

# Create topics over time
docs_for_temporal = embeddings_with_topics['Text'].tolist()
topics_for_temporal = embeddings_with_topics['Topic_ID'].tolist()
timestamps_for_temporal = embeddings_with_topics['Year'].tolist()

print("🕐 Computing topics over time...")
topics_over_time = topic_model.topics_over_time(
    docs=docs_for_temporal,
    topics=topics_for_temporal,
    timestamps=timestamps_for_temporal,
    global_tuning=True,
    evolution_tuning=True,
    nr_bins=20
)

print(f"✅ Temporal analysis complete! Prepared {len(docs_for_temporal)} documents")

📅 Preparing temporal analysis...


ValueError: cannot insert Segment_ID_Enhanced, already exists

In [None]:
# === COMPREHENSIVE VISUALIZATIONS ===

# Create topic name mapping
topic_id_to_name_map = pd.Series(topic_info.LLM_Name.values, index=topic_info.Topic).to_dict()

print("📊 Creating visualizations...")

# 1. Topic evolution over time
valid_topics = sorted([t for t in topics_over_time['Topic'].unique() if t != -1])
speech_counts = embeddings_with_topics.groupby('Topic_ID').size()

cols = 3
rows = math.ceil(len(valid_topics) / cols) if valid_topics else 1

subplot_titles = []
for topic_id in valid_topics:
    topic_name = topic_id_to_name_map.get(topic_id, f"Topic {topic_id}")
    speech_count = speech_counts.get(topic_id, 0)
    subplot_titles.append(f"{topic_name}<br>(Speeches: {speech_count})")

fig_evolution = make_subplots(
    rows=rows, cols=cols,
    subplot_titles=subplot_titles,
    shared_xaxes=True,
    vertical_spacing=0.15
)

for i, topic_id in enumerate(valid_topics):
    row = (i // cols) + 1
    col = (i % cols) + 1
    
    topic_data = topics_over_time[topics_over_time['Topic'] == topic_id]
    
    fig_evolution.add_trace(
        go.Scatter(
            x=topic_data['Timestamp'], 
            y=topic_data['Frequency'],
            mode='lines+markers',
            name=f"Topic {topic_id}",
            line=dict(width=2),
            marker=dict(size=4)
        ),
        row=row, col=col
    )

fig_evolution.update_layout(
    title_text="Topic Evolution Over Time in Austrian Parliament",
    height=350 * rows + 100,
    showlegend=False
)

for i in range(1, rows + 1):
    for j in range(1, cols + 1):
        fig_evolution.update_xaxes(title_text="Year", row=i, col=j)
        fig_evolution.update_yaxes(title_text="Frequency", row=i, col=j)

fig_evolution.show()

# 2. Topic distribution
fig_dist = px.bar(
    topic_info[topic_info['Topic'] != -1].sort_values('Count', ascending=True),
    x='Count', y='LLM_Name',
    orientation='h',
    title='Topic Distribution by Segment Count',
    labels={'Count': 'Number of Segments', 'LLM_Name': 'Topic'}
)
fig_dist.update_layout(height=max(400, len(valid_topics) * 30))
fig_dist.show()

print("🎨 Visualizations complete!")

## Model Evaluation and Quality Metrics

Comprehensive assessment of segmentation and topic modeling quality.

In [None]:
# === QUALITY METRICS ===

print("📊 Computing comprehensive quality metrics...")

# 1. Segmentation quality
avg_coherence = np.mean([m['coherence_ratio'] for m in coherence_metrics if m['coherence_ratio'] > 0])
avg_silhouette = np.mean([m['silhouette'] for m in coherence_metrics if m['silhouette'] > 0])

# 2. Topic modeling quality
avg_segments_per_topic = topic_info[topic_info['Topic'] != -1]['Count'].mean()
topic_size_std = topic_info[topic_info['Topic'] != -1]['Count'].std()

# 3. Segmentation statistics by sitting category
boundary_stats_df = pd.DataFrame(boundary_stats)
category_stats = boundary_stats_df.groupby('category').agg({
    'segments_created': ['mean', 'std'],
    'sitting_length': ['mean', 'count']
}).round(2)

# 4. Overall statistics
total_speeches = len(embeddings)
total_segments = segmented_df['Segment_ID_Enhanced'].nunique()
avg_speeches_per_segment = total_speeches / total_segments

# 5. Temporal coverage
year_range = embeddings_with_topics['Year'].max() - embeddings_with_topics['Year'].min()
topics_per_year = embeddings_with_topics.groupby('Year')['Topic_ID'].nunique().mean()

print("\n📈 Segmentation Quality:")
print(f"  • Average coherence ratio: {avg_coherence:.3f}")
print(f"  • Average silhouette score: {avg_silhouette:.3f}")
print(f"  • Total segments created: {total_segments:,}")
print(f"  • Average speeches per segment: {avg_speeches_per_segment:.1f}")

print(f"\n🏛️ Sitting Categories Performance:")
for category in ['short', 'medium', 'long']:
    if category in category_stats.index:
        stats = category_stats.loc[category]
        print(f"  • {category.capitalize()} sittings:")
        print(f"    - Count: {stats[('sitting_length', 'count')]}")
        print(f"    - Avg segments: {stats[('segments_created', 'mean')]:.1f} ± {stats[('segments_created', 'std')]:.1f}")
        print(f"    - Avg length: {stats[('sitting_length', 'mean')]:.0f} speeches")

print(f"\n🎯 Topic Modeling Quality:")
print(f"  • Topics created: {len(topic_info[topic_info['Topic'] != -1])}")
print(f"  • Average segments per topic: {avg_segments_per_topic:.1f}")
print(f"  • Topic size standard deviation: {topic_size_std:.1f}")
print(f"  • Temporal coverage: {year_range} years")
print(f"  • Average topics per year: {topics_per_year:.1f}")

# 6. Quality assessment
outlier_count = len(embeddings_with_topics[embeddings_with_topics['Topic_ID'] == -1])
outlier_percentage = (outlier_count / total_speeches) * 100
print(f"  • Outlier percentage: {outlier_percentage:.1f}%")

print(f"\n🔍 Quality Assessment:")
if avg_coherence > 0.15:
    print("  ✅ Excellent segmentation coherence")
elif avg_coherence > 0.08:
    print("  ⚠️ Good segmentation coherence")
else:
    print("  ❌ Poor segmentation coherence - consider parameter adjustment")

if outlier_percentage < 10:
    print("  ✅ Good topic clustering quality")
elif outlier_percentage < 20:
    print("  ⚠️ Moderate topic clustering quality")
else:
    print("  ❌ High outlier rate - consider adjusting clustering parameters")

# Expected vs actual segments per sitting
expected_range = (3, 8)  # Expected 3-8 segments for most sittings
actual_avg = boundary_stats_df['segments_created'].mean()
if expected_range[0] <= actual_avg <= expected_range[1]:
    print(f"  ✅ Segment count in expected range: {actual_avg:.1f}")
else:
    print(f"  ⚠️ Segment count outside expected range: {actual_avg:.1f} (expected: {expected_range[0]}-{expected_range[1]})")

In [None]:
# === SAVE RESULTS (OPTIONAL) ===
# Uncomment to save enhanced results

# print("💾 Saving enhanced results...")
# embeddings_final = embeddings_with_topics.copy()
# embeddings_final['Topic_Name'] = embeddings_final['Topic_ID'].map(topic_id_to_name_map)
# embeddings_final.to_pickle('data folder/data/AT_with_enhanced_topics.pkl')
# topic_info.to_pickle('data folder/data/topic_info_enhanced.pkl')
# print("✅ Results saved!")

print("\n🎉 Analysis complete!")
print("📋 Summary:")
print(f"  • {total_segments:,} segments from {len(boundary_stats):,} sittings")
print(f"  • {len(valid_topics)} meaningful topics identified")
print(f"  • Coherence score: {avg_coherence:.3f}")
print(f"  • Temporal span: {year_range} years")