In [1]:
import sys
sys.path.insert(0, '..')

import polars as pl
from pathlib import Path

# Load preprocessed data
data_file = Path('../data/processed/df_sample_split_preprocessed_topic.parquet')
df = pl.read_parquet(data_file)

print(f"Preprocessed data loaded!")
print(f"Shape: {df.shape}")
print(f"\nColumn names: {df.columns}")
print(f"\nData types:")
print(df.schema)
print(f"\nFirst few rows:")
print(df.head())

Preprocessed data loaded!
Shape: (412, 23)

Column names: ['id', 'session', 'electoralTerm', 'firstName', 'lastName', 'politicianId', 'speechContent', 'factionId', 'documentUrl', 'positionShort', 'positionLong', 'date', 'speech_length', 'paragraph_number', 'paragraph_length', 'tokens', 'token_count', 'tokens_no_stopwords', 'token_count_no_stopwords', 'tokens_clean', 'token_count_clean', 'tokens_lemma', 'token_count_lemma']

Data types:
Schema([('id', Int64), ('session', Int64), ('electoralTerm', Int64), ('firstName', String), ('lastName', String), ('politicianId', Int64), ('speechContent', String), ('factionId', Int64), ('documentUrl', String), ('positionShort', String), ('positionLong', String), ('date', String), ('speech_length', Int64), ('paragraph_number', Int64), ('paragraph_length', Int64), ('tokens', List(String)), ('token_count', UInt32), ('tokens_no_stopwords', List(String)), ('token_count_no_stopwords', UInt32), ('tokens_clean', List(String)), ('token_count_clean', UInt32), (

In [2]:
# Join tokens back into full strings using native Polars (much faster than map_elements)
import polars as pl

# Use native Polars list.join() instead of slow map_elements
df = df.with_columns(
    pl.col('tokens_lemma').list.join(' ').alias('text_lemmatized')
)

# Also create versions from other token types for comparison
df = df.with_columns(
    pl.col('tokens_clean').list.join(' ').alias('text_clean')
)

df = df.with_columns(
    pl.col('tokens_no_stopwords').list.join(' ').alias('text_no_stopwords')
)

print("Tokens joined into full strings!")
print(f"\nDataframe shape: {df.shape}")
print(f"New columns: {df.columns}")

print(f"\nSample text from lemmatized tokens (first paragraph):")
print(f"  {df['text_lemmatized'][0]}")

Tokens joined into full strings!

Dataframe shape: (412, 26)
New columns: ['id', 'session', 'electoralTerm', 'firstName', 'lastName', 'politicianId', 'speechContent', 'factionId', 'documentUrl', 'positionShort', 'positionLong', 'date', 'speech_length', 'paragraph_number', 'paragraph_length', 'tokens', 'token_count', 'tokens_no_stopwords', 'token_count_no_stopwords', 'tokens_clean', 'token_count_clean', 'tokens_lemma', 'token_count_lemma', 'text_lemmatized', 'text_clean', 'text_no_stopwords']

Sample text from lemmatized tokens (first paragraph):
  Frau Präsidentin Dame Herr Herr


In [3]:
# TF-IDF Vectorization of lemmatized texts
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

print("TF-IDF Vectorization")
print("Converting lemmatized texts to TF-IDF vectors...")

# Initialize TfidfVectorizer with German-specific parameters
tfidf_vectorizer = TfidfVectorizer(
    max_features=1000,          # Limit vocabulary to top 1000 features
    min_df=2,                   # Minimum document frequency
    max_df=0.8,                 # Maximum document frequency (80% of docs)
    ngram_range=(1, 2),         # Use unigrams and bigrams
    sublinear_tf=True,          # Apply sublinear term frequency scaling
    norm='l2'                   # L2 normalization
)

# Convert lemmatized texts to TF-IDF vectors
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text_lemmatized'])

print(f"\nTF-IDF Vectorization complete!")
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"  Samples (documents): {tfidf_matrix.shape[0]}")
print(f"  Features (vocabulary): {tfidf_matrix.shape[1]}")
print(f"  Sparsity: {(1 - tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])) * 100:.2f}%")

# Get feature names (vocabulary)
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())
print(f"\nVocabulary size: {len(feature_names)}")
print(f"Sample features: {feature_names[:20]}")

# Convert sparse matrix to dense for inspection
tfidf_dense = tfidf_matrix.toarray()

# Show top TF-IDF terms for first document
print(f"\nTop 10 TF-IDF terms for first document:")
top_indices = tfidf_dense[0].argsort()[-10:][::-1]
for idx in top_indices:
    print(f"  {feature_names[idx]}: {tfidf_dense[0, idx]:.4f}")

# Store TF-IDF matrix and vectorizer for later use
print(f"\nTF-IDF vectorizer and matrix ready for topic modeling!")

TF-IDF Vectorization
Converting lemmatized texts to TF-IDF vectors...

TF-IDF Vectorization complete!
TF-IDF matrix shape: (412, 1000)
  Samples (documents): 412
  Features (vocabulary): 1000
  Sparsity: 98.49%

Vocabulary size: 1000
Sample features: ['abgeordneter' 'abgeordneter dr' 'abgeordneter frau' 'abs' 'abschließen'
 'abschließend' 'absehen' 'absolut' 'abstimmung' 'aktiv' 'aktuell' 'all'
 'allgemein' 'alt' 'alternative' 'amerikanisch' 'amnesty'
 'amnesty international' 'anbieten' 'anderer']

Top 10 TF-IDF terms for first document:
  präsidentin: 0.4746
  frau präsidentin: 0.4746
  herr: 0.4165
  frau: 0.4011
  dame herr: 0.3279
  dame: 0.3279
  überzeugung: 0.0000
  überweisungsvorschlag: 0.0000
  überprüfung: 0.0000
  ansatz: 0.0000

TF-IDF vectorizer and matrix ready for topic modeling!


In [4]:
# LDA Topic Modeling with optimized parameters
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np

print("LDA Topic Modeling")
print("Training LDA model with 20 topics...")

# LDA requires CountVectorizer, not TF-IDF
# Create CountVectorizer with the specified parameters
count_vectorizer = CountVectorizer(
    max_features=1000,
    min_df=10,               # Minimum document frequency (seltene Wörter raus)
    max_df=0.90,            # Maximum document frequency (sehr häufige Wörter raus)
    ngram_range=(1, 2),     # Unigrams and bigrams
    stop_words='english'    # Basic English stopwords (additional filter)
)

# Fit and transform the texts
doc_term_matrix = count_vectorizer.fit_transform(df['text_lemmatized'])

print(f"\nDocument-Term Matrix created!")
print(f"Matrix shape: {doc_term_matrix.shape}")
print(f"  Documents: {doc_term_matrix.shape[0]}")
print(f"  Terms (vocabulary): {doc_term_matrix.shape[1]}")
print(f"  Sparsity: {(1 - doc_term_matrix.nnz / (doc_term_matrix.shape[0] * doc_term_matrix.shape[1])) * 100:.2f}%")

# Initialize and train LDA model
n_topics = 30
lda_model = LatentDirichletAllocation(
    n_components=n_topics,
    random_state=42,
    max_iter=30,
    learning_method='online',
    n_jobs=-1,              # Use all available cores
    verbose=1
)

print(f"\nTraining LDA model with {n_topics} topics...")
lda_model.fit(doc_term_matrix)

print(f"\nLDA Model Training Complete!")
print(f"Model parameters:")
print(f"  Number of topics: {n_topics}")
print(f"  Number of iterations: {lda_model.max_iter}")
print(f"  Perplexity: {lda_model.perplexity(doc_term_matrix):.4f}")
print(f"  Score: {lda_model.score(doc_term_matrix):.4f}")

# Get feature names
feature_names = np.array(count_vectorizer.get_feature_names_out())

# Display top terms for each topic
print(f"\n" + "="*80)
print("TOP 10 TERMS FOR EACH TOPIC")
print("="*80)

n_top_words = 10
for topic_idx, topic in enumerate(lda_model.components_):
    top_words_idx = topic.argsort()[-n_top_words:][::-1]
    top_words = [feature_names[i] for i in top_words_idx]
    print(f"\nTopic {topic_idx + 1:2d}: {', '.join(top_words)}")

print(f"\n" + "="*80)
print("LDA Model ready for analysis!")

LDA Topic Modeling
Training LDA model with 20 topics...

Document-Term Matrix created!
Matrix shape: (412, 147)
  Documents: 412
  Terms (vocabulary): 147
  Sparsity: 95.95%

Training LDA model with 30 topics...
iteration: 1 of max_iter: 30
iteration: 2 of max_iter: 30
iteration: 3 of max_iter: 30
iteration: 4 of max_iter: 30
iteration: 5 of max_iter: 30
iteration: 6 of max_iter: 30
iteration: 7 of max_iter: 30
iteration: 8 of max_iter: 30
iteration: 9 of max_iter: 30
iteration: 10 of max_iter: 30
iteration: 11 of max_iter: 30
iteration: 12 of max_iter: 30
iteration: 13 of max_iter: 30
iteration: 14 of max_iter: 30
iteration: 15 of max_iter: 30
iteration: 16 of max_iter: 30
iteration: 17 of max_iter: 30
iteration: 18 of max_iter: 30
iteration: 19 of max_iter: 30
iteration: 20 of max_iter: 30
iteration: 21 of max_iter: 30
iteration: 22 of max_iter: 30
iteration: 23 of max_iter: 30
iteration: 24 of max_iter: 30
iteration: 25 of max_iter: 30
iteration: 26 of max_iter: 30
iteration: 27 of 

In [5]:
%pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [6]:
# Coherence Score Berechnung
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

print("Coherence Score Berechnung")
print("Calculating coherence metrics for the LDA model...")

# Calculate Perplexity
perplexity = lda_model.perplexity(doc_term_matrix)
print(f"\nPerplexity Score: {perplexity:.4f}")
print(f"  (Interpretation: niedriger ist besser)")

# Calculate topic coherence manually
# Method: Measure similarity between top words in each topic
def calculate_topic_coherence(lda_model, doc_term_matrix, feature_names, top_n=10):
    """
    Calculate coherence of topics based on co-occurrence of top words
    """
    # Convert to dense for easier computation
    dtm_dense = doc_term_matrix.toarray()
    
    coherence_scores = []
    
    for topic_idx, topic in enumerate(lda_model.components_):
        # Get top words for this topic
        top_word_indices = topic.argsort()[-top_n:][::-1]
        
        # Calculate pairwise similarity between top words based on document co-occurrence
        word_vectors = dtm_dense[:, top_word_indices]
        
        # Calculate cosine similarity between word vectors
        if word_vectors.shape[1] > 1:
            similarity_matrix = cosine_similarity(word_vectors.T)
            # Get average similarity (excluding diagonal)
            np.fill_diagonal(similarity_matrix, 0)
            avg_similarity = similarity_matrix.sum() / (word_vectors.shape[1] * (word_vectors.shape[1] - 1))
            coherence_scores.append(avg_similarity)
    
    return np.mean(coherence_scores), coherence_scores

# Get feature names from count vectorizer
feature_names = np.array(count_vectorizer.get_feature_names_out())

# Calculate coherence
avg_coherence, topic_coherences = calculate_topic_coherence(lda_model, doc_term_matrix, feature_names)

print(f"\n" + "="*80)
print("COHERENCE SCORES")
print("="*80)
print(f"\nAverage Topic Coherence: {avg_coherence:.4f}")
print(f"  (Range: 0-1, höher ist besser. >0.5 ist akzeptabel)")

# Show individual topic coherence scores
print(f"\nCoherence scores by topic:")
for topic_idx, coherence in enumerate(topic_coherences):
    print(f"  Topic {topic_idx + 1:2d}: {coherence:.4f}")

# Calculate log-likelihood per document
log_likelihood = lda_model.score(doc_term_matrix)
print(f"\nLog-Likelihood per document: {log_likelihood / doc_term_matrix.shape[0]:.4f}")

print(f"\n" + "="*80)
print("MODEL EVALUATION SUMMARY")
print("="*80)
print(f"Number of Topics: 20")
print(f"Number of Documents: {doc_term_matrix.shape[0]}")
print(f"Vocabulary Size: {doc_term_matrix.shape[1]}")
print(f"\nPerplexity: {perplexity:.4f}")
print(f"Average Topic Coherence: {avg_coherence:.4f}")
print(f"Log-Likelihood: {log_likelihood:.4f}")
print(f"\nInterpretation: {'✓ Gutes Modell' if avg_coherence > 0.5 else '✗ Modell könnte verbessert werden'}")

Coherence Score Berechnung
Calculating coherence metrics for the LDA model...

Perplexity Score: 195.2157
  (Interpretation: niedriger ist besser)

COHERENCE SCORES

Average Topic Coherence: 0.2386
  (Range: 0-1, höher ist besser. >0.5 ist akzeptabel)

Coherence scores by topic:
  Topic  1: 0.1908
  Topic  2: 0.2021
  Topic  3: 0.1798
  Topic  4: 0.2491
  Topic  5: 0.3895
  Topic  6: 0.1731
  Topic  7: 0.2179
  Topic  8: 0.2304
  Topic  9: 0.1315
  Topic 10: 0.2246
  Topic 11: 0.2804
  Topic 12: 0.2359
  Topic 13: 0.2697
  Topic 14: 0.1731
  Topic 15: 0.1509
  Topic 16: 0.3098
  Topic 17: 0.2410
  Topic 18: 0.2432
  Topic 19: 0.1295
  Topic 20: 0.2375
  Topic 21: 0.3918
  Topic 22: 0.2812
  Topic 23: 0.2027
  Topic 24: 0.3143
  Topic 25: 0.3473
  Topic 26: 0.1716
  Topic 27: 0.1568
  Topic 28: 0.1710
  Topic 29: 0.2150
  Topic 30: 0.4466

Log-Likelihood per document: -45.2139

MODEL EVALUATION SUMMARY
Number of Topics: 20
Number of Documents: 412
Vocabulary Size: 147

Perplexity: 195.2

In [7]:
# Save LDA Topic Model Results to DataFrames using Polars (with all metadata)
import polars as pl
from pathlib import Path
import json
import numpy as np

print("="*80)
print("SAVING LDA TOPIC MODEL RESULTS")
print("="*80)

# 1. Get topic assignments for each document
print("\n1. Extracting topic assignments for each document...")
doc_topic_dist = lda_model.transform(doc_term_matrix)  # Shape: (n_docs, n_topics)

# Get dominant topic for each document
dominant_topics = doc_topic_dist.argmax(axis=1)
dominant_topic_prob = doc_topic_dist.max(axis=1)

# Get all metadata columns from original dataframe
metadata_cols = ['id', 'session', 'electoralTerm', 'firstName', 'lastName', 'politicianId', 
                 'factionId', 'documentUrl', 'positionShort', 'positionLong', 'date', 
                 'speech_length', 'paragraph_number', 'paragraph_length', 'speechContent']
df_metadata = df.select(metadata_cols)

# Create Polars dataframe with document-level topic assignments and all metadata
df_lda_topics_docs = df_metadata.with_columns(
    pl.Series('dominant_topic', dominant_topics.tolist()),
    pl.Series('dominant_topic_prob', dominant_topic_prob.tolist())
)

print(f"Document-topic assignments shape: {df_lda_topics_docs.shape}")
print(f"Columns: {df_lda_topics_docs.columns}")
print(f"\nFirst 10 rows:")
print(df_lda_topics_docs.head(10))

# 2. Get top terms for each topic
print(f"\n2. Extracting top terms for each topic...")
n_top_words = 15
topics_terms_list = []

for topic_idx, topic_components in enumerate(lda_model.components_):
    # Get indices of top words
    top_words_idx = topic_components.argsort()[-n_top_words:][::-1]
    
    # Get words and their weights
    top_words = feature_names[top_words_idx]
    top_weights = topic_components[top_words_idx]
    
    topics_terms_list.append({
        'topic': topic_idx,
        'top_terms': ', '.join(top_words),
        'top_terms_list': json.dumps(top_words.tolist()),
        'weights': json.dumps(top_weights.tolist())
    })

df_lda_topics_terms = pl.DataFrame(topics_terms_list)

print(f"Topics-terms dataframe shape: {df_lda_topics_terms.shape}")
print(f"\nFirst 5 topics:")
print(df_lda_topics_terms.select(['topic', 'top_terms']).head())

# 3. Create topic quality metrics dataframe
print(f"\n3. Creating topic quality metrics...")
df_lda_topic_metrics = pl.DataFrame({
    'topic': list(range(n_topics)),
    'coherence_score': topic_coherences
})

print(f"Topic metrics shape: {df_lda_topic_metrics.shape}")
print(f"\nTopic metrics:")
print(df_lda_topic_metrics.head())

# 4. Save all LDA results to parquet and csv
output_dir = Path('../data/processed')
output_dir.mkdir(parents=True, exist_ok=True)

# Save document-topic assignments with all metadata
parquet_path_docs = output_dir / 'topic_document_assignments_lda.parquet'
csv_path_docs = output_dir / 'topic_document_assignments_lda.csv'

df_lda_topics_docs.write_parquet(parquet_path_docs)
df_lda_topics_docs.write_csv(csv_path_docs)

print(f"\n✓ LDA Document-topic assignments saved (with all metadata):")
print(f"  Parquet: {parquet_path_docs}")
print(f"  CSV: {csv_path_docs}")
print(f"  Total columns: {df_lda_topics_docs.width}")

# Save topic terms
parquet_path_terms = output_dir / 'topic_terms_lda.parquet'
csv_path_terms = output_dir / 'topic_terms_lda.csv'

df_lda_topics_terms.write_parquet(parquet_path_terms)
df_lda_topics_terms.select(['topic', 'top_terms']).write_csv(csv_path_terms)

print(f"\n✓ LDA Topic terms saved:")
print(f"  Parquet: {parquet_path_terms}")
print(f"  CSV: {csv_path_terms}")

# Save topic metrics
parquet_path_metrics = output_dir / 'topic_metrics_lda.parquet'
csv_path_metrics = output_dir / 'topic_metrics_lda.csv'

df_lda_topic_metrics.write_parquet(parquet_path_metrics)
df_lda_topic_metrics.write_csv(csv_path_metrics)

print(f"\n✓ LDA Topic metrics saved:")
print(f"  Parquet: {parquet_path_metrics}")
print(f"  CSV: {csv_path_metrics}")

print(f"\n" + "="*80)
print("LDA TOPIC MODEL RESULTS SAVED SUCCESSFULLY!")
print("="*80)

SAVING LDA TOPIC MODEL RESULTS

1. Extracting topic assignments for each document...
Document-topic assignments shape: (412, 17)
Columns: ['id', 'session', 'electoralTerm', 'firstName', 'lastName', 'politicianId', 'factionId', 'documentUrl', 'positionShort', 'positionLong', 'date', 'speech_length', 'paragraph_number', 'paragraph_length', 'speechContent', 'dominant_topic', 'dominant_topic_prob']

First 10 rows:
shape: (10, 17)
┌────────┬─────────┬────────────┬───────────┬───┬────────────┬────────────┬────────────┬───────────┐
│ id     ┆ session ┆ electoralT ┆ firstName ┆ … ┆ paragraph_ ┆ speechCont ┆ dominant_t ┆ dominant_ │
│ ---    ┆ ---     ┆ erm        ┆ ---       ┆   ┆ length     ┆ ent        ┆ opic       ┆ topic_pro │
│ i64    ┆ i64     ┆ ---        ┆ str       ┆   ┆ ---        ┆ ---        ┆ ---        ┆ b         │
│        ┆         ┆ i64        ┆           ┆   ┆ i64        ┆ str        ┆ i64        ┆ ---       │
│        ┆         ┆            ┆           ┆   ┆            ┆   

# Alternative Topic Classification: ParlBERT-Topic-German

The LDA topic modeling results above showed limited coherence and interpretability. The topics were not well-defined and difficult to assign meaningful labels.

As an alternative, we use the **ParlBERT-Topic-German** model from HuggingFace ([chkla/parlbert-topic-german](https://huggingface.co/chkla/parlbert-topic-german)), which is a fine-tuned BERT model specifically trained for German parliamentary speech topic classification. This model classifies text into predefined political topic categories, providing more interpretable and consistent results for parliamentary speeches.

In [8]:
# Install transformers if not already installed
%pip install transformers torch

Note: you may need to restart the kernel to use updated packages.


In [9]:
# ParlBERT-Topic-German Classification
from transformers import pipeline
from tqdm import tqdm
import polars as pl
from pathlib import Path

print("="*80)
print("PARLBERT-TOPIC-GERMAN CLASSIFICATION")
print("="*80)

# Initialize the classification pipeline
print("\nLoading ParlBERT-Topic-German model...")
pipeline_classification_topics = pipeline(
    "text-classification", 
    model="chkla/parlbert-topic-german", 
    return_all_scores=False
)
print("✓ Model loaded successfully!")

# Test the model with an example
test_text = "Das Sachgebiet Investive Ausgaben des Bundes Bundesfinanzminister Apel hat gemäß BMF Finanznachrichten vom 1. Januar erklärt, die Investitionsquote des Bundes sei in den letzten zehn Jahren nahezu konstant geblieben."
test_result = pipeline_classification_topics(test_text)
print(f"\nTest classification:")
print(f"  Text: {test_text[:80]}...")
print(f"  Predicted topic: {test_result[0]['label']}")
print(f"  Confidence: {test_result[0]['score']:.4f}")

  from .autonotebook import tqdm as notebook_tqdm


PARLBERT-TOPIC-GERMAN CLASSIFICATION

Loading ParlBERT-Topic-German model...


Device set to use cpu


✓ Model loaded successfully!

Test classification:
  Text: Das Sachgebiet Investive Ausgaben des Bundes Bundesfinanzminister Apel hat gemäß...
  Predicted topic: Macroeconomics
  Confidence: 0.9967


In [10]:
# Classify all documents using ParlBERT
print("\nClassifying all documents...")
print(f"Total documents to classify: {df.shape[0]}")

# Use the original speechContent for classification (not lemmatized)
# The model expects natural German text
texts = df['speechContent'].to_list()

# Classify in batches for efficiency
# Use truncation=True to let the tokenizer handle the 512 token limit properly
results = []
batch_size = 16  # Reduced batch size for stability

for i in tqdm(range(0, len(texts), batch_size), desc="Classifying"):
    batch_texts = texts[i:i+batch_size]
    # Handle None/empty texts
    batch_texts_clean = [text if text else "" for text in batch_texts]
    # Use truncation=True to properly handle the 512 token limit
    batch_results = pipeline_classification_topics(batch_texts_clean, truncation=True, max_length=512)
    results.extend(batch_results)

# Extract labels and scores
topic_labels = [r['label'] for r in results]
topic_scores = [r['score'] for r in results]

print(f"\n✓ Classification complete!")
print(f"  Documents classified: {len(results)}")


Classifying all documents...
Total documents to classify: 412


Classifying: 100%|██████████| 26/26 [01:31<00:00,  3.53s/it]


✓ Classification complete!
  Documents classified: 412





In [11]:
# Analyze topic distribution
print("\n" + "="*80)
print("TOPIC DISTRIBUTION ANALYSIS")
print("="*80)

# Get unique topics and their counts
topic_counts = pl.DataFrame({'topic_label': topic_labels}).group_by('topic_label').len().sort('len', descending=True)

print("\nTopic distribution:")
for row in topic_counts.iter_rows(named=True):
    percentage = (row['len'] / len(topic_labels)) * 100
    print(f"  {row['topic_label']:30s}: {row['len']:5d} documents ({percentage:5.2f}%)")

print(f"\nTotal unique topics: {topic_counts.shape[0]}")


TOPIC DISTRIBUTION ANALYSIS

Topic distribution:
  Government                    :   180 documents (43.69%)
  Civil                         :    86 documents (20.87%)
  International                 :    22 documents ( 5.34%)
  Law                           :    19 documents ( 4.61%)
  Macroeconomics                :    19 documents ( 4.61%)
  Labor                         :    16 documents ( 3.88%)
  Social                        :    15 documents ( 3.64%)
  Defense                       :    11 documents ( 2.67%)
  Agriculture                   :    10 documents ( 2.43%)
  Domestic                      :     8 documents ( 1.94%)
  Environment                   :     7 documents ( 1.70%)
  Technology                    :     5 documents ( 1.21%)
  Housing                       :     4 documents ( 0.97%)
  Health                        :     4 documents ( 0.97%)
  Education                     :     2 documents ( 0.49%)
  Energy                        :     2 documents ( 0.49%)
  Fore

In [12]:
# Save ParlBERT Classification Results to DataFrames
print("\n" + "="*80)
print("SAVING PARLBERT TOPIC CLASSIFICATION RESULTS")
print("="*80)

# Get all metadata columns from original dataframe
metadata_cols = ['id', 'session', 'electoralTerm', 'firstName', 'lastName', 'politicianId', 
                 'factionId', 'documentUrl', 'positionShort', 'positionLong', 'date', 
                 'speech_length', 'paragraph_number', 'paragraph_length', 'speechContent']
df_metadata = df.select(metadata_cols)

# Create numeric topic IDs for compatibility with existing analysis
unique_topics = sorted(list(set(topic_labels)))
topic_to_id = {topic: idx for idx, topic in enumerate(unique_topics)}
topic_ids = [topic_to_id[label] for label in topic_labels]

# Create Polars dataframe with document-level topic assignments and all metadata
df_topics_docs = df_metadata.with_columns(
    pl.Series('dominant_topic', topic_ids),
    pl.Series('dominant_topic_prob', topic_scores),
    pl.Series('topic_label', topic_labels)
)

print(f"\nDocument-topic assignments shape: {df_topics_docs.shape}")
print(f"Columns: {df_topics_docs.columns}")
print(f"\nFirst 10 rows (selected columns):")
print(df_topics_docs.select(['id', 'firstName', 'lastName', 'dominant_topic', 'topic_label', 'dominant_topic_prob']).head(10))

# Create topic terms/labels dataframe (mapping topic IDs to their labels)
df_topics_terms = pl.DataFrame({
    'topic': list(range(len(unique_topics))),
    'top_terms': unique_topics,  # For ParlBERT, the "top_terms" is the topic label itself
    'topic_label': unique_topics
})

print(f"\nTopics terms/labels dataframe shape: {df_topics_terms.shape}")
print(f"\nTopic mapping:")
print(df_topics_terms)

# Create topic metrics dataframe (confidence scores per topic)
df_topic_stats = df_topics_docs.group_by('dominant_topic').agg([
    pl.col('dominant_topic_prob').mean().alias('mean_confidence'),
    pl.col('dominant_topic_prob').std().alias('std_confidence'),
    pl.len().alias('document_count')
]).sort('dominant_topic')

# Join with topic labels
df_topic_metrics = df_topic_stats.join(
    df_topics_terms.select(['topic', 'topic_label']),
    left_on='dominant_topic',
    right_on='topic',
    how='left'
).select(['dominant_topic', 'topic_label', 'mean_confidence', 'std_confidence', 'document_count'])
df_topic_metrics = df_topic_metrics.rename({'dominant_topic': 'topic'})

print(f"\nTopic metrics shape: {df_topic_metrics.shape}")
print(f"\nTopic metrics:")
print(df_topic_metrics)


SAVING PARLBERT TOPIC CLASSIFICATION RESULTS

Document-topic assignments shape: (412, 18)
Columns: ['id', 'session', 'electoralTerm', 'firstName', 'lastName', 'politicianId', 'factionId', 'documentUrl', 'positionShort', 'positionLong', 'date', 'speech_length', 'paragraph_number', 'paragraph_length', 'speechContent', 'dominant_topic', 'dominant_topic_prob', 'topic_label']

First 10 rows (selected columns):
shape: (10, 6)
┌────────┬───────────┬──────────┬────────────────┬─────────────┬─────────────────────┐
│ id     ┆ firstName ┆ lastName ┆ dominant_topic ┆ topic_label ┆ dominant_topic_prob │
│ ---    ┆ ---       ┆ ---      ┆ ---            ┆ ---         ┆ ---                 │
│ i64    ┆ str       ┆ str      ┆ i64            ┆ str         ┆ f64                 │
╞════════╪═══════════╪══════════╪════════════════╪═════════════╪═════════════════════╡
│ 738998 ┆ burkhard  ┆ lischka  ┆ 8              ┆ Government  ┆ 0.942619            │
│ 738998 ┆ burkhard  ┆ lischka  ┆ 8              ┆ Go

In [13]:
# Save all BERT results to parquet and csv
output_dir = Path('../data/processed')
output_dir.mkdir(parents=True, exist_ok=True)

# Save document-topic assignments with all metadata (main result for merging)
parquet_path_docs = output_dir / 'topic_document_assignments_bert.parquet'
csv_path_docs = output_dir / 'topic_document_assignments_bert.csv'

df_topics_docs.write_parquet(parquet_path_docs)
df_topics_docs.write_csv(csv_path_docs)

print(f"\n✓ BERT Document-topic assignments saved (with all metadata):")
print(f"  Parquet: {parquet_path_docs}")
print(f"  CSV: {csv_path_docs}")
print(f"  Total columns: {df_topics_docs.width}")

# Save topic terms/labels
parquet_path_terms = output_dir / 'topic_terms_bert.parquet'
csv_path_terms = output_dir / 'topic_terms_bert.csv'

df_topics_terms.write_parquet(parquet_path_terms)
df_topics_terms.write_csv(csv_path_terms)

print(f"\n✓ BERT Topic terms/labels saved:")
print(f"  Parquet: {parquet_path_terms}")
print(f"  CSV: {csv_path_terms}")

# Save topic metrics
parquet_path_metrics = output_dir / 'topic_metrics_bert.parquet'
csv_path_metrics = output_dir / 'topic_metrics_bert.csv'

df_topic_metrics.write_parquet(parquet_path_metrics)
df_topic_metrics.write_csv(csv_path_metrics)

print(f"\n✓ BERT Topic metrics saved:")
print(f"  Parquet: {parquet_path_metrics}")
print(f"  CSV: {csv_path_metrics}")

print(f"\n" + "="*80)
print("PARLBERT TOPIC CLASSIFICATION RESULTS SAVED SUCCESSFULLY!")
print("="*80)


✓ BERT Document-topic assignments saved (with all metadata):
  Parquet: ..\data\processed\topic_document_assignments_bert.parquet
  CSV: ..\data\processed\topic_document_assignments_bert.csv
  Total columns: 18

✓ BERT Topic terms/labels saved:
  Parquet: ..\data\processed\topic_terms_bert.parquet
  CSV: ..\data\processed\topic_terms_bert.csv

✓ BERT Topic metrics saved:
  Parquet: ..\data\processed\topic_metrics_bert.parquet
  CSV: ..\data\processed\topic_metrics_bert.csv

PARLBERT TOPIC CLASSIFICATION RESULTS SAVED SUCCESSFULLY!
