In [1]:
import sys
sys.path.insert(0, '..')

import polars as pl
from pathlib import Path

# Load preprocessed data
data_file = Path('../data/processed/df_sample_split_preprocessed_topic.parquet')
df = pl.read_parquet(data_file)

print(f"Preprocessed data loaded!")
print(f"Shape: {df.shape}")
print(f"\nColumn names: {df.columns}")
print(f"\nData types:")
print(df.schema)
print(f"\nFirst few rows:")
print(df.head())

Preprocessed data loaded!
Shape: (412, 23)

Column names: ['id', 'session', 'electoralTerm', 'firstName', 'lastName', 'politicianId', 'speechContent', 'factionId', 'documentUrl', 'positionShort', 'positionLong', 'date', 'speech_length', 'paragraph_number', 'paragraph_length', 'tokens', 'token_count', 'tokens_no_stopwords', 'token_count_no_stopwords', 'tokens_clean', 'token_count_clean', 'tokens_lemma', 'token_count_lemma']

Data types:
Schema([('id', Int64), ('session', Int64), ('electoralTerm', Int64), ('firstName', String), ('lastName', String), ('politicianId', Int64), ('speechContent', String), ('factionId', Int64), ('documentUrl', String), ('positionShort', String), ('positionLong', String), ('date', String), ('speech_length', Int64), ('paragraph_number', Int64), ('paragraph_length', Int64), ('tokens', List(String)), ('token_count', UInt32), ('tokens_no_stopwords', List(String)), ('token_count_no_stopwords', UInt32), ('tokens_clean', List(String)), ('token_count_clean', UInt32), (

In [None]:
# Join tokens back into full strings
import polars as pl

# Create a function to join tokens
def join_tokens(tokens):
    """Join a list of tokens into a single string."""
    if tokens is None or len(tokens) == 0:
        return ""
    return ' '.join(tokens)

# Apply to create text from lemmatized tokens
df = df.with_columns(
    pl.col('tokens_lemma').map_elements(join_tokens, return_dtype=pl.Utf8).alias('text_lemmatized')
)

# Also create versions from other token types for comparison
df = df.with_columns(
    pl.col('tokens_clean').map_elements(join_tokens, return_dtype=pl.Utf8).alias('text_clean')
)

df = df.with_columns(
    pl.col('tokens_no_stopwords').map_elements(join_tokens, return_dtype=pl.Utf8).alias('text_no_stopwords')
)

print("Tokens joined into full strings!")
print(f"\nDataframe shape: {df.shape}")
print(f"New columns: {df.columns}")

print(f"\nSample text from lemmatized tokens (first paragraph):")
print(f"  {df['text_lemmatized'][0]}")



Tokens joined into full strings!

Dataframe shape: (412, 26)
New columns: ['id', 'session', 'electoralTerm', 'firstName', 'lastName', 'politicianId', 'speechContent', 'factionId', 'documentUrl', 'positionShort', 'positionLong', 'date', 'speech_length', 'paragraph_number', 'paragraph_length', 'tokens', 'token_count', 'tokens_no_stopwords', 'token_count_no_stopwords', 'tokens_clean', 'token_count_clean', 'tokens_lemma', 'token_count_lemma', 'text_lemmatized', 'text_clean', 'text_no_stopwords']

Sample text from lemmatized tokens (first paragraph):
  Frau Präsidentin Dame Herr Herr

Sample text from clean tokens (first paragraph):
  Frau Präsidentin Damen Herren Herr


In [5]:
# TF-IDF Vectorization of lemmatized texts
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

print("TF-IDF Vectorization")
print("Converting lemmatized texts to TF-IDF vectors...")

# Initialize TfidfVectorizer with German-specific parameters
tfidf_vectorizer = TfidfVectorizer(
    max_features=1000,          # Limit vocabulary to top 1000 features
    min_df=2,                   # Minimum document frequency
    max_df=0.8,                 # Maximum document frequency (80% of docs)
    ngram_range=(1, 2),         # Use unigrams and bigrams
    sublinear_tf=True,          # Apply sublinear term frequency scaling
    norm='l2'                   # L2 normalization
)

# Convert lemmatized texts to TF-IDF vectors
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text_lemmatized'])

print(f"\nTF-IDF Vectorization complete!")
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"  Samples (documents): {tfidf_matrix.shape[0]}")
print(f"  Features (vocabulary): {tfidf_matrix.shape[1]}")
print(f"  Sparsity: {(1 - tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])) * 100:.2f}%")

# Get feature names (vocabulary)
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())
print(f"\nVocabulary size: {len(feature_names)}")
print(f"Sample features: {feature_names[:20]}")

# Convert sparse matrix to dense for inspection
tfidf_dense = tfidf_matrix.toarray()

# Show top TF-IDF terms for first document
print(f"\nTop 10 TF-IDF terms for first document:")
top_indices = tfidf_dense[0].argsort()[-10:][::-1]
for idx in top_indices:
    print(f"  {feature_names[idx]}: {tfidf_dense[0, idx]:.4f}")

# Store TF-IDF matrix and vectorizer for later use
print(f"\nTF-IDF vectorizer and matrix ready for topic modeling!")

TF-IDF Vectorization
Converting lemmatized texts to TF-IDF vectors...

TF-IDF Vectorization complete!
TF-IDF matrix shape: (412, 1000)
  Samples (documents): 412
  Features (vocabulary): 1000
  Sparsity: 98.49%

Vocabulary size: 1000
Sample features: ['abgeordneter' 'abgeordneter dr' 'abgeordneter frau' 'abs' 'abschließen'
 'abschließend' 'absehen' 'absolut' 'abstimmung' 'aktiv' 'aktuell' 'all'
 'allgemein' 'alt' 'alternative' 'amerikanisch' 'amnesty'
 'amnesty international' 'anbieten' 'anderer']

Top 10 TF-IDF terms for first document:
  präsidentin: 0.4746
  frau präsidentin: 0.4746
  herr: 0.4165
  frau: 0.4011
  dame herr: 0.3279
  dame: 0.3279
  überzeugung: 0.0000
  überweisungsvorschlag: 0.0000
  überprüfung: 0.0000
  ansatz: 0.0000

TF-IDF vectorizer and matrix ready for topic modeling!


In [17]:
# LDA Topic Modeling with optimized parameters
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np

print("LDA Topic Modeling")
print("Training LDA model with 20 topics...")

# LDA requires CountVectorizer, not TF-IDF
# Create CountVectorizer with the specified parameters
count_vectorizer = CountVectorizer(
    max_features=1000,
    min_df=10,               # Minimum document frequency (seltene Wörter raus)
    max_df=0.90,            # Maximum document frequency (sehr häufige Wörter raus)
    ngram_range=(1, 2),     # Unigrams and bigrams
    stop_words='english'    # Basic English stopwords (additional filter)
)

# Fit and transform the texts
doc_term_matrix = count_vectorizer.fit_transform(df['text_lemmatized'])

print(f"\nDocument-Term Matrix created!")
print(f"Matrix shape: {doc_term_matrix.shape}")
print(f"  Documents: {doc_term_matrix.shape[0]}")
print(f"  Terms (vocabulary): {doc_term_matrix.shape[1]}")
print(f"  Sparsity: {(1 - doc_term_matrix.nnz / (doc_term_matrix.shape[0] * doc_term_matrix.shape[1])) * 100:.2f}%")

# Initialize and train LDA model
n_topics = 30
lda_model = LatentDirichletAllocation(
    n_components=n_topics,
    random_state=42,
    max_iter=30,
    learning_method='online',
    n_jobs=-1,              # Use all available cores
    verbose=1
)

print(f"\nTraining LDA model with {n_topics} topics...")
lda_model.fit(doc_term_matrix)

print(f"\nLDA Model Training Complete!")
print(f"Model parameters:")
print(f"  Number of topics: {n_topics}")
print(f"  Number of iterations: {lda_model.max_iter}")
print(f"  Perplexity: {lda_model.perplexity(doc_term_matrix):.4f}")
print(f"  Score: {lda_model.score(doc_term_matrix):.4f}")

# Get feature names
feature_names = np.array(count_vectorizer.get_feature_names_out())

# Display top terms for each topic
print(f"\n" + "="*80)
print("TOP 10 TERMS FOR EACH TOPIC")
print("="*80)

n_top_words = 10
for topic_idx, topic in enumerate(lda_model.components_):
    top_words_idx = topic.argsort()[-n_top_words:][::-1]
    top_words = [feature_names[i] for i in top_words_idx]
    print(f"\nTopic {topic_idx + 1:2d}: {', '.join(top_words)}")

print(f"\n" + "="*80)
print("LDA Model ready for analysis!")

LDA Topic Modeling
Training LDA model with 20 topics...

Document-Term Matrix created!
Matrix shape: (412, 147)
  Documents: 412
  Terms (vocabulary): 147
  Sparsity: 95.95%

Training LDA model with 30 topics...
iteration: 1 of max_iter: 30
iteration: 2 of max_iter: 30
iteration: 3 of max_iter: 30
iteration: 4 of max_iter: 30
iteration: 5 of max_iter: 30
iteration: 6 of max_iter: 30
iteration: 7 of max_iter: 30
iteration: 8 of max_iter: 30
iteration: 9 of max_iter: 30
iteration: 10 of max_iter: 30
iteration: 11 of max_iter: 30
iteration: 12 of max_iter: 30
iteration: 13 of max_iter: 30
iteration: 14 of max_iter: 30
iteration: 15 of max_iter: 30
iteration: 16 of max_iter: 30
iteration: 17 of max_iter: 30
iteration: 18 of max_iter: 30
iteration: 19 of max_iter: 30
iteration: 20 of max_iter: 30
iteration: 21 of max_iter: 30
iteration: 22 of max_iter: 30
iteration: 23 of max_iter: 30
iteration: 24 of max_iter: 30
iteration: 25 of max_iter: 30
iteration: 26 of max_iter: 30
iteration: 27 of 

In [13]:
%pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [18]:
# Coherence Score Berechnung
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

print("Coherence Score Berechnung")
print("Calculating coherence metrics for the LDA model...")

# Calculate Perplexity
perplexity = lda_model.perplexity(doc_term_matrix)
print(f"\nPerplexity Score: {perplexity:.4f}")
print(f"  (Interpretation: niedriger ist besser)")

# Calculate topic coherence manually
# Method: Measure similarity between top words in each topic
def calculate_topic_coherence(lda_model, doc_term_matrix, feature_names, top_n=10):
    """
    Calculate coherence of topics based on co-occurrence of top words
    """
    # Convert to dense for easier computation
    dtm_dense = doc_term_matrix.toarray()
    
    coherence_scores = []
    
    for topic_idx, topic in enumerate(lda_model.components_):
        # Get top words for this topic
        top_word_indices = topic.argsort()[-top_n:][::-1]
        
        # Calculate pairwise similarity between top words based on document co-occurrence
        word_vectors = dtm_dense[:, top_word_indices]
        
        # Calculate cosine similarity between word vectors
        if word_vectors.shape[1] > 1:
            similarity_matrix = cosine_similarity(word_vectors.T)
            # Get average similarity (excluding diagonal)
            np.fill_diagonal(similarity_matrix, 0)
            avg_similarity = similarity_matrix.sum() / (word_vectors.shape[1] * (word_vectors.shape[1] - 1))
            coherence_scores.append(avg_similarity)
    
    return np.mean(coherence_scores), coherence_scores

# Get feature names from count vectorizer
feature_names = np.array(count_vectorizer.get_feature_names_out())

# Calculate coherence
avg_coherence, topic_coherences = calculate_topic_coherence(lda_model, doc_term_matrix, feature_names)

print(f"\n" + "="*80)
print("COHERENCE SCORES")
print("="*80)
print(f"\nAverage Topic Coherence: {avg_coherence:.4f}")
print(f"  (Range: 0-1, höher ist besser. >0.5 ist akzeptabel)")

# Show individual topic coherence scores
print(f"\nCoherence scores by topic:")
for topic_idx, coherence in enumerate(topic_coherences):
    print(f"  Topic {topic_idx + 1:2d}: {coherence:.4f}")

# Calculate log-likelihood per document
log_likelihood = lda_model.score(doc_term_matrix)
print(f"\nLog-Likelihood per document: {log_likelihood / doc_term_matrix.shape[0]:.4f}")

print(f"\n" + "="*80)
print("MODEL EVALUATION SUMMARY")
print("="*80)
print(f"Number of Topics: 20")
print(f"Number of Documents: {doc_term_matrix.shape[0]}")
print(f"Vocabulary Size: {doc_term_matrix.shape[1]}")
print(f"\nPerplexity: {perplexity:.4f}")
print(f"Average Topic Coherence: {avg_coherence:.4f}")
print(f"Log-Likelihood: {log_likelihood:.4f}")
print(f"\nInterpretation: {'✓ Gutes Modell' if avg_coherence > 0.5 else '✗ Modell könnte verbessert werden'}")

Coherence Score Berechnung
Calculating coherence metrics for the LDA model...

Perplexity Score: 195.2157
  (Interpretation: niedriger ist besser)

COHERENCE SCORES

Average Topic Coherence: 0.2386
  (Range: 0-1, höher ist besser. >0.5 ist akzeptabel)

Coherence scores by topic:
  Topic  1: 0.1908
  Topic  2: 0.2021
  Topic  3: 0.1798
  Topic  4: 0.2491
  Topic  5: 0.3895
  Topic  6: 0.1731
  Topic  7: 0.2179
  Topic  8: 0.2304
  Topic  9: 0.1315
  Topic 10: 0.2246
  Topic 11: 0.2804
  Topic 12: 0.2359
  Topic 13: 0.2697
  Topic 14: 0.1731
  Topic 15: 0.1509
  Topic 16: 0.3098
  Topic 17: 0.2410
  Topic 18: 0.2432
  Topic 19: 0.1295
  Topic 20: 0.2375
  Topic 21: 0.3918
  Topic 22: 0.2812
  Topic 23: 0.2027
  Topic 24: 0.3143
  Topic 25: 0.3473
  Topic 26: 0.1716
  Topic 27: 0.1568
  Topic 28: 0.1710
  Topic 29: 0.2150
  Topic 30: 0.4466

Log-Likelihood per document: -45.2139

MODEL EVALUATION SUMMARY
Number of Topics: 20
Number of Documents: 412
Vocabulary Size: 147

Perplexity: 195.2