In [1]:
import sys
sys.path.insert(0, '..')

import polars as pl
from pathlib import Path
import json

print("Loading data files...")

# Define paths
processed_dir = Path('../data/processed')

# Load sentiment data
df_sentiment = pl.read_parquet(processed_dir / 'df_sample_sentiment.parquet')
print(f"✓ Loaded df_sample_sentiment.parquet: {df_sentiment.shape}")

# Load BERT topic data with document assignments (includes all metadata and topic_label)
df_topic = pl.read_parquet(processed_dir / 'topic_document_assignments_bert.parquet')
print(f"✓ Loaded topic_document_assignments_bert.parquet: {df_topic.shape}")

print(f"\nDataframes loaded successfully!")

Loading data files...
✓ Loaded df_sample_sentiment.parquet: (412, 18)
✓ Loaded topic_document_assignments_bert.parquet: (412, 18)

Dataframes loaded successfully!


In [2]:
# Create common ID from speech ID and paragraph ID
print("="*80)
print("CREATING COMMON IDENTIFIERS")
print("="*80)

# Check columns in both dataframes
print("\nSentiment dataframe columns:")
print(f"  {df_sentiment.columns}")

print("\nTopic dataframe columns:")
print(f"  {df_topic.columns}")

# Create common ID in sentiment dataframe
print("\n1. Adding common ID to sentiment dataframe...")
df_sentiment = df_sentiment.with_columns(
    pl.concat_str(
        pl.col('id').cast(pl.Utf8),
        pl.lit('_'),
        pl.col('paragraph_number').cast(pl.Utf8)
    ).alias('unique_id')
)
print(f"✓ Created unique_id in sentiment data")
print(f"  Shape: {df_sentiment.shape}")
print(f"  Sample IDs: {df_sentiment.select('unique_id').head(5)}")

# Create common ID in topic dataframe
print("\n2. Adding common ID to topic dataframe...")
df_topic = df_topic.with_columns(
    pl.concat_str(
        pl.col('id').cast(pl.Utf8),
        pl.lit('_'),
        pl.col('paragraph_number').cast(pl.Utf8)
    ).alias('unique_id')
)
print(f"✓ Created unique_id in topic data")
print(f"  Shape: {df_topic.shape}")
print(f"  Sample IDs: {df_topic.select('unique_id').head(5)}")

print(f"\n" + "="*80)
print("COMMON IDENTIFIERS CREATED SUCCESSFULLY!")
print("="*80)

CREATING COMMON IDENTIFIERS

Sentiment dataframe columns:
  ['id', 'session', 'electoralTerm', 'firstName', 'lastName', 'politicianId', 'speechContent', 'factionId', 'documentUrl', 'positionShort', 'positionLong', 'date', 'speech_length', 'paragraph_number', 'paragraph_length', 'word_count', 'sentiment_class', 'sentiment_probabilities']

Topic dataframe columns:
  ['id', 'session', 'electoralTerm', 'firstName', 'lastName', 'politicianId', 'factionId', 'documentUrl', 'positionShort', 'positionLong', 'date', 'speech_length', 'paragraph_number', 'paragraph_length', 'speechContent', 'dominant_topic', 'dominant_topic_prob', 'topic_label']

1. Adding common ID to sentiment dataframe...
✓ Created unique_id in sentiment data
  Shape: (412, 19)
  Sample IDs: shape: (5, 1)
┌───────────┐
│ unique_id │
│ ---       │
│ str       │
╞═══════════╡
│ 738998_1  │
│ 738998_2  │
│ 738998_3  │
│ 738998_4  │
│ 738998_5  │
└───────────┘

2. Adding common ID to topic dataframe...
✓ Created unique_id in topic 

In [3]:
# Create final analysis dataframe with BERT topics
print("="*80)
print("CREATING FINAL ANALYSIS DATAFRAME")
print("="*80)

# Merge sentiment and topic data on unique_id
print("\n1. Merging sentiment and BERT topic data...")
df_merged = df_sentiment.join(
    df_topic.select(['unique_id', 'dominant_topic', 'dominant_topic_prob', 'topic_label', 
                     'firstName', 'lastName', 'factionId', 'date', 'speechContent']),
    on='unique_id',
    how='inner'
)

print(f"✓ Merged data shape: {df_merged.shape}")

# Create final dataframe with relevant columns
print("\n2. Creating final analysis dataframe...")

# Combine first and last name for speaker column
df_final = df_merged.with_columns([
    pl.concat_str([pl.col('firstName'), pl.lit(' '), pl.col('lastName')]).alias('speaker')
])

# Select and rename columns for final output
df_final = df_final.select([
    pl.col('date').alias('time'),
    pl.col('speaker'),
    pl.col('factionId').alias('party'),
    pl.col('speechContent').alias('speech'),
    pl.col('topic_label').alias('topic'),
    pl.col('dominant_topic_prob').alias('topic_confidence'),
    pl.col('sentiment_class').alias('sentiment'),
    pl.col('unique_id')
])

print(f"✓ Final dataframe created!")
print(f"  Shape: {df_final.shape}")
print(f"  Columns: {df_final.columns}")

print(f"\n" + "="*80)
print("FINAL DATAFRAME PREVIEW")
print("="*80)
print(df_final.head(10))

# Show topic distribution
print(f"\n" + "="*80)
print("BERT TOPIC DISTRIBUTION")
print("="*80)
topic_dist = df_final.group_by('topic').len().sort('len', descending=True)
for row in topic_dist.iter_rows(named=True):
    percentage = (row['len'] / df_final.shape[0]) * 100
    print(f"  {row['topic']:30s}: {row['len']:5d} speeches ({percentage:5.2f}%)")

# Save final dataframe
output_dir = Path('../data/processed')
output_dir.mkdir(parents=True, exist_ok=True)

parquet_path = output_dir / 'df_final_analysis.parquet'
csv_path = output_dir / 'df_final_analysis.csv'

df_final.write_parquet(parquet_path)
df_final.write_csv(csv_path)

print(f"\n✓ Final dataframe saved:")
print(f"  Parquet: {parquet_path}")
print(f"  CSV: {csv_path}")

print(f"\n" + "="*80)
print("FINAL ANALYSIS DATAFRAME CREATED SUCCESSFULLY!")
print("="*80)

CREATING FINAL ANALYSIS DATAFRAME

1. Merging sentiment and BERT topic data...
✓ Merged data shape: (412, 27)

2. Creating final analysis dataframe...
✓ Final dataframe created!
  Shape: (412, 8)
  Columns: ['time', 'speaker', 'party', 'speech', 'topic', 'topic_confidence', 'sentiment', 'unique_id']

FINAL DATAFRAME PREVIEW
shape: (10, 8)
┌─────────────┬─────────────┬───────┬────────────┬────────────┬────────────┬───────────┬───────────┐
│ time        ┆ speaker     ┆ party ┆ speech     ┆ topic      ┆ topic_conf ┆ sentiment ┆ unique_id │
│ ---         ┆ ---         ┆ ---   ┆ ---        ┆ ---        ┆ idence     ┆ ---       ┆ ---       │
│ str         ┆ str         ┆ i64   ┆ str        ┆ str        ┆ ---        ┆ str       ┆ str       │
│             ┆             ┆       ┆            ┆            ┆ f64        ┆           ┆           │
╞═════════════╪═════════════╪═══════╪════════════╪════════════╪════════════╪═══════════╪═══════════╡
│ 2010-05-20T ┆ burkhard    ┆ 23    ┆ Frau Präsi ┆ Go