In [4]:
import sys
sys.path.insert(0, '..')

import polars as pl
from pathlib import Path
import json

print("Loading data files...")

# Define paths
processed_dir = Path('../data/processed')

# Load sentiment data (which was created from LDA topics)
df_sentiment = pl.read_parquet(processed_dir / 'df_sample_sentiment.parquet')
print(f"✓ Loaded df_sample_sentiment.parquet: {df_sentiment.shape}")

# NOTE: Sentiment was analyzed on LDA data, so all topic info is already included
# No need to load separate topic file - sentiment df already has everything!
print(f"✓ Sentiment data already contains topic assignments from LDA modeling")

print(f"\nDataframes loaded successfully!")

Loading data files...
✓ Loaded df_sample_sentiment.parquet: (5887, 19)
✓ Sentiment data already contains topic assignments from LDA modeling

Dataframes loaded successfully!


In [5]:
# Create common ID from speech ID and paragraph ID
print("="*80)
print("DATA PREPARATION")
print("="*80)

# Check columns in sentiment dataframe
print("\nSentiment dataframe columns:")
print(f"  {df_sentiment.columns}")

# Create unique ID for reference
print("\nAdding unique ID to sentiment dataframe...")
df_sentiment = df_sentiment.with_columns(
    pl.concat_str(
        pl.col('id').cast(pl.Utf8),
        pl.lit('_'),
        pl.col('paragraph_number').cast(pl.Utf8)
    ).alias('unique_id')
)
print(f"✓ Created unique_id in sentiment data")
print(f"  Shape: {df_sentiment.shape}")
print(f"  Sample IDs: {df_sentiment.select('unique_id').head(5)}")

print(f"\n" + "="*80)
print("DATA READY FOR FINAL PROCESSING!")
print("="*80)

DATA PREPARATION

Sentiment dataframe columns:
  ['id', 'session', 'electoralTerm', 'firstName', 'lastName', 'politicianId', 'factionId', 'documentUrl', 'positionShort', 'positionLong', 'date', 'speech_length', 'paragraph_number', 'paragraph_length', 'speechContent', 'dominant_topic', 'dominant_topic_prob', 'sentiment_class', 'sentiment_probabilities']

Adding unique ID to sentiment dataframe...
✓ Created unique_id in sentiment data
  Shape: (5887, 20)
  Sample IDs: shape: (5, 1)
┌───────────┐
│ unique_id │
│ ---       │
│ str       │
╞═══════════╡
│ 670669_17 │
│ 698691_3  │
│ 690973_6  │
│ 781601_24 │
│ 661773_2  │
└───────────┘

DATA READY FOR FINAL PROCESSING!


In [6]:
# Data Quality Check
print("="*80)
print("DATA QUALITY CHECK")
print("="*80)

print(f"\nSentiment data with LDA topics:")
print(f"  Total rows: {df_sentiment.shape[0]}")
print(f"  Unique IDs: {df_sentiment['unique_id'].n_unique()}")
print(f"  Columns: {len(df_sentiment.columns)}")

# Check for any missing values in key columns
print(f"\nMissing values in key columns:")
print(f"  sentiment_class: {df_sentiment['sentiment_class'].null_count()}")
print(f"  dominant_topic: {df_sentiment['dominant_topic'].null_count()}")
print(f"  speechContent: {df_sentiment['speechContent'].null_count()}")
print(f"  factionId: {df_sentiment['factionId'].null_count()}")

print(f"\n✓ Data quality check complete!")

DATA QUALITY CHECK

Sentiment data with LDA topics:
  Total rows: 5887
  Unique IDs: 5887
  Columns: 20

Missing values in key columns:
  sentiment_class: 0
  dominant_topic: 0
  speechContent: 0
  factionId: 0

✓ Data quality check complete!


In [7]:
# Create final analysis dataframe with LDA topics
print("="*80)
print("CREATING FINAL ANALYSIS DATAFRAME")
print("="*80)

# No merging needed - sentiment data already has LDA topics!
# Just need to format and select the relevant columns
print("\n1. Preparing final dataframe...")
df_merged = df_sentiment  # Already has everything we need

print(f"✓ Data ready: {df_merged.shape}")

# Create final dataframe with relevant columns
print("\n2. Formatting final analysis dataframe...")

# Combine first and last name for speaker column
df_final = df_merged.with_columns([
    pl.concat_str([pl.col('firstName'), pl.lit(' '), pl.col('lastName')]).alias('speaker')
])

# Select and rename columns for final output
df_final = df_final.select([
    pl.col('date').alias('time'),
    pl.col('speaker'),
    pl.col('factionId').alias('party'),
    pl.col('speechContent').alias('speech'),
    pl.col('dominant_topic').cast(pl.Utf8).alias('topic'),  # LDA topic number
    pl.col('dominant_topic_prob').alias('topic_confidence'),
    pl.col('sentiment_class').alias('sentiment'),
    pl.col('unique_id')
])

print(f"✓ Final dataframe created!")
print(f"  Shape: {df_final.shape}")
print(f"  Columns: {df_final.columns}")

print(f"\n" + "="*80)
print("FINAL DATAFRAME PREVIEW")
print("="*80)
print(df_final.head(10))

# Show topic distribution
print(f"\n" + "="*80)
print("LDA TOPIC DISTRIBUTION")
print("="*80)
topic_dist = df_final.group_by('topic').len().sort('len', descending=True)
for row in topic_dist.iter_rows(named=True):
    percentage = (row['len'] / df_final.shape[0]) * 100
    print(f"  Topic {row['topic']:3s}: {row['len']:5d} speeches ({percentage:5.2f}%)")

# Save final dataframe
output_dir = Path('../data/processed')
output_dir.mkdir(parents=True, exist_ok=True)

parquet_path = output_dir / 'df_final_analysis.parquet'
csv_path = output_dir / 'df_final_analysis.csv'

df_final.write_parquet(parquet_path)
df_final.write_csv(csv_path)

print(f"\n✓ Final dataframe saved:")
print(f"  Parquet: {parquet_path}")
print(f"  CSV: {csv_path}")

print(f"\n" + "="*80)
print("FINAL ANALYSIS DATAFRAME CREATED SUCCESSFULLY!")
print("="*80)

CREATING FINAL ANALYSIS DATAFRAME

1. Preparing final dataframe...
✓ Data ready: (5887, 20)

2. Formatting final analysis dataframe...
✓ Final dataframe created!
  Shape: (5887, 8)
  Columns: ['time', 'speaker', 'party', 'speech', 'topic', 'topic_confidence', 'sentiment', 'unique_id']

FINAL DATAFRAME PREVIEW
shape: (10, 8)
┌──────────────┬──────────────┬───────┬──────────────┬───────┬─────────────┬───────────┬───────────┐
│ time         ┆ speaker      ┆ party ┆ speech       ┆ topic ┆ topic_confi ┆ sentiment ┆ unique_id │
│ ---          ┆ ---          ┆ ---   ┆ ---          ┆ ---   ┆ dence       ┆ ---       ┆ ---       │
│ str          ┆ str          ┆ i64   ┆ str          ┆ str   ┆ ---         ┆ str       ┆ str       │
│              ┆              ┆       ┆              ┆       ┆ f64         ┆           ┆           │
╞══════════════╪══════════════╪═══════╪══════════════╪═══════╪═════════════╪═══════════╪═══════════╡
│ 2005-01-20T0 ┆ Peter H      ┆ 4     ┆ der im Globa ┆ 9     ┆ 0.599