In [24]:
import sys
sys.path.insert(0, '..')

import polars as pl
from pathlib import Path
import json

print("Loading data files...")

# Define paths
processed_dir = Path('../data/processed')

# Load sentiment data
df_sentiment = pl.read_parquet(processed_dir / 'df_sample_sentiment.parquet')
print(f"✓ Loaded df_sample_sentiment.parquet: {df_sentiment.shape}")

# Load topic data with document assignments (includes all metadata)
df_topic = pl.read_parquet(processed_dir / 'topic_document_assignments.parquet')
print(f"✓ Loaded topic_document_assignments.parquet: {df_topic.shape}")

print(f"\nDataframes loaded successfully!")

Loading data files...
✓ Loaded df_sample_sentiment.parquet: (412, 18)
✓ Loaded topic_document_assignments.parquet: (412, 17)

Dataframes loaded successfully!


In [25]:
# Create common ID from speech ID and paragraph ID
print("="*80)
print("CREATING COMMON IDENTIFIERS")
print("="*80)

# Check columns in both dataframes
print("\nSentiment dataframe columns:")
print(f"  {df_sentiment.columns}")

print("\nTopic dataframe columns:")
print(f"  {df_topic.columns}")

# Create common ID in sentiment dataframe
print("\n1. Adding common ID to sentiment dataframe...")
df_sentiment = df_sentiment.with_columns(
    pl.concat_str(
        pl.col('id').cast(pl.Utf8),
        pl.lit('_'),
        pl.col('paragraph_number').cast(pl.Utf8)
    ).alias('unique_id')
)
print(f"✓ Created unique_id in sentiment data")
print(f"  Shape: {df_sentiment.shape}")
print(f"  Sample IDs: {df_sentiment.select('unique_id').head(5)}")

# Create common ID in topic dataframe
print("\n2. Adding common ID to topic dataframe...")
df_topic = df_topic.with_columns(
    pl.concat_str(
        pl.col('id').cast(pl.Utf8),
        pl.lit('_'),
        pl.col('paragraph_number').cast(pl.Utf8)
    ).alias('unique_id')
)
print(f"✓ Created unique_id in topic data")
print(f"  Shape: {df_topic.shape}")
print(f"  Sample IDs: {df_topic.select('unique_id').head(5)}")

print(f"\n" + "="*80)
print("COMMON IDENTIFIERS CREATED SUCCESSFULLY!")
print("="*80)

CREATING COMMON IDENTIFIERS

Sentiment dataframe columns:
  ['id', 'session', 'electoralTerm', 'firstName', 'lastName', 'politicianId', 'speechContent', 'factionId', 'documentUrl', 'positionShort', 'positionLong', 'date', 'speech_length', 'paragraph_number', 'paragraph_length', 'word_count', 'sentiment_class', 'sentiment_probabilities']

Topic dataframe columns:
  ['id', 'session', 'electoralTerm', 'firstName', 'lastName', 'politicianId', 'factionId', 'documentUrl', 'positionShort', 'positionLong', 'date', 'speech_length', 'paragraph_number', 'paragraph_length', 'speechContent', 'dominant_topic', 'dominant_topic_prob']

1. Adding common ID to sentiment dataframe...
✓ Created unique_id in sentiment data
  Shape: (412, 19)
  Sample IDs: shape: (5, 1)
┌───────────┐
│ unique_id │
│ ---       │
│ str       │
╞═══════════╡
│ 738998_1  │
│ 738998_2  │
│ 738998_3  │
│ 738998_4  │
│ 738998_5  │
└───────────┘

2. Adding common ID to topic dataframe...
✓ Created unique_id in topic data
  Shape: (

In [27]:
# Calculate average sentiment per topic
print("="*80)
print("CALCULATING AVERAGE SENTIMENT PER TOPIC")
print("="*80)

# First, merge sentiment and topic data on unique_id
print("\n1. Merging sentiment and topic data...")
df_merged = df_sentiment.join(
    df_topic.select(['unique_id', 'dominant_topic', 'dominant_topic_prob']),
    on='unique_id',
    how='inner'
)

print(f"✓ Merged data shape: {df_merged.shape}")
print(f"  Columns: {df_merged.columns}")

# Create sentiment score mapping
sentiment_score_map = {
    'positive': 1,
    'neutral': 0,
    'negative': -1
}

# Add sentiment score column
print("\n2. Creating sentiment score column...")
df_merged = df_merged.with_columns(
    pl.col('sentiment_class').map_elements(
        lambda x: sentiment_score_map.get(x, 0),
        return_dtype=pl.Int32
    ).alias('sentiment_score')
)

print(f"✓ Added sentiment_score")

# Calculate average sentiment per topic
print("\n3. Calculating average sentiment metrics per topic...")
sentiment_by_topic = df_merged.group_by('dominant_topic').agg([
    pl.col('sentiment_score').mean().alias('avg_sentiment_score'),
    pl.col('dominant_topic_prob').mean().alias('avg_topic_probability'),
    pl.len().alias('paragraph_count'),
    (pl.col('sentiment_class') == 'positive').sum().alias('positive_count'),
    (pl.col('sentiment_class') == 'neutral').sum().alias('neutral_count'),
    (pl.col('sentiment_class') == 'negative').sum().alias('negative_count')
]).sort('dominant_topic')

print(f"\n✓ Sentiment by topic calculated!")
print(f"  Shape: {sentiment_by_topic.shape}")
print(f"\nAverage sentiment per topic:")
print(sentiment_by_topic)

# Save results
output_dir = Path('../data/processed')
sentiment_by_topic_path = output_dir / 'sentiment_by_topic.parquet'
sentiment_by_topic.write_parquet(sentiment_by_topic_path)

sentiment_by_topic_csv = output_dir / 'sentiment_by_topic.csv'
sentiment_by_topic.write_csv(sentiment_by_topic_csv)

print(f"\n✓ Results saved:")
print(f"  Parquet: {sentiment_by_topic_path}")
print(f"  CSV: {sentiment_by_topic_csv}")

print(f"\n" + "="*80)
print("SENTIMENT BY TOPIC ANALYSIS COMPLETE!")
print("="*80)

CALCULATING AVERAGE SENTIMENT PER TOPIC

1. Merging sentiment and topic data...
✓ Merged data shape: (412, 21)
  Columns: ['id', 'session', 'electoralTerm', 'firstName', 'lastName', 'politicianId', 'speechContent', 'factionId', 'documentUrl', 'positionShort', 'positionLong', 'date', 'speech_length', 'paragraph_number', 'paragraph_length', 'word_count', 'sentiment_class', 'sentiment_probabilities', 'unique_id', 'dominant_topic', 'dominant_topic_prob']

2. Creating sentiment score column...
✓ Added sentiment_score

3. Calculating average sentiment metrics per topic...

✓ Sentiment by topic calculated!
  Shape: (23, 7)

Average sentiment per topic:
shape: (23, 7)
┌──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐
│ dominant_top ┆ avg_sentime ┆ avg_topic_p ┆ paragraph_c ┆ positive_co ┆ neutral_cou ┆ negative_co │
│ ic           ┆ nt_score    ┆ robability  ┆ ount        ┆ unt         ┆ nt          ┆ unt         │
│ ---          ┆ ---        