# Transformer Sentiment Analysis

This notebook runs the DistilBERT transformer model on the social media dataset.

**Note:** This notebook is optimized for GitHub Codespaces with GPU support.

In [None]:
# Import required libraries
import sys
sys.path.insert(0, '..')

import pandas as pd
from pathlib import Path
from src.pipeline import run_pipeline
from src.evaluate import classification_scores
import numpy as np

## 1. Download Dataset from Kaggle

First, you need to set up your Kaggle credentials in Codespaces.

Upload your `kaggle.json` to `~/.kaggle/` directory.

In [None]:
# Run the preparation script to download and transform data
!python ../scripts/prepare_dataset.py

## 2. Load Dataset

In [None]:
# Load the processed dataset
data_path = Path('../data/processed/sentiments_clean.parquet')
df = pd.read_parquet(data_path)

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nSentiment distribution:")
print(df['label'].value_counts())
df.head()

## 3. Run Transformer Model

This will process all 200K+ records with DistilBERT.

In [None]:
%%time

# Run the transformer pipeline
input_path = Path('../data/processed/sentiments_clean.parquet')
output_path = Path('../outputs/transformer_scored.csv')

scored_df = run_pipeline(
    input_path=input_path,
    output_path=output_path,
    mode="transformer",
    text_column="text",
    label_column="label",
    timestamp_column="created_at"
)

print(f"\nScored {len(scored_df):,} records")

## 4. Evaluate Results

In [None]:
# Calculate metrics
y_true = scored_df['label']
y_pred = scored_df['sentiment_label']

metrics = classification_scores(y_true, y_pred)

print("=" * 70)
print("TRANSFORMER MODEL EVALUATION")
print("=" * 70)
print(f"\nAccuracy:        {metrics.accuracy:.4f} ({metrics.accuracy*100:.2f}%)")
print(f"Macro Precision: {metrics.precision:.4f}")
print(f"Macro Recall:    {metrics.recall:.4f}")
print(f"Macro F1-Score:  {metrics.f1_macro:.4f} ({metrics.f1_macro*100:.2f}%)")

print("\n" + "-" * 70)
print("CONFUSION MATRIX")
print("-" * 70)
cm = np.array(metrics.confusion_matrix)
print("\nRows=Actual, Cols=Predicted")
print("         Negative  Neutral  Positive")
for i, label in enumerate(['Negative', 'Neutral', 'Positive']):
    print(f"{label:9} {cm[i,0]:8d} {cm[i,1]:8d} {cm[i,2]:8d}")

print("\n" + "=" * 70)
print("PROJECT GOAL ASSESSMENT")
print("=" * 70)
target_f1 = 0.80
if metrics.f1_macro >= target_f1:
    print(f"\n✓ SUCCESS: Target F1-score of {target_f1:.0%} ACHIEVED!")
else:
    gap = target_f1 - metrics.f1_macro
    print(f"\n✗ Target F1-score of {target_f1:.0%} not yet reached")
    print(f"Current F1: {metrics.f1_macro:.4f} ({metrics.f1_macro*100:.2f}%)")
    print(f"Gap:        {gap:.4f} ({gap*100:.2f} percentage points)")
    print(f"Progress:   {(metrics.f1_macro/target_f1)*100:.1f}% of target")

## 5. Sample Predictions

In [None]:
# Show some example predictions
sample_df = scored_df.sample(10, random_state=42)[['text', 'label', 'sentiment_label', 'sentiment_confidence']]
sample_df['correct'] = sample_df['label'] == sample_df['sentiment_label']
print("\nSample Predictions:")
print("=" * 100)
for idx, row in sample_df.iterrows():
    status = "✓" if row['correct'] else "✗"
    print(f"{status} Text: {row['text'][:80]}...")
    print(f"  Actual: {row['label']:8s} | Predicted: {row['sentiment_label']:8s} | Confidence: {row['sentiment_confidence']:.3f}")
    print("-" * 100)

## 6. Save Results

Results are automatically saved to `outputs/transformer_scored.csv`

In [None]:
print(f"✓ Results saved to: {output_path}")
print(f"✓ Trends saved to: {output_path.parent / (output_path.stem + '_trend.csv')}")
print("\nYou can now download these files or visualize them in the Streamlit dashboard.")