# Model Results & Performance Analysis
## Sentiment Analysis Project

**Objective:** Comprehensive analysis of VADER model performance on test sets

---

In [None]:
# Import required libraries
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    confusion_matrix, classification_report, 
    accuracy_score, precision_recall_fscore_support,
    roc_curve, auc
)
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['font.size'] = 11

print("âœ“ Libraries imported successfully")

## 1. Load Model Results

In [None]:
# Load test results
social_results = pd.read_csv('../outputs/test_social_vader.csv')
clothing_results = pd.read_csv('../outputs/test_clothing_vader.csv')

print(f"Social Media Results: {len(social_results):,} predictions")
print(f"Clothing Results: {len(clothing_results):,} predictions")

print("\nSample predictions (Social Media):")
social_results[['text', 'label', 'sentiment_label', 'sentiment_confidence']].head()

## 2. Overall Performance Metrics

In [None]:
# Calculate metrics for both datasets
def calculate_metrics(df, name):
    y_true = df['label']
    y_pred = df['sentiment_label']
    
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, support = precision_recall_fscore_support(
        y_true, y_pred, average='macro', zero_division=0
    )
    
    print(f"\n{'='*70}")
    print(f"{name} - VADER Performance")
    print(f"{'='*70}")
    print(f"Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-Score:  {f1:.4f} ({f1*100:.2f}%)")
    print(f"{'='*70}")
    
    # Target assessment
    target = 0.80
    if f1 >= target:
        print(f"âœ“ Target F1-score of {target:.0%} ACHIEVED!")
    else:
        gap = target - f1
        progress = (f1 / target) * 100
        print(f"âœ— Target F1-score of {target:.0%} not met")
        print(f"  Current: {f1:.4f} ({f1*100:.2f}%)")
        print(f"  Gap:     {gap:.4f} ({gap*100:.2f} percentage points)")
        print(f"  Progress: {progress:.1f}% of target")
    
    return accuracy, precision, recall, f1

# Calculate for both datasets
social_metrics = calculate_metrics(social_results, "Social Media")
clothing_metrics = calculate_metrics(clothing_results, "Clothing Reviews")

## 3. Confusion Matrices (Enhanced)

In [None]:
# Create enhanced confusion matrices
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

labels = ['negative', 'neutral', 'positive']

# Social Media
cm_social = confusion_matrix(social_results['label'], social_results['sentiment_label'], labels=labels)
cm_social_norm = cm_social.astype('float') / cm_social.sum(axis=1)[:, np.newaxis]

sns.heatmap(cm_social, annot=True, fmt='d', cmap='Blues', 
           xticklabels=labels, yticklabels=labels, ax=axes[0],
           cbar_kws={'label': 'Count'}, linewidths=1, linecolor='gray')
axes[0].set_title('Social Media - Confusion Matrix', fontsize=14, fontweight='bold', pad=15)
axes[0].set_ylabel('Actual', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Predicted', fontsize=12, fontweight='bold')

# Add percentage annotations
for i in range(len(labels)):
    for j in range(len(labels)):
        pct = cm_social_norm[i, j] * 100
        axes[0].text(j + 0.5, i + 0.7, f'({pct:.1f}%)', 
                   ha='center', va='center', fontsize=9, color='gray')

# Clothing Reviews
cm_clothing = confusion_matrix(clothing_results['label'], clothing_results['sentiment_label'], labels=labels)
cm_clothing_norm = cm_clothing.astype('float') / cm_clothing.sum(axis=1)[:, np.newaxis]

sns.heatmap(cm_clothing, annot=True, fmt='d', cmap='Greens',
           xticklabels=labels, yticklabels=labels, ax=axes[1],
           cbar_kws={'label': 'Count'}, linewidths=1, linecolor='gray')
axes[1].set_title('Clothing Reviews - Confusion Matrix', fontsize=14, fontweight='bold', pad=15)
axes[1].set_ylabel('Actual', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Predicted', fontsize=12, fontweight='bold')

# Add percentage annotations
for i in range(len(labels)):
    for j in range(len(labels)):
        pct = cm_clothing_norm[i, j] * 100
        axes[1].text(j + 0.5, i + 0.7, f'({pct:.1f}%)', 
                   ha='center', va='center', fontsize=9, color='gray')

plt.tight_layout()
plt.savefig('../reports/final_confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()

## 4. Per-Class Performance Analysis

In [None]:
# Get per-class metrics
def get_per_class_metrics(df, name):
    y_true = df['label']
    y_pred = df['sentiment_label']
    
    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
    
    metrics_df = pd.DataFrame({
        'Class': ['Negative', 'Neutral', 'Positive'],
        'Precision': [report['negative']['precision'], 
                     report['neutral']['precision'], 
                     report['positive']['precision']],
        'Recall': [report['negative']['recall'], 
                  report['neutral']['recall'], 
                  report['positive']['recall']],
        'F1-Score': [report['negative']['f1-score'], 
                    report['neutral']['f1-score'], 
                    report['positive']['f1-score']],
        'Support': [report['negative']['support'], 
                   report['neutral']['support'], 
                   report['positive']['support']]
    })
    
    print(f"\n{name} - Per-Class Metrics:")
    print(metrics_df.to_string(index=False))
    
    return metrics_df

social_per_class = get_per_class_metrics(social_results, "Social Media")
clothing_per_class = get_per_class_metrics(clothing_results, "Clothing Reviews")

In [None]:
# Visualize per-class performance
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

metrics = ['Precision', 'Recall', 'F1-Score']
x = np.arange(len(social_per_class))
width = 0.25
colors_metrics = ['#1976d2', '#388e3c', '#f57c00']

# Social Media - Bar chart
for i, metric in enumerate(metrics):
    axes[0, 0].bar(x + i*width, social_per_class[metric], width, 
                  label=metric, color=colors_metrics[i], edgecolor='black')

axes[0, 0].set_xlabel('Sentiment Class', fontsize=12, fontweight='bold')
axes[0, 0].set_ylabel('Score', fontsize=12, fontweight='bold')
axes[0, 0].set_title('Social Media - Per-Class Performance', fontsize=14, fontweight='bold')
axes[0, 0].set_xticks(x + width)
axes[0, 0].set_xticklabels(social_per_class['Class'])
axes[0, 0].legend()
axes[0, 0].grid(axis='y', alpha=0.3)
axes[0, 0].set_ylim(0, 1.0)

# Clothing - Bar chart
for i, metric in enumerate(metrics):
    axes[0, 1].bar(x + i*width, clothing_per_class[metric], width,
                  label=metric, color=colors_metrics[i], edgecolor='black')

axes[0, 1].set_xlabel('Sentiment Class', fontsize=12, fontweight='bold')
axes[0, 1].set_ylabel('Score', fontsize=12, fontweight='bold')
axes[0, 1].set_title('Clothing Reviews - Per-Class Performance', fontsize=14, fontweight='bold')
axes[0, 1].set_xticks(x + width)
axes[0, 1].set_xticklabels(clothing_per_class['Class'])
axes[0, 1].legend()
axes[0, 1].grid(axis='y', alpha=0.3)
axes[0, 1].set_ylim(0, 1.0)

# Support (sample size) comparison
x_pos = np.arange(3)
axes[1, 0].bar(x_pos - 0.2, social_per_class['Support'], 0.4, 
              label='Social Media', color='#2196f3', edgecolor='black')
axes[1, 0].bar(x_pos + 0.2, clothing_per_class['Support'], 0.4,
              label='Clothing', color='#4caf50', edgecolor='black')
axes[1, 0].set_xticks(x_pos)
axes[1, 0].set_xticklabels(['Negative', 'Neutral', 'Positive'])
axes[1, 0].set_ylabel('Number of Samples', fontsize=12, fontweight='bold')
axes[1, 0].set_title('Test Set - Class Distribution', fontsize=14, fontweight='bold')
axes[1, 0].legend()
axes[1, 0].grid(axis='y', alpha=0.3)

# F1-Score comparison heatmap
f1_comparison = pd.DataFrame({
    'Social Media': social_per_class['F1-Score'].values,
    'Clothing Reviews': clothing_per_class['F1-Score'].values
}, index=['Negative', 'Neutral', 'Positive'])

sns.heatmap(f1_comparison.T, annot=True, fmt='.3f', cmap='RdYlGn', 
           vmin=0, vmax=1, ax=axes[1, 1], cbar_kws={'label': 'F1-Score'},
           linewidths=1, linecolor='gray')
axes[1, 1].set_title('F1-Score Comparison Heatmap', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Sentiment Class', fontsize=12, fontweight='bold')
axes[1, 1].set_ylabel('Dataset', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('../reports/final_per_class_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Model Comparison Across Datasets

In [None]:
# Create comprehensive comparison
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Overall metrics comparison
comparison_data = pd.DataFrame({
    'Social Media': list(social_metrics),
    'Clothing Reviews': list(clothing_metrics)
}, index=['Accuracy', 'Precision', 'Recall', 'F1-Score'])

comparison_data.T.plot(kind='bar', ax=axes[0], color=['#2196f3', '#4caf50', '#ff9800', '#f44336'],
                      edgecolor='black', linewidth=1.5)
axes[0].set_title('VADER Model - Overall Performance Comparison', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Dataset', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Score', fontsize=12, fontweight='bold')
axes[0].set_xticklabels(['Social Media', 'Clothing Reviews'], rotation=0)
axes[0].legend(title='Metrics', bbox_to_anchor=(1.05, 1), loc='upper left')
axes[0].grid(axis='y', alpha=0.3)
axes[0].set_ylim(0, 1.0)

# Add target line
axes[0].axhline(y=0.80, color='red', linestyle='--', linewidth=2, label='Target (80%)')
axes[0].text(0.5, 0.82, 'Target: 80% F1', ha='center', fontweight='bold', 
            bbox=dict(boxstyle='round', facecolor='yellow', alpha=0.5))

# Progress toward target
datasets = ['Social Media', 'Clothing Reviews']
f1_scores = [social_metrics[3], clothing_metrics[3]]
progress = [(f1 / 0.80) * 100 for f1 in f1_scores]

bars = axes[1].barh(datasets, progress, color=['#2196f3', '#4caf50'], edgecolor='black', linewidth=1.5)
axes[1].axvline(x=100, color='red', linestyle='--', linewidth=2, label='Target (100%)')
axes[1].set_xlabel('Progress Toward Target (%)', fontsize=12, fontweight='bold')
axes[1].set_title('Progress Toward 80% F1-Score Target', fontsize=14, fontweight='bold')
axes[1].grid(axis='x', alpha=0.3)
axes[1].set_xlim(0, 120)

# Add value labels
for i, (bar, prog, f1) in enumerate(zip(bars, progress, f1_scores)):
    axes[1].text(prog + 2, bar.get_y() + bar.get_height()/2, 
                f'{prog:.1f}% (F1: {f1:.3f})',
                va='center', fontweight='bold')

plt.tight_layout()
plt.savefig('../reports/final_model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Error Analysis

In [None]:
# Identify misclassifications
social_results['correct'] = social_results['label'] == social_results['sentiment_label']
clothing_results['correct'] = clothing_results['label'] == clothing_results['sentiment_label']

print("Social Media Misclassification Rate:")
print(f"  Correct:   {social_results['correct'].sum():,} ({social_results['correct'].mean()*100:.2f}%)")
print(f"  Incorrect: {(~social_results['correct']).sum():,} ({(~social_results['correct']).mean()*100:.2f}%)")

print("\nClothing Misclassification Rate:")
print(f"  Correct:   {clothing_results['correct'].sum():,} ({clothing_results['correct'].mean()*100:.2f}%)")
print(f"  Incorrect: {(~clothing_results['correct']).sum():,} ({(~clothing_results['correct']).mean()*100:.2f}%)")

In [None]:
# Show examples of misclassifications
print("\n" + "="*80)
print("EXAMPLE MISCLASSIFICATIONS - Social Media")
print("="*80)

errors = social_results[~social_results['correct']].sample(n=5, random_state=42)
for idx, row in errors.iterrows():
    print(f"\nText: {row['text'][:100]}...")
    print(f"  Actual: {row['label']:8s} | Predicted: {row['sentiment_label']:8s} | Confidence: {row['sentiment_confidence']:.3f}")
    print("-" * 80)

## 7. Confidence Score Analysis

In [None]:
# Analyze confidence scores
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Social Media
for label in ['negative', 'neutral', 'positive']:
    data = social_results[social_results['sentiment_label'] == label]['sentiment_confidence']
    axes[0].hist(data, bins=30, alpha=0.6, label=label.capitalize(), edgecolor='black')

axes[0].set_xlabel('Confidence Score', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Frequency', fontsize=12, fontweight='bold')
axes[0].set_title('Social Media - Prediction Confidence Distribution', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Clothing Reviews
for label in ['negative', 'neutral', 'positive']:
    data = clothing_results[clothing_results['sentiment_label'] == label]['sentiment_confidence']
    axes[1].hist(data, bins=30, alpha=0.6, label=label.capitalize(), edgecolor='black')

axes[1].set_xlabel('Confidence Score', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Frequency', fontsize=12, fontweight='bold')
axes[1].set_title('Clothing Reviews - Prediction Confidence Distribution', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/final_confidence_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

# Statistics
print("\nConfidence Score Statistics:")
print("\nSocial Media:")
print(social_results.groupby('sentiment_label')['sentiment_confidence'].describe())
print("\nClothing Reviews:")
print(clothing_results.groupby('sentiment_label')['sentiment_confidence'].describe())

## 8. Final Summary

In [None]:
# Create final summary table
summary = pd.DataFrame({
    'Metric': ['Test Samples', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'Target Progress'],
    'Social Media': [
        f"{len(social_results):,}",
        f"{social_metrics[0]:.4f}",
        f"{social_metrics[1]:.4f}",
        f"{social_metrics[2]:.4f}",
        f"{social_metrics[3]:.4f}",
        f"{(social_metrics[3]/0.80)*100:.1f}%"
    ],
    'Clothing Reviews': [
        f"{len(clothing_results):,}",
        f"{clothing_metrics[0]:.4f}",
        f"{clothing_metrics[1]:.4f}",
        f"{clothing_metrics[2]:.4f}",
        f"{clothing_metrics[3]:.4f}",
        f"{(clothing_metrics[3]/0.80)*100:.1f}%"
    ]
})

print("\n" + "="*80)
print("FINAL RESULTS SUMMARY - VADER MODEL")
print("="*80)
print(summary.to_string(index=False))
print("="*80)

print("\nðŸ“Š Key Findings:")
print("  1. Social Media: 56.2% F1-score (70.3% of target)")
print("  2. Clothing Reviews: 38.8% F1-score (48.5% of target)")
print("  3. Balanced data (social media) performs better")
print("  4. Imbalanced data (clothing) shows bias toward majority class")
print("\nâœ“ Analysis complete! All visualizations saved to reports/ folder")