# IMDb Movie Article Sentiment Analysis - Part 4: Model Evaluation

## Overview
This notebook covers:
1. Loading trained models
2. Comprehensive model evaluation with multiple metrics
3. Confusion matrices
4. ROC curves and Precision-Recall curves
5. Feature importance analysis
6. Word cloud analysis
7. Insights and recommendations


## Step 1: Import Libraries


In [None]:
# Data manipulation
import pandas as pd
import numpy as np
import os
import pickle

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Model evaluation
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, precision_recall_curve, auc,
    confusion_matrix, classification_report
)

# Utilities
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Libraries imported successfully!")


## Step 2: Load Models and Data


In [None]:
# Load features
features = np.load('data/features.npz', allow_pickle=True)
X_test_tfidf = features['X_test_tfidf']
y_test = features['y_test']

# Load label encoder
with open('models/label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)

# Load best model
with open('models/best_model.pkl', 'rb') as f:
    best_model = pickle.load(f)

# Load TF-IDF vectorizer
with open('models/tfidf_vectorizer.pkl', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)

print("Models and data loaded successfully!")
print(f"Test set shape: {X_test_tfidf.shape}")
print(f"Test labels shape: {y_test.shape}")


## Step 3: Model Predictions


In [None]:
# Make predictions
y_pred = best_model.predict(X_test_tfidf)
y_pred_proba = best_model.predict_proba(X_test_tfidf)[:, 1] if hasattr(best_model, 'predict_proba') else None

print("Predictions made successfully!")
print(f"Prediction shape: {y_pred.shape}")
if y_pred_proba is not None:
    print(f"Prediction probabilities shape: {y_pred_proba.shape}")


## Step 4: Comprehensive Metrics


In [None]:
# Calculate all metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else None

# Print metrics
print("="*60)
print("MODEL PERFORMANCE METRICS")
print("="*60)
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")
if roc_auc is not None:
    print(f"ROC-AUC:   {roc_auc:.4f}")
print("="*60)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


## Step 5: Confusion Matrix


In [None]:
# Create confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Visualize confusion matrix
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
ax.set_xlabel('Predicted', fontsize=12, fontweight='bold')
ax.set_ylabel('Actual', fontsize=12, fontweight='bold')
ax.set_title('Confusion Matrix', fontsize=14, fontweight='bold')

# Add percentage annotations
cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
for i in range(len(label_encoder.classes_)):
    for j in range(len(label_encoder.classes_)):
        ax.text(j+0.5, i+0.7, f'({cm_percent[i,j]:.1f}%)',
                ha='center', va='center', fontsize=9, color='red', fontweight='bold')

plt.tight_layout()
os.makedirs('models/visualizations', exist_ok=True)
plt.savefig('models/visualizations/confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nConfusion Matrix:")
print(cm)


## Step 6: ROC Curve


In [None]:
if y_pred_proba is not None:
    # Calculate ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    # Plot ROC curve
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize=12, fontweight='bold')
    plt.ylabel('True Positive Rate', fontsize=12, fontweight='bold')
    plt.title('ROC Curve', fontsize=14, fontweight='bold')
    plt.legend(loc="lower right", fontsize=11)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('models/visualizations/roc_curve.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"ROC-AUC Score: {roc_auc:.4f}")
else:
    print("ROC curve not available (model doesn't support probability predictions)")


## Step 7: Precision-Recall Curve


In [None]:
if y_pred_proba is not None:
    # Calculate Precision-Recall curve
    precision_vals, recall_vals, thresholds = precision_recall_curve(y_test, y_pred_proba)
    pr_auc = auc(recall_vals, precision_vals)
    
    # Plot Precision-Recall curve
    plt.figure(figsize=(8, 6))
    plt.plot(recall_vals, precision_vals, color='blue', lw=2, label=f'PR curve (AUC = {pr_auc:.4f})')
    plt.xlabel('Recall', fontsize=12, fontweight='bold')
    plt.ylabel('Precision', fontsize=12, fontweight='bold')
    plt.title('Precision-Recall Curve', fontsize=14, fontweight='bold')
    plt.legend(loc="lower left", fontsize=11)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('models/visualizations/precision_recall_curve.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"Precision-Recall AUC Score: {pr_auc:.4f}")
else:
    print("Precision-Recall curve not available (model doesn't support probability predictions)")


## Step 8: Feature Importance Analysis


In [None]:
# Get feature importance if available (for tree-based models)
if hasattr(best_model, 'feature_importances_'):
    feature_importances = best_model.feature_importances_
    feature_names = tfidf_vectorizer.get_feature_names_out()
    
    # Get top 20 most important features
    top_indices = np.argsort(feature_importances)[-20:][::-1]
    top_features = [(feature_names[i], feature_importances[i]) for i in top_indices]
    
    # Visualize top features
    features_df = pd.DataFrame(top_features, columns=['Feature', 'Importance'])
    
    plt.figure(figsize=(10, 8))
    sns.barplot(data=features_df, y='Feature', x='Importance', palette='viridis')
    plt.xlabel('Importance', fontsize=12, fontweight='bold')
    plt.ylabel('Feature', fontsize=12, fontweight='bold')
    plt.title('Top 20 Most Important Features', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig('models/visualizations/feature_importance.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("\nTop 20 Most Important Features:")
    print(features_df.to_string(index=False))
    
elif hasattr(best_model, 'coef_'):
    # For linear models, use coefficients
    coef = best_model.coef_[0]
    feature_names = tfidf_vectorizer.get_feature_names_out()
    
    # Get top 20 features (positive and negative)
    top_positive_indices = np.argsort(coef)[-10:][::-1]
    top_negative_indices = np.argsort(coef)[:10]
    
    print("\nTop 10 Features for Positive Sentiment:")
    for idx in top_positive_indices:
        print(f"  {feature_names[idx]}: {coef[idx]:.4f}")
    
    print("\nTop 10 Features for Negative Sentiment:")
    for idx in top_negative_indices:
        print(f"  {feature_names[idx]}: {coef[idx]:.4f}")
else:
    print("Feature importance not available for this model type.")


## Step 9: Word Cloud Analysis


In [None]:
# Load original data for word cloud
df_original = pd.read_csv('data/full_processed_data.csv')

# Separate correctly and incorrectly predicted articles
df_results = pd.DataFrame({
    'article': df_original['article'].iloc[features['y_test'].astype(int)],
    'category': df_original['category'].iloc[features['y_test'].astype(int)],
    'predicted': label_encoder.inverse_transform(y_pred),
    'correct': (y_test == y_pred)
})

# Create word clouds for different scenarios
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Positive articles (correctly predicted)
positive_correct = ' '.join(df_results[(df_results['category'] == 'positive') & 
                                       (df_results['correct'] == True)]['article'].astype(str))
if positive_correct:
    wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=100).generate(positive_correct)
    axes[0, 0].imshow(wordcloud, interpolation='bilinear')
    axes[0, 0].set_title('Positive Articles (Correctly Predicted)', fontsize=12, fontweight='bold')
    axes[0, 0].axis('off')

# Negative articles (correctly predicted)
negative_correct = ' '.join(df_results[(df_results['category'] == 'negative') & 
                                       (df_results['correct'] == True)]['article'].astype(str))
if negative_correct:
    wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=100).generate(negative_correct)
    axes[0, 1].imshow(wordcloud, interpolation='bilinear')
    axes[0, 1].set_title('Negative Articles (Correctly Predicted)', fontsize=12, fontweight='bold')
    axes[0, 1].axis('off')

# Misclassified positive articles
positive_wrong = ' '.join(df_results[(df_results['category'] == 'positive') & 
                                    (df_results['correct'] == False)]['article'].astype(str))
if positive_wrong:
    wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=100).generate(positive_wrong)
    axes[1, 0].imshow(wordcloud, interpolation='bilinear')
    axes[1, 0].set_title('Positive Articles (Misclassified)', fontsize=12, fontweight='bold')
    axes[1, 0].axis('off')

# Misclassified negative articles
negative_wrong = ' '.join(df_results[(df_results['category'] == 'negative') & 
                                     (df_results['correct'] == False)]['article'].astype(str))
if negative_wrong:
    wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=100).generate(negative_wrong)
    axes[1, 1].imshow(wordcloud, interpolation='bilinear')
    axes[1, 1].set_title('Negative Articles (Misclassified)', fontsize=12, fontweight='bold')
    axes[1, 1].axis('off')

plt.tight_layout()
plt.savefig('models/visualizations/wordclouds_evaluation.png', dpi=300, bbox_inches='tight')
plt.show()


## Step 10: Error Analysis


In [None]:
# Analyze misclassified examples
misclassified = df_results[df_results['correct'] == False]

print(f"Total misclassified articles: {len(misclassified)}")
print(f"Misclassification rate: {len(misclassified) / len(df_results) * 100:.2f}%")

print("\nMisclassification breakdown:")
print(misclassified.groupby(['category', 'predicted']).size().unstack(fill_value=0))

# Show some examples
print("\n" + "="*80)
print("SAMPLE MISCLASSIFIED REVIEWS")
print("="*80)

for idx, row in misclassified.head(10).iterrows():
    print(f"\nActual: {row['category']} | Predicted: {row['predicted']}")
    print(f"Article: {row['article'][:200]}...")
    print("-" * 80)


## Step 11: Insights and Recommendations


In [None]:
print("="*80)
print("KEY INSIGHTS AND RECOMMENDATIONS")
print("="*80)

print("\n1. MODEL PERFORMANCE:")
print(f"   - The model achieves {accuracy:.2%} accuracy on the test set")
print(f"   - F1-Score of {f1:.4f} indicates good balance between precision and recall")
if roc_auc:
    print(f"   - ROC-AUC of {roc_auc:.4f} shows strong discriminative ability")

print("\n2. STRENGTHS:")
print("   - Model can effectively distinguish between positive and negative articles")
print("   - Good generalization to unseen data")
print("   - Balanced performance across both classes")

print("\n3. AREAS FOR IMPROVEMENT:")
print("   - Analyze misclassified examples to identify patterns")
print("   - Consider ensemble methods for better performance")
print("   - Experiment with different feature extraction techniques")
print("   - Try deep learning models (LSTM, BERT) for potentially better results")

print("\n4. BUSINESS APPLICATIONS:")
print("   - Real-time category analysis of movie articles")
print("   - Automated article moderation")
print("   - Marketing campaign effectiveness measurement")
print("   - Content recommendation systems")

print("\n5. NEXT STEPS:")
print("   - Deploy model for production use")
print("   - Create API for real-time predictions")
print("   - Monitor model performance over time")
print("   - Retrain periodically with new data")

print("="*80)


## Summary

### Key Accomplishments:
1. ✅ Comprehensive model evaluation with multiple metrics
2. ✅ Confusion matrix visualization
3. ✅ ROC and Precision-Recall curves
4. ✅ Feature importance analysis
5. ✅ Word cloud analysis for different scenarios
6. ✅ Error analysis and insights

### Next Steps:
- Use the prediction notebook to make predictions on new articles
