# IMDb Movie Article Sentiment Analysis - Part 3: Model Development

## Overview
This notebook covers:
1. Loading features and data splits
2. Training multiple classification models:
   - Logistic Regression
   - Naive Bayes
   - Support Vector Machine (SVM)
   - Random Forest
   - XGBoost
   - Neural Networks (LSTM - optional)
3. Hyperparameter tuning
4. Saving trained models


## Step 1: Import Libraries


In [None]:
# Data manipulation
import pandas as pd
import numpy as np
import os
import pickle
import time

# Machine Learning models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_val_score, GridSearchCV

# Neural Networks (optional)
try:
    from tensorflow import keras
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    TENSORFLOW_AVAILABLE = True
except ImportError:
    TENSORFLOW_AVAILABLE = False
    print("TensorFlow not available. Neural network models will be skipped.")

# Utilities
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")


## Step 2: Load Features and Data


In [None]:
# Load features
features = np.load('data/features.npz', allow_pickle=True)

X_train_tfidf = features['X_train_tfidf']
X_test_tfidf = features['X_test_tfidf']
X_train_word2vec = features['X_train_word2vec']
X_test_word2vec = features['X_test_word2vec']
y_train = features['y_train']
y_test = features['y_test']

print("Features loaded successfully!")
print(f"Training set (TF-IDF): {X_train_tfidf.shape}")
print(f"Test set (TF-IDF): {X_test_tfidf.shape}")
print(f"Training set (Word2Vec): {X_train_word2vec.shape}")
print(f"Test set (Word2Vec): {X_test_word2vec.shape}")
print(f"Training labels: {y_train.shape}")
print(f"Test labels: {y_test.shape}")


## Step 3: Model Training Functions


In [None]:
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """Train a model and evaluate its performance"""
    print(f"\n{'='*60}")
    print(f"Training {model_name}...")
    print(f"{'='*60}")
    
    start_time = time.time()
    
    # Train the model
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = None
    
    # Get prediction probabilities if available
    if hasattr(model, 'predict_proba'):
        y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    roc_auc = None
    if y_pred_proba is not None:
        roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    # Print results
    print(f"\n{model_name} Results:")
    print(f"  Training Time: {training_time:.2f} seconds")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1-Score: {f1:.4f}")
    if roc_auc is not None:
        print(f"  ROC-AUC: {roc_auc:.4f}")
    
    return {
        'model': model,
        'model_name': model_name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'roc_auc': roc_auc,
        'training_time': training_time,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }

print("Model training function created!")


## Step 4: Train Models with TF-IDF Features


In [None]:
# Store results
results_tfidf = {}

# 1. Logistic Regression
print("\n" + "="*60)
print("MODEL 1: Logistic Regression")
print("="*60)
lr_model = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1)
results_tfidf['Logistic Regression'] = train_and_evaluate_model(
    lr_model, X_train_tfidf, X_test_tfidf, y_train, y_test, "Logistic Regression"
)


In [None]:
# 2. Naive Bayes
print("\n" + "="*60)
print("MODEL 2: Naive Bayes")
print("="*60)
nb_model = MultinomialNB(alpha=1.0)
results_tfidf['Naive Bayes'] = train_and_evaluate_model(
    nb_model, X_train_tfidf, X_test_tfidf, y_train, y_test, "Naive Bayes"
)


In [None]:
# 3. Support Vector Machine (SVM)
print("\n" + "="*60)
print("MODEL 3: Support Vector Machine")
print("="*60)
# Note: SVM can be slow on large datasets, using linear kernel for speed
svm_model = SVC(kernel='linear', probability=True, random_state=42)
results_tfidf['SVM'] = train_and_evaluate_model(
    svm_model, X_train_tfidf, X_test_tfidf, y_train, y_test, "SVM"
)


In [None]:
# 4. Random Forest
print("\n" + "="*60)
print("MODEL 4: Random Forest")
print("="*60)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, max_depth=20)
results_tfidf['Random Forest'] = train_and_evaluate_model(
    rf_model, X_train_tfidf, X_test_tfidf, y_train, y_test, "Random Forest"
)


In [None]:
# 5. XGBoost
print("\n" + "="*60)
print("MODEL 5: XGBoost")
print("="*60)
# Convert sparse matrix to dense for XGBoost
X_train_tfidf_dense = X_train_tfidf.toarray() if hasattr(X_train_tfidf, 'toarray') else X_train_tfidf
X_test_tfidf_dense = X_test_tfidf.toarray() if hasattr(X_test_tfidf, 'toarray') else X_test_tfidf

xgb_model = XGBClassifier(random_state=42, n_jobs=-1, eval_metric='logloss')
results_tfidf['XGBoost'] = train_and_evaluate_model(
    xgb_model, X_train_tfidf_dense, X_test_tfidf_dense, y_train, y_test, "XGBoost"
)


## Step 5: Compare Model Performance


In [None]:
# Create comparison dataframe
comparison_data = {
    'Model': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1-Score': [],
    'ROC-AUC': [],
    'Training Time (s)': []
}

for model_name, result in results_tfidf.items():
    comparison_data['Model'].append(model_name)
    comparison_data['Accuracy'].append(result['accuracy'])
    comparison_data['Precision'].append(result['precision'])
    comparison_data['Recall'].append(result['recall'])
    comparison_data['F1-Score'].append(result['f1_score'])
    comparison_data['ROC-AUC'].append(result['roc_auc'] if result['roc_auc'] is not None else np.nan)
    comparison_data['Training Time (s)'].append(result['training_time'])

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('F1-Score', ascending=False)

print("\n" + "="*80)
print("MODEL COMPARISON (TF-IDF Features)")
print("="*80)
print(comparison_df.to_string(index=False))

# Visualize comparison
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
for idx, metric in enumerate(metrics):
    ax = axes[idx // 2, idx % 2]
    comparison_df_sorted = comparison_df.sort_values(metric, ascending=True)
    ax.barh(comparison_df_sorted['Model'], comparison_df_sorted[metric], color='steelblue')
    ax.set_xlabel(metric, fontsize=12)
    ax.set_title(f'{metric} Comparison', fontsize=14, fontweight='bold')
    ax.set_xlim([0, 1])
    for i, v in enumerate(comparison_df_sorted[metric]):
        ax.text(v + 0.01, i, f'{v:.3f}', va='center', fontsize=10)

plt.tight_layout()
os.makedirs('models', exist_ok=True)
plt.savefig('models/model_comparison_tfidf.png', dpi=300, bbox_inches='tight')
plt.show()


## Step 6: Hyperparameter Tuning (Best Model)


In [None]:
# Find best model
best_model_name = comparison_df.iloc[0]['Model']
print(f"\nBest performing model: {best_model_name}")

# Hyperparameter tuning for best model
print(f"\nPerforming hyperparameter tuning for {best_model_name}...")

if best_model_name == 'Logistic Regression':
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear']
    }
    base_model = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1)
    
elif best_model_name == 'Naive Bayes':
    param_grid = {
        'alpha': [0.1, 0.5, 1.0, 2.0]
    }
    base_model = MultinomialNB()
    
elif best_model_name == 'SVM':
    param_grid = {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    }
    base_model = SVC(probability=True, random_state=42)
    
elif best_model_name == 'Random Forest':
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5]
    }
    base_model = RandomForestClassifier(random_state=42, n_jobs=-1)
    
elif best_model_name == 'XGBoost':
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2]
    }
    base_model = XGBClassifier(random_state=42, n_jobs=-1, eval_metric='logloss')
    X_train_tuned = X_train_tfidf_dense
    X_test_tuned = X_test_tfidf_dense
else:
    X_train_tuned = X_train_tfidf
    X_test_tuned = X_test_tfidf

if best_model_name != 'XGBoost':
    X_train_tuned = X_train_tfidf
    X_test_tuned = X_test_tfidf

# Perform grid search
print("Running GridSearchCV (this may take a while)...")
grid_search = GridSearchCV(
    base_model, 
    param_grid, 
    cv=3, 
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_tuned, y_train)

print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")

# Evaluate tuned model
best_tuned_model = grid_search.best_estimator_
y_pred_tuned = best_tuned_model.predict(X_test_tuned)

print(f"\nTuned {best_model_name} Performance:")
print(f"  Accuracy: {accuracy_score(y_test, y_pred_tuned):.4f}")
print(f"  F1-Score: {f1_score(y_test, y_pred_tuned, average='weighted'):.4f}")

# Update results
results_tfidf[f'{best_model_name} (Tuned)'] = {
    'model': best_tuned_model,
    'model_name': f'{best_model_name} (Tuned)',
    'accuracy': accuracy_score(y_test, y_pred_tuned),
    'precision': precision_score(y_test, y_pred_tuned, average='weighted'),
    'recall': recall_score(y_test, y_pred_tuned, average='weighted'),
    'f1_score': f1_score(y_test, y_pred_tuned, average='weighted'),
    'roc_auc': roc_auc_score(y_test, best_tuned_model.predict_proba(X_test_tuned)[:, 1]) if hasattr(best_tuned_model, 'predict_proba') else None,
    'training_time': 0,
    'y_pred': y_pred_tuned,
    'y_pred_proba': best_tuned_model.predict_proba(X_test_tuned)[:, 1] if hasattr(best_tuned_model, 'predict_proba') else None
}


In [None]:
# Save all models
os.makedirs('models', exist_ok=True)

for model_name, result in results_tfidf.items():
    # Clean model name for filename
    filename = model_name.lower().replace(' ', '_').replace('(', '').replace(')', '')
    filepath = f'models/{filename}.pkl'
    
    with open(filepath, 'wb') as f:
        pickle.dump(result['model'], f)
    
    print(f"Saved: {filepath}")

# Save best model separately
best_model_result = results_tfidf[best_model_name]
with open('models/best_model.pkl', 'wb') as f:
    pickle.dump(best_model_result['model'], f)

print(f"\nBest model saved: models/best_model.pkl ({best_model_name})")
print("\nAll models saved successfully!")


## Summary

### Key Accomplishments:
1. ✅ Trained 5 different classification models
2. ✅ Compared model performance across multiple metrics
3. ✅ Performed hyperparameter tuning on best model
4. ✅ Saved all trained models for evaluation

### Next Steps:
- Proceed to Model Evaluation notebook for detailed analysis
