# 5. Model Training & Evaluation

This notebook implements model training, hyperparameter tuning, and evaluation.

## Objectives
1. Load prepared data
2. Implement model architectures
3. Train models with hyperparameter tuning
4. Evaluate model performance
5. Save best model

In [None]:
import numpy as np
import pandas as pd
import json
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

# Set random seed
np.random.seed(42)

## 1. Load Prepared Data

In [None]:
def load_prepared_data():
    """Load prepared datasets and metadata."""
    # Load splits
    X_train = np.load('../data/processed/X_train.npy')
    y_train = np.load('../data/processed/y_train.npy')
    X_val = np.load('../data/processed/X_val.npy')
    y_val = np.load('../data/processed/y_val.npy')
    X_test = np.load('../data/processed/X_test.npy')
    y_test = np.load('../data/processed/y_test.npy')
    
    # Load feature names
    with open('../data/processed/feature_names.json', 'r') as f:
        feature_names = json.load(f)
    
    return (
        (X_train, y_train),
        (X_val, y_val),
        (X_test, y_test),
        feature_names
    )

# Load data
(X_train, y_train), (X_val, y_val), (X_test, y_test), feature_names = load_prepared_data()

print("Data shapes:")
print(f"X_train: {X_train.shape}")
print(f"X_val: {X_val.shape}")
print(f"X_test: {X_test.shape}")

## 2. Model Implementation

In [None]:
def create_models():
    """Create dictionary of models to train."""
    models = {
        'random_forest': {
            'model': RandomForestClassifier(random_state=42),
            'params': {
                'n_estimators': [100, 200, 300],
                'max_depth': [10, 20, 30, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            }
        },
        'gradient_boosting': {
            'model': GradientBoostingClassifier(random_state=42),
            'params': {
                'n_estimators': [100, 200, 300],
                'learning_rate': [0.01, 0.1, 0.3],
                'max_depth': [3, 4, 5],
                'min_samples_split': [2, 5, 10]
            }
        },
        'neural_network': {
            'model': MLPClassifier(random_state=42, max_iter=1000),
            'params': {
                'hidden_layer_sizes': [(100,), (100, 50), (200, 100, 50)],
                'activation': ['relu', 'tanh'],
                'alpha': [0.0001, 0.001, 0.01],
                'learning_rate': ['constant', 'adaptive']
            }
        }
    }
    
    return models

# Create models
models = create_models()
print("Created models:")
for name in models:
    print(f"- {name}")

## 3. Model Training & Tuning

In [None]:
def train_and_tune_models(models, X_train, y_train, X_val, y_val):
    """Train and tune models using RandomizedSearchCV.
    
    Args:
        models (dict): Dictionary of models and their parameters
        X_train, y_train: Training data
        X_val, y_val: Validation data
        
    Returns:
        dict: Trained models with their best parameters and scores
    """
    results = {}
    
    for name, config in models.items():
        print(f"\nTraining {name}...")
        
        # Create RandomizedSearchCV
        search = RandomizedSearchCV(
            config['model'],
            config['params'],
            n_iter=10,
            cv=5,
            n_jobs=-1,
            random_state=42,
            verbose=1
        )
        
        # Fit model
        search.fit(X_train, y_train)
        
        # Get best model
        best_model = search.best_estimator_
        
        # Evaluate on validation set
        val_score = best_model.score(X_val, y_val)
        
        results[name] = {
            'model': best_model,
            'best_params': search.best_params_,
            'best_score': search.best_score_,
            'val_score': val_score
        }
        
        print(f"Best CV score: {search.best_score_:.4f}")
        print(f"Validation score: {val_score:.4f}")
    
    return results

# Train and tune models
results = train_and_tune_models(models, X_train, y_train, X_val, y_val)

## 4. Model Evaluation

In [None]:
def evaluate_models(results, X_test, y_test):
    """Evaluate models on test set.
    
    Args:
        results (dict): Dictionary of trained models and their results
        X_test, y_test: Test data
    """
    for name, res in results.items():
        print(f"\nEvaluating {name}:")
        model = res['model']
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(
            y_test, y_pred, average='weighted'
        )
        
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-score: {f1:.4f}")
        
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))
        
        # Plot confusion matrix
        plt.figure(figsize=(10, 8))
        cm = confusion_matrix(y_test, y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix - {name}')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.show()

# Evaluate models
evaluate_models(results, X_test, y_test)

## 5. Save Best Model

In [None]:
def save_best_model(results):
    """Save the best performing model."""
    # Find best model based on validation score
    best_model_name = max(results.keys(), key=lambda k: results[k]['val_score'])
    best_model = results[best_model_name]['model']
    
    # Save model
    model_path = '../models/best_model.joblib'
    Path('../models').mkdir(exist_ok=True)
    joblib.dump(best_model, model_path)
    
    # Save model metadata
    metadata = {
        'model_type': best_model_name,
        'best_params': results[best_model_name]['best_params'],
        'validation_score': results[best_model_name]['val_score'],
        'feature_names': feature_names
    }
    
    with open('../models/model_metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2)
    
    print(f"Saved best model ({best_model_name}):")
    print(f"- Model: best_model.joblib")
    print(f"- Metadata: model_metadata.json")

save_best_model(results)