# Machine Learning Models – Train & Evaluate

**EEG Stress Level Classification**

Models: Random Forest, SVM, KNN, Gradient Boosting, Logistic Regression  
Target: `stress_level` (0=Natural, 1=Low, 2=Mid, 3=High)

## Part 1: Import Required Libraries

In [None]:
import os, json, warnings
import numpy as np
import pandas as pd
import joblib
from pathlib import Path

from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings("ignore")
print("✓ All libraries imported successfully")

## Part 2: Configuration & Constants

In [None]:
# Paths
ROOT = Path(".")
MODEL_DIR = ROOT / "models"
MODEL_DIR.mkdir(exist_ok=True)
STATIC_DIR = ROOT / "static"
STATIC_DIR.mkdir(exist_ok=True)

# Class names
STRESS_NAMES = ["natural", "lowlevel", "midlevel", "highlevel"]

print(f"✓ Model directory: {MODEL_DIR.resolve()}")
print(f"✓ Static directory: {STATIC_DIR.resolve()}")

## Part 3: Load & Explore Dataset

In [None]:
# Load processed dataset
df = pd.read_csv(ROOT / "processed_dataset.csv")

# Separate features and target
exclude = {"task", "stress_level", "stress_label", "participant"}
feature_cols = [c for c in df.columns if c not in exclude]
X = df[feature_cols].values
y = df["stress_level"].values

print(f"Dataset shape: {df.shape}")
print(f"Features: {len(feature_cols)}")
print(f"\nStress distribution:")
print(df["stress_label"].value_counts())
print(f"\nTask distribution:")
print(df["task"].value_counts())
df.head()

## Part 4: Feature Scaling & Train/Test Split

In [None]:
# StandardScaler for feature normalization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save scaler & feature columns for later use in prediction
joblib.dump(scaler, MODEL_DIR / "scaler.pkl")
joblib.dump(feature_cols, MODEL_DIR / "feature_cols.pkl")

# Train/Test split (80/20, stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set:     {X_test.shape[0]} samples")
print(f"Features:     {X_train.shape[1]}")

## Part 5: Define ML Models

In [None]:
# Define all ML models
models = {
    "RandomForest": RandomForestClassifier(
        n_estimators=200, max_depth=15, random_state=42
    ),
    "SVM": SVC(kernel="rbf", C=10, gamma="scale", probability=True, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5, weights="distance"),
    "GradientBoosting": GradientBoostingClassifier(
        n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42
    ),
    "LogisticRegression": LogisticRegression(
        max_iter=2000, C=1.0, random_state=42
    ),
}

print(f"✓ Defined {len(models)} models:")
for name in models:
    print(f"  • {name}")

## Part 6: Train & Evaluate All Models

Cross-validation (5-fold stratified) + test set evaluation for each model.  
Metrics: Accuracy, Precision, Recall, F1-Score.

In [None]:
results = {}

for name, model in models.items():
    print(f"\n{'='*50}")
    print(f"Training {name} …")
    print(f"{'='*50}")

    # Cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring="accuracy")
    print(f"  CV accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

    # Fit on full training set
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average="weighted", zero_division=0)
    rec = recall_score(y_test, y_pred, average="weighted", zero_division=0)
    f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

    results[name] = {
        "accuracy": round(acc, 4),
        "precision": round(prec, 4),
        "recall": round(rec, 4),
        "f1_score": round(f1, 4),
        "cv_mean": round(cv_scores.mean(), 4),
        "cv_std": round(cv_scores.std(), 4),
    }

    print(f"  Test accuracy:  {acc:.4f}")
    print(f"  Precision:      {prec:.4f}")
    print(f"  Recall:         {rec:.4f}")
    print(f"  F1-score:       {f1:.4f}")
    print(f"\n  Classification Report:")
    print(classification_report(y_test, y_pred, target_names=STRESS_NAMES, zero_division=0))

    # Save model
    joblib.dump(model, MODEL_DIR / f"ml_{name}.pkl")

print("\n✓ All models trained and saved!")

## Part 7: Confusion Matrices

Visualize confusion matrix for each trained model.

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for idx, (name, model) in enumerate(models.items()):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=STRESS_NAMES, yticklabels=STRESS_NAMES, ax=axes[idx])
    axes[idx].set_xlabel("Predicted")
    axes[idx].set_ylabel("Actual")
    axes[idx].set_title(f"{name}")
    
    # Also save individual confusion matrix
    fig_single, ax_single = plt.subplots(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=STRESS_NAMES, yticklabels=STRESS_NAMES, ax=ax_single)
    ax_single.set_xlabel("Predicted")
    ax_single.set_ylabel("Actual")
    ax_single.set_title(f"{name} – Confusion Matrix")
    fig_single.tight_layout()
    fig_single.savefig(STATIC_DIR / f"cm_{name}.png", dpi=100)
    plt.close(fig_single)

# Hide unused subplot
axes[-1].set_visible(False)

fig.suptitle("Confusion Matrices – All ML Models", fontsize=14, fontweight="bold")
fig.tight_layout()
plt.show()

## Part 8: Model Comparison Chart

Bar chart comparing Accuracy, Precision, Recall, and F1-Score across all models.

In [None]:
res_df = pd.DataFrame(results).T

fig, ax = plt.subplots(figsize=(10, 5))
res_df[["accuracy", "precision", "recall", "f1_score"]].plot(
    kind="bar", ax=ax, colormap="viridis"
)
ax.set_ylabel("Score")
ax.set_title("ML Model Comparison")
ax.set_ylim(0, 1.05)
ax.legend(loc="lower right")
fig.tight_layout()
fig.savefig(STATIC_DIR / "ml_comparison.png", dpi=100)
plt.show()

print("\nResults Table:")
display(res_df)

## Part 9: Save Results

Save all results as JSON and print final summary.

In [None]:
# Save results JSON
with open(ROOT / "ml_results.json", "w") as f:
    json.dump(results, f, indent=2)

# Final summary
print("=" * 50)
print("ML Training Complete!")
print("=" * 50)
for name, r in results.items():
    print(f"  {name:25s} → acc={r['accuracy']:.4f}  f1={r['f1_score']:.4f}")

print(f"\n✓ Results saved to ml_results.json")
print(f"✓ Models saved to {MODEL_DIR}/")
print(f"✓ Charts saved to {STATIC_DIR}/")