In [None]:
# notebooks/01_train_and_eval.ipynb

import os
import joblib
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
    accuracy_score,
)

# ------------------ Paths ------------------ #
PROCESSED_DIR = os.path.join("..", "data", "processed")
MODEL_DIR = os.path.join("..", "data", "model")
os.makedirs(MODEL_DIR, exist_ok=True)

# ------------------ Load Artifacts ------------------ #
print("[INFO] Loading preprocessed data...")
X_train = np.load(os.path.join(PROCESSED_DIR, "X_train.npy"))
X_test = np.load(os.path.join(PROCESSED_DIR, "X_test.npy"))
y_train = np.load(os.path.join(PROCESSED_DIR, "y_train.npy"))
y_test = np.load(os.path.join(PROCESSED_DIR, "y_test.npy"))

print(f"[INFO] Train shape: {X_train.shape}, Test shape: {X_test.shape}")

# ------------------ Train XGBoost ------------------ #
print("[INFO] Training XGBoost model...")
model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=4,
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42
)
model.fit(X_train, y_train)

# ------------------ Predictions ------------------ #
print("[INFO] Predicting...")
y_pred = model.predict(X_test)
y_probs = model.predict_proba(X_test)[:, 1]  # Probabilities for ROC

# ------------------ Evaluation ------------------ #
print("[INFO] Evaluating model...")
acc = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_probs)
print(f"✅ Accuracy: {acc:.4f}")
print(f"✅ ROC-AUC: {roc_auc:.4f}")

# Classification report
cls_report = classification_report(y_test, y_pred, target_names=["ham", "spam"])
print(cls_report)

# ------------------ Save Model ------------------ #
print("[INFO] Saving model...")
model_path = os.path.join(MODEL_DIR, "xgboost_model.pkl")
joblib.dump(model, model_path)

# ------------------ Save Classification Report ------------------ #
with open(os.path.join(MODEL_DIR, "classification_report.txt"), "w") as f:
    f.write(f"Accuracy: {acc:.4f}\n")
    f.write(f"ROC-AUC: {roc_auc:.4f}\n\n")
    f.write(cls_report)

# ------------------ Plot Confusion Matrix ------------------ #
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["ham", "spam"], yticklabels=["ham", "spam"])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.savefig(os.path.join(MODEL_DIR, "confusion_matrix.png"))
plt.close()

# ------------------ Plot ROC Curve ------------------ #
fpr, tpr, _ = roc_curve(y_test, y_probs)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.savefig(os.path.join(MODEL_DIR, "roc_curve.png"))
plt.close()

print("[SUCCESS] Model training complete. Artifacts saved in data/model/")
