In [1]:
import os
import joblib
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, cohen_kappa_score, accuracy_score
from sklearn.model_selection import KFold
from utils.preprocess_data_KDD_plus_test import preprocess_data
from utils.plot import plot_combined_roc_curve

# File paths
train_file_path = os.path.join('data', 'KDDTrain+.csv')
test_file_path = os.path.join('data', 'KDDTest+.csv')

# Preprocess data
preprocessing_models_folder = os.path.join('preprocessing_pipeline')
X_train, X_test, y_train, y_test = preprocess_data(
    train_file_path=train_file_path,
    test_file_path=test_file_path,
    preprocessing_models_folder=preprocessing_models_folder
)

# Set output folder
current_directory = os.path.abspath(os.getcwd())
output_folder = os.path.join(current_directory, "xgboost")
os.makedirs(output_folder, exist_ok=True)


def class_based_accuracy(y_true, y_pred, num_classes):
    """Calculate class-based accuracy for each class."""
    class_acc = {}
    for cls in range(num_classes):
        mask = y_true == cls
        acc = accuracy_score(y_true[mask], y_pred[mask])
        class_acc[cls] = acc
    return class_acc


def xgboost_kfold(X, y, num_classes, k=5, output_folder=output_folder):
    """Train XGBoost with K-Fold cross-validation and evaluate with metrics."""
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    all_auc = []
    all_kappa = []
    class_accuracies = []

    for fold_index, (train_index, test_index) in enumerate(kf.split(X)):
        print(f"Running fold-{fold_index + 1}")
        X_train_fold, X_val_fold = X[train_index], X[test_index]
        y_train_fold, y_val_fold = y[train_index], y[test_index]

        # Initialize and train the XGBoost model
        xgb = XGBClassifier(
            objective='multi:softprob',
            num_class=num_classes,
            eval_metric='mlogloss',
            random_state=42
        )
        xgb.fit(X_train_fold, y_train_fold)

        # Predict probabilities and classes
        y_prob_val = xgb.predict_proba(X_val_fold)
        y_pred_val = np.argmax(y_prob_val, axis=1)

        # Compute metrics for the fold
        fold_auc = roc_auc_score(y_val_fold, y_prob_val, multi_class='ovr')
        fold_kappa = cohen_kappa_score(y_val_fold, y_pred_val)
        fold_class_acc = class_based_accuracy(y_val_fold, y_pred_val, num_classes)

        all_auc.append(fold_auc)
        all_kappa.append(fold_kappa)
        class_accuracies.append(fold_class_acc)

        print(f"Fold-{fold_index + 1} AUC: {fold_auc:.4f}, Kappa: {fold_kappa:.4f}")

    # Combine metrics across folds
    avg_auc = np.mean(all_auc)
    avg_kappa = np.mean(all_kappa)
    avg_class_acc = {cls: np.mean([acc[cls] for acc in class_accuracies]) for cls in range(num_classes)}

    print("\nK-Fold Cross-Validation Results:")
    print(f"Average AUC: {avg_auc:.4f}")
    print(f"Average Kappa: {avg_kappa:.4f}")
    print("Average Class-Based Accuracy:")
    for cls, acc in avg_class_acc.items():
        print(f"Class {cls}: {acc:.4f}")

    print(f"K-Fold cross-validation completed. Results saved in {output_folder}")


def train_xgboost(X, y, X_test, y_test, num_classes, output_folder=output_folder):
    """Train the final XGBoost model on the full dataset and evaluate on the test set."""
    print("Training final XGBoost model...")
    xgb_final = XGBClassifier(
        objective='multi:softprob',
        num_class=num_classes,
        eval_metric='mlogloss',
        random_state=42
    )
    xgb_final.fit(X, y)

    # Save the trained model
    model_path = os.path.join(output_folder, "xgboost_model.pkl")
    joblib.dump(xgb_final, model_path)

    # Predict probabilities and classes for the test set
    y_prob_test = xgb_final.predict_proba(X_test)
    y_pred_test = np.argmax(y_prob_test, axis=1)

    # Compute metrics
    test_auc = roc_auc_score(y_test, y_prob_test, multi_class='ovr')
    test_kappa = cohen_kappa_score(y_test, y_pred_test)
    test_class_acc = class_based_accuracy(y_test, y_pred_test, num_classes)

    print("\nTest Set Evaluation:")
    print(f"AUC: {test_auc:.4f}")
    print(f"Kappa: {test_kappa:.4f}")
    print("Class-Based Accuracy:")
    for cls, acc in test_class_acc.items():
        print(f"Class {cls}: {acc:.4f}")

    print(f"Final model evaluation completed. Results saved in {output_folder}")


# Run K-Fold cross-validation
num_classes = len(np.unique(y_train))  # Number of unique classes in the dataset
xgboost_kfold(X_train, y_train, num_classes=num_classes, k=5, output_folder=output_folder)

# Train the final model and evaluate on the test set
train_xgboost(X_train, y_train, X_test, y_test, num_classes=num_classes, output_folder=output_folder)


Running fold-1
Fold-1 AUC: 0.9970, Kappa: 0.9924
Running fold-2
Fold-2 AUC: 0.9977, Kappa: 0.9922
Running fold-3
Fold-3 AUC: 0.9995, Kappa: 0.9931
Running fold-4
Fold-4 AUC: 0.9985, Kappa: 0.9918
Running fold-5
Fold-5 AUC: 0.9993, Kappa: 0.9912

K-Fold Cross-Validation Results:
Average AUC: 0.9984
Average Kappa: 0.9921
Average Class-Based Accuracy:
Class 0: 0.9981
Class 1: 0.8746
Class 2: 0.9883
Class 3: 0.7397
Class 4: 0.9969
K-Fold cross-validation completed. Results saved in /Users/himanshupradhan/coding/Projects/Major Project/nid-system/notebooks/Himanshu_KDD+ Dataset-multiclass/xgboost
Training final XGBoost model...

Test Set Evaluation:
AUC: 0.8705
Kappa: 0.5472
Class-Based Accuracy:
Class 0: 0.7937
Class 1: 0.0056
Class 2: 0.7007
Class 3: 0.0019
Class 4: 0.9392
Final model evaluation completed. Results saved in /Users/himanshupradhan/coding/Projects/Major Project/nid-system/notebooks/Himanshu_KDD+ Dataset-multiclass/xgboost
