# XGBoost

In [51]:
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo

In [52]:
df_final = pd.read_csv('../data/processed/final.csv')
X = df_final.drop('result', axis=1)
y = df_final['result']

# MLFlow

In [53]:
import mlflow
import mlflow.sklearn
from pathlib import Path

In [54]:
MLFLOW_DIR = Path("../mlruns").resolve()
ARTIFACTS_DIR = Path("../configs/models").resolve()
mlflow.set_tracking_uri(f"file:///{str(MLFLOW_DIR)}")

In [55]:
BASE_EXPERIMENT_NAME = "XGBoost"
mlflow.set_tracking_uri("../mlruns")

def set_experiment(experiment_name):
    experiment_name = f"{BASE_EXPERIMENT_NAME}_{experiment_name}"
    mlflow.set_experiment(experiment_name)

def create_experiment(experiment_name):
    experiment_name = f"{BASE_EXPERIMENT_NAME}_{experiment_name}"
    mlflow.create_experiment(experiment_name, artifact_location=f"file:///{str(ARTIFACTS_DIR)}")

In [56]:
def delete_experiment(experiment_name):
    experiment_name = f"{BASE_EXPERIMENT_NAME}_{experiment_name}"
    mlflow.delete_experiment(mlflow.get_experiment_by_name(experiment_name).experiment_id)

## Hyperparameter Tuning Libraries

In [57]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import xgboost as xgb
from xgboost import plot_tree
import time

In [58]:
hpo_run_count = 5
kfc_run_count = 5

In [59]:
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=21)

In [60]:
# Heavy focus on preventing overfitting
param_grid = {
    'max_depth': [3, 4, 5, 8],  # Smaller depth to reduce overfitting
    'learning_rate': [0.01, 0.05, 0.1],  # Low learning rates help avoid overfitting
    'n_estimators': [50, 100, 150],  # Fewer trees to prevent overfitting
    'subsample': [0.7, 0.8, 0.9],  # Regularization to prevent overfitting
    'colsample_bytree': [0.7, 0.8, 1.0],  # Fraction of features to use for each tree
    'gamma': [0, 0.05, 0.1],  # Regularization to control tree splitting
    'scale_pos_weight': [1, 2, 3],  # Helps if the dataset is imbalanced
    'min_child_weight': [1, 3, 5]  # Ensures trees are not too deep (reduces overfitting)
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, scoring='f1', n_jobs=-1, verbose=1)

In [61]:
import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate

In [62]:
set_experiment("GRID_SEARCH")

with mlflow.start_run(run_name=f"HPO_{hpo_run_count}_XGB"):
    hpo_run_count += 1
    mlflow.log_param("model", "XGBoost")

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    y_train = y_train.replace(-1, 0)
    y_test = y_test.replace(-1, 0)
    
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    for param, value in best_params.items():
        mlflow.log_param(param, value)
    
    best_score = grid_search.best_score_
    mlflow.log_metric("best_accuracy", best_score)

    best_model = grid_search.best_estimator_

    y_pred = best_model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    mlflow.log_metric("test_accuracy", accuracy)
    mlflow.log_metric("test_precision", precision)
    mlflow.log_metric("test_recall", recall)
    mlflow.log_metric("test_f1", f1)

    class_report = classification_report(y_test, y_pred)
    mlflow.log_param("classification_report", class_report)

    cm = confusion_matrix(y_test, y_pred)

    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', xticklabels=["Class 0", "Class 1"], yticklabels=["Class 0", "Class 1"])
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    
    cm_image_path = "./images/confusion_matrix.png"
    plt.savefig(cm_image_path)
    plt.close()

    mlflow.log_artifact(cm_image_path, "confusion_matrix")

    mlflow.sklearn.log_model(best_model, "model")

    print("Best parameters found: ", best_params)
    print("Best cross-validation accuracy: {:.4f}".format(best_score))
    print("Test Accuracy: {:.4f}".format(accuracy))
    print("Test Precision: {:.4f}".format(precision))
    print("Test Recall: {:.4f}".format(recall))
    print("Test F1-Score: {:.4f}".format(f1))
    print("\nClassification Report:\n", class_report)
    print("\nConfusion Matrix:\n", cm)

Fitting 5 folds for each of 8748 candidates, totalling 43740 fits


Parameters: { "use_label_encoder" } are not used.



Best parameters found:  {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 8, 'min_child_weight': 1, 'n_estimators': 100, 'scale_pos_weight': 3, 'subsample': 0.9}
Best cross-validation accuracy: 0.9786
Test Accuracy: 0.9816
Test Precision: 0.9778
Test Recall: 0.9851
Test F1-Score: 0.9815

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98       412
           1       0.98      0.99      0.98       403

    accuracy                           0.98       815
   macro avg       0.98      0.98      0.98       815
weighted avg       0.98      0.98      0.98       815


Confusion Matrix:
 [[403   9]
 [  6 397]]


# Kfold Cross Validation

In [63]:
best_params = grid_search.best_params_

In [64]:
set_experiment("K_FOLD")
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
train_metric_data = []
metric_data = []
with mlflow.start_run(run_name=f"KFC_{kfc_run_count}_XGB"):
    kfc_run_count += 1
    mlflow.log_param("model", "XGBoost")
    mlflow.log_param("cross_validation", "StratifiedKFold")
    
    for param, value in best_params.items():
        mlflow.log_param(param, value)

    best_model = None
    best_accuracy = 0

    for i, (train_index, test_index) in enumerate(cv.split(X, y)):

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        y_train = y_train.replace(-1, 0)
        y_test = y_test.replace(-1, 0)

        model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', **best_params)
        start_train_time = time.time()
        model.fit(X_train, y_train)
        end_train_time = time.time()

        mlflow.log_metric(f"training_time_{i}", end_train_time - start_train_time)

        y_pred = model.predict(X_train)
        train_accuracy = accuracy_score(y_train, y_pred)
        train_precision = precision_score(y_train, y_pred)
        train_recall = recall_score(y_train, y_pred)
        train_f1 = f1_score(y_train, y_pred)

        mlflow.log_metric(f"train_accuracy_{i}", train_accuracy)
        mlflow.log_metric(f"train_precision_{i}", train_precision)
        mlflow.log_metric(f"train_recall_{i}", train_recall)
        mlflow.log_metric(f"train_f1_{i}", train_f1)

        train_metric_data.append([train_accuracy, train_precision, train_recall, train_f1, end_train_time - start_train_time])

        start_test_time = time.time()
        y_pred = model.predict(X_test)
        end_test_time = time.time()

        mlflow.log_metric(f"testing_time_{i}", end_test_time - start_test_time)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        mlflow.log_metric(f"test_accuracy_{i}", accuracy)
        mlflow.log_metric(f"test_precision_{i}", precision)
        mlflow.log_metric(f"test_recall_{i}", recall)
        mlflow.log_metric(f"test_f1_{i}", f1)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model

        metric_data.append([accuracy, precision, recall, f1, end_test_time - start_test_time])

    mlflow.sklearn.log_model(best_model, "model")

    train_metric_data = np.array(train_metric_data)
    mean = np.mean(train_metric_data, axis=0)
    std = np.std(train_metric_data, axis=0)

    mlflow.log_metric("train_mean_accuracy", mean[0])
    mlflow.log_metric("train_mean_precision", mean[1])
    mlflow.log_metric("train_mean_recall", mean[2])
    mlflow.log_metric("train_mean_f1", mean[3])
    mlflow.log_metric("train_mean_time", mean[4])

    mlflow.log_metric("train_std_accuracy", std[0])
    mlflow.log_metric("train_std_precision", std[1])
    mlflow.log_metric("train_std_recall", std[2])
    mlflow.log_metric("train_std_f1", std[3])
    mlflow.log_metric("train_std_time", std[4])

    metric_data = np.array(metric_data)
    mean = np.mean(metric_data, axis=0)
    std = np.std(metric_data, axis=0)

    mlflow.log_metric("test_mean_accuracy", mean[0])
    mlflow.log_metric("test_mean_precision", mean[1])
    mlflow.log_metric("test_mean_recall", mean[2])
    mlflow.log_metric("mean_f1", mean[3])
    mlflow.log_metric("test_mean_time", mean[4])

    mlflow.log_metric("test_std_accuracy", std[0])
    mlflow.log_metric("test_std_precision", std[1])
    mlflow.log_metric("test_std_recall", std[2])
    mlflow.log_metric("test_std_f1", std[3])
    mlflow.log_metric("test_std_time", std[4])

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



## Summary

In [65]:
train_metric_data = np.array(train_metric_data)
train_mean = np.mean(train_metric_data, axis=0)
train_std = np.std(train_metric_data, axis=0)

train_mean_accuracy, train_mean_precision, train_mean_recall, train_mean_f1, train_mean_time = train_mean
train_std_accuracy, train_std_precision, train_std_recall, train_std_f1, train_std_time = train_std

metric_data = np.array(metric_data)
test_mean = np.mean(metric_data, axis=0)
test_std = np.std(metric_data, axis=0)

test_mean_accuracy, test_mean_precision, test_mean_recall, test_mean_f1, test_mean_time = test_mean
test_std_accuracy, test_std_precision, test_std_recall, test_std_f1, test_std_time = test_std

headers = ["Metric", "Train Mean", "Train Std", "Test Mean", "Test Std"]
table_data = [
    ["Accuracy", train_mean_accuracy, train_std_accuracy, test_mean_accuracy, test_std_accuracy],
    ["Precision", train_mean_precision, train_std_precision, test_mean_precision, test_std_precision],
    ["Recall", train_mean_recall, train_std_recall, test_mean_recall, test_std_recall],
    ["F1 Score", train_mean_f1, train_std_f1, test_mean_f1, test_std_f1],
    ["Time (s)", train_mean_time, train_std_time, test_mean_time, test_std_time]
]

fancy_table = tabulate(table_data, headers=headers, tablefmt="fancy_grid", floatfmt=".4f")

print(fancy_table)

╒═══════════╤══════════════╤═════════════╤═════════════╤════════════╕
│ Metric    │   Train Mean │   Train Std │   Test Mean │   Test Std │
╞═══════════╪══════════════╪═════════════╪═════════════╪════════════╡
│ Accuracy  │       0.9941 │      0.0014 │      0.9779 │     0.0036 │
├───────────┼──────────────┼─────────────┼─────────────┼────────────┤
│ Precision │       0.9879 │      0.0029 │      0.9680 │     0.0035 │
├───────────┼──────────────┼─────────────┼─────────────┼────────────┤
│ Recall    │       1.0000 │      0.0000 │      0.9868 │     0.0086 │
├───────────┼──────────────┼─────────────┼─────────────┼────────────┤
│ F1 Score  │       0.9939 │      0.0015 │      0.9773 │     0.0037 │
├───────────┼──────────────┼─────────────┼─────────────┼────────────┤
│ Time (s)  │       0.1754 │      0.0318 │      0.0151 │     0.0053 │
╘═══════════╧══════════════╧═════════════╧═════════════╧════════════╛


In [66]:
total_test = best_model.predict(X_test)
print(classification_report(y_test, total_test))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98       422
           1       0.97      1.00      0.98       392

    accuracy                           0.98       814
   macro avg       0.98      0.98      0.98       814
weighted avg       0.98      0.98      0.98       814



In [67]:
plt.figure(figsize=(40, 20))
plot_tree(best_model, num_trees=4)
plt.savefig('tree_plot.png', dpi=3000, bbox_inches='tight')  # Higher DPI for clarity
plt.close()

<Figure size 4000x2000 with 0 Axes>

In [68]:
num_trees = best_model.get_params()['n_estimators']
total_nodes = sum(
    len(tree.splitlines()) - 1
    for tree in best_model.get_booster().get_dump()
)
max_depth = best_model.get_params()['max_depth']
learning_rate = best_model.get_params()['learning_rate']
n_features = len(X.columns)

# Prepare data for tabulation
data = [
    ["Number of Trees", num_trees],
    ["Total Nodes", total_nodes],
    ["Maximum Depth", max_depth],
    ["Learning Rate", learning_rate],
    ["Number of Features", n_features],
]

# Create a table
table = tabulate(data, headers=["Statistic", "Value"], tablefmt="grid")
print(table)

+--------------------+---------+
| Statistic          |   Value |
| Number of Trees    |   100   |
+--------------------+---------+
| Total Nodes        |  6454   |
+--------------------+---------+
| Maximum Depth      |     8   |
+--------------------+---------+
| Learning Rate      |     0.1 |
+--------------------+---------+
| Number of Features |    15   |
+--------------------+---------+
