In [5]:
import pandas as pd
import numpy as np
import optuna
import joblib

from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline

In [6]:
X_train = pd.read_pickle("dataset_preprocessed/X_train.pkl")
X_test = pd.read_pickle("dataset_preprocessed/X_test.pkl")
y_train_bin = pd.read_pickle("dataset_preprocessed/y_train_bin.pkl")
y_test_bin = pd.read_pickle("dataset_preprocessed/y_test_bin.pkl")

In [7]:
numeric_cols = X_train.columns

In [8]:
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_cols)
])

In [9]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 800),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "random_state": 42,
        "eval_metric": "logloss",
        "scale_pos_weight": (len(y_train_bin) - y_train_bin.sum()) / y_train_bin.sum()
    }

    xgb_model = XGBClassifier(**params)

    pipeline = ImbPipeline(steps=[
        ("scale", preprocessor),
        ("smote", SMOTE(random_state=42)),
        ("model", xgb_model)
    ])

    pipeline.fit(X_train, y_train_bin)
    preds = pipeline.predict(X_test)

    return f1_score(y_test_bin, preds)

In [10]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=40)

[I 2025-11-15 13:41:09,687] A new study created in memory with name: no-name-8d26d9bd-0f6d-409b-b821-03ff9fb1316f
[I 2025-11-15 13:41:10,269] Trial 0 finished with value: 0.4236111111111111 and parameters: {'n_estimators': 443, 'learning_rate': 0.08194414008649754, 'max_depth': 4, 'subsample': 0.8529971585690839, 'colsample_bytree': 0.5448930933615466, 'gamma': 2.0339864350522117, 'min_child_weight': 2}. Best is trial 0 with value: 0.4236111111111111.
[I 2025-11-15 13:41:10,460] Trial 1 finished with value: 0.449438202247191 and parameters: {'n_estimators': 329, 'learning_rate': 0.08955054746611865, 'max_depth': 11, 'subsample': 0.8998456340639869, 'colsample_bytree': 0.5506199586716556, 'gamma': 4.710287589722148, 'min_child_weight': 4}. Best is trial 1 with value: 0.449438202247191.
[I 2025-11-15 13:41:10,664] Trial 2 finished with value: 0.5128205128205128 and parameters: {'n_estimators': 437, 'learning_rate': 0.2821300436987093, 'max_depth': 4, 'subsample': 0.5720958007998747, 'col

In [11]:
print("Best Trial:", study.best_trial.params)

Best Trial: {'n_estimators': 799, 'learning_rate': 0.2122425951792502, 'max_depth': 8, 'subsample': 0.9882395289582837, 'colsample_bytree': 0.8366108240205682, 'gamma': 0.11432496528726657, 'min_child_weight': 9}


In [12]:
best_params = study.best_params
best_params["random_state"] = 42
best_params["eval_metric"] = "logloss"
best_params["scale_pos_weight"] = (len(y_train_bin) - y_train_bin.sum()) / y_train_bin.sum()

In [13]:
best_model = XGBClassifier(**best_params)

In [14]:
pipeline_best = ImbPipeline(steps=[
    ("scale", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("model", best_model)
])

In [15]:
pipeline_best.fit(X_train, y_train_bin)
preds = pipeline_best.predict(X_test)

In [16]:
print("\n=== Classification Report ===")
print(classification_report(y_test_bin, preds))

print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test_bin, preds))


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1932
           1       0.51      0.82      0.63        68

    accuracy                           0.97      2000
   macro avg       0.75      0.90      0.81      2000
weighted avg       0.98      0.97      0.97      2000


=== Confusion Matrix ===
[[1878   54]
 [  12   56]]
