In [2]:
import pandas as pd
import numpy as np
import optuna
import joblib


from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix, precision_recall_curve
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline

In [3]:
X_train = pd.read_pickle("dataset_preprocessed/X_train.pkl")
X_test = pd.read_pickle("dataset_preprocessed/X_test.pkl")
y_train_bin = pd.read_pickle("dataset_preprocessed/y_train_bin.pkl")
y_test_bin = pd.read_pickle("dataset_preprocessed/y_test_bin.pkl")

In [4]:
numeric_cols = X_train.columns

In [5]:
preprocessor = ColumnTransformer([
("num", StandardScaler(), numeric_cols)
])

In [6]:
def objective(trial):
    params = {
    "n_estimators": trial.suggest_int("n_estimators", 200, 800),
    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
    "max_depth": trial.suggest_int("max_depth", 3, 12),
    "subsample": trial.suggest_float("subsample", 0.5, 1.0),
    "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
    "gamma": trial.suggest_float("gamma", 0, 5),
    "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
    "random_state": 42,
    "eval_metric": "logloss",
    "scale_pos_weight": (len(y_train_bin) - y_train_bin.sum()) / y_train_bin.sum()
    }


    xgb_model = XGBClassifier(**params)


    pipeline = ImbPipeline(steps=[
    ("scale", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("model", xgb_model)
    ])


    pipeline.fit(X_train, y_train_bin)
    probs = pipeline.predict_proba(X_test)[:, 1]


    prec, rec, th = precision_recall_curve(y_test_bin, probs)
    f1_scores = 2 * (prec * rec) / (prec + rec + 1e-9)
    best_idx = np.argmax(f1_scores)


    return f1_scores[best_idx]

In [7]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=40)


print("Best Trial Params:", study.best_trial.params)

[I 2025-11-15 13:44:39,725] A new study created in memory with name: no-name-dddd9388-303f-4ce7-8204-f95021d0e94e
[I 2025-11-15 13:44:40,270] Trial 0 finished with value: 0.7586206891700357 and parameters: {'n_estimators': 614, 'learning_rate': 0.038375985900929, 'max_depth': 10, 'subsample': 0.8045453560567064, 'colsample_bytree': 0.9497204178682787, 'gamma': 1.8534457662483095, 'min_child_weight': 2}. Best is trial 0 with value: 0.7586206891700357.
[I 2025-11-15 13:44:40,426] Trial 1 finished with value: 0.7310344822605469 and parameters: {'n_estimators': 241, 'learning_rate': 0.22470065192009245, 'max_depth': 8, 'subsample': 0.5624588593596962, 'colsample_bytree': 0.6038403965936823, 'gamma': 1.7140382622158412, 'min_child_weight': 9}. Best is trial 0 with value: 0.7586206891700357.
[I 2025-11-15 13:44:40,538] Trial 2 finished with value: 0.7301587296618796 and parameters: {'n_estimators': 325, 'learning_rate': 0.24664487301269927, 'max_depth': 6, 'subsample': 0.8247682739971789, 'c

Best Trial Params: {'n_estimators': 495, 'learning_rate': 0.20836540412690088, 'max_depth': 10, 'subsample': 0.7987173142869121, 'colsample_bytree': 0.8801431868276532, 'gamma': 3.371646451399516, 'min_child_weight': 3}


In [8]:
best_params = study.best_params
best_params["random_state"] = 42
best_params["eval_metric"] = "logloss"
best_params["scale_pos_weight"] = (len(y_train_bin) - y_train_bin.sum()) / y_train_bin.sum()

In [9]:
best_model = XGBClassifier(**best_params)

In [10]:
pipeline_best = ImbPipeline(steps=[
    ("scale", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("model", best_model)
])

In [11]:
pipeline_best.fit(X_train, y_train_bin)
probs = pipeline_best.predict_proba(X_test)[:, 1]

In [12]:
prec, rec, thresholds = precision_recall_curve(y_test_bin, probs)
f1_scores = 2 * (prec * rec) / (prec + rec + 1e-9)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]

In [13]:
print("Best Threshold:", best_threshold)
print("Best F1 Score:", f1_scores[best_idx])

Best Threshold: 0.99300134
Best F1 Score: 0.793650793153943


In [14]:
final_pred = (probs >= best_threshold).astype(int)

In [15]:
print("\n=== Classification Report ===")
print(classification_report(y_test_bin, final_pred))


print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test_bin, final_pred))


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1932
           1       0.86      0.74      0.79        68

    accuracy                           0.99      2000
   macro avg       0.93      0.87      0.89      2000
weighted avg       0.99      0.99      0.99      2000


=== Confusion Matrix ===
[[1924    8]
 [  18   50]]
