In [1]:
import pandas as pd
import joblib
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline

In [2]:
X_train = pd.read_pickle("dataset_preprocessed/X_train.pkl")
X_test = pd.read_pickle("dataset_preprocessed/X_test.pkl")
y_train_bin = pd.read_pickle("dataset_preprocessed/y_train_bin.pkl")
y_test_bin = pd.read_pickle("dataset_preprocessed/y_test_bin.pkl")

In [3]:
numeric_cols = X_train.columns

In [4]:
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_cols)
])

In [5]:
xgb_model = XGBClassifier(
    n_estimators=600,
    learning_rate=0.1,
    max_depth=8,
    subsample=0.7,
    colsample_bytree=0.7,
    random_state=42,
    scale_pos_weight=(len(y_train_bin) - y_train_bin.sum()) / y_train_bin.sum()
)

In [6]:
pipeline = ImbPipeline(steps=[
    ("scale", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("model", xgb_model)
])

In [7]:
pipeline.fit(X_train, y_train_bin)

0,1,2
,steps,"[('scale', ...), ('smote', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,sampling_strategy,'auto'
,random_state,42
,k_neighbors,5

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.7
,device,
,early_stopping_rounds,
,enable_categorical,False


In [8]:
probs = pipeline.predict_proba(X_test)[:, 1]

In [9]:
prec, rec, thresholds = precision_recall_curve(y_test_bin, probs)
f1_scores = 2 * (prec * rec) / (prec + rec + 1e-9)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]

In [10]:
print("Best Threshold:", best_threshold)
print("Best F1 Score:", f1_scores[best_idx])

Best Threshold: 0.9793919
Best F1 Score: 0.731343283082201


In [11]:
final_pred = (probs >= best_threshold).astype(int)

In [12]:
print(classification_report(y_test_bin, final_pred))
print(confusion_matrix(y_test_bin, final_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1932
           1       0.74      0.72      0.73        68

    accuracy                           0.98      2000
   macro avg       0.87      0.86      0.86      2000
weighted avg       0.98      0.98      0.98      2000

[[1915   17]
 [  19   49]]
