In [19]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, make_scorer, recall_score

In [20]:
X_train = pd.read_pickle("dataset_preprocessed/X_train.pkl")
X_test = pd.read_pickle("dataset_preprocessed/X_test.pkl")
y_train_bin = pd.read_pickle("dataset_preprocessed/y_train_bin.pkl")
y_test_bin = pd.read_pickle("dataset_preprocessed/y_test_bin.pkl")

In [21]:
param_dist = {
    "n_estimators": [200, 400, 600, 800],
    "max_depth": [4,5,6,7,8],
    "learning_rate": [0.01,0.03,0.05,0.1],
    "subsample": [0.6,0.7,0.8,0.9,1.0],
    "colsample_bytree": [0.6,0.7,0.8,0.9,1.0],
    "gamma": [0,1,5],
    "min_child_weight": [1,3,5,7]
}

In [22]:
model = XGBClassifier(
    random_state=42,
    eval_metric="logloss",
    scale_pos_weight=(y_train_bin.value_counts()[0] / y_train_bin.value_counts()[1])
)

In [23]:
scorer = make_scorer(recall_score, pos_label=1)

In [24]:
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    scoring=scorer,
    n_iter=30,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

In [25]:
random_search.fit(X_train, y_train_bin)

print("Best Params:", random_search.best_params_)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best Params: {'subsample': 0.9, 'n_estimators': 200, 'min_child_weight': 3, 'max_depth': 4, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.9}


In [26]:
best_model = random_search.best_estimator_

In [27]:
y_pred = best_model.predict(X_test)

print("\n=== tuned model report ===")
print(classification_report(y_test_bin, y_pred))


=== tuned model report ===
              precision    recall  f1-score   support

           0       1.00      0.92      0.96      1932
           1       0.30      0.93      0.45        68

    accuracy                           0.92      2000
   macro avg       0.65      0.92      0.71      2000
weighted avg       0.97      0.92      0.94      2000

