In [20]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier 
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score 
import optuna


In [None]:
xgb = XGBClassifier(objective="binary:logistic",  # klasyfikacja binarna
    eval_metric="logloss",        # metryka logloss
    tree_method="hist",           # szybka metoda histogramowa
    booster="gbtree",             # klasyczny booster
    random_state=42,
    n_jobs=-1
)

In [38]:
RS_param_grid = {
    "n_estimators": [300, 600, 900, 1200],
    "max_depth": [3, 4, 6, 8],
    "learning_rate": [0.01, 0.03, 0.1],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "min_child_weight": [1, 2, 5, 8],
    "gamma": [0.0, 1.0, 3.0],
    "reg_lambda": [0.0, 1.0, 5.0, 10.0],
    "reg_alpha": [0.0, 0.1, 1.0, 5.0],
    "scale_pos_weight": [1.0, 2.0, 5.0],
    "colsample_bylevel": [0.6, 0.8, 1.0],
    "grow_policy": ["depthwise", "lossguide"],
    "max_bin": [256, 512],
    "max_leaves": [15, 31, 63]
}

In [23]:
from skopt.space import Real, Integer, Categorical

search_space = {
    "n_estimators": Integer(300, 1200),
    "max_depth": Integer(3, 8),
    "learning_rate": Real(0.01, 0.1, prior="log-uniform"),
    "subsample": Real(0.6, 1.0),
    "colsample_bytree": Real(0.6, 1.0),
    "min_child_weight": Integer(1, 8),
    "gamma": Real(0.0, 3.0),
    "reg_lambda": Real(0.0, 10.0),
    "reg_alpha": Real(0.0, 5.0),
    "scale_pos_weight": Real(1.0, 5.0)
}

# DIABETES

In [39]:
df_diabetes = pd.read_csv("diabetes_transformed.csv")   

X = df_diabetes.drop('diabetes', axis=1)  
y = df_diabetes['diabetes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
baseline_xgb = XGBClassifier(
    objective="binary:logistic", eval_metric="logloss",
    tree_method="hist", n_jobs=-1, random_state=42
)
baseline_xgb.fit(X_train, y_train)
baseline_acc = accuracy_score(y_test, baseline_xgb.predict(X_test))
print(f"Baseline XGB (bez tuningu) – test accuracy: {baseline_acc:.4f}")

Baseline XGB (bez tuningu) – test accuracy: 0.9730


## RANDOMIZED SEARCH

In [42]:
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=RS_param_grid,
    n_iter=100,  
    cv=3,  #
    scoring='accuracy', 
    verbose=1,
    random_state=42,
    n_jobs=-1 
)

random_search.fit(X_train, y_train)

results = random_search.cv_results_
num_iterations = len(results['params']) 
best_params_random = random_search.best_params_

#najlepsze parametry:
print("Najlepsze parametry:", best_params_random)

# liczba iteracji:
num_iterations = len(results['params'])

# tworzymy pełną historię wyników - opt.cv_results_ zawiera dla każdej iteracji: parametry, wynik walidacji, czasy itd.
hist_random = pd.DataFrame(results).copy()
hist_random["method"] = "random"
hist_random["iter"] = np.arange(len(hist_random))
hist_random["running_best"] = np.maximum.accumulate(hist_random["mean_test_score"])

# test accuracy dla najlepszej konfiguracji
y_pred_rs = random_search.predict(X_test)
test_acc_rs = accuracy_score(y_test, y_pred_rs)
print(f"RandomSearch – CV mean(best): {random_search.best_score_:.4f}")
print(f"RandomSearch – Test accuracy: {test_acc_rs:.4f}")
print(f"RandomSearch – Liczba iteracji: {num_iterations}")

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Najlepsze parametry: {'subsample': 0.6, 'scale_pos_weight': 1.0, 'reg_lambda': 5.0, 'reg_alpha': 5.0, 'n_estimators': 1200, 'min_child_weight': 5, 'max_leaves': 15, 'max_depth': 3, 'max_bin': 512, 'learning_rate': 0.03, 'grow_policy': 'depthwise', 'gamma': 0.0, 'colsample_bytree': 0.6, 'colsample_bylevel': 0.6}
RandomSearch – CV mean(best): 0.9720
RandomSearch – Test accuracy: 0.9728
RandomSearch – Liczba iteracji: 100


In [27]:
hist_random.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_scale_pos_weight,param_reg_lambda,param_reg_alpha,param_n_estimators,param_min_child_weight,...,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,method,iter,running_best
0,1.331953,0.015631,0.130363,0.011094,1.0,5.0,5.0,5.0,900,8,...,"{'subsample': 1.0, 'scale_pos_weight': 5.0, 'r...",0.950566,0.948503,0.948126,0.949065,0.001072,49,random,0,0.949065
1,2.078068,0.011593,0.282207,0.013623,1.0,2.0,5.0,5.0,1200,5,...,"{'subsample': 1.0, 'scale_pos_weight': 2.0, 'r...",0.969807,0.96962,0.971531,0.970319,0.00086,23,random,1,0.970319
2,0.71156,0.0178,0.073987,0.013805,0.8,1.0,10.0,0.0,300,2,...,"{'subsample': 0.8, 'scale_pos_weight': 1.0, 'r...",0.970632,0.971307,0.972132,0.971357,0.000613,16,random,2,0.971357
3,0.537392,0.001089,0.06569,0.00623,1.0,2.0,1.0,0.0,300,1,...,"{'subsample': 1.0, 'scale_pos_weight': 2.0, 'r...",0.968119,0.967744,0.969056,0.968307,0.000552,32,random,3,0.971357
4,0.469865,0.013372,0.051791,0.007606,1.0,5.0,10.0,0.1,300,1,...,"{'subsample': 1.0, 'scale_pos_weight': 5.0, 'r...",0.953867,0.955592,0.954053,0.954504,0.000773,40,random,4,0.971357


## BAYES SEARCH

In [28]:
from skopt import BayesSearchCV
opt = BayesSearchCV(
    estimator=xgb,
    search_spaces=search_space,
    n_iter=60,                
    cv=5,
    scoring="accuracy",      
    n_jobs=-1,
    random_state=42,
    refit=True,
    verbose=0,
)

opt.fit(X_train, y_train)

print("Najlepsze parametry:", opt.best_params_)

# tworzymy pełną historię wyników - opt.cv_results_ zawiera dla każdej iteracji: parametry, wynik walidacji, czasy itd.
hist_bayes = pd.DataFrame(opt.cv_results_).copy()
hist_bayes["method"] = "bayes"
hist_bayes["iter"] = np.arange(len(hist_bayes))
hist_bayes["running_best"] = np.maximum.accumulate(hist_bayes["mean_test_score"])

# używamy już modelu z najlepszymi parametrami, żeby przewidzieć target i liczymy dokładność (accuracy_score).
y_pred_bayes = opt.predict(X_test)
test_acc_bayes = accuracy_score(y_test, y_pred_bayes)
print(f"BayesSearch – CV mean(best): {opt.best_score_:.4f}")
print(f"BayesSearch – Test accuracy: {test_acc_bayes:.4f}")
print(f"BayesSearch – Liczba iteracji: {len(hist_bayes)}")

Najlepsze parametry: OrderedDict({'colsample_bytree': 0.6707487107534322, 'gamma': 0.7778311189511045, 'learning_rate': 0.010635974318114841, 'max_depth': 5, 'min_child_weight': 6, 'n_estimators': 355, 'reg_alpha': 1.8572271814891825, 'reg_lambda': 2.4096666049396416, 'scale_pos_weight': 1.9723853412464256, 'subsample': 0.6809326594353681})
BayesSearch – CV mean(best): 0.9719
BayesSearch – Test accuracy: 0.9729
BayesSearch – Liczba iteracji: 60


# LOAN

In [29]:
# df_loan = pd.read_csv("loan_transformed.csv")   

# X = df_loan.drop('loan_status', axis=1)  
# y = df_loan['loan_status']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# baseline_xgb = XGBClassifier(
#     objective="binary:logistic", eval_metric="logloss",
#     tree_method="hist", n_jobs=-1, random_state=42
# )
# baseline_xgb.fit(X_train, y_train)
# baseline_acc = accuracy_score(y_test, baseline_xgb.predict(X_test))
# print(f"Baseline XGB (bez tuningu) – test accuracy: {baseline_acc:.4f}")

## RANDOMIZED SEARCH

In [30]:
# random_search = RandomizedSearchCV(
#     estimator=xgb,
#     param_distributions=RS_param_grid,
#     n_iter=50,  
#     cv=3,  #
#     scoring='accuracy', 
#     verbose=1,
#     random_state=42,
#     n_jobs=-1 
# )

# random_search.fit(X_train, y_train)

# results = random_search.cv_results_
# num_iterations = len(results['params']) 
# best_params_random = random_search.best_params_

# #najlepsze parametry:
# print("Najlepsze parametry:", best_params_random)

# # liczba iteracji:
# num_iterations = len(results['params'])

# # tworzymy pełną historię wyników - opt.cv_results_ zawiera dla każdej iteracji: parametry, wynik walidacji, czasy itd.
# hist_random = pd.DataFrame(results).copy()
# hist_random["method"] = "random"
# hist_random["iter"] = np.arange(len(hist_random))
# hist_random["running_best"] = np.maximum.accumulate(hist_random["mean_test_score"])

# # test accuracy dla najlepszej konfiguracji
# y_pred_rs = random_search.predict(X_test)
# test_acc_rs = accuracy_score(y_test, y_pred_rs)
# print(f"RandomSearch – CV mean(best): {random_search.best_score_:.4f}")
# print(f"RandomSearch – Test accuracy: {test_acc_rs:.4f}")
# print(f"RandomSearch – Liczba iteracji: {num_iterations}")

## BAYES SEARCH

In [31]:
# from skopt import BayesSearchCV
# opt = BayesSearchCV(
#     estimator=xgb,
#     search_spaces=search_space,
#     n_iter=60,                
#     cv=5,
#     scoring="accuracy",      
#     n_jobs=-1,
#     random_state=42,
#     refit=True,
#     verbose=0,
# )

# opt.fit(X_train, y_train)

# print("Najlepsze parametry:", opt.best_params_)

# # tworzymy pełną historię wyników - opt.cv_results_ zawiera dla każdej iteracji: parametry, wynik walidacji, czasy itd.
# hist_bayes = pd.DataFrame(opt.cv_results_).copy()
# hist_bayes["method"] = "bayes"
# hist_bayes["iter"] = np.arange(len(hist_bayes))
# hist_bayes["running_best"] = np.maximum.accumulate(hist_bayes["mean_test_score"])

# # używamy już modelu z najlepszymi parametrami, żeby przewidzieć target i liczymy dokładność (accuracy_score).
# y_pred_bayes = opt.predict(X_test)
# test_acc_bayes = accuracy_score(y_test, y_pred_bayes)
# print(f"BayesSearch – CV mean(best): {opt.best_score_:.4f}")
# print(f"BayesSearch – Test accuracy: {test_acc_bayes:.4f}")
# print(f"BayesSearch – Liczba iteracji: {len(hist_bayes)}")

# DEPRESSION

In [33]:
df_depression = pd.read_csv("depression_transformed.csv")   

X = df_depression.drop('History of Mental Illness', axis=1)  
y = df_depression['History of Mental Illness']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
baseline_xgb = XGBClassifier(
    objective="binary:logistic", eval_metric="logloss",
    tree_method="hist", n_jobs=-1, random_state=42
)
baseline_xgb.fit(X_train, y_train)
baseline_acc = accuracy_score(y_test, baseline_xgb.predict(X_test))
print(f"Baseline XGB (bez tuningu) – test accuracy: {baseline_acc:.4f}")

Baseline XGB (bez tuningu) – test accuracy: 0.6935


## RANDOMIZED SEARCH

In [34]:
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=RS_param_grid,
    n_iter=50,  
    cv=3,  #
    scoring='accuracy', 
    verbose=1,
    random_state=42,
    n_jobs=-1 
)

random_search.fit(X_train, y_train)

results = random_search.cv_results_
num_iterations = len(results['params']) 
best_params_random = random_search.best_params_

# najlepsze parametry:
print("Najlepsze parametry:", best_params_random)

# liczba iteracji:
num_iterations = len(results['params'])

# tworzymy pełną historię wyników - opt.cv_results_ zawiera dla każdej iteracji: parametry, wynik walidacji, czasy itd.
hist_random = pd.DataFrame(results).copy()
hist_random["method"] = "random"
hist_random["iter"] = np.arange(len(hist_random))
hist_random["running_best"] = np.maximum.accumulate(hist_random["mean_test_score"])

# test accuracy dla najlepszej konfiguracji
y_pred_rs = random_search.predict(X_test)
test_acc_rs = accuracy_score(y_test, y_pred_rs)
print(f"RandomSearch – CV mean(best): {random_search.best_score_:.4f}")
print(f"RandomSearch – Test accuracy: {test_acc_rs:.4f}")
print(f"RandomSearch – Liczba iteracji: {num_iterations}")

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Najlepsze parametry: {'subsample': 1.0, 'scale_pos_weight': 1.0, 'reg_lambda': 0.0, 'reg_alpha': 1.0, 'n_estimators': 300, 'min_child_weight': 1, 'max_depth': 4, 'learning_rate': 0.01, 'gamma': 0.0, 'colsample_bytree': 1.0}
RandomSearch – CV mean(best): 0.6963
RandomSearch – Test accuracy: 0.6945
RandomSearch – Liczba iteracji: 50


## BAYES SEARCH

In [None]:
from skopt import BayesSearchCV
opt = BayesSearchCV(
    estimator=xgb,
    search_spaces=search_space,
    n_iter=60,                
    cv=5,
    scoring="accuracy",      
    n_jobs=-1,
    random_state=42,
    refit=True,
    verbose=0,
)

opt.fit(X_train, y_train)

print("Najlepsze parametry:", opt.best_params_)

# tworzymy pełną historię wyników - opt.cv_results_ zawiera dla każdej iteracji: parametry, wynik walidacji, czasy itd.
hist_bayes = pd.DataFrame(opt.cv_results_).copy()
hist_bayes["method"] = "bayes"
hist_bayes["iter"] = np.arange(len(hist_bayes))
hist_bayes["running_best"] = np.maximum.accumulate(hist_bayes["mean_test_score"])

# używamy już modelu z najlepszymi parametrami, żeby przewidzieć target i liczymy dokładność (accuracy_score).
y_pred_bayes = opt.predict(X_test)
test_acc_bayes = accuracy_score(y_test, y_pred_bayes)
print(f"BayesSearch – CV mean(best): {opt.best_score_:.4f}")
print(f"BayesSearch – Test accuracy: {test_acc_bayes:.4f}")
print(f"BayesSearch – Liczba iteracji: {len(hist_bayes)}")

Najlepsze parametry: OrderedDict({'colsample_bytree': 0.8115299714555839, 'gamma': 3.0, 'learning_rate': 0.02735832608348027, 'max_depth': 8, 'min_child_weight': 4, 'n_estimators': 300, 'reg_alpha': 2.6644196614086435, 'reg_lambda': 10.0, 'scale_pos_weight': 1.0, 'subsample': 1.0})
Accuracy (mean): 0.6963
Liczba iteracji BayesSearchCV: 60
Test accuracy: 0.6945
