In [1]:
import pandas as pd
from xgboost import XGBClassifier 
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score 
import optuna


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
RS_param_grid = {
    "n_estimators": [300, 600, 900, 1200],
    "max_depth": [3, 4, 6, 8],
    "learning_rate": [0.01, 0.03, 0.1],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "min_child_weight": [1, 2, 5, 8],
    "gamma": [0.0, 1.0, 3.0],
    "reg_lambda": [0.0, 1.0, 5.0, 10.0],
    "reg_alpha": [0.0, 0.1, 1.0, 5.0],
    "scale_pos_weight": [1.0, 2.0, 5.0] 
}

In [3]:
xgb = XGBClassifier(random_state=42)

# DIABETES

In [None]:
df_diabetes = pd.read_csv("diabetes_transformed.csv")   

X = df_diabetes.drop('diabetes', axis=1)  
y = df_diabetes['diabetes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## RANDOMIZED SEARCH

In [17]:
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=RS_param_grid,
    n_iter=50,  
    cv=3,  #
    scoring='accuracy', 
    verbose=1,
    random_state=42,
    n_jobs=-1 
)

random_search.fit(X_train, y_train)

results = random_search.cv_results_
num_iterations = len(results['params']) 
best_params_random = random_search.best_params_

print("Najlepsze parametry:", best_params_random)
print(f"Accuracy (mean): {random_search.best_score_:.4f}")
print(f"Liczba iteracji RandomizedSearchCV: {num_iterations}")

y_pred = random_search.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Fitting 3 folds for each of 50 candidates, totalling 150 fits


Najlepsze parametry: {'subsample': 0.6, 'scale_pos_weight': 1.0, 'reg_lambda': 5.0, 'reg_alpha': 1.0, 'n_estimators': 1200, 'min_child_weight': 8, 'max_depth': 4, 'learning_rate': 0.01, 'gamma': 0.0, 'colsample_bytree': 0.8}
Accuracy (mean): 0.9719
Liczba iteracji RandomizedSearchCV: 50
Test Accuracy: 0.9730959643946592


## BAYES SEARCH

In [18]:
from skopt.space import Real, Integer, Categorical

search_space = {
    "n_estimators": Integer(300, 1200),
    "max_depth": Integer(3, 8),
    "learning_rate": Real(0.01, 0.1, prior="log-uniform"),
    "subsample": Real(0.6, 1.0),
    "colsample_bytree": Real(0.6, 1.0),
    "min_child_weight": Integer(1, 8),
    "gamma": Real(0.0, 3.0),
    "reg_lambda": Real(0.0, 10.0),
    "reg_alpha": Real(0.0, 5.0),
    "scale_pos_weight": Real(1.0, 5.0)
}

In [19]:
from skopt import BayesSearchCV
opt = BayesSearchCV(
    estimator=xgb,
    search_spaces=search_space,
    n_iter=60,                
    cv=5,
    scoring="accuracy",      
    n_jobs=-1,
    random_state=42,
    refit=True,
    verbose=0,
)

opt.fit(X_train, y_train)

n_iters_done = len(opt.cv_results_["params"])

print("Najlepsze parametry:", opt.best_params_)
print("Accuracy (mean):", round(opt.best_score_, 4))
print("Liczba iteracji BayesSearchCV:", n_iters_done)

y_pred = opt.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print("Test accuracy:", round(test_acc, 4))

Najlepsze parametry: OrderedDict({'colsample_bytree': 0.6707487107534322, 'gamma': 0.7778311189511045, 'learning_rate': 0.010635974318114841, 'max_depth': 5, 'min_child_weight': 6, 'n_estimators': 355, 'reg_alpha': 1.8572271814891825, 'reg_lambda': 2.4096666049396416, 'scale_pos_weight': 1.9723853412464256, 'subsample': 0.6809326594353681})
Accuracy (mean): 0.9719
Liczba iteracji BayesSearchCV: 60
Test accuracy: 0.9729


# LOAN

In [None]:
# df_loan = pd.read_csv("loan_transformed.csv")   

# X = df_loan.drop('loan_status', axis=1)  
# y = df_loan['loan_status']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

KeyError: "['loan_status'] not found in axis"

## RANDOMIZED SEARCH

In [None]:
# random_search = RandomizedSearchCV(
#     estimator=xgb,
#     param_distributions=RS_param_grid,
#     n_iter=50,  
#     cv=3,  #
#     scoring='accuracy', 
#     verbose=1,
#     random_state=42,
#     n_jobs=-1 
# )

# random_search.fit(X_train, y_train)

# results = random_search.cv_results_
# num_iterations = len(results['params']) 
# best_params_random = random_search.best_params_

# print("Najlepsze parametry:", best_params_random)
# print(f"Accuracy (mean): {random_search.best_score_:.4f}")
# print(f"Liczba iteracji RandomizedSearchCV: {num_iterations}")

# y_pred = random_search.predict(X_test)
# print("Test Accuracy:", accuracy_score(y_test, y_pred))

## BAYES SEARCH

In [None]:
# from skopt.space import Real, Integer, Categorical

# search_space = {
#     "n_estimators": Integer(300, 1200),
#     "max_depth": Integer(3, 8),
#     "learning_rate": Real(0.01, 0.1, prior="log-uniform"),
#     "subsample": Real(0.6, 1.0),
#     "colsample_bytree": Real(0.6, 1.0),
#     "min_child_weight": Integer(1, 8),
#     "gamma": Real(0.0, 3.0),
#     "reg_lambda": Real(0.0, 10.0),
#     "reg_alpha": Real(0.0, 5.0),
#     "scale_pos_weight": Real(1.0, 5.0)
# }

In [None]:
# from skopt import BayesSearchCV
# opt = BayesSearchCV(
#     estimator=xgb,
#     search_spaces=search_space,
#     n_iter=60,                
#     cv=5,
#     scoring="accuracy",      
#     n_jobs=-1,
#     random_state=42,
#     refit=True,
#     verbose=0,
# )

# opt.fit(X_train, y_train)

# n_iters_done = len(opt.cv_results_["params"])

# print("Najlepsze parametry:", opt.best_params_)
# print("Accuracy (mean):", round(opt.best_score_, 4))
# print("Liczba iteracji BayesSearchCV:", n_iters_done)

# y_pred = opt.predict(X_test)
# test_acc = accuracy_score(y_test, y_pred)
# print("Test accuracy:", round(test_acc, 4))

# DEPRESSION

In [6]:
df_depression = pd.read_csv("depression_transformed.csv")   

X = df_depression.drop('History of Mental Illness', axis=1)  
y = df_depression['History of Mental Illness']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## RANDOMIZED SEARCH

In [7]:
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=RS_param_grid,
    n_iter=50,  
    cv=3,  #
    scoring='accuracy', 
    verbose=1,
    random_state=42,
    n_jobs=-1 
)

random_search.fit(X_train, y_train)

results = random_search.cv_results_
num_iterations = len(results['params']) 
best_params_random = random_search.best_params_

print("Najlepsze parametry:", best_params_random)
print(f"Accuracy (mean): {random_search.best_score_:.4f}")
print(f"Liczba iteracji RandomizedSearchCV: {num_iterations}")

y_pred = random_search.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Najlepsze parametry: {'subsample': 1.0, 'scale_pos_weight': 1.0, 'reg_lambda': 0.0, 'reg_alpha': 1.0, 'n_estimators': 300, 'min_child_weight': 1, 'max_depth': 4, 'learning_rate': 0.01, 'gamma': 0.0, 'colsample_bytree': 1.0}
Accuracy (mean): 0.6963
Liczba iteracji RandomizedSearchCV: 50
Test Accuracy: 0.6944800251347367


## BAYES SEARCH

In [8]:
from skopt.space import Real, Integer, Categorical

search_space = {
    "n_estimators": Integer(300, 1200),
    "max_depth": Integer(3, 8),
    "learning_rate": Real(0.01, 0.1, prior="log-uniform"),
    "subsample": Real(0.6, 1.0),
    "colsample_bytree": Real(0.6, 1.0),
    "min_child_weight": Integer(1, 8),
    "gamma": Real(0.0, 3.0),
    "reg_lambda": Real(0.0, 10.0),
    "reg_alpha": Real(0.0, 5.0),
    "scale_pos_weight": Real(1.0, 5.0)
}

In [None]:
from skopt import BayesSearchCV
opt = BayesSearchCV(
    estimator=xgb,
    search_spaces=search_space,
    n_iter=60,                
    cv=5,
    scoring="accuracy",      
    n_jobs=-1,
    random_state=42,
    refit=True,
    verbose=0,
)

opt.fit(X_train, y_train)

n_iters_done = len(opt.cv_results_["params"])

print("Najlepsze parametry:", opt.best_params_)
print("Accuracy (mean):", round(opt.best_score_, 4))
print("Liczba iteracji BayesSearchCV:", n_iters_done)

y_pred = opt.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print("Test accuracy:", round(test_acc, 4))