In [1]:
import xgboost as xgb
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import make_scorer, accuracy_score
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
df = pd.read_csv('./dados/cleaned_dataset.csv')

In [3]:
df = df.drop(columns=['Id'])

In [4]:
X = df[['Score', 'Gender', 'Age', 'Assets', 'Products', 'Active']]
y = df['Churned']

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [6]:
def xgb_cv(max_depth, learning_rate, n_estimators, gamma, min_child_weight, subsample, colsample_bytree, alpha, lambda_):
    try:
        model = xgb.XGBClassifier(
            max_depth=int(max_depth),
            learning_rate=learning_rate,
            n_estimators=int(n_estimators),
            gamma=gamma,
            min_child_weight=int(min_child_weight),
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            objective='binary:logistic',
            eval_metric='logloss',
            reg_alpha=alpha,
            reg_lambda=lambda_,
            random_state=42
        )
        
        # Perform cross-validation and return the mean accuracy
        cv_result = cross_val_score(model, X_train, y_train, cv=3, scoring=make_scorer(accuracy_score))
        return cv_result.mean()
    
    except ValueError as e:
        print("Error encountered during function evaluation:", e)
        return None  # Return None so Bayesian Optimization skips this iteration


In [7]:
# Set the parameter bounds for Bayesian Optimization
param_bounds = {
    'max_depth': (3, 10),
    'learning_rate': (0.01, 0.3),
    'n_estimators': (50, 500),
    'gamma': (0, 5),
    'min_child_weight': (1, 10),
    'subsample': (0.5, 1.0),
    'colsample_bytree': (0.5, 1.0),
    'alpha': (0, 10),
    'lambda_': (0, 10)                   # L2 regularization term (reg_lambda)
}


In [8]:
# Run Bayesian Optimization
optimizer = BayesianOptimization(f=xgb_cv, pbounds=param_bounds, random_state=42)
optimizer.maximize(init_points=5, n_iter=25)

|   iter    |  target   |   alpha   | colsam... |   gamma   |  lambda_  | learni... | max_depth | min_ch... | n_esti... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [39m1        [39m | [39m0.8398   [39m | [39m3.745    [39m | [39m0.9754   [39m | [39m3.66     [39m | [39m5.987    [39m | [39m0.05525  [39m | [39m4.092    [39m | [39m1.523    [39m | [39m439.8    [39m | [39m0.8006   [39m |
| [39m2        [39m | [39m0.7976   [39m | [39m7.081    [39m | [39m0.5103   [39m | [39m4.85     [39m | [39m8.324    [39m | [39m0.07158  [39m | [39m4.273    [39m | [39m2.651    [39m | [39m186.9    [39m | [39m0.7624   [39m |
| [39m3        [39m | [39m0.8276   [39m | [39m4.319    [39m | [39m0.6456   [39m | [39m3.059    [39m | [39m1.395    [39m | [39m0.09472  [39m | [39m5.565    [39m | [39m5.105    [39m | [39m403.3    [39m | [39m0.5998   [39m |


In [17]:
best_model = xgb.XGBClassifier(
    colsample_bytree=0.7982,          # colsample_bytree value from iter 16
    gamma=4.5,                        # gamma value from iter 16
    learning_rate=0.1096,             # learning_rate value from iter 16
    max_depth=6,                      # max_depth rounded from iter 16 (6.223)
    min_child_weight=1,               # min_child_weight rounded from iter 16 (1.085)
    n_estimators=434,                 # n_estimators rounded from iter 16 (433.8)
    subsample=0.9096,                 # subsample value from iter 16
    alpha=0.5554,                     # alpha value from iter 16
    lambda_=0.2544,                   # lambda value from iter 16
    objective='binary:logistic',      # Standard objective for binary classification
    eval_metric='logloss',            # Evaluation metric
    random_state=42                   # Fixed random seed for reproducibility
)


In [18]:
# Fit the model on the training data
best_model.fit(X_train, y_train)

Parameters: { "lambda_" } are not used.



In [19]:
# Make predictions on the test data
y_pred_class = best_model.predict(X_test)

In [20]:
cnf_matrix = confusion_matrix(y_test, y_pred_class)

In [21]:
print('Confusion Matrix:')
print(cnf_matrix)
print('\nAccuracy', accuracy_score(y_test, y_pred_class))

Confusion Matrix:
[[78  1]
 [13  8]]

Accuracy 0.86


In [22]:
cnf_table = pd.DataFrame(data=cnf_matrix, index=['Non-Churned', 'Churned'], columns=['Non_churned(pred)', 'Churned(pred)'])

In [23]:
print(cnf_table)

             Non_churned(pred)  Churned(pred)
Non-Churned                 78              1
Churned                     13              8


In [24]:
print(classification_report(y_test, y_pred_class, target_names=['Non Churned', 'Churned']))

              precision    recall  f1-score   support

 Non Churned       0.86      0.99      0.92        79
     Churned       0.89      0.38      0.53        21

    accuracy                           0.86       100
   macro avg       0.87      0.68      0.73       100
weighted avg       0.86      0.86      0.84       100

