In [1]:
# Load stored variables
%store -r X_resampled y_resampled y_test X_test

----
#### Building a model with XGBoost
----

In [9]:
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# Convert the DataFrame and Series to numpy arrays if they were loaded with %store
X_resampled = np.array(X_resampled)
y_resampled = np.array(y_resampled)
y_test = np.array(y_test)
X_test = np.array(X_test)

In [11]:
churn_model = XGBClassifier(n_estimators=200, learning_rate=0.05, random_state=99)
churn_model.fit(X_resampled, y_resampled)  #

# Train the model
y_pred = churn_model.predict(X_test)

# Evaluate performance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.81      0.86      1593
           1       0.50      0.75      0.60       407

    accuracy                           0.80      2000
   macro avg       0.71      0.78      0.73      2000
weighted avg       0.84      0.80      0.81      2000



----
#### Key Improvements & Issues
- Higher Accuracy (80%) → Big jump from 73% in Logistic Regression.
- Higher Recall for Churners (75%) → The model is catching more churners.
- Better F1-Score for Churners (0.60 vs. 0.48 in Logistic Regression).
- Still Low Precision for Churners (0.50) → The model misclassifies some non-churners as churners.

#### Looking for better parameters
----

In [13]:
from sklearn.model_selection import RandomizedSearchCV

# Define the hyperparameters to search
param_dist = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'max_depth': [3, 5, 7, 9],
    'n_estimators': [100, 200, 300, 400],
    'subsample': [0.7, 0.8, 1.0]
}

# Create the XGBClassifier
xgb_model = XGBClassifier(random_state=99)

# Set up RandomizedSearchCV with 5-fold cross-validation
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist, n_iter=10, cv=5, verbose=1, random_state=99)

# Fit the model
random_search.fit(X_resampled, y_resampled)

# Get the best parameters
best_params_random = random_search.best_params_
print("Best parameters found by RandomizedSearchCV: ", best_params_random)

# Retrain the model with the best parameters
best_xgb_random = random_search.best_estimator_

# Evaluate the model
y_pred_random = best_xgb_random.predict(X_test)
print(classification_report(y_test, y_pred_random))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters found by RandomizedSearchCV:  {'subsample': 0.8, 'n_estimators': 100, 'max_depth': 9, 'learning_rate': 0.3}
              precision    recall  f1-score   support

           0       0.92      0.84      0.88      1593
           1       0.53      0.72      0.61       407

    accuracy                           0.81      2000
   macro avg       0.72      0.78      0.74      2000
weighted avg       0.84      0.81      0.82      2000



----
#### It barely improved. Additional tuning is needed.
----