In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df = pd.read_excel("data/customer_churn.xlsx")

In [3]:
X = df.drop('Churn', axis=1)
y = df['Churn']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
numeric_columns = X_train.select_dtypes(include=['float64', 'int64']).columns
X_train_numeric = X_train[numeric_columns]
X_test_numeric = X_test[numeric_columns]

In [6]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_numeric)
X_test_scaled = scaler.transform(X_test_numeric)

In [7]:
gb_model = GradientBoostingClassifier(random_state=42)

In [8]:
param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
    'max_depth': [3, 4, 5, 6],
    'subsample': [0.8, 0.9, 1.0],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

gb_random = RandomizedSearchCV(estimator=gb_model, param_distributions=param_dist, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
gb_random.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=GradientBoostingClassifier(random_state=42),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'learning_rate': [0.001, 0.01, 0.1, 0.2,
                                                          0.3],
                                        'max_depth': [3, 4, 5, 6],
                                        'max_features': ['auto', 'sqrt', 'log2',
                                                         None],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [50, 100, 200, 300],
                                        'subsample': [0.8, 0.9, 1.0]},
                   random_state=42, verbose=2)

In [9]:
best_gb_model = gb_random.best_estimator_

In [10]:
y_pred = best_gb_model.predict(X_test_scaled)

In [11]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[143   5]
 [ 16  16]]

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.97      0.93       148
           1       0.76      0.50      0.60        32

    accuracy                           0.88       180
   macro avg       0.83      0.73      0.77       180
weighted avg       0.87      0.88      0.87       180



In [12]:
print("\nFeature Importances:\n", best_gb_model.feature_importances_)


Feature Importances:
 [0.04717327 0.03211205 0.02647392 0.17719545 0.71704531]


In [13]:
import joblib
joblib.dump(best_gb_model, 'gradient_boosting_model.pkl')

['gradient_boosting_model.pkl']