In [1]:
import pandas as pd

def load_data(filepath):
    df = pd.read_csv(filepath)
    return df

df = load_data('../data/BankChurners.csv')


# Preprocessing

### Cleaning out mess

In [2]:
def clean_df(df):
    df = df[df.columns[:-2]]
    df = df.drop(['CLIENTNUM'], axis=1)
    return df

df = clean_df(df)

### Splitting

In [3]:
from sklearn.model_selection import train_test_split
# Train test split
X = df.drop('Attrition_Flag', axis=1)
y = df['Attrition_Flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Rebalancing the target variable (SMOTE)

# Gradient Boosting

pipeline creation

In [4]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer


# Identifying categoricals and numericals
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns
numerical_cols = X_train.select_dtypes(exclude=['object', 'category']).columns

# Numerical preprocessing
numerical_pipeline = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)

# Categorical preprocessing
categorical_pipeline = make_pipeline(
    OneHotEncoder(handle_unknown='ignore')
)

# ColumnTransformer 
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_pipeline, categorical_cols),
        ('num', numerical_pipeline, numerical_cols) 
    ],
    remainder='passthrough'
)

In [5]:
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

model = GradientBoostingClassifier()

# A pipeline that includes the above
pipe = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('gradientboostingclassifier', model)
])
#pipe = make_pipeline(preprocessor, GradientBoostingClassifier())
pipe.fit(X_train, y_train)

KeyboardInterrupt: 

### Evaluation
-  **Accuracy Printing**: Directly using the .score() method gives you the accuracy on the training and test sets.
-  **ROC AUC Calculation**: ROC_AUC is calculated from probabilities (y_proba), which are obtained using predict_proba() method.
-  **Classification Report**: This report gives you a breakdown of precision, recall, f1-score, and support for each class.
-  **Cross Validation**: Re-running cross-validation on the estimator provides an additional layer of validation, confirming that the parameters selected indeed perform well across different subsets of your training data.

In [None]:
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import cross_validate


# Predictions
y_pred = pipe.predict(X_test)

# R² scoring
print("train_score: ", round(pipe.score(X_train, y_train), 3))
print("test_score: ", round(pipe.score(X_test, y_test), 2))

# ROC_AUC
print("ROC_AUC: ", round(roc_auc_score(y_test, pipe.predict_proba(X_test)[:, 1]), 3))

# Classification report
report = classification_report(y_test, y_pred)
print(report)

# Cross validation
cv_results = cross_validate(pipe, X_train, y_train, cv=5, scoring='accuracy', return_train_score=True)
print("Mean Test Accuracy:", round(cv_results['test_score'].mean(), 3))
print("Mean Train Accuracy:", round(cv_results['train_score'].mean(), 3))
print("Mean Fit Time:", round(cv_results['fit_time'].mean(), 3))
print("Mean Score Time:", round(cv_results['score_time'].mean(), 3))

train_score:  0.966
test_score:  0.96
ROC_AUC:  0.988
                   precision    recall  f1-score   support

Attrited Customer       0.86      0.89      0.88       327
Existing Customer       0.98      0.97      0.98      1699

         accuracy                           0.96      2026
        macro avg       0.92      0.93      0.93      2026
     weighted avg       0.96      0.96      0.96      2026

Mean Test Accuracy: 0.957
Mean Train Accuracy: 0.968
Mean Fit Time: 2.755
Mean Score Time: 0.006


## Hyperparameter Tuning
**Best Estimator Usage**: By using grid_search.best_estimator_, you ensure that the predictions and evaluations are performed with the hyperparameter-tuned model. This is essential for getting accurate assessments of model performance.

In [None]:
param_grid = {
    'gradientboostingclassifier__n_estimators': [100, 200, 300],
    'gradientboostingclassifier__learning_rate': [0.01, 0.1, 0.2],
    'gradientboostingclassifier__max_depth': [3, 5, 7]
}

In [None]:
from sklearn.model_selection import GridSearchCV

# Create a GridSearchCV object
grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validated score:", grid_search.best_score_)

# Use the best estimator to make predictions
y_pred_best = grid_search.predict(X_test)

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import cross_validate

# Best estimator predictions
y_pred = grid_search.best_estimator_.predict(X_test)
y_proba = grid_search.best_estimator_.predict_proba(X_test)[:, 1]

# Scoring
print("Train Score: ", round(grid_search.best_estimator_.score(X_train, y_train), 3))
print("Test Score: ", round(grid_search.best_estimator_.score(X_test, y_test), 3))

# Calculate and print ROC AUC
print("ROC_AUC: ", round(roc_auc_score(y_test, y_proba), 3))

# Generate and print classification report
report = classification_report(y_test, y_pred)
print(report)

# Perform cross-validation on the best estimator
cv_results = cross_validate(grid_search.best_estimator_, X_train, y_train, cv=5, scoring='accuracy', return_train_score=True)

# Print mean test and training accuracy, and mean fit and score times
print("Mean Test Accuracy:", round(cv_results['test_score'].mean(), 3))
print("Mean Train Accuracy:", round(cv_results['train_score'].mean(), 3))
print("Mean Fit Time:", round(cv_results['fit_time'].mean(), 3))
print("Mean Score Time:", round(cv_results['score_time'].mean(), 3))


Train Score:  1.0
Test Score:  0.968
ROC_AUC:  0.992
                   precision    recall  f1-score   support

Attrited Customer       0.93      0.87      0.90       327
Existing Customer       0.98      0.99      0.98      1699

         accuracy                           0.97      2026
        macro avg       0.95      0.93      0.94      2026
     weighted avg       0.97      0.97      0.97      2026

Mean Test Accuracy: 0.974
Mean Train Accuracy: 1.0
Mean Fit Time: 13.788
Mean Score Time: 0.025
