Dataset can be found at https://www.kaggle.com/cfpb/us-consumer-finance-complaints

In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [23]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler

from sklearn.cluster import DBSCAN, KMeans, AgglomerativeClustering
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from category_encoders import LeaveOneOutEncoder, OneHotEncoder
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
    make_scorer,
)

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import make_pipeline, Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA

import seaborn as sns
import matplotlib.pyplot as plt
import gower

%matplotlib inline

<IPython.core.display.Javascript object>

In [3]:
df = pd.read_csv("BankChurners.csv")

<IPython.core.display.Javascript object>

# Explore the data

In [4]:
print(df.shape)
df.head()

(10127, 23)


Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,9.3e-05,0.99991
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,5.7e-05,0.99994
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,2.1e-05,0.99998
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,3313.0,2517,796.0,1.405,1171,20,2.333,0.76,0.000134,0.99987
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,4716.0,0,4716.0,2.175,816,28,2.5,0.0,2.2e-05,0.99998


<IPython.core.display.Javascript object>

In [5]:
df = df.drop(
    columns=[
        "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1",
        "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2",
        "CLIENTNUM",
    ],
    errors="ignore",
)

<IPython.core.display.Javascript object>

In [6]:
# pd.DataFrame(gower.gower_matrix(df)).style.background_gradient()

<IPython.core.display.Javascript object>

In [7]:
df.Income_Category.unique()

array(['$60K - $80K', 'Less than $40K', '$80K - $120K', '$40K - $60K',
       '$120K +', 'Unknown'], dtype=object)

<IPython.core.display.Javascript object>

In [8]:
df.Education_Level.unique()

array(['High School', 'Graduate', 'Uneducated', 'Unknown', 'College',
       'Post-Graduate', 'Doctorate'], dtype=object)

<IPython.core.display.Javascript object>

In [9]:
df.Attrition_Flag.value_counts()

Existing Customer    8500
Attrited Customer    1627
Name: Attrition_Flag, dtype: int64

<IPython.core.display.Javascript object>

In [10]:
df.Attrition_Flag.value_counts(normalize=True)

Existing Customer    0.83934
Attrited Customer    0.16066
Name: Attrition_Flag, dtype: float64

<IPython.core.display.Javascript object>

In [11]:
df.Card_Category.value_counts()

Blue        9436
Silver       555
Gold         116
Platinum      20
Name: Card_Category, dtype: int64

<IPython.core.display.Javascript object>

# Feature engineering

In [12]:
# mapping ordinal data
income_map = {
    "$60K - $80K": 3,
    "Less than $40K": 1,
    "$80K - $120K": 4,
    "$40K - $60K": 2,
    "$120K +": 5,
    "Unknown": 0,
}

education_map = {
    "High School": 2,
    "Graduate": 3,
    "Uneducated": 1,
    "Unknown": 0,
    "College": 4,
    "Post-Graduate": 5,
    "Doctorate": 6,
}


card_cat_map = {
    "Blue": 0,
    "Silver": 1,
    "Gold": 2,
    "Platinum": 3,
}


df["Attrition_Flag"] = (df["Attrition_Flag"] == "Existing Customer").astype(int)
df["Gender"] = (df["Gender"] == "F").astype(int)
df["Income_Category"] = df["Income_Category"].map(income_map)
df["Education_Level"] = df["Education_Level"].map(education_map)
df["Card_Category"] = df["Card_Category"].map(card_cat_map)

<IPython.core.display.Javascript object>

In [13]:
df.head()

Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,1,45,0,3,2,Married,3,0,39,5,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,1,49,1,5,3,Single,1,0,44,6,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,1,51,0,3,3,Married,4,0,36,4,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,1,40,1,4,2,Unknown,1,0,34,3,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,1,40,0,3,1,Married,3,0,21,5,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0


<IPython.core.display.Javascript object>

# Split train and test data

In [14]:
X = df.drop(columns="Attrition_Flag")
y = df["Attrition_Flag"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=22, stratify=y
)

<IPython.core.display.Javascript object>

In [15]:
df.columns

Index(['Attrition_Flag', 'Customer_Age', 'Gender', 'Dependent_count',
       'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category',
       'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio'],
      dtype='object')

<IPython.core.display.Javascript object>

# Preprocessing

In [16]:
num_cols = [
    "Customer_Age",
    "Dependent_count",
    "Months_on_book",
    "Total_Relationship_Count",
    "Months_Inactive_12_mon",
    "Contacts_Count_12_mon",
    "Credit_Limit",
    "Total_Revolving_Bal",
    "Avg_Open_To_Buy",
    "Total_Amt_Chng_Q4_Q1",
    "Total_Trans_Amt",
    "Total_Trans_Ct",
    "Total_Ct_Chng_Q4_Q1",
    "Avg_Utilization_Ratio",
    "Income_Category",
    "Education_Level",
    "Card_Category",
]
cat_cols = ["Marital_Status"]
drop_cats = ["Unknown"]

<IPython.core.display.Javascript object>

In [17]:
preprocessing = ColumnTransformer(
    [
        ("scale", StandardScaler(), num_cols),
        ("encode", LeaveOneOutEncoder(), cat_cols),
    ],
    remainder="passthrough",
)

<IPython.core.display.Javascript object>

# Run Random Forest Model

In [18]:
n_estimators = 100
learning_rate = 2 / n_estimators

pipeline_rf = Pipeline(
    [
        ("preprocessing", preprocessing),
        ("sampling", SMOTE()),
        ("RFmodel", RandomForestClassifier()),
    ]
)

pipeline_rf.fit(X_train, y_train)

train_score = pipeline_rf.score(X_train, y_train)
test_score = pipeline_rf.score(X_test, y_test)

print(f"Train score for Linear Regression model before gridsearch: {train_score}")
print(f"Test score for Linear Regression model before gridsearch: {test_score}")


grid = {
    "RFmodel__n_estimators": [50, 100, 150],
    "RFmodel__max_depth": [80, 90, 100, 110],
    "RFmodel__min_samples_leaf": [3, 4, 5],
}

pipeline_rf_cv = GridSearchCV(pipeline_rf, grid, verbose=1, cv=5, n_jobs=-1)
pipeline_rf_cv.fit(X_train, y_train)

print(
    f"Train score for Linear Regression model: {pipeline_rf_cv.score(X_train, y_train)}"
)
print(f"Test score for Linear Regression model: {pipeline_rf_cv.score(X_test, y_test)}")

Train score for Linear Regression model before gridsearch: 0.9238365633872362
Test score for Linear Regression model before gridsearch: 0.8968410661401777
Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   40.8s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  2.8min finished


Train score for Linear Regression model: 0.9222318232316998
Test score for Linear Regression model: 0.8998025666337611


<IPython.core.display.Javascript object>

In [34]:
print(pipeline_rf_cv.best_params_)

y_pred = pipeline_rf_cv.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

confusion_mat = confusion_matrix(y_test, y_pred)
confusion_df = pd.DataFrame(
    confusion_mat,
    index=["actual_Customer_Leaving", "actual_Customer_Stay"],
    columns=["pred_Customer_Leaving", "pred_Customer_Stay"],
)
print(f"percentage matrix{confusion_mat/(len(y_pred))}")

display(confusion_df)

{'RFmodel__max_depth': 80, 'RFmodel__min_samples_leaf': 3, 'RFmodel__n_estimators': 150}
[[ 178  147]
 [  56 1645]]
              precision    recall  f1-score   support

           0       0.76      0.55      0.64       325
           1       0.92      0.97      0.94      1701

    accuracy                           0.90      2026
   macro avg       0.84      0.76      0.79      2026
weighted avg       0.89      0.90      0.89      2026

percentage matrix[[0.08785785 0.07255676]
 [0.02764067 0.81194472]]


Unnamed: 0,pred_Customer_Leaving,pred_Customer_Stay
actual_Customer_Leaving,178,147
actual_Customer_Stay,56,1645


<IPython.core.display.Javascript object>

### Random Forest are decent results, slightly over fit but still doing better than guessing 1.

# KNN Model

In [19]:
pipeline_knn = Pipeline(
    [
        ("preprocessing", preprocessing),
        ("sampling", SMOTE()),
        ("KNmodel", KNeighborsClassifier()),
    ]
)


pipeline_knn.fit(X_train, y_train)

train_score = pipeline_knn.score(X_train, y_train)
test_score = pipeline_knn.score(X_test, y_test)

print(f"Train score for Linear Regression model before gridsearch: {train_score}")
print(f"Test score for Linear Regression model before gridsearch: {test_score}")


grid = {
    "KNmodel__algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "KNmodel__n_neighbors": [5, 10, 25, 50],
}

pipeline_knn_cv = GridSearchCV(pipeline_knn, grid, verbose=1, cv=5, n_jobs=-1)
pipeline_knn_cv.fit(X_train, y_train)

print(
    f"Train score for Linear Regression model: {pipeline_knn_cv.score(X_train, y_train)}"
)
print(
    f"Test score for Linear Regression model: {pipeline_knn_cv.score(X_test, y_test)}"
)

Train score for Linear Regression model before gridsearch: 0.9174176027650908
Test score for Linear Regression model before gridsearch: 0.8538993089832182
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   33.3s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   58.7s finished


Train score for Linear Regression model: 0.918158252067646
Test score for Linear Regression model: 0.849457058242843


<IPython.core.display.Javascript object>

In [35]:
print(pipeline_knn_cv.best_params_)

y_pred = pipeline_knn_cv.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

confusion_mat = confusion_matrix(y_test, y_pred)
confusion_df = pd.DataFrame(
    confusion_mat,
    index=["actual_Customer_Leaving", "actual_Customer_Stay"],
    columns=["pred_Customer_Leaving", "pred_Customer_Stay"],
)
print(f"percentage matrix{confusion_mat/(len(y_pred))}")
display(confusion_df)

{'KNmodel__algorithm': 'auto', 'KNmodel__n_neighbors': 5}
[[ 262   63]
 [ 242 1459]]
              precision    recall  f1-score   support

           0       0.52      0.81      0.63       325
           1       0.96      0.86      0.91      1701

    accuracy                           0.85      2026
   macro avg       0.74      0.83      0.77      2026
weighted avg       0.89      0.85      0.86      2026

percentage matrix[[0.12931885 0.03109576]
 [0.11944719 0.7201382 ]]


Unnamed: 0,pred_Customer_Leaving,pred_Customer_Stay
actual_Customer_Leaving,262,63
actual_Customer_Stay,242,1459


<IPython.core.display.Javascript object>

### KNN is more overfit and has a worse performance than our Random Forest model. 

# Logistic Regression Classifier

In [20]:
pipeline_lg = Pipeline(
    [
        ("preprocessing", preprocessing),
        ("sampling", SMOTE()),
        ("LRmodel", LogisticRegression(max_iter=1000)),
    ]
)


pipeline_lg.fit(X_train, y_train)

train_score = pipeline_lg.score(X_train, y_train)
test_score = pipeline_lg.score(X_test, y_test)

print(f"Train score for Linear Regression model before gridsearch: {train_score}")
print(f"Test score for Linear Regression model before gridsearch: {test_score}")


grid = {
    "LRmodel__C": [0.1, 0.25, 0.5, 0.75, 1.0],
    "LRmodel__penalty": ["l1", "l2", "elasticnet", "none"],
}

pipeline_lg_cv = GridSearchCV(pipeline_lg, grid, verbose=1, cv=5, n_jobs=-1)
pipeline_lg_cv.fit(X_train, y_train)

print(
    f"Train score for Linear Regression model: {pipeline_lg_cv.score(X_train, y_train)}"
)
print(f"Test score for Linear Regression model: {pipeline_lg_cv.score(X_test, y_test)}")

Train score for Linear Regression model before gridsearch: 0.8491544253795827
Test score for Linear Regression model before gridsearch: 0.8543928923988154
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    7.6s finished


Train score for Linear Regression model: 0.848907542278731
Test score for Linear Regression model: 0.8548864758144127


<IPython.core.display.Javascript object>

In [36]:
print(pipeline_lg_cv.best_params_)

y_pred = pipeline_lg_cv.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

confusion_mat = confusion_matrix(y_test, y_pred)
confusion_df = pd.DataFrame(
    confusion_mat,
    index=["actual_Customer_Leaving", "actual_Customer_Stay"],
    columns=["pred_Customer_Leaving", "pred_Customer_Stay"],
)

print(f"percentage matrix{confusion_mat/(len(y_pred))}")
display(confusion_df)

{'LRmodel__C': 0.5, 'LRmodel__penalty': 'l2'}
[[ 270   55]
 [ 239 1462]]
              precision    recall  f1-score   support

           0       0.53      0.83      0.65       325
           1       0.96      0.86      0.91      1701

    accuracy                           0.85      2026
   macro avg       0.75      0.85      0.78      2026
weighted avg       0.89      0.85      0.87      2026

percentage matrix[[0.13326752 0.02714709]
 [0.11796644 0.72161895]]


Unnamed: 0,pred_Customer_Leaving,pred_Customer_Stay
actual_Customer_Leaving,270,55
actual_Customer_Stay,239,1462


<IPython.core.display.Javascript object>

### LG has a worse perfomance tha our Random Forest model and KNN model. 

In [21]:
n_estimators = 100
learning_rate = 2 / n_estimators

pipeline = Pipeline(
    [
        ("preprocessing", preprocessing),
        ("sampling", SMOTE()),
        (
            "model",
            XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate),
        ),
    ]
)

pipeline.fit(X_train, y_train)

train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

grid = {
    "model__subsample": [0.5, 0.75, 1.0],
    "model__colsample_bytree": [0.5, 0.75, 1.0],
    "model__max_depth": [7, 10, 12],
}

pipeline_cv = GridSearchCV(pipeline, grid, verbose=1, cv=5, n_jobs=-1)
pipeline_cv.fit(X_train, y_train)



Train score: 0.7421306011603506
Test score: 0.7403751233958539
Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   49.6s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:  3.4min finished




GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessing',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('scale',
                                                                         StandardScaler(),
                                                                         ['Customer_Age',
                                                                          'Dependent_count',
                                                                          'Months_on_book',
                                                                          'Total_Relationship_Count',
                                                                          'Months_Inactive_12_mon',
                                                                          'Contacts_Count_12_mon',
                                                                          'Credit_Limit',
  

<IPython.core.display.Javascript object>

In [22]:
print(f"Train score: {pipeline_cv.score(X_train, y_train)}")
print(f"Test score: {pipeline_cv.score(X_test, y_test)}")

Train score: 0.848907542278731
Test score: 0.8104639684106614


<IPython.core.display.Javascript object>

In [37]:
y_pred = pipeline_cv.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

confusion_mat = confusion_matrix(y_test, y_pred)
confusion_df = pd.DataFrame(
    confusion_mat,
    index=["actual_Customer_Leaving", "actual_Customer_Stay"],
    columns=["pred_Customer_Leaving", "pred_Customer_Stay"],
)
print(f"percentage matrix{confusion_mat/(len(y_pred))}")
display(confusion_df)

[[ 221  104]
 [ 280 1421]]
              precision    recall  f1-score   support

           0       0.44      0.68      0.54       325
           1       0.93      0.84      0.88      1701

    accuracy                           0.81      2026
   macro avg       0.69      0.76      0.71      2026
weighted avg       0.85      0.81      0.83      2026

percentage matrix[[0.10908193 0.05133268]
 [0.13820336 0.70138203]]


Unnamed: 0,pred_Customer_Leaving,pred_Customer_Stay
actual_Customer_Leaving,221,104
actual_Customer_Stay,280,1421


<IPython.core.display.Javascript object>

### XGB had the worst perfomance of all. 