In [1]:
# Install dependencies (add imbalanced-learn)
%pip install pandas matplotlib scikit-learn imbalanced-learn --quiet

from features import get_train_test_data

x_train, x_test, y_train, y_test = get_train_test_data("data/bank_data_train.csv")

print(f"x_train shape: {x_train.shape}")
print(f"x_test shape: {x_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

# Display first 5 rows of x_train
x_train.head()

Note: you may need to restart the kernel to use updated packages.
x_train shape: (284152, 32)
x_test shape: (71038, 32)
y_train shape: (284152,)
y_test shape: (71038,)


Unnamed: 0,CR_PROD_CNT_IL,TURNOVER_DYNAMIC_IL_1M,REST_DYNAMIC_FDEP_1M,REST_DYNAMIC_SAVE_3M,CR_PROD_CNT_VCU,REST_AVG_CUR,CR_PROD_CNT_TOVR,CR_PROD_CNT_PIL,TURNOVER_CC,TURNOVER_PAYM,AGE,CR_PROD_CNT_CC,REST_DYNAMIC_FDEP_3M,REST_DYNAMIC_IL_1M,CR_PROD_CNT_CCFP,REST_DYNAMIC_CUR_1M,REST_AVG_PAYM,LDEAL_GRACE_DAYS_PCT_MED,REST_DYNAMIC_CUR_3M,TURNOVER_DYNAMIC_CUR_1M,REST_DYNAMIC_PAYM_3M,REST_DYNAMIC_IL_3M,TURNOVER_DYNAMIC_IL_3M,REST_DYNAMIC_PAYM_1M,TURNOVER_DYNAMIC_CUR_3M,CLNT_SETUP_TENOR,TURNOVER_DYNAMIC_PAYM_3M,TURNOVER_DYNAMIC_PAYM_1M,REST_DYNAMIC_CC_1M,TURNOVER_DYNAMIC_CC_1M,REST_DYNAMIC_CC_3M,TURNOVER_DYNAMIC_CC_3M
0,2.061893,-0.044935,-0.052273,-0.312231,5.244695,-0.110851,1.176112,-0.192559,-0.038441,-0.093385,-0.976263,-0.242002,-0.086736,-0.068473,-0.064618,0.25893,-0.151018,-0.045033,0.283547,0.190178,-0.375206,-0.099585,-0.075343,-0.284607,0.326273,-0.610612,-0.347538,-0.24017,-0.083058,-0.031616,-0.108653,-0.07104
1,-0.244365,-0.044935,-0.052273,2.151389,-0.169621,0.192015,-0.526426,-0.192559,-0.038441,-0.093385,-1.064184,-0.242002,-0.086736,-0.068473,-0.064618,-0.286218,-0.151018,-0.045033,-0.22763,0.140643,-0.375206,-0.099585,-0.075343,-0.284607,-0.316017,1.144931,-0.347538,-0.24017,-0.083058,-0.031616,-0.108653,-0.07104
2,-0.244365,-0.044935,-0.052273,-0.312231,-0.169621,-0.260938,-0.526426,-0.192559,-0.038441,-0.093385,1.837201,-0.242002,-0.086736,-0.068473,-0.064618,-0.433742,-0.151018,-0.045033,-0.448334,-0.392441,-0.375206,-0.099585,-0.075343,-0.284607,-0.661855,-1.081618,-0.347538,-0.24017,-0.083058,-0.031616,-0.108653,-0.07104
3,2.061893,-0.044935,-0.052273,-0.312231,-0.169621,-0.332548,1.176112,-0.192559,-0.038441,-0.093385,-0.184976,-0.242002,-0.086736,-0.068473,-0.064618,0.039323,-0.151018,-0.045033,0.836577,0.003751,-0.375206,-0.099585,-0.075343,-0.284607,0.89938,-0.425565,-0.347538,-0.24017,-0.083058,-0.031616,-0.108653,-0.07104
4,-0.244365,-0.044935,-0.052273,-0.312231,-0.169621,-0.29102,1.176112,-0.192559,-0.038441,-0.093385,-1.327946,-0.242002,-0.086736,-0.068473,-0.064618,0.799208,-0.151018,-0.045033,1.675035,1.528313,-0.375206,-0.099585,-0.075343,-0.284607,1.55469,-0.719649,-0.347538,-0.24017,-0.083058,-0.031616,-0.108653,-0.07104


In [2]:
# print name of features used for training
print(f"Features used for training: {x_train.columns.tolist()}")

Features used for training: ['CR_PROD_CNT_IL', 'TURNOVER_DYNAMIC_IL_1M', 'REST_DYNAMIC_FDEP_1M', 'REST_DYNAMIC_SAVE_3M', 'CR_PROD_CNT_VCU', 'REST_AVG_CUR', 'CR_PROD_CNT_TOVR', 'CR_PROD_CNT_PIL', 'TURNOVER_CC', 'TURNOVER_PAYM', 'AGE', 'CR_PROD_CNT_CC', 'REST_DYNAMIC_FDEP_3M', 'REST_DYNAMIC_IL_1M', 'CR_PROD_CNT_CCFP', 'REST_DYNAMIC_CUR_1M', 'REST_AVG_PAYM', 'LDEAL_GRACE_DAYS_PCT_MED', 'REST_DYNAMIC_CUR_3M', 'TURNOVER_DYNAMIC_CUR_1M', 'REST_DYNAMIC_PAYM_3M', 'REST_DYNAMIC_IL_3M', 'TURNOVER_DYNAMIC_IL_3M', 'REST_DYNAMIC_PAYM_1M', 'TURNOVER_DYNAMIC_CUR_3M', 'CLNT_SETUP_TENOR', 'TURNOVER_DYNAMIC_PAYM_3M', 'TURNOVER_DYNAMIC_PAYM_1M', 'REST_DYNAMIC_CC_1M', 'TURNOVER_DYNAMIC_CC_1M', 'REST_DYNAMIC_CC_3M', 'TURNOVER_DYNAMIC_CC_3M']


In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

N_ESTIMATORS = 100

# Reduced parameter grid for faster search
param_grid_small = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt']
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid=param_grid_small,
    scoring='roc_auc',
    cv=3,  # Faster with 3 folds
    verbose=2,
    n_jobs=-1
)

grid_search.fit(x_train, y_train)

# Evaluate
y_train_pred = grid_search.predict_proba(x_train)[:, 1]
y_test_pred = grid_search.predict_proba(x_test)[:, 1]

test_auc = roc_auc_score(y_test, y_test_pred)
accuracy = grid_search.score(x_test, y_test)
print(f"AUC: {test_auc:.4f}")
print(f"Accuracy: {accuracy:.4f}")

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=  25.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=  26.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=  26.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  27.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  27.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  27.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=  27.7s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total



[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.3min
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  28.2s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.4min
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=  56.7s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=  57.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=  58.7s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=  27.5s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=  27.6s
[CV] END max_depth=20, max_features=sqrt, m

In [4]:
# print classification report
from sklearn.metrics import classification_report
y_test_pred = grid_search.predict(x_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96     65253
           1       0.68      0.01      0.03      5785

    accuracy                           0.92     71038
   macro avg       0.80      0.51      0.49     71038
weighted avg       0.90      0.92      0.88     71038



In [7]:
import pandas as pd

LIBRARY_NAME = "scikit-learn"
ALGORITHM_NAME = "Random Forest"

pd.DataFrame({
    "library": [LIBRARY_NAME],
    "algorithm": [ALGORITHM_NAME],
    "AUC": [test_auc],
    "Accuracy": [accuracy],
    "Max_Depth": [grid_search.best_params_['max_depth']],
    "N_Estimators": [grid_search.best_params_['n_estimators']],
    "Min_Samples_Split": [grid_search.best_params_['min_samples_split']],
    "Min_Samples_Leaf": [grid_search.best_params_['min_samples_leaf']],
    "Max_Features": [grid_search.best_params_['max_features']]
}).T


Unnamed: 0,0
library,scikit-learn
algorithm,Random Forest
AUC,0.827194
Accuracy,0.827194
Max_Depth,20
N_Estimators,200
Min_Samples_Split,2
Min_Samples_Leaf,2
Max_Features,sqrt
