In [1]:
import json
import os
import warnings
from datetime import datetime
from shutil import copyfile

import joblib
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import pandas_profiling as pp
import xgboost as xgb
import seaborn as sns
import shap
from catboost import CatBoostClassifier, Pool, cv
from category_encoders import TargetEncoder, WOEEncoder
from mlxtend.feature_selection import SequentialFeatureSelector
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import (
    RandomizedSearchCV,
    RepeatedStratifiedKFold,
    StratifiedKFold,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline
from tqdm import tqdm

np.set_printoptions(formatter={"float": lambda x: "{0:0.4f}".format(x)})  # `easy numbers` mode
pd.set_option("display.max_columns", None)                                # `show whole df` mode
warnings.filterwarnings("ignore")                                         # `do not disturbe` mode

sns.set_context("paper", font_scale=1.4)

## Full dataset: 30k instances

### Load data

In [2]:
df = pd.read_csv("Data\\data_preprocessed\\taiwan_data_binned.csv", sep=",", na_values="NULL")
df

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,GOOD
0,1,1,2,1,1,2,2,-1,-1,-2,-2,3,3,2,1,1,1,1,2,1,1,1,1,0
1,5,1,2,2,2,-1,2,0,0,0,2,3,3,3,3,3,4,1,2,2,2,1,3,0
2,4,1,2,2,5,0,0,0,0,0,0,6,5,5,5,5,5,2,2,2,2,2,4,1
3,2,1,2,1,6,0,0,0,0,0,0,7,7,8,6,7,7,3,3,2,2,2,2,1
4,2,0,2,1,10,-1,0,-1,0,0,0,4,4,7,6,6,6,3,5,5,4,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,8,0,3,1,7,0,0,0,0,0,0,10,10,10,9,7,5,5,5,4,3,4,2,1
29996,6,0,3,2,8,-1,-1,-1,-1,0,0,2,3,3,4,4,1,3,4,5,1,1,1,1
29997,1,0,2,2,6,4,3,2,-1,0,0,3,3,3,6,6,6,1,1,5,4,3,3,0
29998,4,0,3,1,8,1,-1,0,0,0,-1,1,8,8,8,5,8,5,4,2,3,4,3,0


In [3]:
X = df.drop(["GOOD"], axis=1)
y = df["GOOD"]

In [None]:
encoder = WOEEncoder(cols=X.columns)
encoder.fit(X, y)

encoder.transform(X).head()

### Model definition

In [None]:
pipe = Pipeline([
    ("scaler", WOEEncoder()),
    ("lr", LogisticRegression())
])

### Model performance

In [None]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=42)
scores = cross_validate(
    pipe, X, y, scoring="roc_auc", cv=cv, return_train_score=True, n_jobs=-1
)

mean_train_auc = scores["train_score"].mean()
mean_test_auc = scores["test_score"].mean()
std_test_auc = scores["test_score"].std()
print("GINI train:", np.round(mean_train_auc * 2 - 1, 3))
print("GINI dev:", np.round(mean_test_auc * 2 - 1, 3), f"({np.round(std_test_auc, 3)})")

### Model visualization

In [None]:
pipe = Pipeline([("scaler", WOEEncoder()), ("lr", LogisticRegression())])
pipe.fit(X, y)

coefs = pd.DataFrame(zip(X.columns, pipe['lr'].coef_[0]), columns=['Variable', 'Coef'])
print(coefs.reindex(coefs.Coef.abs().sort_values(ascending=False).index))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

encoder = WOEEncoder(cols=X.columns)
encoder.fit(X_train, y_train)

X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

pipe.fit(X = X_train, y = np.ravel(y_train))
preds = pipe.predict_proba(X_test)[::,1]
test_auc = metrics.roc_auc_score(y_test, preds)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(16,6))
fig1 = sns.kdeplot(scores["test_score"]*2-1, ax=axs[0])
fig1.set_xlabel('GINI')
fig1.set_ylabel('Density')

fpr, tpr, _ = metrics.roc_curve(y_test, preds)
lw=2
axs[1].plot(fpr, tpr, lw=lw, label='ROC curve (GINI = %0.3f)' % (test_auc * 2 - 1))
axs[1].plot([0, 1], [0, 1], color='red', lw=lw, linestyle='--')
axs[1].set_xlim([0.0, 1.0])
axs[1].set_ylim([0.0, 1.05])
axs[1].set_xlabel('False Positive Rate')
axs[1].set_ylabel('True Positive Rate')
axs[1].legend(loc="lower right")

plt.savefig('Img\lr_cv_roc.png', facecolor='w')

In [None]:
explainer = shap.Explainer(pipe['lr'], X_train, feature_names=X.columns)
shap_values = explainer(X_test)
fig = shap.summary_plot(shap_values, plot_size=(12,6), show=False)

plt.savefig('Img\lr_shap.png', facecolor='w')

## 1/3 dataset: 10k instances

### Load data

In [None]:
N_SAMPLES = 10000
df = pd.read_csv("Data\\data_preprocessed\\taiwan_data_binned.csv", sep=",", na_values="NULL").sample(N_SAMPLES, random_state=42)
df

In [None]:
X = df.drop(["GOOD"], axis=1)
y = df["GOOD"]

In [None]:
encoder = WOEEncoder(cols=X.columns)
encoder.fit(X, y)

encoder.transform(X).head()

### Model definition

In [None]:
pipe = Pipeline([
    ("scaler", WOEEncoder()),
    ("lr", LogisticRegression())
])

### Model performance

In [None]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=42)
scores = cross_validate(
    pipe, X, y, scoring="roc_auc", cv=cv, return_train_score=True, n_jobs=-1
)

mean_train_auc = scores["train_score"].mean()
mean_test_auc = scores["test_score"].mean()
std_test_auc = scores["test_score"].std()
print("GINI train:", np.round(mean_train_auc * 2 - 1, 3))
print("GINI dev:", np.round(mean_test_auc * 2 - 1, 3), f"({np.round(std_test_auc, 3)})")

## 1/15 dataset: 1k instances

### Load data

In [None]:
N_SAMPLES = 2000
df = pd.read_csv("Data\\data_preprocessed\\taiwan_data_binned.csv", sep=",", na_values="NULL").sample(N_SAMPLES, random_state=42)
df

In [None]:
X = df.drop(["GOOD"], axis=1)
y = df["GOOD"]

In [None]:
encoder = WOEEncoder(cols=X.columns)
encoder.fit(X, y)

encoder.transform(X).head()

### Model definition

In [None]:
pipe = Pipeline([
    ("scaler", WOEEncoder()),
    ("lr", LogisticRegression())
])

### Model performance

In [None]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=42)
scores = cross_validate(
    pipe, X, y, scoring="roc_auc", cv=cv, return_train_score=True, n_jobs=-1
)

mean_train_auc = scores["train_score"].mean()
mean_test_auc = scores["test_score"].mean()
std_test_auc = scores["test_score"].std()
print("GINI train:", np.round(mean_train_auc * 2 - 1, 3))
print("GINI dev:", np.round(mean_test_auc * 2 - 1, 3), f"({np.round(std_test_auc, 3)})")