In [1]:
import json
import os
import warnings
from datetime import datetime
from shutil import copyfile

import joblib
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import pandas_profiling as pp
import xgboost as xgb
from catboost import CatBoostClassifier, Pool, cv
from category_encoders import TargetEncoder, WOEEncoder
from mlxtend.feature_selection import SequentialFeatureSelector
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_roc_curve, roc_auc_score
from sklearn.model_selection import (
    RandomizedSearchCV,
    RepeatedStratifiedKFold,
    StratifiedKFold,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline
from tqdm import tqdm

np.set_printoptions(formatter={"float": lambda x: "{0:0.4f}".format(x)})  # `easy numbers` mode
# pd.set_option("display.max_columns", None)                                # `show whole df` mode
warnings.filterwarnings("ignore")                                         # `do not disturbe` mode

In [2]:
df = pd.read_csv("Data\\data_preprocessed\\taiwan_data_binned.csv", sep=",", na_values="NULL")

X = df.drop(["GOOD"], axis=1)
y = df["GOOD"]

df

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,GOOD
0,1,1,2,1,1,2,2,-1,-1,-2,...,1,1,1,1,2,1,1,1,1,0
1,5,1,2,2,2,-1,2,0,0,0,...,3,3,4,1,2,2,2,1,3,0
2,4,1,2,2,5,0,0,0,0,0,...,5,5,5,2,2,2,2,2,4,1
3,2,1,2,1,6,0,0,0,0,0,...,6,7,7,3,3,2,2,2,2,1
4,2,0,2,1,10,-1,0,-1,0,0,...,6,6,6,3,5,5,4,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,8,0,3,1,7,0,0,0,0,0,...,9,7,5,5,5,4,3,4,2,1
29996,6,0,3,2,8,-1,-1,-1,-1,0,...,4,4,1,3,4,5,1,1,1,1
29997,1,0,2,2,6,4,3,2,-1,0,...,6,6,6,1,1,5,4,3,3,0
29998,4,0,3,1,8,1,-1,0,0,0,...,8,5,8,5,4,2,3,4,3,0


In [None]:
encoder = WOEEncoder(cols=X.columns)
encoder.fit(X, y)

encoder.transform(X).head()

### Feature selection

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=42)
sfs = SequentialFeatureSelector(
    LogisticRegression(),
    k_features=15,
    forward=True,
    floating=True,
    scoring="roc_auc",
    verbose=2,
    cv=cv,
    n_jobs=-1,
)

sfs = sfs.fit(encoder.transform(X), y)

In [None]:
# print chart
fig = plot_sfs(sfs.get_metric_dict(), kind="std_err")
plt.title("Sequential Forward Selection (w. StdErr)")
plt.grid()
plt.show()

In [None]:
results = pd.DataFrame.from_dict(sfs.get_metric_dict()).T
results["avg_score"] = results["avg_score"] * 2 - 1
results

In [None]:
final_variables = list(results["feature_names"][11])
final_variables

### Final model

In [None]:
X = X[final_variables]
X = X[final_variables]

pipe = Pipeline([("scaler", WOEEncoder()), ("lr", LogisticRegression())])

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=42)
scores = cross_validate(
    pipe, X, y, scoring="roc_auc", cv=cv, return_train_score=True, n_jobs=-1
)

In [None]:
mean_train_auc = scores["train_score"].mean()
mean_test_auc = scores["test_score"].mean()
std_test_auc = scores["test_score"].std()
print("GINI train:", np.round(mean_train_auc * 2 - 1, 3))
print("GINI dev:", np.round(mean_test_auc * 2 - 1, 3), f"({np.round(std_test_auc, 3)})")