IV

In [None]:
from optbinning import OptimalBinning
import warnings; warnings.filterwarnings('ignore')

iv_df = []

for i in numerical_list: # 'categorical_list' if categorical variables
    variable = i
    x = df[variable].values
    y = df.credit

    # numerical
    optb = OptimalBinning(name=variable, dtype='numerical', solver='cp',
                          max_n_prebins=3) # bin이 많아지면 해석하기 어려우므로 3개가 적당함.
    # categorical
    # optb = OptimalBinning(name=variable, dtype='categorical', solver='cp')
    optb.fit(x,y) # fitting을 시켜서 사용하는 경우가 대부분.

    binning_table = optb.binning_table
    v1 = binning_table.build()

    loop_df = pd.DataFrame({'val' : variable,
                            'IV' : [v1.loc['Totals', 'IV']]})
    iv_df.append(loop_df)

iv_df = pd.concat(iv_df).reset_index(drop=True)
iv_df.sort_values(by=['IV'], ascending=False)

In [None]:
# bins 활용 labeling
x_transform_bins = optb.transform(df['DAYS_BIRTH'], metric='bins')
df_bin = pd.DataFrame(df['credit'])
df_bin['bin'] = x_transform_bins
df_bin.head()

3 models

In [None]:
# Logistic Regression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# X & Y
X = df.drop(['index', 'credit', 'FLAG_MOBIL'], axis=1)
Y = df['credit']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.3, stratify=Y, random_state=1234)

print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(y_test.shape)

# onehot encoding
for col in categorical_list:
    encoder = OneHotEncoder()
    encoder.fit(x_train[[col]])
    onehot_train = pd.DataFrame(encoder.transform(x_train[[col]]).toarray(), columns=encoder.get_feature_names_out(), index=x_train.index)
    onehot_test = pd.DataFrame(encoder.transform(x_test[[col]]).toarray(), columns=encoder.get_feature_names_out(), index=x_test.index)
    # 기존 col 삭제
    x_train = pd.concat([x_train, onehot_train], axis=1).drop(columns=[col])
    x_test = pd.concat([x_test, onehot_test], axis=1).drop(columns=[col])

# scaling
scaler = StandardScaler()
x_train_sc = scaler.fit_transform(x_train)
x_test_sc = scaler.transform(x_test)

# model
LR = LogisticRegression()
LR.fit(x_train_sc, y_train)

y_pred_train = LR.predict(x_train_sc)
y_pred_test = LR.predict(x_test_sc)

print(classification_report(y_train, y_pred_train))

# 과적합 문제, train과 test set에 성능을 최대한 줄여주는 것이 과적합을 방지
y_pred_train_proba = LR.predict_proba(x_train_sc)[:, 1]
y_pred_test_proba = LR.predict_proba(x_test_sc)[:, 1]

roc_score_train = roc_auc_score(y_train, y_pred_train_proba)
roc_score_test = roc_auc_score(y_test, y_pred_test_proba)

print('roc_score_train: ', roc_score_train)
print('roc_score_test: ', roc_score_test)

lr_re = pd.DataFrame({'model' : ['LR'],
                      'f1_train' : metrics.f1_score(y_train, y_pred_train),
                      'f1_test' : metrics.f1_score(y_test, y_pred_test),
                      'AUC_train' : roc_auc_score(y_train, y_pred_train_proba),
                      'AUC_test' : roc_auc_score(y_test, y_pred_test_proba)})

df_comparison = df_comparison.append(lr_re)
df_comparison

In [None]:
# random forest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

import numpy as np
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score

X = df.drop(['index', 'credit', 'FLAG_MOBIL'], axis=1)
Y = df['credit']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.3, stratify=Y, random_state=1234)

print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(y_test.shape)

# model
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)

y_pred_train = rfc.predict(x_train)
y_pred_test = rfc.predict(x_test)

print(classification_report(y_train, y_pred_train))
print(classification_report(y_test, y_pred_test))

# 과적합 문제, train과 test set에 성능을 최대한 줄여주는 것이 과적합을 방지
y_pred_train_proba = rfc.predict_proba(x_train)[:, 1]
y_pred_test_proba = rfc.predict_proba(x_test)[:, 1]

roc_score_train = roc_auc_score(y_train, y_pred_train_proba)
roc_score_test = roc_auc_score(y_test, y_pred_test_proba)

print('roc_score_train: ', roc_score_train)
print('roc_score_test: ', roc_score_test)

# !pip install -q bayesian-optimization bayesian-optimization 패키지

def model_evaluate(n_estimators, maxDepth):
    clf = RandomForestClassifier(
        n_estimators = int(n_estimators),
        max_depth = int(maxDepth)
    )
    scores = cross_val_score(clf, x_train, y_train, cv=5, scoring='roc_auc')
    return np.mean(scores)

def bayesOpt(x_train, y_train):
    clfBO = BayesianOptimization(model_evaluate, {'n_estimators' : (100,200),
                                                  'maxDepth' : (2,4)})
    clfBO.maximize(init_points=5, n_iter=10)
    print(clfBO.res)

bayesOpt(x_train, y_train)

# after finding out best parameters
rfc = RandomForestClassifier(n_estimators=147,
                             max_depth=4,
                             random_state=1121)
rfc.fit(x_train, y_train)

y_pred_train = rfc.predict(x_train)
y_pred_test = rfc.predict(x_test)

y_pred_train_proba = rfc.predict_proba(x_train)[:, 1]
y_pred_test_proba = rfc.predict_proba(x_test)[:, 1]

rfc_re = pd.DataFrame({'model' : ['RFC(BO)'],
                      'f1_train' : metrics.f1_score(y_train, y_pred_train),
                      'f1_test' : metrics.f1_score(y_test, y_pred_test),
                      'AUC_train' : roc_auc_score(y_train, y_pred_train_proba),
                      'AUC_test' : roc_auc_score(y_test, y_pred_test_proba)})

df_comparison = df_comparison.append(rfc_re)
df_comparison.reset_index(drop=True, inplace=True)

In [None]:
# LGBM
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

import numpy as np
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score

X = df.drop(['index', 'credit', 'FLAG_MOBIL'], axis=1)
Y = df['credit']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.3, stratify=Y, random_state=1234)

print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(y_test.shape)

# model
LGBM = LGBMClassifier()
LGBM.fit(x_train, y_train)

y_pred_train = LGBM.predict(x_train)
y_pred_test = LGBM.predict(x_test)

print(classification_report(y_train, y_pred_train))
print(classification_report(y_test, y_pred_test))

# 과적합 문제, train과 test set에 성능을 최대한 줄여주는 것이 과적합을 방지
y_pred_train_proba = LGBM.predict_proba(x_train)[:, 1]
y_pred_test_proba = LGBM.predict_proba(x_test)[:, 1]

roc_score_train = roc_auc_score(y_train, y_pred_train_proba)
roc_score_test = roc_auc_score(y_test, y_pred_test_proba)

print('roc_score_train: ', roc_score_train)
print('roc_score_test: ', roc_score_test)

# !pip install -q bayesian-optimization bayesian-optimization 패키지

def model_evaluate(n_estimators, maxDepth):
    clf = LGBMClassifier(
        objective='binary', # binary 문제
        metric='auc',
        learning_rate=.01,
        n_estimators = int(n_estimators),
        max_depth = int(maxDepth),
        verbose=-1
    )
    scores = cross_val_score(clf, x_train, y_train, cv=5, scoring='roc_auc')
    return np.mean(scores)

def bayesOpt(x_train, y_train):
    clfBO = BayesianOptimization(model_evaluate, {'n_estimators' : (100,200),
                                                  'maxDepth' : (2,4)})
    clfBO.maximize(init_points=5, n_iter=10)
    print(clfBO.res)

bayesOpt(x_train, y_train)

# after finding out best parameters
LGBM = LGBMClassifier(n_estimators=147,
                    max_depth=4,
                    random_state=1121)
LGBM.fit(x_train, y_train)

y_pred_train = LGBM.predict(x_train)
y_pred_test = LGBM.predict(x_test)

y_pred_train_proba = LGBM.predict_proba(x_train)[:, 1]
y_pred_test_proba = LGBM.predict_proba(x_test)[:, 1]

lgbm_re = pd.DataFrame({'model' : ['LGBM(BO)'],
                      'f1_train' : metrics.f1_score(y_train, y_pred_train),
                      'f1_test' : metrics.f1_score(y_test, y_pred_test),
                      'AUC_train' : roc_auc_score(y_train, y_pred_train_proba),
                      'AUC_test' : roc_auc_score(y_test, y_pred_test_proba)})

df_comparison = df_comparison.append(lgbm_re)
df_comparison.reset_index(drop=True, inplace=True)

In [None]:
# checking final result
df_comparison.style.background_gradient(cmap='coolwarm', low=1)