# Import Libraries

In [1]:
import numpy as np
import pandas as pd

from math import log, ceil 

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_selection import SelectFromModel
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

from xverse.transformer import WOE
from xverse.ensemble import VotingSelector
from xgboost import XGBClassifier
import xgboost

import warnings
warnings.filterwarnings('ignore')

# Load Data and Set Target

In [17]:
data = pd.read_csv('credit_risk_dataset.csv')
data.drop_duplicates(inplace=True)

# Handling Missing Value and Outlier with WOE

Disini WOE dapat digunakan untuk mengatasi missing value dan outlier pada dataset. Hal ini disebabkan WOE mampu memberikan weight / bobot pada data points yang memiliki missing value maupun data points yang tergolong ke dalam outlier. Selain itu, penggunaan logistic regression pada credit card mampu dapat memberikan gambaran kepada divisi bisnis terhadap setiap fitur yang ada pada nasabah, yaitu terkait pengaruh dari setiap fitur, baik demografi maupun behavior

In [18]:
woe = {
    'person_age' : {'binning':[-float("inf"), 21, float("inf")], 'labels':[-0.433, -0.019],'nan':0},
    'loan_int_rate' : {'binning':[-float("inf"), 14, 18, float("inf")], 'labels':[-0.465, 1.303, 2.066],'nan':-0.068},
    'loan_percent_income' : {'binning':[-float("inf"), 0.3, 0.5, 0.75, float("inf")], 'labels':[-0.431, 2.111, 2.572, 2.662],'nan':0},
    'person_income' : {'binning':[-float("inf"), 20000, 60000, 100000, float("inf")], 'labels':[2.397, 0.222, -0.595, -0.973],'nan':0},
    'loan_amnt' : {'binning':[-float("inf"), 13000, 18000, 23000, float("inf")], 'labels':[-0.154, 0.261, 0.572, 0.704],'nan':0},
    'person_emp_length' : {'binning':[-float("inf"), 2, 4, float("inf")], 'labels':[0.285, -0.071, -0.253],'nan':0.499},
    'cb_person_cred_hist_length' : {'binning':[-float("inf"), 3, 4, 8, 15.5, float("inf")], 'labels':[0.066, 0.031, -0.071, -0.076, -0.010],'nan':0}    
}

for feature, woe_info in woe.items():
    data[f'{feature}_WOE'] = pd.cut(data[feature], bins = woe_info['binning'], labels = woe_info['labels'])
    data[f'{feature}_WOE'] = data[f'{feature}_WOE'].values.add_categories('Nan').fillna('Nan')
    data[f'{feature}_WOE'] = data[f'{feature}_WOE'].replace('Nan', woe_info['nan'])
    data[f'{feature}_WOE'] = data[f'{feature}_WOE'].astype(float)
    data.drop(columns = f'{feature}', inplace=True)

In [19]:
y = data['loan_status'].copy()
x = data.drop(columns = 'loan_status').copy()

In [5]:
import pickle
woe_result = 'woe-1.0.0.pkl'

with open (woe_result, 'wb') as f:
    pickle.dump(woe, f)

# Split Datatest

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)

print("Number of Train :", len(y_train), f"{round(len(y_train) / len(y) * 100, 2)}%")
print("Number of Test :", len(y_test), f"{round(len(y_test) / len(y) * 100, 2)}%")

Number of Train : 25932 80.0%
Number of Test : 6484 20.0%


# Split by Data Types

In [21]:
num_columns = X_train.select_dtypes(include = 'number').columns.tolist()
cat_columns = X_train.select_dtypes(exclude = 'number').columns.tolist()
num_columns

['person_age_WOE',
 'loan_int_rate_WOE',
 'loan_percent_income_WOE',
 'person_income_WOE',
 'loan_amnt_WOE',
 'person_emp_length_WOE',
 'cb_person_cred_hist_length_WOE']

In [22]:
cat_columns

['person_home_ownership',
 'loan_intent',
 'loan_grade',
 'cb_person_default_on_file']

# Pipeline

In [23]:
cat_pipe = Pipeline(steps=[
    ('ohe', OneHotEncoder(drop="first"))
])

num_pipe = Pipeline(steps=[
    ('scaler', StandardScaler())
])

preprocess = ColumnTransformer(transformers=[
    ('cat', cat_pipe, cat_columns),
    ('num', num_pipe, num_columns)
], remainder='passthrough')


full_pipe = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', LogisticRegression(random_state=42, max_iter=1000))
])

In [24]:
param_space = {
    "model__C": [0.0001, 0.001, 0.01, 0.1, 2, 1, 3, 4, 5, 10, 100, 1000],
    "model__max_iter": [1000],
    "model__random_state": [42]
}

clf = GridSearchCV(full_pipe, 
                   param_space, 
                   scoring='roc_auc', 
                   cv=5, 
                   refit=True)

In [25]:
clf.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocess',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('cat',
                                                                         Pipeline(steps=[('ohe',
                                                                                          OneHotEncoder(drop='first'))]),
                                                                         ['person_home_ownership',
                                                                          'loan_intent',
                                                                          'loan_grade',
                                                                          'cb_person_default_on_file']),
                                                                        ('num',
                                                                         Pipeline(st

In [26]:
clf.best_estimator_

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(drop='first'))]),
                                                  ['person_home_ownership',
                                                   'loan_intent', 'loan_grade',
                                                   'cb_person_default_on_file']),
                                                 ('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['person_age_WOE',
                                                   'loan_int_rate_WOE',
                                                   'loan_percent_income_WOE',


In [27]:
model = clf.best_estimator_

In [28]:
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

yhat_train = model.predict_proba(X_train)[:, 1]
yhat_test = model.predict_proba(X_test)[:, 1]

# Evaluate Model

In [47]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score

def print_evaluate(true, predicted):
    auc = metrics.roc_auc_score(true, predicted)
    accuracy = metrics.accuracy_score(true, predicted)
    print('Accuracy:', accuracy)
    print('AUC:', auc)
    print('__________________________________')
    
def evaluate(true, predicted):
    auc = metrics.roc_auc_score(true, predicted)
    accuracy = metrics.accuracy_score(true, predicted)
    return accuracy, auc

In [61]:
print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

Test set evaluation:
_____________________________________
Accuracy: 0.8871067242442936
AUC: 0.8040984533077342
__________________________________
Train set evaluation:
_____________________________________
Accuracy: 0.8848912540490513
AUC: 0.8015786204804239
__________________________________


In [62]:
print("ROC-AUC Score Train:", roc_auc_score(y_train, yhat_train))
print("ROC-AUC Score Test:", roc_auc_score(y_test, yhat_test))

ROC-AUC Score Train: 0.8904320135196364
ROC-AUC Score Test: 0.8883885740663302


In [65]:
print(classification_report(y_test, test_pred))

              precision    recall  f1-score   support

           0       0.91      0.95      0.93      5066
           1       0.79      0.66      0.72      1418

    accuracy                           0.89      6484
   macro avg       0.85      0.80      0.82      6484
weighted avg       0.88      0.89      0.88      6484



# Plot ROC AUC

In [66]:
# https://stackoverflow.com/questions/28719067/roc-curve-and-cut-off-point-python
def get_optimal_cutoff(tpr, fpr, thresholds):
    i = np.arange(len(tpr)) # index for df
    roc = pd.DataFrame({
        'fpr' : pd.Series(fpr, index=i),
        'tpr' : pd.Series(tpr, index = i), 
        '1-fpr' : pd.Series(1-fpr, index = i), 
        'tf' : pd.Series(tpr - (1-fpr), index = i), 
        'thresholds' : pd.Series(thresholds, index = i)})
    
    print(roc.iloc[[(roc.tf-0).abs().argmin()]])

    # Plot tpr vs 1-fpr
    fig, ax = plt.subplots()
    _ = plt.plot(roc['tpr'])
    _ = plt.plot(roc['1-fpr'], color = 'red')
    _ = plt.xlabel('1-False Positive Rate')
    _ = plt.ylabel('True Positive Rate')
    _ = plt.title('Receiver operating characteristic')
    _ = ax.set_xticklabels([])
    
    
def plot_roc_auc(tpr, fpr, labels):
    
    if (isinstance(tpr, (list, np.array)) and isinstance(fpr, (list, np.array))):
        
        nlist = len(tpr)
        
        if all(len(val) == nlist for val in [tpr, fpr, labels]):
            pass
        else:
            raise ValueError('length of parameter should be the same')
               
        fig, ax = plt.subplots(figsize=(8, 6.5))
        _ = sns.despine(ax=ax)
        _ = ax.plot([0, 1], [0, 1], color='navy', linestyle='--')
        _ = ax.set_xlim([0.0, 1.0])
        _ = ax.set_ylim([0.0, 1.05])
        _ = ax.set_xlabel('False Positive Rate')
        _ = ax.set_ylabel('True Positive Rate')
        _ = ax.set_title('Receiver operating characteristic example')
        
        for i in range(nlist):
        
            roc_auc = auc(fpr[i], tpr[i])
            _ = ax.plot(
                fpr[i], 
                tpr[i], 
                label=f'ROC curve {labels[i]} (area = {roc_auc: 0.2f}'
            )
        
        _ = fig.legend(loc="right")
        
    else:
        roc_auc = auc(fpr, tpr)
        
        fig, ax = plt.subplots(figsize=(8, 6.5))
        _ = sns.despine(ax=ax)
        _ = ax.plot([0, 1], [0, 1], color='navy', linestyle='--')
        _ = ax.set_xlim([0.0, 1.0])
        _ = ax.set_ylim([0.0, 1.05])
        _ = ax.set_xlabel('False Positive Rate')
        _ = ax.set_ylabel('True Positive Rate')
        _ = ax.set_title('Receiver operating characteristic example')
        _ = ax.plot(
            fpr, 
            tpr, 
            label=f'ROC curve {labels} (area = {roc_auc: 0.2f}'
        )
        
    _ = fig.legend(loc="right")

In [68]:
from sklearn.metrics import roc_curve, auc, roc_auc_score, classification_report

fpr_train, tpr_train, thresholds_train =roc_curve(y_train, 
                                                  yhat_train)
fpr_test, tpr_test, thresholds_test =roc_curve(y_test, 
                                               yhat_test)

roc_auc_train = auc(fpr_train, tpr_train)
roc_auc_test = auc(fpr_test, tpr_test)

tpr = tpr_test
fpr = fpr_test

thresholds = thresholds_test
get_optimal_cutoff(tpr, fpr, thresholds)

         fpr       tpr    1-fpr        tf  thresholds
805  0.18259  0.817348  0.81741 -0.000062    0.171982


In [71]:
threshold = 0.171982 

y_pred_train = np.where(yhat_train > threshold, 1, 0)
y_pred_test = np.where(yhat_test> threshold, 1, 0)

print(classification_report(y_test, 
                            y_pred_test))

              precision    recall  f1-score   support

           0       0.94      0.82      0.88      5066
           1       0.56      0.82      0.66      1418

    accuracy                           0.82      6484
   macro avg       0.75      0.82      0.77      6484
weighted avg       0.86      0.82      0.83      6484



# Save Model

In [50]:
import pickle 

model_name = 'model-LR-1.0.0.pkl'

with open(model_name, 'wb') as f:
    pickle.dump(model, f)

In [55]:
with open(model_name, 'rb') as f:
    model = pickle.load(f)

In [56]:
yhat_train = model.predict_proba(X_train)[:, 1]
yhat_test = model.predict_proba(X_test)[:, 1]

In [57]:
print("ROC-AUC Score Train:", roc_auc_score(y_train, yhat_train))
print("ROC-AUC Score Test:", roc_auc_score(y_test, yhat_test))

ROC-AUC Score Train: 0.8904320135196364
ROC-AUC Score Test: 0.8883885740663302


In [81]:
x

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0.10,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,0.55,Y,4
...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0.11,N,30
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0.15,N,19
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,0.46,N,28
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0.10,N,26
