## Import Package

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import log, ceil 
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import mean_squared_error, roc_auc_score, roc_curve, r2_score, accuracy_score, confusion_matrix, classification_report, fbeta_score     # to evaluate our model
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

In [None]:
import scipy
import sklearn
print(pd.__version__)
print(np.__version__)
print(scipy.__version__)
print(sklearn.__version__)
print(sns.__version__)

## Import Data

### Load CSV

In [None]:
# Read the data into a data frame
data = pd.read_csv("credit_risk_dataset.csv")

FileNotFoundError: ignored

In [None]:
data[data.duplicated()]

In [None]:
data = data.drop_duplicates()

In [None]:
data.shape

## Data Exploration

In [None]:
data.info()

There are NULL data in person_emp_length & loan_int_rate feature

In [None]:
num_columns = data.select_dtypes(include='number').columns.tolist()
target = 'loan_status'
num_columns.remove(target)
cat_columns = data.select_dtypes(include='object').columns.tolist()

print("Total numerical columns :", len(num_columns))
print("Total categorical columns :", len(cat_columns))

In [None]:
data.isnull().sum()

### Descriptive Statistic

### Continuous Variable

In [None]:
pd.options.display.max_rows = 1000
data.describe(include='number').transpose().apply(lambda s: s.apply('{0:.2f}'.format))

In [None]:
def plot_correlation(data):
    corr = data.corr()
    mask = np.triu(np.ones_like(corr, dtype=bool))

    fig = plt.figure(figsize=(10, 10))

    ax = sns.heatmap(
        corr, 
        mask=mask,
        vmin=-1, vmax=1, center=0,
        cmap='coolwarm',
        square=True,
        linewidths=0.3,
        annot=True,
        fmt=".2f",
        annot_kws={"size": 8},
        cbar_kws={"shrink": .5}
    )

    ax.set_xticklabels(
        ax.get_xticklabels(),
        rotation=45,
        horizontalalignment='right'
    )

    ax.set_title("Correlation", size=14)

    plt.show()

In [None]:
plot_correlation(data[num_columns])

### Categorical Variable

In [None]:
pd.options.display.max_rows = 1000
data.describe(include='object').T

## Handling Missing Value (NaN)

In [None]:
# List Columns with NaN
print('Total columns:',data.shape[1])
print('Total columns with nan:',len(data.columns[data.isna().any()]))

In [None]:
# Nan Percentage by Column
list_column_nan_data = data[data.columns[data.isnull().any()]].isnull().sum() * 100 / data.shape[0]
list_column_nan_data

In [None]:
list_fill_na = list_column_nan_data[list_column_nan_data<75].index.tolist()
list_fill_na

In [None]:
list_fill_na_num = list(set(num_columns).intersection(set(list_fill_na)))
list_fill_na_num

In [None]:
n_cols = 4
n_rows = ceil(len(list_fill_na_num)/n_cols)
counter = 1

fig = plt.figure(figsize=(20,5))
for col in list_fill_na_num:
    plt.subplot(n_rows, n_cols, counter)
    plt.xlabel(col)
    g = plt.hist(data[col], bins=20)
    
    counter += 1

plt.show();

In [None]:
for i in list_fill_na_num:
    data[f'{i}'] = data[i].fillna(data[i].median())

In [None]:
data.head()

In [None]:
data.isnull().sum()

## OneHotEncoding

In [None]:
encoder = OneHotEncoder().fit(data[cat_columns])
data_transformed = encoder.transform(data[cat_columns]).toarray()

In [None]:
column_name = encoder.get_feature_names(cat_columns)
column_name

In [None]:
data_one_hot_encoded = pd.DataFrame(data_transformed, columns=column_name, index=data[cat_columns].index).astype(int)

In [None]:
data.head()

In [None]:
data_one_hot_encoded

In [None]:
data = pd.concat([data,data_one_hot_encoded], axis=1).reset_index(drop=True)

In [None]:
import pickle

ONEHOTENCODER = 'OHE-1.0.0.pkl'
COLUMN_NAME = 'COL-NAME1.0.0.pkl'

with open(ONEHOTENCODER, 'wb') as f:
  pickle.dump(encoder, f)

with open(COLUMN_NAME, 'wb') as f:
  pickle.dump(cat_columns, f)

## Handling Outlier

In [None]:
num_columns

In [None]:
# Check Outliers
cols = num_columns

plt.figure(figsize=(15,25))
for i,col in enumerate(cols, start=1):
    plt.subplot(10,1,i)
    sns.boxplot(x=data[col], data=data)
    plt.title(col)
    plt.subplots_adjust(hspace=1)
plt.show()

In [None]:
feature = 'person_age'
binning=[-float("inf"), 20, 44, 55, float("inf")]

bin_feature = pd.cut(data[feature], bins=binning).values.add_categories('Nan').fillna('Nan')
data_woe_iv = (pd.crosstab(bin_feature,data[target],normalize='columns')
             .assign(woe=lambda datax: np.log(datax[1] / datax[0]))
             .assign(iv=lambda datax: np.sum(datax['woe']*
                                           (datax[1]-datax[0]))))

data_woe_iv

In [None]:
feature = 'person_income'
binning=[-float("inf"), 50000, 75000, 100000, float("inf")]

bin_feature = pd.cut(data[feature], bins=binning).values.add_categories('Nan').fillna('Nan')
data_woe_iv = (pd.crosstab(bin_feature,data[target],normalize='columns')
             .assign(woe=lambda datax: np.log(datax[1] / datax[0]))
             .assign(iv=lambda datax: np.sum(datax['woe']*
                                           (datax[1]-datax[0]))))

data_woe_iv

In [None]:
feature = 'loan_amnt'
binning=[-float("inf"), 10000, 20000, float("inf")]

bin_feature = pd.cut(data[feature], bins=binning).values.add_categories('Nan').fillna('Nan')
data_woe_iv = (pd.crosstab(bin_feature,data[target],normalize='columns')
             .assign(woe=lambda datax: np.log(datax[1] / datax[0]))
             .assign(iv=lambda datax: np.sum(datax['woe']*
                                           (datax[1]-datax[0]))))

data_woe_iv

In [None]:
feature = 'loan_int_rate'
binning=[-float("inf"), 10, 15, float("inf")]

bin_feature = pd.cut(data[feature], bins=binning).values.add_categories('Nan').fillna('Nan')
data_woe_iv = (pd.crosstab(bin_feature,data[target],normalize='columns')
             .assign(woe=lambda datax: np.log(datax[1] / datax[0]))
             .assign(iv=lambda datax: np.sum(datax['woe']*
                                           (datax[1]-datax[0]))))

data_woe_iv

In [None]:
feature = 'loan_percent_income'
binning=[-float("inf"), 0.3, 0.5, 0.75, float("inf")]

bin_feature = pd.cut(data[feature], bins=binning).values.add_categories('Nan').fillna('Nan')
data_woe_iv = (pd.crosstab(bin_feature,data[target],normalize='columns')
             .assign(woe=lambda datax: np.log(datax[1] / datax[0]))
             .assign(iv=lambda datax: np.sum(datax['woe']*
                                           (datax[1]-datax[0]))))

data_woe_iv

In [None]:
feature = 'person_emp_length'
binning=[-float("inf"), 5, 10,  float("inf")]

bin_feature = pd.cut(data[feature], bins=binning).values.add_categories('Nan').fillna('Nan')
data_woe_iv = (pd.crosstab(bin_feature,data[target],normalize='columns')
             .assign(woe=lambda datax: np.log(datax[1] / datax[0]))
             .assign(iv=lambda datax: np.sum(datax['woe']*
                                           (datax[1]-datax[0]))))

data_woe_iv

In [None]:
feature = 'cb_person_cred_hist_length'
binning=[-float("inf"), 5, float("inf")]

bin_feature = pd.cut(data[feature], bins=binning).values.add_categories('Nan').fillna('Nan')
data_woe_iv = (pd.crosstab(bin_feature,data[target],normalize='columns')
             .assign(woe=lambda datax: np.log(datax[1] / datax[0]))
             .assign(iv=lambda datax: np.sum(datax['woe']*
                                           (datax[1]-datax[0]))))

data_woe_iv

In [None]:
woe = {
    'person_age': {'binning':[-float("inf"), 20, 44, 55, float("inf")],'labels': [-0.598, -0.0006, 0.031, 0.066], 'nan':0},
    'person_income': {'binning':[-float("inf"), 50000, 75000, 100000, float("inf")],'labels': [0.521, -0.267, -0.814, -0.975], 'nan':0},
    'person_emp_length': {'binning':[-float("inf"), 5, 10,  float("inf")],'labels': [-0.169, 0.196, 0.649], 'nan':0},
    'loan_amnt': {'binning':[-float("inf"), 10, 15, float("inf")],'labels': [-0.826, 0.01, 1.599], 'nan':0},
    'loan_int_rate': {'binning':[-float("inf"), 10, 15, float("inf")],'labels': [0.131, -0.236, -0.366], 'nan':0},
    'loan_percent_income': {'binning':[-float("inf"), 0.3, 0.5, 0.75, float("inf")],'labels': [-0.431, 2.114, 2.569, 2.659], 'nan':0},
    'cb_person_cred_hist_length': {'binning':[-float("inf"), 5, float("inf")],'labels': [0.042, -0.067], 'nan':0},
}

for feature, woe_info in woe.items():
  print('feature:', feature)
  data[f'{feature}_WOE'] = pd.cut(data[feature], bins=woe_info['binning'], labels=woe_info['labels'])
  data[f'{feature}_WOE'] = data[f'{feature}_WOE'].values.add_categories('Nan').fillna('Nan') 
  data[f'{feature}_WOE'] = data[f'{feature}_WOE'].replace('Nan', woe_info['nan'])
  data[f'{feature}_WOE'] = data[f'{feature}_WOE'].astype(float)

In [None]:
WOE_DICT = 'WOE-1.0.0.pkl'

with open(WOE_DICT, 'wb') as f:
  pickle.dump(woe, f)

In [None]:
loaded_woe = pickle.load(open(WOE_DICT, 'rb'))
loaded_woe

In [None]:
data.head()

## Split Data

In [None]:
X = data.loc[:,data.columns!='loan_status']
y = data[[target]]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42, stratify=y_train)

In [None]:
data_train_x_y = pd.concat([X_train, y_train], axis=1)
data_valid_x_y = pd.concat([X_val, y_val], axis=1)

In [None]:
woe_features = ['person_age_WOE', 'person_income_WOE', 'person_emp_length_WOE', 'loan_amnt_WOE', 
                'loan_int_rate_WOE', 'loan_percent_income_WOE', 'cb_person_cred_hist_length_WOE']
cat_features = ['cb_person_default_on_file_Y','cb_person_default_on_file_N', 'loan_grade_A','loan_grade_B', 'loan_grade_C',
       'loan_grade_D', 'loan_grade_E', 'loan_grade_F', 'loan_grade_G',
       'loan_intent_EDUCATION', 'loan_intent_HOMEIMPROVEMENT',
       'loan_intent_MEDICAL', 'loan_intent_PERSONAL', 'loan_intent_VENTURE', 'loan_intent_DEBTCONSOLIDATION', 
       'person_home_ownership_OTHER', 'person_home_ownership_OWN',
       'person_home_ownership_RENT', 'person_home_ownership_MORTGAGE']

In [None]:
all_features = cat_features + woe_features

In [None]:
def evaluate(true, predicted):
    threshold = 0.5
    pred_threshold = (predicted > threshold).astype(int)
    
    auc = roc_auc_score(true, predicted)
    accuracy = accuracy_score(true, pred_threshold)
    return accuracy, auc

In [None]:
def logreg(features, X, y, X_val, y_val, X_test, y_test):
    param_space = {
        "C": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
        "solver": ['newton-cg', 'lbfgs', 'liblinear'],
        "penalty": ['none', 'l1', 'l2', 'elasticnet'],
        "max_iter": [100, 1000,2500, 5000],
        #"random_state": [SEED]
    }
    lr = LogisticRegression()
    clf = GridSearchCV(lr, param_space, scoring='roc_auc', cv=3, refit=True)
    clf.fit(X, y)

    yhat_train = clf.predict_proba(X)[:, 1]
    yhat_val = clf.predict_proba(X_val)[:, 1]
    yhat_test = clf.predict_proba(X_test)[:, 1]

    print(clf.best_params_)

    index = pd.MultiIndex.from_product([['Train', 'Valid', 'Test'], ['Accuracy', 'AUC']])
    result = pd.DataFrame([
        pd.DataFrame(
            {'Train' : list(evaluate(y, yhat_train)),
             'Valid' : list(evaluate(y_val, yhat_val)),
             'Test' : list(evaluate(y_test, yhat_test))
            }).unstack().values], 
        columns=index)
    result.insert(loc=0, column='Model', value = features)
    return result

In [None]:
result = logreg("all features", X_train[all_features], y_train, 
                                          X_val[all_features], y_val, 
                                          X_test[all_features], y_test)
result

In [None]:
best_param = LogisticRegression(solver='lbfgs', C = 0.0001, max_iter=100, penalty='none')
best_param.fit(X_train[all_features], y_train)

In [None]:
yhat_train = best_param.predict_proba(X_train[all_features])[:, 1]
yhat_val = best_param.predict_proba(X_val[all_features])[:, 1]
yhat_test = best_param.predict_proba(X_test[all_features])[:, 1]
threshold = 0.5

In [None]:
test_input = {'person_age': 23,
 'person_income': 95000,
 'person_emp_length': 7,
 'loan_amnt': 35000,
 'loan_int_rate': 7.9,
 'loan_percent_income': 0.37,
 'cb_person_cred_hist_length': 4,
 'person_home_ownership': 'MORTGAGE',
 'loan_intent': 'EDUCATION',
 'loan_grade': 'A',
 'cb_person_default_on_file': 'N',
'cb_person_default_on_file_Y': 0,
 'cb_person_default_on_file_N': 1,
 'loan_grade_A': 1,
 'loan_grade_B': 0,
 'loan_grade_C': 0,
 'loan_grade_D': 0,
 'loan_grade_E': 0,
 'loan_grade_F': 0,
 'loan_grade_G': 0,
 'loan_intent_EDUCATION': 1,
 'loan_intent_HOMEIMPROVEMENT': 0,
 'loan_intent_MEDICAL': 0,
 'loan_intent_PERSONAL': 0,
 'loan_intent_VENTURE': 0,
 'loan_intent_DEBTCONSOLIDATION': 0,
 'person_home_ownership_OTHER': 0,
 'person_home_ownership_OWN': 0,
 'person_home_ownership_RENT': 0,
 'person_home_ownership_MORTGAGE': 1,
 'person_age_WOE': -0.0006,
 'person_income_WOE': -0.814,
 'person_emp_length_WOE': 0.196,
 'loan_amnt_WOE': 1.599,
 'loan_int_rate_WOE': 0.131,
 'loan_percent_income_WOE': 2.114,
 'cb_person_cred_hist_length_WOE': 0.042}

test_data = pd.DataFrame([test_input])
all_features = ['person_age_WOE', 'person_income_WOE', 'person_emp_length_WOE','loan_amnt_WOE','loan_int_rate_WOE', 'loan_percent_income_WOE', 'cb_person_cred_hist_length_WOE', 'cb_person_default_on_file_Y', 'cb_person_default_on_file_N','loan_grade_A','loan_grade_B', 'loan_grade_C', 'loan_grade_D', 'loan_grade_E', 'loan_grade_F', 'loan_grade_G', 'loan_intent_EDUCATION', 'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL', 'loan_intent_PERSONAL', 'loan_intent_VENTURE', 'loan_intent_DEBTCONSOLIDATION', 'person_home_ownership_OTHER', 'person_home_ownership_OWN', 'person_home_ownership_RENT', 'person_home_ownership_MORTGAGE']
model = best_param

pred_proba = model.predict_proba(test_data[all_features])[:, 1]
threshold = 0.5
prediction = (pred_proba > threshold).astype(int)

print({ "data": [ { "pred proba": float(pred_proba[0]), "prediction" : int(prediction[0])} ] })

In [None]:
print("Accuracy Train:", accuracy_score(y_train, (yhat_train > threshold).astype(int)))
print("ROC-AUC Train:", roc_auc_score(y_train, yhat_train))
print("Accuracy Valid:", accuracy_score(y_val, (yhat_val > threshold).astype(int)))
print("ROC-AUC Valid:", roc_auc_score(y_val, yhat_val))
print("Accuracy Test:", accuracy_score(y_test, (yhat_test > threshold).astype(int)))
print("ROC-AUC Test:", roc_auc_score(y_test, yhat_test))

In [None]:
# Export Model
MODELNAME = 'LR-ALL-WOE-1.0.0.pkl'

with open(MODELNAME, 'wb') as f:
    pickle.dump(best_param, f)

In [None]:
# Export Test
threshold = 0.5

test_set = X_test.copy()
test_set.loc[:, target] = y_test
test_set.loc[:, 'score_proba'] = yhat_test 
test_set.loc[:, 'prediction'] = (yhat_test > threshold).astype(int)

test_set.to_csv('test_set.csv')

In [None]:
accuracy_score(test_set['loan_status'], test_set['prediction'])