# Credit Scoring - Machine Learning Final Project 2023

<a id='toc'></a>
## Table of contents
* <a href='#importdata'>Import Data</a>
* <a href='#datatransformation'>Data Transformation</a>
    * <a href='#stdscl'>Standard Scaler</a>
    * <a href='#minmax'>MinMax Scaler</a>
    * <a href='#woe'>WOE</a>
* <a href='#Dealing with class imbalance'>Dealing with class imbalance</a>
    * <a href='#upsample'>Upsample</a>
    * <a href='#downsample'>Downsample</a>
    * <a href='#smote'>SMOTE</a>
* <a href='#combination'>Features Combination</a>
* <a href='#classifier'>Classifier</a>
    * <a href='#lr'>Logistic Regression</a>
    * <a href='#dt'>Decision Tree</a>
    * <a href='#rf'>Random Forest</a>

## Importing package

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import KernelPCA
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier    
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score


#importing date for labelling file
from datetime import datetime
date=datetime.today().strftime('%Y%m%d')
print('today date ', date)

plt.ioff() #prevent showing figure

def filldata(df,value):
    return df.fillna(value)

today date  20230501


#### Dictionary Variable to Store Classifier Results

In [2]:
dfclassification_result=pd.DataFrame(columns=['variable','precision','recall','f1','auc','auc+f1+prec',
                                              'data_fill','transformation','resample','classifier'])

<a id='importdata'></a>
## Importing Data
<a href='#toc'>Table of contents</a>

#### Import Training and Test Data

In [201]:
#importing training data to pandas dataframe
df=pd.read_csv(r'E:\phbs\module03\ml_finance\final_project\training.csv',index_col='id')
df=df.drop(columns=['split'])
df['event'].replace({0:1, 1:0},inplace=True)

#importing TEST data to pandas dataframe
df1=pd.read_csv(r'E:\phbs\module03\ml_finance\final_project\test.csv',index_col='id')
df1=df1.drop(columns=['split'])
df1['event'].replace({0:1, 1:0},inplace=True)

#set the target column separate from dataframe
dfy1=df1['event']
dfy=df['event'] 

#### Fill NA Value - <font color='red'>Mean</font>

In [177]:
fillvalue=df.mean()
data_fill='mean'

df=filldata(df,fillvalue)
df1=filldata(df1,fillvalue)

X=df.values
y=dfy.values
X1=df1.values
y1=dfy1.values

y_train=y

print('Credit Default - train %:',dfy.sum()/df.shape[0]*100)
print('Credit Default - test %:',dfy1.sum()/df1.shape[0]*100)

Credit Default - train %: 2.5255824080121925
Credit Default - test %: 2.5004605670836892


#### Fill NA Value - <font color='red'>Median</font>

In [202]:
fillvalue=df.median()
data_fill='median'

df=filldata(df,fillvalue)
df1=filldata(df1,fillvalue)

X=df.values
y=dfy.values
X1=df1.values
y1=dfy1.values

y_train=y

print('Credit Default - train %:',dfy.sum()/df.shape[0]*100)
print('Credit Default - test %:',dfy1.sum()/df1.shape[0]*100)

Credit Default - train %: 2.5255824080121925
Credit Default - test %: 2.5004605670836892


<a id='datatransformation'></a>
## Data Transformation
<a href='#toc'>Table of contents</a>

<a id='stdscl'></a>
## Standard Scaler

In [160]:
scaler=StandardScaler()

transformation='stdscl'

scaler.fit(df)

df=pd.DataFrame(scaler.fit_transform(df),columns=df.columns)
df1=pd.DataFrame(scaler.fit_transform(df1),columns=df1.columns)
X_train=df.drop("event" ,axis= 1)

X_test=df1.drop(columns=['event'])
y_test=y1

<a id='minmax'></a>
## MinMax Scaler

In [203]:
from sklearn.preprocessing import MinMaxScaler

scaler=MinMaxScaler()

transformation='minmax'

scaler.fit(df)

df=pd.DataFrame(scaler.fit_transform(df),columns=df.columns)
df1=pd.DataFrame(scaler.fit_transform(df1),columns=df1.columns)
X_train=df.drop("event" ,axis= 1)

X_test=df1.drop(columns=['event'])
y_test=y1

<a id='woe'></a>
## WOE

In [118]:
import scorecardpy as sc
print(pd.__version__)

transformation='WOE'

df['event']=dfy
df1['event']=dfy1

bins = sc.woebin(df, y="event")

'''
pd.concat([pd.DataFrame(bins['var1']),
           pd.DataFrame(bins['var2']),
           pd.DataFrame(bins['var3']),
           pd.DataFrame(bins['var4']),
           pd.DataFrame(bins['var5']),
           pd.DataFrame(bins['var6']),
           pd.DataFrame(bins['var7']),
           pd.DataFrame(bins['var8'])
          ])
'''

# converting train and test into woe values
X_train = sc.woebin_ply(df.drop(columns=['event']), bins)
X_test = sc.woebin_ply(df1.drop(columns=['event']), bins)

1.1.3
[INFO] creating woe binning ...
Binning on 238836 rows and 9 columns in 00:00:18
[INFO] converting into woe values ...
[INFO] converting into woe values ...


<a id='woe'></a>
## WOE + Original Data
Original data means the variable that hasn't been transformed using WOE (It should either run the Standard scaler or minmax scaler first). So the result is 16 features. 8 from standard scaled features, 8 from WOE transformation.

In [204]:
import scorecardpy as sc
print(pd.__version__)

df['event']=dfy
df1['event']=dfy1

bins = sc.woebin(df, y="event")

dfbinned = pd.concat([pd.DataFrame(bins['var1']),
           pd.DataFrame(bins['var2']),
           pd.DataFrame(bins['var3']),
           pd.DataFrame(bins['var4']),
           pd.DataFrame(bins['var5']),
           pd.DataFrame(bins['var6']),
           pd.DataFrame(bins['var7']),
           pd.DataFrame(bins['var8'])
          ])

# converting train and test into woe values
X_train = sc.woebin_ply(df.drop(columns=['event']), bins)
X_test = sc.woebin_ply(df1.drop(columns=['event']), bins)

X_train=pd.concat([X_train, df.drop(columns=['event'])], axis=1, join='inner')
X_test=pd.concat([X_test, df1.drop(columns=['event'])], axis=1, join='inner')

1.1.3
[INFO] creating woe binning ...
Binning on 238836 rows and 9 columns in 00:00:20
[INFO] converting into woe values ...
Woe transformating on 238836 rows and 8 columns in 00:00:14
[INFO] converting into woe values ...


## Dealing with class imbalance <a id='Dealing with class imbalance'></a>

<a href='#toc'>Table of contents</a>

<a id='upsample'></a>
### Upsample

In [192]:
from sklearn.utils import resample

resamplelbl='upsample'

y_train=dfy

print('Number of class 1 examples before:', X_train[y_train == 1].shape)

#UPSAMPLE
X_upsampled, y_upsampled = resample(X_train[y_train == 1],
                                    y_train[y_train == 1],
                                    replace=True,
                                    n_samples=X_train[y_train == 0].shape[0],
                                    random_state=123)

print('Number of class 1 examples after:', X_upsampled.shape)

print(X_train.shape,X_upsampled.shape,y_train.shape,y_upsampled.shape)

X_bal = np.vstack((X_train[y_train==0], X_upsampled))
y_bal = np.hstack((y_train[y_train==0], y_upsampled))

#check the data label balance
#plt.title('Data after up sampling')
#plt.hist(y_bal)
#plt.savefig('images/data_upsample_dist.png', dpi=300)

X_train=pd.DataFrame(X_bal,columns=X_train.columns[0:])
y_train=y_bal

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

Number of class 1 examples before: (6032, 16)
Number of class 1 examples after: (232804, 16)
(238836, 16) (232804, 16) (238836,) (232804,)
(465608, 16)
(59709, 16)
(465608,)
(59709,)


<a id='downsample'></a>
### Downsample

In [103]:
#DOWNSAMPLE
from sklearn.utils import resample

resamplelbl='downsample'

y_train=dfy

print('Number of class 1 examples before:', X_train[y_train == 1].shape)

X_downsampled, y_downsampled = resample(X_train[y_train == 0],
                                    y_train[y_train == 0],
                                    replace=True,
                                    n_samples=X_train[y_train == 1].shape[0],
                                    random_state=123)

print('Number of class 1 examples after:', X_downsampled.shape)
print(X_train.shape,X_downsampled.shape,y_train.shape,y_downsampled.shape)

X_bal = np.vstack((X_train[y_train==1], X_downsampled))
y_bal = np.hstack((y_train[y_train==1], y_downsampled))

#check the data label balance
#plt.title('Data after down sampling')
#plt.hist(y_bal)
#plt.savefig('images/data_downsample_dist.png', dpi=300)

X_train=pd.DataFrame(X_bal,columns=X_train.columns[0:])
y_train=y_bal

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

Number of class 1 examples before: (6032, 8)
Number of class 1 examples after: (6032, 8)
(238836, 8) (6032, 8) (238836,) (6032,)
(12064, 8)
(59709, 8)
(12064,)
(59709,)


<a id='smote'></a>
### SMOTE

In [205]:
from imblearn.over_sampling import SMOTE
resamplelbl='SMOTE'
sm = SMOTE(sampling_strategy='auto', k_neighbors=5, random_state=123)

X_bal, y_bal = sm.fit_resample(X_train, y_train)
print(X_train.shape,X_bal.shape,y_train.shape,y_bal.shape)

#check the data label balance
#plt.title('Data after SMOTE')
#plt.hist(y_bal)
#plt.savefig('images/data_SMOTE_dist.png', dpi=300)

y_train=y_bal
y_test=y1

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

X_train=pd.DataFrame(X_bal)
#X_test.drop("event" ,axis= 1,inplace=True)

(238836, 16) (465608, 16) (238836,) (465608,)
(238836, 16)
(59709, 16)
(465608,)
(59709,)


## LR gridsearch

In [51]:
#Logistic regression grid search

pipe_lr = make_pipeline(LogisticRegression(random_state=1, solver='lbfgs'))


param_range = [.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]

param_grid = [{'logisticregression__C': param_range}]

for i in X_train.columns:
    gs = GridSearchCV(estimator=pipe_lr, 
                      param_grid=param_grid, 
                      scoring='roc_auc', 
                      refit=True,
                      cv=5,
                      n_jobs=-1)
    gs = gs.fit(pd.DataFrame(X_train[i]), y_train)
    print(i)
    print(gs.best_score_)
    print(gs.best_params_)

var1
0.5367512399927217
{'logisticregression__C': 0.001}
var2
0.6031097329997864
{'logisticregression__C': 0.001}
var3
0.5750612637967172
{'logisticregression__C': 0.001}
var4
0.5862370348015984
{'logisticregression__C': 0.001}
var5
0.49825676917356987
{'logisticregression__C': 0.001}
var6
0.4998361604416006
{'logisticregression__C': 0.001}
var7
0.5113545659213239
{'logisticregression__C': 0.001}
var8
0.5434373906283821
{'logisticregression__C': 10.0}


RF grid search
https://towardsdatascience.com/a-practical-guide-to-implementing-a-random-forest-classifier-in-python-979988d8a263

<a id='combination'></a>
## Generate List of Features Combination
Rather than choosing specifically 2 or 3 features using PCA, we tried to observed how the result of the machine learning classifier to combinations of features that exist.

In [207]:
listfeature1=[]
label=X_train.columns 
# A Python program to print all
# combinations of a given length
from itertools import combinations

max_element=3

for n in range(3,max_element+1):    #len(label) + 1): (uncomment for making list of all possible combination)
    comb = combinations(label, n)
    for i in list(comb):
        listfeature1.append(list(i))

print(len(listfeature1))
print(listfeature1)

560
[['var3_woe', 'var4_woe', 'var7_woe'], ['var3_woe', 'var4_woe', 'var5_woe'], ['var3_woe', 'var4_woe', 'var6_woe'], ['var3_woe', 'var4_woe', 'var2_woe'], ['var3_woe', 'var4_woe', 'var1_woe'], ['var3_woe', 'var4_woe', 'var8_woe'], ['var3_woe', 'var4_woe', 'var1'], ['var3_woe', 'var4_woe', 'var2'], ['var3_woe', 'var4_woe', 'var3'], ['var3_woe', 'var4_woe', 'var4'], ['var3_woe', 'var4_woe', 'var5'], ['var3_woe', 'var4_woe', 'var6'], ['var3_woe', 'var4_woe', 'var7'], ['var3_woe', 'var4_woe', 'var8'], ['var3_woe', 'var7_woe', 'var5_woe'], ['var3_woe', 'var7_woe', 'var6_woe'], ['var3_woe', 'var7_woe', 'var2_woe'], ['var3_woe', 'var7_woe', 'var1_woe'], ['var3_woe', 'var7_woe', 'var8_woe'], ['var3_woe', 'var7_woe', 'var1'], ['var3_woe', 'var7_woe', 'var2'], ['var3_woe', 'var7_woe', 'var3'], ['var3_woe', 'var7_woe', 'var4'], ['var3_woe', 'var7_woe', 'var5'], ['var3_woe', 'var7_woe', 'var6'], ['var3_woe', 'var7_woe', 'var7'], ['var3_woe', 'var7_woe', 'var8'], ['var3_woe', 'var5_woe', 'var6_wo

<a id='classifier'></a>
## Classifier
<a href='#toc'>Table of contents</a>

<a id='lr'></a>
### Logistic Regression

In [209]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')
clr = ['black', 'orange', 'blue', 'green']
ls = [':', '--', '-.', '-']

classifier='LR'

clf = LogisticRegression(penalty='l2', 
                          C=1e-3,
                          solver='lbfgs',
                          random_state=1
                        )
#listfeature=['var3_woe','var1_woe']
for listfeature in listfeature1:
    clf.fit(X_train[listfeature], y_train)
    y_pred = clf.fit(X_train[listfeature],y_train).predict_proba(X_test[listfeature])[:, 1]
    
    fpr, tpr, thresholds = roc_curve(y_true=y_test,y_score=y_pred)
    auc = roc_auc_score(y_test, y_pred)
    
    #roc_auc = auc(x=fpr, y=tpr)
    
                     
    confmat = confusion_matrix(y_test, clf.predict(X_test[listfeature]))
    #print(confmat)
    
    print(','.join(listfeature), '\t','Score\t: ',clf.score(X_test[listfeature],y_test),'\t ROC AUC\t: ',auc,'\n')                 

    cr=classification_report(y_test, clf.predict(X_test[listfeature]),digits=4,output_dict=True)
    
    dfclassification_result=dfclassification_result.append({'variable':','.join(listfeature),
                                                            'precision':cr['weighted avg']['precision'],
                                                            'recall':cr['weighted avg']['recall'],
                                                            'f1':cr['weighted avg']['f1-score'],
                                                           'auc':auc,
                                                            'data_fill':data_fill,
                                                            'resample':resamplelbl,
                                                            'classifier':classifier,
                                                            'transformation':transformation,
                                                            'auc+f1+prec':cr['weighted avg']['precision']+cr['weighted avg']['f1-score']+auc
                                                           },
                                                           ignore_index=True)    
    
    fig, ax = plt.subplots(figsize=(2.5, 2.5))
    ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
    for i in range(confmat.shape[0]):
        for j in range(confmat.shape[1]):
            ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')

    plt.xlabel('Predicted label')
    plt.ylabel('True label')

    plt.title(data_fill+' '+classifier+' '+resamplelbl+' '+transformation+' '+' '.join(listfeature))
    plt.savefig('images/'+date+'_conmat_'+data_fill+'_'+classifier+'_'+resamplelbl+'_'+transformation+'_'+'_'.join(listfeature)+'.png', dpi=300, bbox_inches="tight")
    #plt.show()
    plt.clf()
    
    
    fig, ax = plt.subplots(figsize=(4, 3))
    plt.plot(fpr, tpr,
         color='red',
         #linestyle=ls,
         label='%s (auc = %0.3f)' % (' '.join(listfeature), auc))
    
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1],
             linestyle='--',
             color='gray',
             linewidth=2)

    plt.xlim([-0.1, 1.1])
    plt.ylim([-0.1, 1.1])
    plt.grid(alpha=0.5)
    plt.xlabel('False positive rate (FPR)')
    plt.ylabel('True positive rate (TPR)')
    plt.title(data_fill+' '+classifier+' '+resamplelbl+' '+transformation+' '+' '.join(listfeature))
    plt.savefig('images/'+date+'_TPRvsVPR_'+data_fill+'_'+classifier+'_'+resamplelbl+'_'+transformation+'_'+'_'.join(listfeature)+'.png', dpi=300, bbox_inches="tight")
    #plt.show() 
    
plt.ioff() #prevent showing figure

var3_woe,var4_woe,var7_woe 	 Score	:  0.9333936257515617 	 ROC AUC	:  0.5834103593785337 

var3_woe,var4_woe,var5_woe 	 Score	:  0.9333936257515617 	 ROC AUC	:  0.5834103593785337 

var3_woe,var4_woe,var6_woe 	 Score	:  0.9333936257515617 	 ROC AUC	:  0.5834103593785337 

var3_woe,var4_woe,var2_woe 	 Score	:  0.9333936257515617 	 ROC AUC	:  0.6347655809562853 

var3_woe,var4_woe,var1_woe 	 Score	:  0.8952419233281415 	 ROC AUC	:  0.5929794701322952 

var3_woe,var4_woe,var8_woe 	 Score	:  0.9238808219866351 	 ROC AUC	:  0.5955662520556513 

var3_woe,var4_woe,var1 	 Score	:  0.9333768778576094 	 ROC AUC	:  0.5894883200987137 

var3_woe,var4_woe,var2 	 Score	:  0.9333936257515617 	 ROC AUC	:  0.642251479374086 

var3_woe,var4_woe,var3 	 Score	:  0.9333936257515617 	 ROC AUC	:  0.6061620437309894 

var3_woe,var4_woe,var4 	 Score	:  0.9333936257515617 	 ROC AUC	:  0.5866927745630957 

var3_woe,var4_woe,var5 	 Score	:  0.9333936257515617 	 ROC AUC	:  0.5825303019606591 

var3_woe,var4_woe,va

var3_woe,var4,var5 	 Score	:  0.9333936257515617 	 ROC AUC	:  0.5858228360538452 

var3_woe,var4,var6 	 Score	:  0.9333936257515617 	 ROC AUC	:  0.585386474658295 

var3_woe,var4,var7 	 Score	:  0.925572359275821 	 ROC AUC	:  0.5898887964732307 

var3_woe,var4,var8 	 Score	:  0.9310321727042824 	 ROC AUC	:  0.6137205693354751 

var3_woe,var5,var6 	 Score	:  0.9748446632835921 	 ROC AUC	:  0.49778035210074295 

var3_woe,var5,var7 	 Score	:  0.9469594198529535 	 ROC AUC	:  0.5050332855142513 

var3_woe,var5,var8 	 Score	:  0.8304945653084125 	 ROC AUC	:  0.5498966778317136 

var3_woe,var6,var7 	 Score	:  0.9469594198529535 	 ROC AUC	:  0.5095802881497006 

var3_woe,var6,var8 	 Score	:  0.8304945653084125 	 ROC AUC	:  0.550093222818667 

var3_woe,var7,var8 	 Score	:  0.8286355490797033 	 ROC AUC	:  0.5501023810350001 

var4_woe,var7_woe,var5_woe 	 Score	:  0.9333936257515617 	 ROC AUC	:  0.5834103593785337 

var4_woe,var7_woe,var6_woe 	 Score	:  0.9333936257515617 	 ROC AUC	:  0.583410359

var4_woe,var5,var7 	 Score	:  0.9265604850190089 	 ROC AUC	:  0.5850026809642838 

var4_woe,var5,var8 	 Score	:  0.9324892394781357 	 ROC AUC	:  0.6112902076761316 

var4_woe,var6,var7 	 Score	:  0.9265604850190089 	 ROC AUC	:  0.587785248524998 

var4_woe,var6,var8 	 Score	:  0.9324892394781357 	 ROC AUC	:  0.6113119699452193 

var4_woe,var7,var8 	 Score	:  0.927615602338006 	 ROC AUC	:  0.6118347648837352 

var7_woe,var5_woe,var6_woe 	 Score	:  0.9749953943291632 	 ROC AUC	:  0.5 

var7_woe,var5_woe,var2_woe 	 Score	:  0.209315178616289 	 ROC AUC	:  0.5676104170246731 

var7_woe,var5_woe,var1_woe 	 Score	:  0.9220218057579259 	 ROC AUC	:  0.5354849646018831 

var7_woe,var5_woe,var8_woe 	 Score	:  0.9650136495335712 	 ROC AUC	:  0.5298890182953551 

var7_woe,var5_woe,var1 	 Score	:  0.8876216315798289 	 ROC AUC	:  0.5378479282319829 

var7_woe,var5_woe,var2 	 Score	:  0.2309199618148018 	 ROC AUC	:  0.6028755211554337 

var7_woe,var5_woe,var3 	 Score	:  0.0994657421829205 	 ROC AUC	: 

var5_woe,var2_woe,var1 	 Score	:  0.209315178616289 	 ROC AUC	:  0.5994490539010272 

var5_woe,var2_woe,var2 	 Score	:  0.209315178616289 	 ROC AUC	:  0.6028755211554337 

var5_woe,var2_woe,var3 	 Score	:  0.209315178616289 	 ROC AUC	:  0.608384665749495 

var5_woe,var2_woe,var4 	 Score	:  0.9351856504044617 	 ROC AUC	:  0.6364483686915651 

var5_woe,var2_woe,var5 	 Score	:  0.209315178616289 	 ROC AUC	:  0.5664910609365624 

var5_woe,var2_woe,var6 	 Score	:  0.209315178616289 	 ROC AUC	:  0.5658215792152117 

var5_woe,var2_woe,var7 	 Score	:  0.209315178616289 	 ROC AUC	:  0.572424773996851 

var5_woe,var2_woe,var8 	 Score	:  0.27289018405935456 	 ROC AUC	:  0.5919554814501939 

var5_woe,var1_woe,var8_woe 	 Score	:  0.9205144953022157 	 ROC AUC	:  0.5481531651393922 

var5_woe,var1_woe,var1 	 Score	:  0.9220218057579259 	 ROC AUC	:  0.5378479282319829 

var5_woe,var1_woe,var2 	 Score	:  0.9220218057579259 	 ROC AUC	:  0.60302958858623 

var5_woe,var1_woe,var3 	 Score	:  0.922021805757

var6_woe,var3,var8 	 Score	:  0.8305280610963172 	 ROC AUC	:  0.5663779293521386 

var6_woe,var4,var5 	 Score	:  0.9333936257515617 	 ROC AUC	:  0.5858228360538452 

var6_woe,var4,var6 	 Score	:  0.9333936257515617 	 ROC AUC	:  0.585386474658295 

var6_woe,var4,var7 	 Score	:  0.925572359275821 	 ROC AUC	:  0.5898887964732307 

var6_woe,var4,var8 	 Score	:  0.9310321727042824 	 ROC AUC	:  0.6137205693354751 

var6_woe,var5,var6 	 Score	:  0.9748446632835921 	 ROC AUC	:  0.49778035210074295 

var6_woe,var5,var7 	 Score	:  0.9469594198529535 	 ROC AUC	:  0.5050332855142513 

var6_woe,var5,var8 	 Score	:  0.8304945653084125 	 ROC AUC	:  0.5498966778317136 

var6_woe,var6,var7 	 Score	:  0.9469594198529535 	 ROC AUC	:  0.5095802881497006 

var6_woe,var6,var8 	 Score	:  0.8304945653084125 	 ROC AUC	:  0.550093222818667 

var6_woe,var7,var8 	 Score	:  0.8286355490797033 	 ROC AUC	:  0.5501023810350001 

var2_woe,var1_woe,var8_woe 	 Score	:  0.935805322480698 	 ROC AUC	:  0.5961177757205285 


var8_woe,var1,var8 	 Score	:  0.9627694317439582 	 ROC AUC	:  0.5623509373733554 

var8_woe,var2,var3 	 Score	:  0.9650136495335712 	 ROC AUC	:  0.5829974457780669 

var8_woe,var2,var4 	 Score	:  0.9335778525850374 	 ROC AUC	:  0.6202586556419536 

var8_woe,var2,var5 	 Score	:  0.9650136495335712 	 ROC AUC	:  0.572420355962841 

var8_woe,var2,var6 	 Score	:  0.9650136495335712 	 ROC AUC	:  0.5727113134161611 

var8_woe,var2,var7 	 Score	:  0.9528881743120803 	 ROC AUC	:  0.5727285138350275 

var8_woe,var2,var8 	 Score	:  0.9650136495335712 	 ROC AUC	:  0.5692980139740575 

var8_woe,var3,var4 	 Score	:  0.9335778525850374 	 ROC AUC	:  0.6080465538368278 



MemoryError: Unable to allocate 10.7 MiB for an array with shape (465608, 3) and data type float64

<a id='dt'></a>
### Decision Tree

In [208]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')
clr = ['black', 'orange', 'blue', 'green']
ls = [':', '--', '-.', '-']

classifier='DT'

clf = DecisionTreeClassifier(max_depth=5,
                          criterion='entropy',
                          random_state=1)
    
for listfeature in listfeature1:
    clf.fit(X_train[listfeature], y_train)
    y_pred = clf.fit(X_train[listfeature],y_train).predict_proba(X_test[listfeature])[:, 1]
    
    fpr, tpr, thresholds = roc_curve(y_true=y_test,y_score=y_pred)
    auc = roc_auc_score(y_test, y_pred)
    
    #roc_auc = auc(x=fpr, y=tpr)
    
                     
    confmat = confusion_matrix(y_test, clf.predict(X_test[listfeature]))
    #print(confmat)
    
    print(','.join(listfeature), '\t','Score\t: ',clf.score(X_test[listfeature],y_test),'\t ROC AUC\t: ',auc,'\n') 
    
    
    cr=classification_report(y_test, clf.predict(X_test[listfeature]),digits=4,output_dict=True)
    #print(cr)

    dfclassification_result=dfclassification_result.append({'variable':','.join(listfeature),
                                                            'precision':cr['weighted avg']['precision'],
                                                            'recall':cr['weighted avg']['recall'],
                                                            'f1':cr['weighted avg']['f1-score'],
                                                           'auc':auc,
                                                            'data_fill':data_fill,
                                                            'resample':resamplelbl,
                                                            'classifier':classifier,
                                                            'transformation':transformation,
                                                            'auc+f1+prec':cr['weighted avg']['precision']+cr['weighted avg']['f1-score']+auc
                                                           },
                                                           ignore_index=True)    
    
    fig, ax = plt.subplots(figsize=(2.5, 2.5))
    ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
    for i in range(confmat.shape[0]):
        for j in range(confmat.shape[1]):
            ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')

    plt.xlabel('Predicted label')
    plt.ylabel('True label')

    plt.title(data_fill+' '+classifier+' '+resamplelbl+' '+transformation+' '+' '.join(listfeature))
    plt.savefig('images/'+date+'_conmat_'+data_fill+'_'+classifier+'_'+resamplelbl+'_'+transformation+'_'+'_'.join(listfeature)+'.png', dpi=300, bbox_inches="tight")
    #plt.show()
    plt.clf()
    
    
    fig, ax = plt.subplots(figsize=(4, 3))
    plt.plot(fpr, tpr,
         color='red',
         #linestyle=ls,
         label='%s (auc = %0.3f)' % (' '.join(listfeature), auc))
    
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1],
             linestyle='--',
             color='gray',
             linewidth=2)

    plt.xlim([-0.1, 1.1])
    plt.ylim([-0.1, 1.1])
    plt.grid(alpha=0.5)
    plt.xlabel('False positive rate (FPR)')
    plt.ylabel('True positive rate (TPR)')
    plt.title(data_fill+' '+classifier+' '+resamplelbl+' '+transformation+' '+' '.join(listfeature))
    plt.savefig('images/'+date+'_TPRvsVPR_'+data_fill+'_'+classifier+'_'+resamplelbl+'_'+transformation+'_'+'_'.join(listfeature)+'.png', dpi=300, bbox_inches="tight")
    #plt.show() 
    
plt.ioff() #prevent showing figure

var3_woe,var4_woe,var7_woe 	 Score	:  0.9333936257515617 	 ROC AUC	:  0.5834103593785337 

var3_woe,var4_woe,var5_woe 	 Score	:  0.9333936257515617 	 ROC AUC	:  0.5834103593785337 

var3_woe,var4_woe,var6_woe 	 Score	:  0.9333936257515617 	 ROC AUC	:  0.5834103593785337 

var3_woe,var4_woe,var2_woe 	 Score	:  0.9375805992396457 	 ROC AUC	:  0.6347589769158644 

var3_woe,var4_woe,var1_woe 	 Score	:  0.8952419233281415 	 ROC AUC	:  0.5929794701322952 

var3_woe,var4_woe,var8_woe 	 Score	:  0.9238808219866351 	 ROC AUC	:  0.5955662520556513 

var3_woe,var4_woe,var1 	 Score	:  0.8952419233281415 	 ROC AUC	:  0.5933983549818533 

var3_woe,var4_woe,var2 	 Score	:  0.9365924734964578 	 ROC AUC	:  0.6234418088775056 

var3_woe,var4_woe,var3 	 Score	:  0.9409301780301127 	 ROC AUC	:  0.6073267019256462 

var3_woe,var4_woe,var4 	 Score	:  0.9371451539968849 	 ROC AUC	:  0.5842229727459766 

var3_woe,var4_woe,var5 	 Score	:  0.9334941131152757 	 ROC AUC	:  0.5824463305512299 

var3_woe,var4_woe,v

var3_woe,var4,var5 	 Score	:  0.9335276089031804 	 ROC AUC	:  0.5879709727802164 

var3_woe,var4,var6 	 Score	:  0.9442797568205799 	 ROC AUC	:  0.4488045985014949 

var3_woe,var4,var7 	 Score	:  0.9191746637860289 	 ROC AUC	:  0.5816725130449356 

var3_woe,var4,var8 	 Score	:  0.9343984993887019 	 ROC AUC	:  0.5840592638763775 

var3_woe,var5,var6 	 Score	:  0.9749618985412585 	 ROC AUC	:  0.4999484677751821 

var3_woe,var5,var7 	 Score	:  0.9526704516906999 	 ROC AUC	:  0.506271778951768 

var3_woe,var5,var8 	 Score	:  0.8820780786816058 	 ROC AUC	:  0.5422105354740059 

var3_woe,var6,var7 	 Score	:  0.9524862248572242 	 ROC AUC	:  0.507110722191168 

var3_woe,var6,var8 	 Score	:  0.8808722303170377 	 ROC AUC	:  0.5405624074456389 

var3_woe,var7,var8 	 Score	:  0.9550151568440268 	 ROC AUC	:  0.5432287657550083 

var4_woe,var7_woe,var5_woe 	 Score	:  0.9333936257515617 	 ROC AUC	:  0.5834103593785337 

var4_woe,var7_woe,var6_woe 	 Score	:  0.9333936257515617 	 ROC AUC	:  0.583410359

var4_woe,var5,var7 	 Score	:  0.9098795826424827 	 ROC AUC	:  0.5845154604037843 

var4_woe,var5,var8 	 Score	:  0.9333936257515617 	 ROC AUC	:  0.5904515032866952 

var4_woe,var6,var7 	 Score	:  0.9548476779045035 	 ROC AUC	:  0.5048659122075895 

var4_woe,var6,var8 	 Score	:  0.9313671305833291 	 ROC AUC	:  0.5981184548091727 

var4_woe,var7,var8 	 Score	:  0.9207154700296438 	 ROC AUC	:  0.6074018372670557 

var7_woe,var5_woe,var6_woe 	 Score	:  0.9749953943291632 	 ROC AUC	:  0.5 

var7_woe,var5_woe,var2_woe 	 Score	:  0.209315178616289 	 ROC AUC	:  0.5676104170246731 

var7_woe,var5_woe,var1_woe 	 Score	:  0.9220218057579259 	 ROC AUC	:  0.5354849646018831 

var7_woe,var5_woe,var8_woe 	 Score	:  0.9650136495335712 	 ROC AUC	:  0.5298890182953551 

var7_woe,var5_woe,var1 	 Score	:  0.8876216315798289 	 ROC AUC	:  0.5340958783332341 

var7_woe,var5_woe,var2 	 Score	:  0.7609740575122679 	 ROC AUC	:  0.5814796727635843 

var7_woe,var5_woe,var3 	 Score	:  0.8866167579426887 	 ROC AUC	

var5_woe,var2_woe,var1 	 Score	:  0.22157463698939858 	 ROC AUC	:  0.5888098239772414 

var5_woe,var2_woe,var2 	 Score	:  0.7609573096183155 	 ROC AUC	:  0.5943130318381018 

var5_woe,var2_woe,var3 	 Score	:  0.8998140983771291 	 ROC AUC	:  0.6025785349265377 

var5_woe,var2_woe,var4 	 Score	:  0.9386524644525951 	 ROC AUC	:  0.6352764564072125 

var5_woe,var2_woe,var5 	 Score	:  0.9661357584283776 	 ROC AUC	:  0.5650396274640088 

var5_woe,var2_woe,var6 	 Score	:  0.9749618985412585 	 ROC AUC	:  0.4744699590254958 

var5_woe,var2_woe,var7 	 Score	:  0.21336816895275418 	 ROC AUC	:  0.5697675163773299 

var5_woe,var2_woe,var8 	 Score	:  0.8824465323485572 	 ROC AUC	:  0.5705718804468952 

var5_woe,var1_woe,var8_woe 	 Score	:  0.9205144953022157 	 ROC AUC	:  0.5485360384096514 

var5_woe,var1_woe,var1 	 Score	:  0.9220218057579259 	 ROC AUC	:  0.5333023177374585 

var5_woe,var1_woe,var2 	 Score	:  0.7814232360280695 	 ROC AUC	:  0.5858023738833075 

var5_woe,var1_woe,var3 	 Score	:  0.9

var6_woe,var3,var8 	 Score	:  0.878393542012092 	 ROC AUC	:  0.557966418293385 

var6_woe,var4,var5 	 Score	:  0.9335276089031804 	 ROC AUC	:  0.5879709727802164 

var6_woe,var4,var6 	 Score	:  0.9442797568205799 	 ROC AUC	:  0.4488045985014949 

var6_woe,var4,var7 	 Score	:  0.9191746637860289 	 ROC AUC	:  0.5816725130449356 

var6_woe,var4,var8 	 Score	:  0.9343984993887019 	 ROC AUC	:  0.5840592638763775 

var6_woe,var5,var6 	 Score	:  0.9749618985412585 	 ROC AUC	:  0.4999484677751821 

var6_woe,var5,var7 	 Score	:  0.9526704516906999 	 ROC AUC	:  0.506271778951768 

var6_woe,var5,var8 	 Score	:  0.8820780786816058 	 ROC AUC	:  0.5422105354740059 

var6_woe,var6,var7 	 Score	:  0.9524862248572242 	 ROC AUC	:  0.507110722191168 

var6_woe,var6,var8 	 Score	:  0.8808722303170377 	 ROC AUC	:  0.5405624074456389 

var6_woe,var7,var8 	 Score	:  0.9550151568440268 	 ROC AUC	:  0.5432287657550083 

var2_woe,var1_woe,var8_woe 	 Score	:  0.9359895493141738 	 ROC AUC	:  0.5951709358067941 



var8_woe,var1,var8 	 Score	:  0.920129293741312 	 ROC AUC	:  0.541240949588299 

var8_woe,var2,var3 	 Score	:  0.9286204759751461 	 ROC AUC	:  0.591807856985662 

var8_woe,var2,var4 	 Score	:  0.9493711165820898 	 ROC AUC	:  0.6469080814678108 

var8_woe,var2,var5 	 Score	:  0.33003399822472324 	 ROC AUC	:  0.5826744920940663 

var8_woe,var2,var6 	 Score	:  0.8102128657321341 	 ROC AUC	:  0.4932314568439534 

var8_woe,var2,var7 	 Score	:  0.904754727093068 	 ROC AUC	:  0.5719390663828938 

var8_woe,var2,var8 	 Score	:  0.8114354619906546 	 ROC AUC	:  0.5964440544353333 

var8_woe,var3,var4 	 Score	:  0.9409301780301127 	 ROC AUC	:  0.5953868959822675 

var8_woe,var3,var5 	 Score	:  0.8292552211559396 	 ROC AUC	:  0.5704669924077007 

var8_woe,var3,var6 	 Score	:  0.9138655814031386 	 ROC AUC	:  0.5120605080131633 

var8_woe,var3,var7 	 Score	:  0.9361737761476494 	 ROC AUC	:  0.563332310435737 

var8_woe,var3,var8 	 Score	:  0.8547957594332513 	 ROC AUC	:  0.5577794514661015 

var8_woe

<matplotlib.pyplot._IoffContext at 0x20220ba8880>

<a id='rf'></a>
### Random Forest

In [166]:
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')
clr = ['black', 'orange', 'blue', 'green']
ls = [':', '--', '-.', '-']

classifier='RFC'

clf = RandomForestClassifier(random_state=123)

listfeature1=[['var3','var4'],['var1','var3'],['var1','var4']]

for listfeature in listfeature1:
    clf.fit(X_train[listfeature], y_train)
    y_pred = clf.fit(X_train[listfeature],y_train).predict_proba(X_test[listfeature])[:, 1]
    
    fpr, tpr, thresholds = roc_curve(y_true=y_test,y_score=y_pred)
    auc = roc_auc_score(y_test, y_pred)
    
    #roc_auc = auc(x=fpr, y=tpr)
    
                     
    confmat = confusion_matrix(y_test, clf.predict(X_test[listfeature]))
    #print(confmat)
    
    print(','.join(listfeature), '\t','Score\t: ',clf.score(X_test[listfeature],y_test),'\t ROC AUC\t: ',auc,'\n') 
    
    
    cr=classification_report(y_test, clf.predict(X_test[listfeature]),digits=4,output_dict=True)
    #print(cr)

    dfclassification_result=dfclassification_result.append({'variable':','.join(listfeature),
                                                            'precision':cr['weighted avg']['precision'],
                                                            'recall':cr['weighted avg']['recall'],
                                                            'f1':cr['weighted avg']['f1-score'],
                                                           'auc':auc,
                                                            'data_fill':data_fill,
                                                            'resample':resample,
                                                            'classifier':classifier,
                                                            'transformation':transformation,
                                                            'auc+f1+prec':cr['weighted avg']['precision']+cr['weighted avg']['f1-score']+auc
                                                           },
                                                           ignore_index=True)    
    
    fig, ax = plt.subplots(figsize=(2.5, 2.5))
    ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
    for i in range(confmat.shape[0]):
        for j in range(confmat.shape[1]):
            ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')

    plt.xlabel('Predicted label')
    plt.ylabel('True label')

    plt.title(data_fill+' '+classifier+' '+resamplelbl+' '+transformation+' '+' '.join(listfeature))
    plt.savefig('images/'+date+'_conmat_'+data_fill+'_'+classifier+'_'+resamplelbl+'_'+transformation+'_'+'_'.join(listfeature)+'.png', dpi=300, bbox_inches="tight")
    #plt.show()
    plt.clf()
    
    
    fig, ax = plt.subplots(figsize=(4, 3))
    plt.plot(fpr, tpr,
         color='red',
         #linestyle=ls,
         label='%s (auc = %0.3f)' % (' '.join(listfeature), auc))
    
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1],
             linestyle='--',
             color='gray',
             linewidth=2)

    plt.xlim([-0.1, 1.1])
    plt.ylim([-0.1, 1.1])
    plt.grid(alpha=0.5)
    plt.xlabel('False positive rate (FPR)')
    plt.ylabel('True positive rate (TPR)')
    plt.title(data_fill+' '+classifier+' '+resamplelbl+' '+transformation+' '+' '.join(listfeature))
    plt.savefig('images/'+date+'_TPRvsVPR_'+data_fill+'_'+classifier+'_'+resamplelbl+'_'+transformation+'_'+'_'.join(listfeature)+'.png', dpi=300, bbox_inches="tight")
    #plt.show() 
    
plt.ioff() #prevent showing figure

KeyboardInterrupt: 

## Result table

In [None]:
'''
############
# For interactive pandas table view

#!pip install qgrid

import qgrid
widget = qgrid.show_grid(dfclassification_result,show_toolbar = True)
widget
############
'''

dfclassification_result.sort_values('auc+f1+prec',ascending=False).head(25)

#### SAVE classification table result

In [None]:
dfclassification_result.to_csv(date+'classification_result.csv')

#### Read the saved classification table result

In [26]:
namefile=date+'classification_result.csv'
dfclassification_result=pd.read_csv(namefile',index_col=0)

#### Sort values by several different metrics

In [15]:
dfclassification_result.sort_values('precision',ascending=False).head(10)

Unnamed: 0,variable,precision,recall,f1,auc,auc+f1+prec,data_fill,transformation,resample,classifier
44,"var1,var2",0.971875,0.093503,0.129734,0.571076,1.672685,mean,stdscl,SMOTE,DT
129,"var3,var4",0.971483,0.106265,0.151875,0.462283,1.585641,median,stdscl,SMOTE,DT
95,"var3,var6",0.970863,0.10248,0.145454,0.556761,1.673078,median,stdscl,SMOTE,LR
74,var3,0.970859,0.102413,0.145339,0.558782,1.67498,median,stdscl,SMOTE,LR
117,"var1,var3",0.970273,0.108845,0.156434,0.578893,1.7056,median,stdscl,SMOTE,DT
94,"var3,var5",0.970199,0.107722,0.15453,0.557564,1.682293,median,stdscl,SMOTE,LR
110,var3,0.96948,0.113601,0.164608,0.563664,1.697752,median,stdscl,SMOTE,DT
132,"var3,var7",0.96939,0.114874,0.166768,0.571077,1.707234,median,stdscl,SMOTE,DT
123,"var2,var3",0.969139,0.162605,0.24376,0.58272,1.795618,median,stdscl,SMOTE,DT
124,"var2,var4",0.968973,0.227353,0.338124,0.517291,1.824388,median,stdscl,SMOTE,DT


In [172]:
dfclassification_result.sort_values('f1',ascending=False).head(10)

Unnamed: 0,variable,precision,recall,f1,auc,auc+f1+prec,data_fill,transformation,resample,classifier
246,"var5,var6",0.950616,0.974995,0.962651,0.489425,2.402692,mean,minmax,SMOTE,LR
221,var6,0.950616,0.974995,0.962651,0.492564,2.405831,mean,minmax,SMOTE,LR
149,var6,0.950616,0.974995,0.962651,0.492564,2.405831,median,minmax,SMOTE,LR
174,"var5,var6",0.950616,0.974995,0.962651,0.489425,2.402692,median,minmax,SMOTE,LR
220,var5,0.950616,0.974995,0.962651,0.489725,2.402992,mean,minmax,SMOTE,LR
148,var5,0.950616,0.974995,0.962651,0.489725,2.402992,median,minmax,SMOTE,LR
185,var6,0.950614,0.974912,0.96261,0.499948,2.413172,median,minmax,SMOTE,DT
257,var6,0.950614,0.974912,0.96261,0.499948,2.413172,mean,minmax,SMOTE,DT
113,var6,0.950603,0.974443,0.962375,0.498429,2.411407,median,stdscl,SMOTE,DT
112,var5,0.950912,0.973689,0.962029,0.501722,2.414663,median,stdscl,SMOTE,DT


In [171]:
dfclassification_result.sort_values('recall',ascending=False).head(10)

Unnamed: 0,variable,precision,recall,f1,auc,auc+f1+prec,data_fill,transformation,resample,classifier
174,"var5,var6",0.950616,0.974995,0.962651,0.489425,2.402692,median,minmax,SMOTE,LR
148,var5,0.950616,0.974995,0.962651,0.489725,2.402992,median,minmax,SMOTE,LR
149,var6,0.950616,0.974995,0.962651,0.492564,2.405831,median,minmax,SMOTE,LR
220,var5,0.950616,0.974995,0.962651,0.489725,2.402992,mean,minmax,SMOTE,LR
221,var6,0.950616,0.974995,0.962651,0.492564,2.405831,mean,minmax,SMOTE,LR
246,"var5,var6",0.950616,0.974995,0.962651,0.489425,2.402692,mean,minmax,SMOTE,LR
257,var6,0.950614,0.974912,0.96261,0.499948,2.413172,mean,minmax,SMOTE,DT
185,var6,0.950614,0.974912,0.96261,0.499948,2.413172,median,minmax,SMOTE,DT
113,var6,0.950603,0.974443,0.962375,0.498429,2.411407,median,stdscl,SMOTE,DT
112,var5,0.950912,0.973689,0.962029,0.501722,2.414663,median,stdscl,SMOTE,DT
