In [138]:
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import (
    RandomForestClassifier,
    BaggingClassifier,
    AdaBoostClassifier
)


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, mean_squared_error, f1_score, accuracy_score 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import (
    SMOTE,
    BorderlineSMOTE
)

from imblearn.ensemble import (
    BalancedBaggingClassifier,
    BalancedRandomForestClassifier,
    RUSBoostClassifier,
    EasyEnsembleClassifier
)

In [139]:
resampling_dict = {
    'random':  RandomUnderSampler(
        sampling_strategy = 'auto',
        random_state = 1,
        replacement = False
    ),
    
    'smote': SMOTE(
        sampling_strategy = 'auto',
        random_state = 1,
        k_neighbors = 5,
        n_jobs = 2
    )
}

In [140]:
ensemble_dict = {
    'balancedRF': BalancedRandomForestClassifier(
        n_estimators = 20,
        criterion = 'gini',
        sampling_strategy = 'auto',
        n_jobs = 2,
        random_state = 2909
    ),
    
    'bagging': BaggingClassifier(
        base_estimator = LogisticRegression(random_state=2909),
        n_estimators = 20,
        n_jobs = 2,
        random_state = 2909
    ),
    
    'balancedBagging': BalancedBaggingClassifier(
        base_estimator = LogisticRegression(random_state=2909),
        n_estimators = 20,
        max_samples = 1.0,
        max_features = 1.0,
        bootstrap = True,
        bootstrap_features = True,
        sampling_strategy = 'auto',
        random_state = 2909
    ),
    
    'rusboost' : RUSBoostClassifier(
        base_estimator = AdaBoostClassifier(n_estimators = 600,random_state = 39),
        n_estimators = 20,
        learning_rate = 1.0,
        sampling_strategy = 'auto',
        random_state = 2909
    ),
    
    'easyEnsemble': EasyEnsembleClassifier(
        n_estimators = 20,
        sampling_strategy = 'auto',
        n_jobs = 2,
        random_state = 2909
    )

}

In [141]:
df = pd.read_csv('loan_data.csv')
df.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [142]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   credit.policy      9578 non-null   int64  
 1   purpose            9578 non-null   object 
 2   int.rate           9578 non-null   float64
 3   installment        9578 non-null   float64
 4   log.annual.inc     9578 non-null   float64
 5   dti                9578 non-null   float64
 6   fico               9578 non-null   int64  
 7   days.with.cr.line  9578 non-null   float64
 8   revol.bal          9578 non-null   int64  
 9   revol.util         9578 non-null   float64
 10  inq.last.6mths     9578 non-null   int64  
 11  delinq.2yrs        9578 non-null   int64  
 12  pub.rec            9578 non-null   int64  
 13  not.fully.paid     9578 non-null   int64  
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB


In [143]:
df.describe()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
count,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0
mean,0.80497,0.12264,319.089413,10.932117,12.606679,710.846314,4560.767197,16913.96,46.799236,1.577469,0.163708,0.062122,0.160054
std,0.396245,0.026847,207.071301,0.614813,6.88397,37.970537,2496.930377,33756.19,29.014417,2.200245,0.546215,0.262126,0.366676
min,0.0,0.06,15.67,7.547502,0.0,612.0,178.958333,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.1039,163.77,10.558414,7.2125,682.0,2820.0,3187.0,22.6,0.0,0.0,0.0,0.0
50%,1.0,0.1221,268.95,10.928884,12.665,707.0,4139.958333,8596.0,46.3,1.0,0.0,0.0,0.0
75%,1.0,0.1407,432.7625,11.291293,17.95,737.0,5730.0,18249.5,70.9,2.0,0.0,0.0,0.0
max,1.0,0.2164,940.14,14.528354,29.96,827.0,17639.95833,1207359.0,119.0,33.0,13.0,5.0,1.0


In [144]:
#Get Dummies
#Convert Purpose column to numbers

df =pd.get_dummies(df,columns=['purpose'],drop_first=True)
df.head()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,1,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0,0,1,0,0,0,0
1,1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0,1,0,0,0,0,0
2,1,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0,0,1,0,0,0,0
3,1,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0,0,1,0,0,0,0
4,1,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0,1,0,0,0,0,0


In [145]:
##Get X & y
#X = df.drop('not.fully.paid',axis=1)

X = df.drop(['not.fully.paid','delinq.2yrs','pub.rec'],axis=1)

y = df['not.fully.paid']

print(X.shape)
print(y.shape)

(9578, 16)
(9578,)


In [146]:
df.groupby(['not.fully.paid'])['not.fully.paid'].count()

not.fully.paid
0    8045
1    1533
Name: not.fully.paid, dtype: int64

In [147]:
## Scale X and y
data_scaled = True

if data_scaled == True:
    scaler = MinMaxScaler()
    scaler.fit(X)
    scaled_X = scaler.transform(X)
    X = pd.DataFrame(scaled_X, columns = X.columns)
    
#    scaler = StandardScaler()
#    scaled_X = scaler.fit_transform(X)
#    X = pd.DataFrame(scaled_X,columns = X.columns)


In [148]:
import time
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score, roc_auc_score
import datetime
import os

xDate = datetime.datetime.now()
sdate = xDate.strftime("%d_%m_%Y")
i = 1

f_1 = "Models_Ensemble_FE_Scale_"
f_2 = "Ensemble_FE_Scale_"

fname = f_1 + sdate + "_" + str(i) + ".csv"
while os.path.exists(fname):
    i+=1
    fname =  f_1 + sdate + "_" + str(i) + ".csv"
    
model_log = open(fname,"a")
model_log.write("Model, Accuracy, f1_Score_0,f1_Score_1,roc_auc_score,Time_taken, Scaled_data, Under_Sampling" + "\n")

fname = f_2 + sdate + "_" + str(i) + ".csv"
while os.path.exists(fname):
    i+=1
    fname =  f_2 + sdate + "_" + str(i) + ".csv"

en_log = open(fname,"a")
en_log.write("Oversampling, Sampling_Size, Time_taken" + "\n")


40

In [149]:
def run_RandomForest (X_tr, y_tr, X_tst, y_tst,reSampler = 'None', scaled_data = False):
    rf = RandomForestClassifier(n_estimators = 600,random_state = 39,n_jobs =2)
    startTime = time.time()
    rf.fit(X_tr, y_tr)
    reqTime = time.time() - startTime
    preds = rf.predict_proba(X_tst)
    ro_score = roc_auc_score(y_tst, preds[:,1])
    
    preds = rf.predict(X_tst)
    arrf1 = f1_score(y_tst, preds, average=None)
    f1_0 = str(round(arrf1[0],2))
    f1_1 = str(round(arrf1[1],2))
    s_acc = str(round(accuracy_score(y_tst, preds),2))

    model_log.write('RandomForestClassifier' + "," + s_acc + "," + f1_0 + "," + f1_1 + "," + str(ro_score) + "," 
                    + str(round(reqTime,2)) + "," + str(scaled_data) + "," + reSampler + "\n")    

In [150]:
def run_adaBoost (X_tr, y_tr, X_tst, y_tst, reSampler = 'None', scaled_data = False):
    ada = AdaBoostClassifier(n_estimators = 600,random_state = 39)
    
    startTime = time.time()
    ada.fit(X_tr, y_tr)
    reqTime = time.time() - startTime   
    
    preds = ada.predict_proba(X_tst)
    ro_score = roc_auc_score(y_tst, preds[:,1])
    
    preds = ada.predict(X_tst)
    arrf1 = f1_score(y_tst, preds, average=None)
    f1_0 = str(round(arrf1[0],2))
    f1_1 = str(round(arrf1[1],2))
    s_acc = str(round(accuracy_score(y_tst, preds),2))

    model_log.write('adaBoostClassifier' + "," + s_acc + "," + f1_0 + "," + f1_1 + "," + str(ro_score) + "," 
                    + str(round(reqTime,2)) + "," + str(scaled_data) + "," + reSampler + "\n")    

In [151]:
def run_Ensemble(X_tr, y_tr, X_tst, y_tst, ensemble, obj_Ens, scaled_data = False):
    startTime = time.time()
    obj_Ens.fit(X_tr, y_tr)
    reqTime = time.time() - startTime
    
    preds = obj_Ens.predict_proba(X_tst)
    ro_score = roc_auc_score(y_tst, preds[:,1])
    
    preds = obj_Ens.predict(X_tst)
    arrf1 = f1_score(y_tst, preds, average=None)
    f1_0 = str(round(arrf1[0],2))
    f1_1 = str(round(arrf1[1],2))
    s_acc = str(round(accuracy_score(y_tst, preds),2))
    
    model_log.write(ensemble + "," + s_acc + "," + f1_0 + "," + f1_1 + "," + str(ro_score) + "," 
                    + str(round(reqTime,2)) + "," + str(scaled_data) + "," + ensemble + "\n")        

In [152]:
##Split the Dataset into Training and Test
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=101)

In [153]:
#Run RandomForest
run_RandomForest(X_train, y_train, X_test, y_test,'None',data_scaled)

In [154]:
#Run adaBoost
run_adaBoost(X_train, y_train, X_test, y_test,'None',data_scaled)

In [155]:
#Now use Samplers and run RandomForest

for sampler in resampling_dict.keys():
    print(sampler)
    X_Resampled, y_Resampled = resampling_dict[sampler].fit_resample(X_train, y_train)
    run_RandomForest(X_Resampled, y_Resampled, X_test, y_test,'None',data_scaled)
    print(X_Resampled.shape, y_Resampled.shape)
    

random
(2180, 16) (2180,)
smote




(11228, 16) (11228,)


In [156]:
#Use Ensemble methods now
for ensemble in ensemble_dict.keys():
    obj_Ensemble = ensemble_dict[ensemble]
    run_Ensemble(X_train, y_train, X_test, y_test,ensemble, obj_Ensemble,data_scaled)



In [158]:
model_log.close()
en_log.close()