In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
from imblearn.datasets import fetch_datasets
from imblearn.pipeline import make_pipeline

from imblearn.under_sampling import (
    RandomUnderSampler,
    CondensedNearestNeighbour,
    TomekLinks,
    OneSidedSelection,
    EditedNearestNeighbours,
    RepeatedEditedNearestNeighbours,
    AllKNN,
    NeighbourhoodCleaningRule,
    NearMiss,
    InstanceHardnessThreshold
)

In [3]:
from sklearn.metrics import average_precision_score, mean_squared_error
from sklearn.model_selection import train_test_split, cross_validate

from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import MinMaxScaler

In [4]:
undersampler_dict = {
    'random': RandomUnderSampler(
    sampling_strategy='auto',
    random_state=0,
    replacement=False),
    
    'cnn': CondensedNearestNeighbour (
    sampling_strategy = 'auto',
    random_state = 0,
    n_neighbors = 1,
    n_jobs = 2),
    
    'tomek': TomekLinks(
    sampling_strategy= 'auto',
    n_jobs = 2
    ),
        
    'oss': OneSidedSelection(
    sampling_strategy = 'auto',
    random_state = 0,
    n_neighbors = 1,
    n_jobs = 2),
    
    'enn': EditedNearestNeighbours(
        sampling_strategy = 'auto',
        n_neighbors = 3,
        kind_sel = 'all',
        n_jobs = 2    
    ),
    
    'renn': RepeatedEditedNearestNeighbours(
        sampling_strategy = 'auto',
        n_neighbors = 3,
        kind_sel = 'all',
        n_jobs = 2,
        max_iter = 100
    ),
    
    'allKNN': AllKNN(
        sampling_strategy = 'auto',
        n_neighbors = 5,
        kind_sel = 'all',
        n_jobs = 2
    ),
    
    'ncr' : NeighbourhoodCleaningRule(
        sampling_strategy = 'auto',
        n_neighbors = 3,
        kind_sel = 'mode',
        n_jobs = 2,
        threshold_cleaning = 0.1
    ),
    
    'nm1' : NearMiss(
        sampling_strategy = 'auto',
        version = 1,
        n_neighbors = 3,
        n_jobs = 2
    ),

    'nm2' : NearMiss(
        sampling_strategy = 'auto',
        version = 1,
        n_neighbors = 3,
        n_jobs = 2
    ),

    
    'iht': InstanceHardnessThreshold(
        estimator = RandomForestClassifier(n_estimators= 100, random_state=39, max_depth=3, n_jobs=2),
        sampling_strategy = 'auto',
        random_state=0,
        n_jobs = 2,
        cv = 3
    )
}

In [5]:
model_dict = {'Logistic_Regression': LogisticRegression(max_iter=500),
              'Decision_Tree_Model':DecisionTreeClassifier(),
              'Random_Forest_Classifier': RandomForestClassifier(n_estimators=600),
              'SVM': SVC(),
              'K_Nearest_Neighbour':KNeighborsClassifier(n_neighbors = 1)
             }

In [6]:
df = pd.read_csv('loan_data.csv')

In [7]:
df.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   credit.policy      9578 non-null   int64  
 1   purpose            9578 non-null   object 
 2   int.rate           9578 non-null   float64
 3   installment        9578 non-null   float64
 4   log.annual.inc     9578 non-null   float64
 5   dti                9578 non-null   float64
 6   fico               9578 non-null   int64  
 7   days.with.cr.line  9578 non-null   float64
 8   revol.bal          9578 non-null   int64  
 9   revol.util         9578 non-null   float64
 10  inq.last.6mths     9578 non-null   int64  
 11  delinq.2yrs        9578 non-null   int64  
 12  pub.rec            9578 non-null   int64  
 13  not.fully.paid     9578 non-null   int64  
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB


In [9]:
df.describe()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
count,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0
mean,0.80497,0.12264,319.089413,10.932117,12.606679,710.846314,4560.767197,16913.96,46.799236,1.577469,0.163708,0.062122,0.160054
std,0.396245,0.026847,207.071301,0.614813,6.88397,37.970537,2496.930377,33756.19,29.014417,2.200245,0.546215,0.262126,0.366676
min,0.0,0.06,15.67,7.547502,0.0,612.0,178.958333,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.1039,163.77,10.558414,7.2125,682.0,2820.0,3187.0,22.6,0.0,0.0,0.0,0.0
50%,1.0,0.1221,268.95,10.928884,12.665,707.0,4139.958333,8596.0,46.3,1.0,0.0,0.0,0.0
75%,1.0,0.1407,432.7625,11.291293,17.95,737.0,5730.0,18249.5,70.9,2.0,0.0,0.0,0.0
max,1.0,0.2164,940.14,14.528354,29.96,827.0,17639.95833,1207359.0,119.0,33.0,13.0,5.0,1.0


In [10]:
#Get Dummies
#Convert Purpose column to numbers

df =pd.get_dummies(df,columns=['purpose'],drop_first=True)
df.head()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,1,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0,0,1,0,0,0,0
1,1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0,1,0,0,0,0,0
2,1,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0,0,1,0,0,0,0
3,1,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0,0,1,0,0,0,0
4,1,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0,1,0,0,0,0,0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   credit.policy               9578 non-null   int64  
 1   int.rate                    9578 non-null   float64
 2   installment                 9578 non-null   float64
 3   log.annual.inc              9578 non-null   float64
 4   dti                         9578 non-null   float64
 5   fico                        9578 non-null   int64  
 6   days.with.cr.line           9578 non-null   float64
 7   revol.bal                   9578 non-null   int64  
 8   revol.util                  9578 non-null   float64
 9   inq.last.6mths              9578 non-null   int64  
 10  delinq.2yrs                 9578 non-null   int64  
 11  pub.rec                     9578 non-null   int64  
 12  not.fully.paid              9578 non-null   int64  
 13  purpose_credit_card         9578 

In [12]:
##Get X & y
X = df.drop('not.fully.paid',axis=1)
y = df['not.fully.paid']

print(X.shape)
print(y.shape)

(9578, 18)
(9578,)


In [13]:
df.groupby(['not.fully.paid'])['not.fully.paid'].count()

not.fully.paid
0    8045
1    1533
Name: not.fully.paid, dtype: int64

In [14]:
## Scale X and y
data_scaled = False
#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()
#scaled_X = scaler.fit_transform(X)
#X = pd.DataFrame(scaled_X,columns = X.columns)
#data_scaled = True

In [15]:
X.head()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,1,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0,1,0,0,0,0
1,1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,1,0,0,0,0,0
2,1,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0,1,0,0,0,0
3,1,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0,1,0,0,0,0
4,1,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,1,0,0,0,0,0


In [16]:
import time
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
import datetime
import os

xDate = datetime.datetime.now()
sdate = xDate.strftime("%d_%m_%Y")
i = 1
fname = "Learning_Models_" + sdate + "_" + str(i) + ".csv"
while os.path.exists(fname):
    i+=1
    fname =  "Learning_Models_" + sdate + "_" + str(i) + ".csv"
    
model_log = open(fname,"a")
model_log.write("Model, Accuracy, f1_Score_0,f1_Score_1,Time_taken, Scaled_data, Under_Sampling" + "\n")

fname = "Under_Sampling_" + sdate + "_" + str(i) + ".csv"
while os.path.exists(fname):
    i+=1
    fname =  "Under_Sampling_" + sdate + "_" + str(i) + ".csv"

us_log = open(fname,"a")
us_log.write("Undersampling, Sampling_Size, Time_taken" + "\n")


41

In [17]:
def train_test_models(X_tr, y_tr, X_tst, y_tst, under_sample, scaled_data = False):
    #Decision Tree
#    DTClassifier = DecisionTreeClassifier()
#    startTime = time.time()
#    DTClassifier.fit(X_tr,y_tr)
#    reqTime = time.time() - startTime
#    preds = DTClassifier.predict(X_tst)
#    arrf1 = f1_score(y_tst, pred, average=None)
#    f1_0 = str(round(arrf1[0],2))
#    f1_1 = str(round(arrf1[1],2))
#    s_acc = str(round(accuracy_score(y_test, pred),2))
#    model_log.write("Decision_Tree_Model," + s_acc + "," + f1_0 + "," + f1_1 + "," + str(round(reqTime,2)) + "\n")
   
    for k in model_dict.keys():
        model = model_dict[k]
        startTime = time.time()
        if k == 'K_Nearest_Neighbour':
            rmseKNN = []
            for i in range(1,50):
                knn_model = KNeighborsClassifier(n_neighbors = i)
                knn_model.fit(X_tr, y_tr)
                preds = knn_model.predict(X_tst)
                rmseKNN.append(np.sqrt(mean_squared_error(y_test, preds)))
                k_n = rmseKNN.index(min(rmseKNN)) + 1
            model = KNeighborsClassifier(n_neighbors = k_n)
        
        model.fit(X_tr,y_tr)
        reqTime = time.time() - startTime
        preds = model.predict(X_tst)
        arrf1 = f1_score(y_tst, preds, average=None)
        f1_0 = str(round(arrf1[0],2))
        f1_1 = str(round(arrf1[1],2))
        s_acc = str(round(accuracy_score(y_tst, preds),2))
        model_log.write(k + "," + s_acc + "," + f1_0 + "," + f1_1 + "," + str(round(reqTime,2)) + 
                        "," + str(scaled_data) + "," + under_sample + "\n")    

In [18]:
def train_test_gridsearch(X_tr, y_tr, X_tst, y_tst, under_sample, scaled_data = False):
    # With Decision tree
    startTime = time.time()
    param_tree = {'criterion':['gini','entropy'],
             'splitter':['best','random'],
             'min_samples_leaf':[1,2,3,4,5],
             'class_weight':['balanced',None]}
    gsCV = GridSearchCV(DecisionTreeClassifier(), param_grid=param_tree, cv=5, refit=True)
    gsCV.fit(X_tr, y_tr)
    reqTime = time.time() - startTime
    
    preds = gsCV.predict(X_tst)
    arrf1 = f1_score(y_tst, preds, average=None)
    f1_0 = str(round(arrf1[0],2))
    f1_1 = str(round(arrf1[1],2))
    s_acc = str(round(accuracy_score(y_tst, preds),2))
    model_log.write("GridSearch_Decision_Tree" + "," + s_acc + "," + f1_0 + "," + f1_1 + "," + str(round(reqTime,2)) + 
                        "," + str(scaled_data) + "," + under_sample + "\n")

    #With Random forest
    startTime = time.time()
    param_RF = {'criterion':['gini','entropy'],
           'min_samples_leaf':[1,2,3,4,5],
           'n_estimators':[500,600]}
    gsCV = GridSearchCV(RandomForestClassifier(), param_grid=param_RF,cv=10, refit=True)
    gsCV.fit(X_tr, y_tr)
    reqTime = time.time() - startTime
    preds = gsCV.predict(X_tst)
    arrf1 = f1_score(y_tst, preds, average=None)
    f1_0 = str(round(arrf1[0],2))
    f1_1 = str(round(arrf1[1],2))
    s_acc = str(round(accuracy_score(y_tst, preds),2))
    model_log.write("GridSearch_Random_Forest" + "," + s_acc + "," + f1_0 + "," + f1_1 + "," + str(round(reqTime,2)) + 
                        "," + str(scaled_data) + "," + under_sample + "\n")
    

    #With SVM
    startTime = time.time()
    param_grid = {'C':[0.1, 1, 10, 100],'gamma':[1, 0.1, 0.01, 0.001]}
    gsCV_model = GridSearchCV(SVC(), param_grid, refit = True)    
    gsCV.fit(X_tr, y_tr)
    reqTime = time.time() - startTime
    preds = gsCV.predict(X_tst)
    arrf1 = f1_score(y_tst, preds, average=None)
    f1_0 = str(round(arrf1[0],2))
    f1_1 = str(round(arrf1[1],2))
    s_acc = str(round(accuracy_score(y_tst, preds),2))
    model_log.write("GridSearch_SVM" + "," + s_acc + "," + f1_0 + "," + f1_1 + "," + str(round(reqTime,2)) + 
                        "," + str(scaled_data) + "," + under_sample + "\n")
    

In [19]:
##Split the Dataset into Training and Test
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=101)

In [20]:
#Get the re-sampled dataset for each Under_Resampler and run through each Classification model

for under_sampler in undersampler_dict.keys():
    x = undersampler_dict[under_sampler]
    
    startTime = time.time()
    X_Resample,y_resample = x.fit_resample(X_train, y_train)
    reqTime = time.time() - startTime
    us_log.write(under_sampler + "," + str(X_Resample.shape[0]) + "," + str(round(reqTime,2)) + "\n")
    
    train_test_models(X_Resample,y_resample,X_test,y_test,under_sampler,data_scaled)
 #   train_test_gridsearch(X_Resample,y_resample,X_test,y_test,under_sampler,data_scaled)

In [21]:
model_log.close()
us_log.close()