
1. Title: Credit Approval

2. Sources: 
    (confidential)
    Submitted by quinlan@cs.su.oz.au

3.  Past Usage:

    See Quinlan,
    * "Simplifying decision trees", Int J Man-Machine Studies 27,
      Dec 1987, pp. 221-234.
    * "C4.5: Programs for Machine Learning", Morgan Kaufmann, Oct 1992
  
4.  Relevant Information:

    This file concerns credit card applications.  All attribute names
    and values have been changed to meaningless symbols to protect
    confidentiality of the data.
  
    This dataset is interesting because there is a good mix of
    attributes -- continuous, nominal with small numbers of
    values, and nominal with larger numbers of values.  There
    are also a few missing values.
  
5.  Number of Instances: 690

6.  Number of Attributes: 15 + class attribute

7.  Attribute Information:

    A1:	b, a.
    A2:	continuous.
    A3:	continuous.
    A4:	u, y, l, t.
    A5:	g, p, gg.
    A6:	c, d, cc, i, j, k, m, r, q, w, x, e, aa, ff.
    A7:	v, h, bb, j, n, z, dd, ff, o.
    A8:	continuous.
    A9:	t, f.
    A10:	t, f.
    A11:	continuous.
    A12:	t, f.
    A13:	g, p, s.
    A14:	continuous.
    A15:	continuous.
    A16: +,-         (class attribute)

8.  Missing Attribute Values:
    37 cases (5%) have one or more missing values.  The missing
    values from particular attributes are:

    A1:  12
    A2:  12
    A4:   6
    A5:   6
    A6:   9
    A7:   9
    A14: 13

9.  Class Distribution
  
    +: 307 (44.5%)
    -: 383 (55.5%)



In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [3]:
#Read data
data_frame = pd.read_csv("crx.data", header = None)
data_frame.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [4]:
data_frame.shape
data_frame.dtypes

0      object
1      object
2     float64
3      object
4      object
5      object
6      object
7     float64
8      object
9      object
10      int64
11     object
12     object
13     object
14      int64
15     object
dtype: object

In [5]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(['+','-'])

LabelEncoder()

In [6]:
data_frame[15]=label_encoder.transform(data_frame[15])
data_frame.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,0
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,0
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,0
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,0
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,0


In [7]:
#Fill undefined valus with NaN

def replace_undef_with_NaN():
  for column_index in data_frame.axes[1]:
    data_frame[column_index][data_frame[column_index] == '?'] = float("NaN")

replace_undef_with_NaN()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
  res_values = method(rvalues)


In [8]:
def calculate_mean(col_index):
  return round(data_frame[col_index].mean(), 3)

def get_max_in_categorical(col_index):
   data_t = data_frame.groupby(col_index)[col_index].count()
   return data_t.axes[0][data_t.argmax()]

# Filling category column nan values
categorical_cols = [0, 3, 4, 5, 6, 8 , 9, 11, 12]
categorical_mapping = {}

for cat_col in categorical_cols:
  categorical_mapping[cat_col] = get_max_in_categorical(cat_col)

#Fill categorical value undefs with max count option
data_frame.fillna(categorical_mapping, inplace=True)

# Filling numeric columns nan values
numeric_cols = [1, 2, 7, 10, 13, 14]
numeric_mapping = {}

data_frame.fillna({13: "0000"}, inplace=True)

In [9]:
#Correct data types of columns

data_frame[1] = data_frame[1].astype('float64')
data_frame[13] = data_frame[13].astype('int64')
data_frame[15] = data_frame[15].astype('int64')

#Fill numeric value undefs with mean
for num_col in numeric_cols:
  numeric_mapping[num_col] = calculate_mean(num_col)

data_frame.fillna(numeric_mapping, inplace=True)

data_frame.dtypes

0      object
1     float64
2     float64
3      object
4      object
5      object
6      object
7     float64
8      object
9      object
10      int64
11     object
12     object
13      int64
14      int64
15      int64
dtype: object

In [10]:
X = data_frame.iloc[:, 0:15]
y = data_frame.iloc[:, 15]

X.head()
y.head()


0    0
1    0
2    0
3    0
4    0
Name: 15, dtype: int64

In [11]:
# cat_cols = [0, 3, 4, 5, 6, 8 , 9, 11, 12]
# Creating dummy variables
dummies = [X]
for cat_col in categorical_cols:
  dummy = pd.get_dummies(X[cat_col], prefix=cat_col, drop_first=True)
  dummies.append(dummy)

X = pd.concat(dummies, axis=1)

# Droping unnecessary columns
X = X.drop(categorical_cols, axis=1)

X.head()

Unnamed: 0,1,2,7,10,13,14,0_b,3_u,3_y,4_gg,4_p,5_c,5_cc,5_d,5_e,5_ff,5_i,5_j,5_k,5_m,5_q,5_r,5_w,5_x,6_dd,6_ff,6_h,6_j,6_n,6_o,6_v,6_z,8_t,9_t,11_t,12_p,12_s
0,30.83,0.0,1.25,1,202,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,1,0,0,0
1,58.67,4.46,3.04,6,43,560,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0
2,24.5,0.5,1.5,0,280,824,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
3,27.83,1.54,3.75,5,100,3,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,1,1,0,0
4,20.17,5.625,1.71,0,120,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1


In [12]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

print(X_train.shape, y_train.shape)

(552, 37) (552,)


**Hyper Parameter Tunning**

In [18]:
#Hyper parameter tunning

from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten, LeakyReLU, BatchNormalization, Dropout
from keras.activations import relu, sigmoid

In [19]:
def create_model(layers, activation):
  model = Sequential()
  for i, nodes in enumerate(layers):
    if i==0:
      model.add(Dense(nodes, input_dim=X_train.shape[-1]))
      model.add(Activation(activation))
      model.add(Dropout(0.3))
      continue
    model.add(Dense(nodes))
    model.add(Activation(activation))
    model.add(Dropout(0.3))
 
  model.add(Dense(units=1, kernel_initializer='glorot_uniform', activation='sigmoid'))
  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
  return model

In [27]:
model = KerasClassifier(build_fn=create_model, verbose=0)
 
layers = [(15,), (40,20), (30,20,6)]
activations = ['sigmoid', 'relu']
batch_size = [128, 265]
epochs = [30]
 
param_grid = dict(layers=layers, activation=activations, batch_size= batch_size, epochs=epochs)
 
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
 
grid_result = grid.fit(X_train, y_train)
 
print("Results: ", [grid_result.best_score_, grid_result.best_params_])

Results:  [0.8478624105453492, {'activation': 'relu', 'batch_size': 128, 'epochs': 30, 'layers': (40, 20)}]


In [29]:
# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LeakyReLU,PReLU,ELU
from keras.layers import Dropout


In [30]:
#Best so far : Results:  [0.862276816368103, {'activation': 'relu', 'batch_size': 128, 'epochs': 30, 'layers': (125, 60, 6)}]

classifier = Sequential()

classifier.add(Dense(units = 40, kernel_initializer = 'he_uniform',activation='relu',input_dim = 37))

classifier.add(Dense(units = 20, kernel_initializer = 'he_uniform',activation='relu'))


classifier.add(Dense(units = 1, kernel_initializer = 'glorot_uniform', activation = 'sigmoid'))


classifier.compile(optimizer = 'Adamax', loss = 'binary_crossentropy', metrics = ['accuracy'])

model_history=classifier.fit(X_train, y_train,validation_split=0.33, batch_size = 128, epochs = 100)

Train on 369 samples, validate on 183 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
E

In [31]:
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)



# Calculate the Accuracy
from sklearn.metrics import accuracy_score
score = accuracy_score(y_pred,y_test)
print("Resulting score: ",score )

Resulting score:  0.8043478260869565


In [66]:
#Computing F1 score
#from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

report = classification_report(y_true=y_test, y_pred=y_pred)
print(report)


              precision    recall  f1-score   support

           0       0.80      0.75      0.78        60
           1       0.82      0.86      0.84        78

    accuracy                           0.81       138
   macro avg       0.81      0.80      0.81       138
weighted avg       0.81      0.81      0.81       138



**Doing 5 fold cross validation**

In [34]:
#Doing 5 fold cross validation

def create_NN_model():
  classifier = Sequential()
  classifier.add(Dense(units = 100, kernel_initializer = 'he_uniform',activation='relu',input_dim = 37))
  classifier.add(Dense(units = 60, kernel_initializer = 'he_uniform',activation='relu'))
  classifier.add(Dense(units = 6, kernel_initializer = 'he_uniform',activation='relu'))
  classifier.add(Dense(units = 1, kernel_initializer = 'glorot_uniform', activation = 'sigmoid'))
  classifier.compile(optimizer = 'Adamax', loss = 'binary_crossentropy', metrics = ['accuracy'])
  return classifier


def create_NN_model_2():
  classifier = Sequential()
  classifier.add(Dense(units = 20, kernel_initializer = 'he_uniform',activation='sigmoid',input_dim = 37))
  classifier.add(Dense(units = 1, kernel_initializer = 'glorot_uniform', activation = 'sigmoid'))
  classifier.compile(optimizer = 'Adamax', loss = 'mean_squared_error', metrics = ['accuracy'])
  return classifier

def create_NN_model_3():
  classifier = Sequential()
  classifier.add(Dense(units = 40, kernel_initializer = 'he_uniform',activation='relu',input_dim = 37))
  classifier.add(Dense(units = 20, kernel_initializer = 'he_uniform',activation='relu'))
  classifier.add(Dense(units = 1, kernel_initializer = 'glorot_uniform', activation = 'sigmoid'))
  classifier.compile(optimizer = 'Adamax', loss = 'binary_crossentropy', metrics = ['accuracy'])
  return classifier


from sklearn.model_selection import StratifiedKFold

def evaluateTheModel():
   skf = StratifiedKFold(n_splits = 5)
   for train_index, test_index in skf.split(X, y):
        X_train_temp, X_test_temp = X.iloc[train_index], X.iloc[test_index]
        y_train_temp, y_test_temp = y.iloc[train_index], y.iloc[test_index]

        model=create_NN_model_1()
        model.fit(X_train_temp, y_train_temp,validation_split=0.33, batch_size = 128, epochs = 100, verbose=0)
        evaluationMetrics = model.evaluate(X_test_temp,y_test_temp, verbose=1)
        print("Test loss: " , evaluationMetrics[0] , " Test accuracy: ", evaluationMetrics[0])
        model.summary()
        y_pred_temp = model.predict(X_test_temp)
        y_pred_temp = (y_pred_temp > 0.5)
        report = classification_report(y_true=y_test_temp, y_pred=y_pred_temp)
        print(report)

In [None]:
evaluateTheModel()

**Regularization on the loss function**

In [45]:
#regularization on the loss function
from keras.regularizers import l2, l1_l2, l1
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold

def create_NN_model_with_regularization():
  classifier = Sequential()
  classifier.add(Dense(units = 40, kernel_initializer = 'he_uniform', kernel_regularizer=l1(0.001), bias_regularizer=l2(0.01), activation='relu',input_dim = 37))
  classifier.add(Dense(units = 20, kernel_initializer = 'he_uniform', kernel_regularizer=l1(0.001), activation='relu'))
  classifier.add(Dense(units = 1, kernel_initializer = 'glorot_uniform', activation = 'sigmoid', ))
  classifier.compile(optimizer = 'Adamax', loss = 'binary_crossentropy', metrics = ['accuracy'])
  return classifier


def evaluateTheModelWithReg():
   skf = StratifiedKFold(n_splits = 5)
   for train_index, test_index in skf.split(X, y):
        X_train_temp_2, X_test_temp_2 = X.iloc[train_index], X.iloc[test_index]
        y_train_temp_2, y_test_temp_2 = y.iloc[train_index], y.iloc[test_index]

        model=create_NN_model_with_regularization()
        model.fit(X_train_temp_2, y_train_temp_2, validation_split=0.33, batch_size = 128, epochs = 100, verbose=0)

        y_pred_temp_2 = model.predict(X_test_temp_2)
        y_pred_temp_2 = (y_pred_temp_2 > 0.5)
        report = classification_report(y_true=y_test_temp_2, y_pred=y_pred_temp_2)
        print(report)


In [38]:
evaluateTheModelWithReg()

              precision    recall  f1-score   support

           0       0.63      0.63      0.63        62
           1       0.70      0.70      0.70        76

    accuracy                           0.67       138
   macro avg       0.66      0.66      0.66       138
weighted avg       0.67      0.67      0.67       138

              precision    recall  f1-score   support

           0       0.77      0.74      0.75        62
           1       0.79      0.82      0.81        76

    accuracy                           0.78       138
   macro avg       0.78      0.78      0.78       138
weighted avg       0.78      0.78      0.78       138

              precision    recall  f1-score   support

           0       0.70      0.52      0.60        61
           1       0.68      0.82      0.75        77

    accuracy                           0.69       138
   macro avg       0.69      0.67      0.67       138
weighted avg       0.69      0.69      0.68       138

              preci

In [None]:
classifier=create_NN_model_with_regularization()

classifier.compile(optimizer = 'Adamax', loss = 'binary_crossentropy', metrics = ['accuracy'])

model_history=classifier.fit(X_train, y_train,validation_split=0.33, batch_size = 128, epochs = 100)

In [47]:
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)

# Calculate the Accuracy
from sklearn.metrics import accuracy_score
score = accuracy_score(y_pred,y_test)
print("Resulting score: ",score )

Resulting score:  0.8260869565217391
