## Importing the required libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics

## Uploading the dataset

In [2]:
dataset = pd.read_csv('Credit.csv')

In [3]:
dataset.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_1',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'DEFAULT_PAYMENT_NEXT_MONTH'],
      dtype='object')

In [4]:
dataset.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT_PAYMENT_NEXT_MONTH
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [5]:
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,24].values

In [6]:
x, y

(array([[     1,  20000,      2, ...,      0,      0,      0],
        [     2, 120000,      2, ...,   1000,      0,   2000],
        [     3,  90000,      2, ...,   1000,   1000,   5000],
        ...,
        [ 29998,  30000,      1, ...,   4200,   2000,   3100],
        [ 29999,  80000,      1, ...,   1926,  52964,   1804],
        [ 30000,  50000,      1, ...,   1000,   1000,   1000]], dtype=int64),
 array([1, 1, 0, ..., 1, 1, 1], dtype=int64))

## Splitting the dataset into the Training set and Test set

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 82)

In [8]:
x_train, x_test, y_train, y_test

(array([[ 25548, 150000,      2, ...,    250,  12500,  12500],
        [ 14173, 200000,      1, ...,   3500,   4000,   3000],
        [ 19487, 180000,      2, ...,   5000,   5000,   5000],
        ...,
        [  8324,  20000,      1, ...,      3,   1800,      0],
        [  4547, 220000,      2, ...,      0,   6118,   6097],
        [ 12420, 130000,      2, ...,   4500,   4500,   4500]], dtype=int64),
 array([[ 29310, 140000,      1, ...,   4800,   5100,   5000],
        [ 12577,  50000,      2, ...,    800,   1000,   1000],
        [  8491,  10000,      1, ...,    126,      0,      0],
        ...,
        [ 20966, 250000,      1, ...,   5000,   4000,   5000],
        [ 18444, 100000,      2, ...,   3300,  11000,      0],
        [ 18468,  30000,      2, ...,   1011,   2450,      0]], dtype=int64),
 array([0, 0, 0, ..., 1, 1, 0], dtype=int64),
 array([1, 1, 0, ..., 0, 1, 0], dtype=int64))

## Feature scaling to bring the variables in a single scale

In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [10]:
x_train, x_test

(array([[ 1.22272292, -0.13267193,  0.81279712, ..., -0.28715532,
          0.51841517,  0.4090419 ],
        [-0.09293356,  0.25276127, -1.23031932, ..., -0.08452383,
         -0.04973751, -0.12319057],
        [ 0.52169488,  0.09858799,  0.81279712, ...,  0.00899839,
          0.01710399, -0.01114163],
        ...,
        [-0.76944124, -1.13479824, -1.23031932, ..., -0.30255532,
         -0.19678879, -0.29126398],
        [-1.20629702,  0.40693454,  0.81279712, ..., -0.30274236,
          0.09183277,  0.05031721],
        [-0.29568924, -0.28684521,  0.81279712, ..., -0.02217568,
         -0.01631676, -0.03915387]]),
 array([[ 1.65784377, -0.20975857, -1.23031932, ..., -0.00347124,
          0.02378814, -0.01114163],
        [-0.27753029, -0.90353832,  0.81279712, ..., -0.25286384,
         -0.25026198, -0.23523951],
        [-0.75012566, -1.21188488, -1.23031932, ..., -0.29488649,
         -0.31710347, -0.29126398],
        ...,
        [ 0.69275914,  0.63819446, -1.23031932, ...,  

## Implementation of Machine Learning Algorithm
## 1. Applying Logistic Regression (LogR) Algorithm

### - Setting the model and fitting

In [11]:
from sklearn.linear_model import LogisticRegression
LogR = LogisticRegression()
LogR.fit(x_train, y_train)

### - Predicting the Test set results

In [12]:
y_pred_LogR = LogR.predict(x_test)
print(y_pred_LogR)

[0 1 0 ... 0 1 0]


### - Lets see the actual and predicted value side by side, actual value on the left side and predicted value on the right side, printing the top 5 values

In [13]:
y_compare_LogR = np.vstack((y_test, y_pred_LogR)).T
y_compare_LogR[:5,:]

array([[1, 0],
       [1, 1],
       [0, 0],
       [1, 1],
       [0, 0]], dtype=int64)

In [45]:
Randpred = pd.DataFrame({ "actual": y_test, "pred": y_pred_LogRogRogR })
Randpred


Unnamed: 0,actual,pred
0,1,0
1,1,1
2,0,0
3,1,1
4,0,0
...,...,...
7495,0,0
7496,1,0
7497,0,0
7498,1,1


### - Making the Confusion Matrix, Classification Report and Accuracy Score

In [14]:
metrics.confusion_matrix(y_test, y_pred_LogR)

array([[5727,  153],
       [1237,  383]], dtype=int64)

In [15]:
print(metrics.classification_report(y_test, y_pred_LogR))

              precision    recall  f1-score   support

           0       0.82      0.97      0.89      5880
           1       0.71      0.24      0.36      1620

    accuracy                           0.81      7500
   macro avg       0.77      0.61      0.62      7500
weighted avg       0.80      0.81      0.78      7500



In [16]:
metrics.accuracy_score(y_test, y_pred_LogR)

0.8146666666666667

## 2. Applying Suppport Vector Machine (SVM) Algorithm

### - Setting the model and fitting

In [17]:
from sklearn.svm import SVC
svm = SVC()
svm.fit(x_train, y_train)

### - Predicting the Test set results

In [18]:
y_pred_svm = svm.predict(x_test)
print(y_pred_svm)

[0 1 0 ... 0 1 1]


### - Lets see the actual and predicted value side by side, actual value on the left side and predicted value on the right hand side, printing the top 5 values

In [19]:
y_compare_svm = np.vstack((y_test,y_pred_svm)).T
y_compare_svm[:5,:]

array([[1, 0],
       [1, 1],
       [0, 0],
       [1, 1],
       [0, 0]], dtype=int64)

### - Making the Confusion Matrix, Classification Report and Accuracy Score

In [20]:
metrics.confusion_matrix(y_test, y_pred_svm)

array([[5636,  244],
       [1091,  529]], dtype=int64)

In [21]:
print(metrics.classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

           0       0.84      0.96      0.89      5880
           1       0.68      0.33      0.44      1620

    accuracy                           0.82      7500
   macro avg       0.76      0.64      0.67      7500
weighted avg       0.80      0.82      0.80      7500



In [22]:
metrics.accuracy_score(y_test, y_pred_svm)

0.822

## 3. Applying K-Fold Cross Validation (KFCV) Algorithm

### - Setting the model, fitting and finding score

In [23]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
svm = SVC()
accuracies = cross_val_score(estimator=svm, X=x_train, y=y_train, cv=3)
accuracies.mean() 

0.8184

### - Applying grid search to find the best model and the best parameters

In [24]:
from sklearn.model_selection import GridSearchCV
parameters = [{'C':[1,10,100,1000], 'kernel': ['linear']}, {'C':[1,10,100,1000], 'kernel': ['rbf']}]
grid_search = GridSearchCV(estimator=svm, param_grid=parameters, scoring='accuracy', cv=3, n_jobs=1)
# grid_search = grid_search.fit(x_train, y_train)

In [25]:
# from sklearn.model_selection import GridSearchCV
# parameters = [{'C':[1,10,100,1000], 'kernel': ['linear']}, {'C':[1,10,100,1000], 'kernel': ['rbf'], 'gamma': [0.5, 0.1 ,0.001 ,0.0001]}]
# grid_search = GridSearchCV(estimator=svm, param_grid=parameters, scoring='accuracy', cv=10, n_jobs=-1)
# grid_search = grid_search.fit(x_train, y_train)
# best_accuracy = grid_search.best_score_
# best_parameters = grid_search.best_params_
# print(best_accuracy, best_parameters)

## 4. Applying Random Forest (RF) Algorithm

### - Setting the model and fitting

In [26]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth=10, random_state=0)
rfc.fit(x_train, y_train)

### - Predicting the Test set results

In [27]:
y_pred_rf = rfc.predict(x_test)
print(y_pred_rf)

[0 1 0 ... 0 1 1]


### - Lets see the actual and predicted value side by side, actual value on the left side and predicted value on the right hand side, printing the top 5 values

In [28]:
y_compare_rf = np.vstack((y_test,y_pred_rf)).T
y_compare_rf[:5,:]

array([[1, 0],
       [1, 1],
       [0, 0],
       [1, 0],
       [0, 0]], dtype=int64)

### - Making the Confusion Matrix, Classification Report and Accuracy Score

In [29]:
metrics.confusion_matrix(y_test, y_pred_rf)

array([[5600,  280],
       [1053,  567]], dtype=int64)

In [30]:
print(metrics.classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.84      0.95      0.89      5880
           1       0.67      0.35      0.46      1620

    accuracy                           0.82      7500
   macro avg       0.76      0.65      0.68      7500
weighted avg       0.80      0.82      0.80      7500



In [31]:
metrics.accuracy_score(y_test, y_pred_rf)

0.8222666666666667

## 5. Applying K-Nearest Neighbor (KNN) Algorithm

### - Setting the model and fitting

In [32]:
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier()
knc.fit(x_train, y_train)

### - Predicting the Test set results

In [33]:
y_pred_knn = knc.predict(x_test)
print(y_pred_knn)

[1 1 0 ... 0 0 1]


### - Lets see the actual and predicted value side by side, actual value on the left side and predicted value on the right hand side, printing the top 5 values

In [34]:
y_compare_knn = np.vstack((y_test,y_pred_knn)).T
y_compare_knn[:5,:]

array([[1, 1],
       [1, 1],
       [0, 0],
       [1, 1],
       [0, 0]], dtype=int64)

### - Making the Confusion Matrix, Classification Report and Accuracy Score

In [35]:
metrics.confusion_matrix(y_test, y_pred_knn)

array([[5418,  462],
       [1066,  554]], dtype=int64)

In [36]:
print(metrics.classification_report(y_test, y_pred_knn))

              precision    recall  f1-score   support

           0       0.84      0.92      0.88      5880
           1       0.55      0.34      0.42      1620

    accuracy                           0.80      7500
   macro avg       0.69      0.63      0.65      7500
weighted avg       0.77      0.80      0.78      7500



In [37]:
metrics.accuracy_score(y_test, y_pred_knn)

0.7962666666666667

## 6. Applying Artificial Neural Network (ANN) Algorithm

### - Setting the model and fitting

In [38]:
import keras
from keras.models import Sequential
from keras.layers import Dense
ann = Sequential()
#ann.add(keras.Input(shape=(16,)))
ann.add(Dense(12, activation='relu', input_shape=(24,)))
ann.add(Dense(7, activation='relu'))
ann.add(Dense(1, activation='sigmoid'))
ann.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
ann.fit(x_train, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x185153ca5c0>

### - Predicting the Test set results

In [39]:
y_pred_ann = ann.predict(x_test)
print(y_pred_ann)
y_pred_ann = (y_pred_ann > 0.5)
y_pred_ann

[[0.5283924 ]
 [0.67364126]
 [0.23568837]
 ...
 [0.02337574]
 [0.59596246]
 [0.8133173 ]]


array([[ True],
       [ True],
       [False],
       ...,
       [False],
       [ True],
       [ True]])

In [40]:
y_pred_ann = ann.predict(x_test)
y_pred_ann = np.array([1 if i[0]>0.5 else 0 for i in y_pred_ann])
print(y_pred_ann)

[1 1 0 ... 0 1 1]


### - Lets see the actual and predicted value side by side, actual value on the left side and predicted value on the right hand side, printing the top 5 values

In [41]:
y_compare_ann = np.vstack((y_test,y_pred_ann)).T
y_compare_ann[:5,:]

array([[1, 1],
       [1, 1],
       [0, 0],
       [1, 1],
       [0, 0]], dtype=int64)

### - Making the Confusion Matrix, Classification Report and Accuracy Score

In [42]:
metrics.confusion_matrix(y_test, y_pred_ann)

array([[5524,  356],
       [ 999,  621]], dtype=int64)

In [43]:
print(metrics.classification_report(y_test, y_pred_ann))

              precision    recall  f1-score   support

           0       0.85      0.94      0.89      5880
           1       0.64      0.38      0.48      1620

    accuracy                           0.82      7500
   macro avg       0.74      0.66      0.68      7500
weighted avg       0.80      0.82      0.80      7500



In [44]:
metrics.accuracy_score(y_test, y_pred_ann)

0.8193333333333334