In [1]:
# Import Statements

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score
from sklearn.metrics import matthews_corrcoef
import pickle

In [2]:
# Step 1 : Load data set
data = pd.read_csv('credit_card_data.csv')
data = data.drop(['ID'], axis=1)

null_counts = data.isnull().sum()
print("Missing Values Count : " ,null_counts)

print("\nColumn Names:")
print(data.columns.tolist())

print("Number of Data Points")
print(data.shape)

Missing Values Count :  LIMIT_BAL    0
SEX          0
EDUCATION    0
MARRIAGE     0
AGE          0
PAY_0        0
PAY_2        0
PAY_3        0
PAY_4        0
PAY_5        0
PAY_6        0
BILL_AMT1    0
BILL_AMT2    0
BILL_AMT3    0
BILL_AMT4    0
BILL_AMT5    0
BILL_AMT6    0
PAY_AMT1     0
PAY_AMT2     0
PAY_AMT3     0
PAY_AMT4     0
PAY_AMT5     0
PAY_AMT6     0
DEFAULT      0
dtype: int64

Column Names:
['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'DEFAULT']
Number of Data Points
(30000, 24)


In [3]:
# Step 2 : One hot Encoding only the Categorical - Nominal Data
cat_c = ['SEX', 'MARRIAGE']

data = pd.get_dummies(data, columns=cat_c, drop_first=True)
bool_c = data.select_dtypes(include=['bool']).columns

# Convert them to int (True → 1, False → 0)
data[bool_c] = data[bool_c].astype(int)
print(data.iloc[0])



LIMIT_BAL     20000
EDUCATION         2
AGE              24
PAY_0             2
PAY_2             2
PAY_3            -1
PAY_4            -1
PAY_5            -2
PAY_6            -2
BILL_AMT1      3913
BILL_AMT2      3102
BILL_AMT3       689
BILL_AMT4         0
BILL_AMT5         0
BILL_AMT6         0
PAY_AMT1          0
PAY_AMT2        689
PAY_AMT3          0
PAY_AMT4          0
PAY_AMT5          0
PAY_AMT6          0
DEFAULT           1
SEX_2             1
MARRIAGE_1        1
MARRIAGE_2        0
MARRIAGE_3        0
Name: 0, dtype: int64


In [4]:
# Step 3 : Train Split Data
y = data['DEFAULT']
X = data.drop('DEFAULT', axis=1)

X_train,X_test,y_train,y_test= train_test_split(X, y, test_size = 0.20, stratify=y, random_state = 17)

In [5]:
# Step 4 : Train Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)

In [6]:
with open("nb_model.pkl", "wb") as file:
    pickle.dump(nb, file)

In [7]:
# Step 5 : Predict
y_pred = nb.predict(X_test)


In [8]:
# Step 6 : Accuracy Score
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.37633333333333335


In [9]:
# Step 7 : Confusion Matrix

conf_matrix = confusion_matrix(y_test, y_pred) 
print('Confusion Matrix :')
print(conf_matrix)

Confusion Matrix :
[[1071 3602]
 [ 140 1187]]


In [10]:
# Step 8 : Classification Report
print('Classification Report: ')
print(classification_report(y_test, y_pred))

Classification Report: 
              precision    recall  f1-score   support

           0       0.88      0.23      0.36      4673
           1       0.25      0.89      0.39      1327

    accuracy                           0.38      6000
   macro avg       0.57      0.56      0.38      6000
weighted avg       0.74      0.38      0.37      6000



In [11]:
# Step 8 : AUC Score
y_prob = nb.predict_proba(X_test)[:,1]
auc_score = roc_auc_score(y_test, y_prob)

print("AUC:", auc_score)

AUC: 0.6878182817129492


In [12]:
# Step 9 : Mathews Correlation Coefficient ( MCC Score )

mcc = matthews_corrcoef(y_test, y_pred)

print(" MCC : ", mcc)

 MCC :  0.1278986095979632
