In [1]:
# Import Statements

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import matthews_corrcoef
import pickle

In [2]:
# Step 1 : Load data set
data = pd.read_csv('credit_card_data.csv')
data = data.drop(['ID'], axis=1)

null_counts = data.isnull().sum()
print("Missing Values Count : " ,null_counts)

print("\nColumn Names:")
print(data.columns.tolist())

print("Number of Data Points")
print(data.shape)

Missing Values Count :  LIMIT_BAL    0
SEX          0
EDUCATION    0
MARRIAGE     0
AGE          0
PAY_0        0
PAY_2        0
PAY_3        0
PAY_4        0
PAY_5        0
PAY_6        0
BILL_AMT1    0
BILL_AMT2    0
BILL_AMT3    0
BILL_AMT4    0
BILL_AMT5    0
BILL_AMT6    0
PAY_AMT1     0
PAY_AMT2     0
PAY_AMT3     0
PAY_AMT4     0
PAY_AMT5     0
PAY_AMT6     0
DEFAULT      0
dtype: int64

Column Names:
['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'DEFAULT']
Number of Data Points
(30000, 24)


In [3]:
# Step 2 : Scaling and One Hot Encoding
num_c = ['LIMIT_BAL', 'AGE', 
                'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3','BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 
                'PAY_AMT1','PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6' ]
cat_c = ['SEX', 'MARRIAGE']

data[num_c] = StandardScaler().fit_transform(data[num_c])
data = pd.get_dummies(data, columns=cat_c, drop_first=True)

bool_c = data.select_dtypes(include=['bool']).columns

# Convert them to int (True → 1, False → 0)
data[bool_c] = data[bool_c].astype(int)
print(data.iloc[0])

LIMIT_BAL    -1.136720
EDUCATION     2.000000
AGE          -1.246020
PAY_0         2.000000
PAY_2         2.000000
PAY_3        -1.000000
PAY_4        -1.000000
PAY_5        -2.000000
PAY_6        -2.000000
BILL_AMT1    -0.642501
BILL_AMT2    -0.647399
BILL_AMT3    -0.667993
BILL_AMT4    -0.672497
BILL_AMT5    -0.663059
BILL_AMT6    -0.652724
PAY_AMT1     -0.341942
PAY_AMT2     -0.227086
PAY_AMT3     -0.296801
PAY_AMT4     -0.308063
PAY_AMT5     -0.314136
PAY_AMT6     -0.293382
DEFAULT       1.000000
SEX_2         1.000000
MARRIAGE_1    1.000000
MARRIAGE_2    0.000000
MARRIAGE_3    0.000000
Name: 0, dtype: float64


In [4]:
# Step 3 : Train and Split
y = data['DEFAULT']
X = data.drop('DEFAULT', axis=1)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, stratify=y, random_state = 17)

In [5]:
# Instantiate model with 1000 decision trees
rf = RandomForestClassifier(n_estimators = 200, random_state = 42, max_depth = 3)

In [6]:
# Train the model on training data
rf.fit(X_train, y_train)

In [7]:
with open("rf_classifier_model.pkl", "wb") as file:
    pickle.dump(rf, file)

In [8]:
y_predict = rf.predict(X_test)

In [9]:
# Accuracy of the classifier

acc = accuracy_score(y_test, y_predict)
print(acc * 100 , "%")

80.98333333333333 %


In [10]:
# Step 6 : Build the confusion matrix

conf_matrix = confusion_matrix(y_test, y_predict) 
print('Confusion Matrix :')
print(conf_matrix)

Confusion Matrix :
[[4508  165]
 [ 976  351]]


In [11]:
# Step 7 : Classification Report
print('Classification Report: ')
print(classification_report(y_test, y_predict))

Classification Report: 
              precision    recall  f1-score   support

           0       0.82      0.96      0.89      4673
           1       0.68      0.26      0.38      1327

    accuracy                           0.81      6000
   macro avg       0.75      0.61      0.63      6000
weighted avg       0.79      0.81      0.78      6000



In [12]:
# Step 8 : AUC Score
y_prob = rf.predict_proba(X_test)[:,1]
auc_score = roc_auc_score(y_test, y_prob)

print("AUC:", auc_score)

AUC: 0.7710272951236972


In [13]:
# Step 9 : Mathews Correlation Coefficient ( MCC Score )

mcc = matthews_corrcoef(y_test, y_predict)

print(" MCC : ", mcc)

 MCC :  0.3392884315832388
