# Credit Card Fraud Detection

In [None]:
%pip install imblearn

In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout, BatchNormalization
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score
import os, sys; sys.path.append(os.path.dirname(os.getcwd()))

from src.loadingdata.read_dataset import readData
from src.features.data_preprocessing import preprocessData
from src.visualization.visualize import visualizeData, DisplayCallback
from src.modules.build_model import buildModel
from src.modules.train_model import  trainModel
from src.modules.predict_model import predictor
from src.hyper_parameters.hps import get_hyper_paras
from src.github_commands.git_utils import gitCommands


In [13]:
BATCH,STEPS_PER_EPOCH,VALIDATION_STEPS,EPOCHS,VAL_SUBSPLITS,FINE_TUNE,model_dir,refRepoName,sourceRepoName,refRepoDir = get_hyper_paras()

In [None]:
# import zipfile
# with zipfile.ZipFile('../data/creditcard.csv.zip', 'r') as zip_ref:
#     zip_ref.extractall('../data/')

In [None]:
data=pd.read_csv("../data/creditcard.csv")

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
data.info()

Let see how features are distributed w.r.t., our target variable, which is "Class".

In [None]:
var = data.columns.values

i = 0
t0 = data.loc[data['Class'] == 0]
t1 = data.loc[data['Class'] == 1]

sns.set_style('whitegrid')
plt.figure()
fig, ax = plt.subplots(8,4,figsize=(16,28))

for feature in var:
    i += 1
    plt.subplot(8,4,i)
    sns.kdeplot(t0[feature], bw=0.5,label="Class = 0")
    sns.kdeplot(t1[feature], bw=0.5,label="Class = 1")
    plt.xlabel(feature, fontsize=12)
    locs, labels = plt.xticks()
    plt.tick_params(axis='both', which='major', labelsize=12)
plt.show()

To work with this dataset, we need to bring all the variables in the same scale. But before that we will do a little more exploration.

In [None]:
plt.figure(figsize = (16,10))
plt.title('Credit Card Transactions features correlation plot')
corr = data.corr()
sns.heatmap(corr,xticklabels=corr.columns,yticklabels=corr.columns,linewidths=.1,cmap="Greens",fmt='.1f',annot=True)
plt.show()

Our data is pretty imbalanced as can be seen just below.

In [None]:
data["Class"].value_counts().plot(kind="bar",color="red")
plt.title("Frequency of the target classes", size=20)
plt.xlabel("Target Labels", size = 18)

Below is the exact frequency values for both the target labels.

In [None]:
target = pd.DataFrame(data["Class"].value_counts())
target.style.background_gradient(cmap="Reds")

In [None]:
target.iloc[1].values[0]

In [None]:
# counts = np.bincount(target[:, 0])
# print(
#     "Number of positive samples in training data: {} ({:.2f}% of total)".format(
#         counts[1], 100 * float(counts[1]) / len(train_targets)
#     )
# )

weight_for_0 = (1.0 / target.iloc[0].values[0])
weight_for_1 = (1.0 / target.iloc[1].values[0])*100


Now we will try to standardize all our input features and for that we will seperate the input from the output feature, so that it will be easy for us.

In [None]:
X=data.drop(columns=["Class"])
y=data["Class"]

In [None]:
names=X.columns
scaled_df = preprocessing.scale(X)
scaled_df = pd.DataFrame(scaled_df,columns=names)

In [None]:
scaled_df.head()

So by seeing the time and amount features, we can say that the features has been scaled.

In [None]:
scaled_df[["Amount","Time"]].describe()

As here we are dealing with the problem of imbalanced dataset, so we will try to balance it using a technique called "**SMOTE**" which is the short form of **Synthetic Minority Over-sampling Technique**, this is another method of simple over-sampling technique, but here instead of just duplicating the minority the class, synthetic data are produced, so according to me it is much better compared to simple over-sampling, which just randomly duplicates the minority class to balance it. There is another method, by which we can solve this problem of unbalaced data, which is "Down-Sampling", but I am not a big fan of that technique cause there is a lot of data loss which happens while trying to achieve that.


# Splitting the Data


Now we will split the standardized dataset into train and test, and then do over-sampling on the training dataset, and then we will do the classification based on the training.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(scaled_df, y, test_size = 0.30, random_state = 0, shuffle = True, stratify = y)

In [None]:
X_train.shape, X_test.shape

Also let's check a few thing about the splitted data before we proceed.

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

# SMOTE


Now we will start the process.

In [None]:
sm = SMOTE(random_state = 33)
X_train_new, y_train_new = sm.fit_resample(X_train, y_train.ravel())

Now we will see whether it has been balanced or not.

In [None]:
pd.Series(y_train_new).value_counts().plot(kind="bar")

So it is pretty much balanced now, and we can build our predictive model with it now.

# Using Logistic Regression

In [None]:
clf = LogisticRegression(solver = 'lbfgs')
clf.fit(X_train_new, y_train_new)
train_pred = clf.predict(X_train_new)
test_pred = clf.predict(X_test)

In [None]:
print('Accuracy score for Training Dataset = ', accuracy_score(train_pred, y_train_new))
print('Accuracy score for Testing Dataset = ', accuracy_score(test_pred, y_test))

In [None]:
cm=confusion_matrix(y_test, test_pred)
cm

# Confusion Matrix for Logistic Regression Model

In [None]:
plt.figure(figsize=(8,6))
sns.set(font_scale=1.2)
sns.heatmap(cm, annot=True, fmt = 'g', cmap="Reds", cbar = False)
plt.xlabel("Predicted Label", size = 18)
plt.ylabel("True Label", size = 18)
plt.title("Confusion Matrix Plotting for Logistic Regression model", size = 20)

So from the above confusion matrix, we can see that the nummber of wrong classifications done for 0, which is "no fraud" is 2018 out of 85295, and number of wrong classification done for 1, which is "Fraud happened" is 13 out of 148, or in terms of percentages, let's see below.

In [None]:
print("Percentage for 'no fraud' cases wrong classification using Logistic Regression is:", (2018/85295)*100)
print("Percentage for 'Fraud' cases wrong prediction Logistic Regression is:", (13/148)*100)

# Using Neural Network

# Model Architecture

In [None]:
model = Sequential()
model.add(Dense(X_train_new.shape[1], activation = 'relu', input_dim = X_train_new.shape[1]))
model.add(BatchNormalization())


model.add(Dense(256, activation = 'relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(256, activation = 'relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(1, activation = 'sigmoid'))

In [None]:


model = Sequential([
    Dense(units=128, kernel_initializer='uniform', input_dim=X_train_new.shape[1], activation='relu'),
    Dense(units=18, kernel_initializer='uniform', activation='relu'),
    Dropout(0.25),
    Dense(20, kernel_initializer='uniform', activation='relu'),
    Dense(24, kernel_initializer='uniform', activation='relu'),
    Dense(1, kernel_initializer='uniform', activation='sigmoid')
])



The hyperparameters that we have used here are **Batch Normalization**, and **Dropout**. And the activation function we have used here for hidden layers are "relu", and for output, it is "Sigmoid" function.

Now let's compile the model.

In [None]:
metrics = [
    keras.metrics.FalseNegatives(name="fn"),
    keras.metrics.FalsePositives(name="fp"),
    keras.metrics.TrueNegatives(name="tn"),
    keras.metrics.TruePositives(name="tp"),
    keras.metrics.Precision(name="precision"),
    keras.metrics.Recall(name="recall"),
]
optimizer = keras.optimizers.Adam(lr=0.0001)
model.compile(optimizer = optimizer, loss = 'binary_focal_crossentropy',metrics=metrics)

To protect our model from overfitting, we will use early stop feature of tensorflow, which will once identify that the evaluation metric that we mentioned, if it stopped improving further, it will stop the number of epochs.

In [None]:
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience = 10)
class_weight = {0: weight_for_0, 1: weight_for_1}#

In [None]:
history = model.fit(x=X_train_new, y=y_train_new, batch_size = 256, epochs=150,
          validation_data=(X_test, y_test), verbose=1,
          callbacks=[early_stop],class_weight=class_weight)

In [None]:
evaluation_metrics=pd.DataFrame(model.history.history)
evaluation_metrics.plot(figsize=(10,5))
plt.title("Loss for both Training and Validation", size = 20)

In [None]:

# y_pred = model.predict_classes(X_test)
import numpy as np
predict_x=model.predict(X_test) 
y_pred=np.argmax(predict_x,axis=1)

In [None]:
cm_nn=confusion_matrix(y_test, y_pred)
cm_nn

# Confusion Matrix for Neural Network

In [None]:
plt.figure(figsize=(8,6))
sns.set(font_scale=1.2)
sns.heatmap(cm_nn, annot=True, fmt = 'g', cmap="winter", cbar = False)
plt.xlabel("Predicted Label", size = 18)
plt.ylabel("True Label", size = 18)
plt.title("Confusion Matrix Plotting for Neural Network model", size = 20)

So if we compare this with the Logistic Regression model, the little problem here is that, it is doing very good prediction for the majority class, which is 0 or "**No Fraud**" cases, but for minority class, which is 1 or "**Fraud**" cases, it is performing a little less better than the Logistic Regression. But I guess with a little more hyperparamters tuning, the model will be able to perform better than the Logistic Regression even for **minority class**.