<a href="https://colab.research.google.com/github/mohkharma/Credit-Card-Fraud-Detection-Using-Machine-Learning-Comparative-Study/blob/main/credit_card_fraud_detection_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Credit-Card-Fraud-Detection-Using-Machine-Learning-Comparative-Study**
Author: Mohammed Kharma, Feb-2023


This book is using the dataset taken from:

[Credit Card Fraud on Kaggle](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud)



-------------------------------------------------------------------

In [None]:
#Importing the required libraries:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [None]:
#Read the dataset into DataFrame using pandas

credit_card_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/creditcard.csv')

In [None]:
# dataset informations
credit_card_data.info()

In [None]:
# head 5 rows of the dataset
credit_card_data.head()

In [None]:
# tail 5 rows of the dataset
credit_card_data.tail()

In [None]:
# dataset description
credit_card_data.describe()

In [None]:
# missing values in each column check
credit_card_data.isnull().sum()

In [None]:
# shape of the data
credit_card_data.shape

(284807, 31)

In [None]:
# legit and fraudulent transactions per class
credit_card_data['Class'].value_counts()

In [None]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

print(legit.shape)
print(fraud.shape)

In [None]:
# statistical measures of the data
legit.Amount.describe()

In [None]:
fraud.Amount.describe()

In [None]:
# compare the values for both transactions so we can compare the generated sample from the legitmite df where it need to refelect relevint means
credit_card_data.groupby('Class').mean()

Under-Sampling

Build a sample dataset containing similar distribution of normal transactions and Fraudulent Transactions

Number of Fraudulent Transactions --> 492

In [None]:
#legit_test = legit.sample(n=2000)
legit_test = legit.iloc[0:2001,0:32]
legit = legit.iloc[2002:,0:32]

In [None]:
from sklearn.utils import shuffle
legit_sample = legit.sample(n=492)

In [None]:
legit_sample.shape

(492, 31)

In [None]:
legit_test

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1998,1542.0,1.194910,0.093867,-0.073016,1.008538,0.454666,0.809557,-0.069582,0.219049,0.146503,...,-0.126805,-0.180904,-0.220110,-1.134723,0.779307,-0.257532,0.032098,-0.005676,13.08,0
1999,1542.0,-0.090760,0.430191,0.587889,-1.468904,0.315714,-0.765142,0.737765,-0.348267,-1.877661,...,0.238175,0.619908,-0.381979,-0.387806,0.492030,-0.078039,0.023719,0.036965,25.00,0
2000,1544.0,-0.781938,1.594474,2.067660,1.823249,0.260623,-0.502556,1.090112,-0.490576,-1.142105,...,-0.113361,-0.411819,0.006180,0.681210,-0.286224,-0.412347,-0.663629,-0.134151,9.35,0
2001,1545.0,-0.693979,0.863780,1.782080,-0.621203,-0.034457,-0.556248,0.704357,0.061961,-0.290994,...,-0.124982,-0.122718,-0.158336,0.590370,-0.014215,0.344616,0.284906,0.157360,5.00,0


Concatenating two DataFrames

In [None]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [None]:
new_dataset.head()
new_dataset.tail()
new_dataset['Class'].value_counts()
new_dataset.groupby('Class').mean()


Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,96224.72561,0.002366,0.075517,0.023714,0.059047,0.104433,-0.05049,0.010652,-0.029354,-0.001286,...,-0.083836,-0.045855,0.051065,0.001702,-0.029183,0.014025,0.010112,0.008844,-0.028621,64.140305
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


Splitting the data into Features & Targets

In [None]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [None]:
print(X)

            Time        V1        V2        V3        V4        V5        V6  \
117840   74838.0  1.135858 -0.106367  1.001555  0.693739 -0.942031 -0.478576   
131278   79541.0 -1.089192  1.033084  1.142932  0.272566  1.303121 -0.979236   
246887  153375.0  0.128395  0.968289 -0.423312 -0.536925  0.853749 -0.762438   
280607  169640.0 -0.415778  0.712262  0.690968 -0.793964 -0.239076 -0.661871   
220254  142089.0 -0.952154  1.532240 -0.172828 -0.338676 -0.689891 -1.764746   
...          ...       ...       ...       ...       ...       ...       ...   
279863  169142.0 -1.927883  1.125653 -4.518331  1.749293 -1.566487 -2.010494   
280143  169347.0  1.378559  1.289381 -5.004247  1.411850  0.442581 -1.326536   
280149  169351.0 -0.676143  1.126366 -2.213700  0.468308 -1.120541 -0.003346   
281144  169966.0 -3.113832  0.585864 -5.399730  1.817092 -0.840618 -2.943548   
281674  170348.0  1.991976  0.158476 -2.583441  0.408670  1.151147 -0.096695   

              V7        V8        V9  .

In [None]:
print(Y)

117840    0
131278    0
246887    0
280607    0
220254    0
         ..
279863    1
280143    1
280149    1
281144    1
281674    1
Name: Class, Length: 984, dtype: int64


Split the data into Training data & Testing Data

In [None]:
X.shape

(984, 30)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(984, 30) (787, 30) (197, 30)


Model Training

Logistic Regression

In [None]:
model = LogisticRegression()

In [None]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

Model Evaluation

Accuracy Score

In [None]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

print('Accuracy on Training data : ', training_data_accuracy)

# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

print('Accuracy score on Test Data : ', test_data_accuracy)



Accuracy on Training data :  0.951715374841169
Accuracy score on Test Data :  0.934010152284264


In [None]:
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.datasets import make_classification
# import sklearn
# temp_dataset = pd.concat([legit, fraud], axis=0)
# X_temp = temp_dataset.drop(columns='Class', axis=1)
# Y_temp = temp_dataset['Class']

# X, y = make_classification(n_samples=5000, n_features=2, n_informative=2,
#                                             n_redundant=0, n_repeated=0, n_classes=2,
#                                             n_clusters_per_class=1,
#                                             weights=[0.95, 0.05],
#                                             class_sep=0.5, random_state=0)

# dataset_df = pd.DataFrame({'X1':X_temp[:,0],'X2':X_temp[:,1], 'Y':Y_temp})

In [None]:
#10 runs preperation using dimensionality reduction, tested and it showed no change in the F1 and accuricy
from sklearn.decomposition import PCA

legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

temp_dataset = pd.concat([legit, fraud], axis=0)
X_temp = temp_dataset.drop(columns='Class', axis=1)
Y_temp = temp_dataset['Class']

# Initialize PCA and fit the data
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(X_temp)

# Print the explained variance ratio for each principal component
print(pca.explained_variance_ratio_)

# Create a new dataframe with the reduced features
df_reduced = pd.DataFrame(X_reduced, columns=['PC1', 'PC2'])

# Add the target variable back to the dataframe
df_reduced['Class'] = Y_temp

legit = df_reduced[df_reduced.Class == 0]
fraud = df_reduced[df_reduced.Class == 1]

[9.99972249e-01 2.77382192e-05]


In [None]:
#10 runs 
from sklearn import ensemble

from sklearn.metrics import classification_report
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier  # Include
from sklearn.naive_bayes import GaussianNB   # Include
from sklearn.svm import SVC                      # Include
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

model = LogisticRegression()     # Include
# model = RandomForestClassifier(n_estimators=100)
#model = GaussianNB()
#model = SVC(kernel='linear', C=1.0)
#model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=6)

for i in range(1, 11):
  legit_new_sample = legit.sample(n=492)

  new_dataset = pd.concat([legit_new_sample, fraud], axis=0)
  X_new = new_dataset.drop(columns='Class', axis=1)
  Y_new = new_dataset['Class']

  X_train_new, X_test_new, Y_train_new, Y_test_new = train_test_split(X_new, Y_new, test_size=0.2, stratify=Y, random_state=2)

  model.fit(X_train_new, Y_train_new)
  # accuracy on training data
  X_train_prediction_new = model.predict(X_train_new)
  training_data_accuracy_new = accuracy_score(X_train_prediction_new, Y_train_new)

  print('Accuracy on Training data : ', training_data_accuracy_new)

  # accuracy on test data
  X_test_prediction_new = model.predict(X_test_new)
  test_data_accuracy_new = accuracy_score(X_test_prediction_new, Y_test_new)

  print(classification_report(X_test_prediction_new,Y_test_new))

Accuracy on Training data :  0.5743329097839899
              precision    recall  f1-score   support

           0       0.47      0.57      0.52        82
           1       0.64      0.55      0.59       115

    accuracy                           0.56       197
   macro avg       0.56      0.56      0.56       197
weighted avg       0.57      0.56      0.56       197

Accuracy on Training data :  0.5667090216010165
              precision    recall  f1-score   support

           0       0.42      0.55      0.48        76
           1       0.65      0.53      0.58       121

    accuracy                           0.54       197
   macro avg       0.54      0.54      0.53       197
weighted avg       0.56      0.54      0.54       197

Accuracy on Training data :  0.5476493011435832
              precision    recall  f1-score   support

           0       0.40      0.53      0.46        76
           1       0.63      0.51      0.57       121

    accuracy                          

In [None]:
# 10 Flods
# from sklearn.model_selection import KFold
# import numpy as np

# # Define the number of folds
# num_folds = 10


# # Generate some example data
# X = credit_card_data.drop(columns='Class', axis=1).values
# y = credit_card_data['Class'].values

# # Create a KFold object to split the data
# kf = KFold(n_splits=num_folds, shuffle=True)

# # Iterate over each fold
# for fold, (train_index, test_index) in enumerate(kf.split(X)):
#     # Get the training and testing data for this fold
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]
    
#     # Train your model on the training data
#     model.fit(X_train, y_train)
    
#     # Test your model on the testing data
#     accuracy = model.score(X_test, y_test)
    
#     # Print out the accuracy for this fold
#     print(f"Fold {fold+1}: accuracy = {accuracy}")


In [None]:
from sklearn.metrics import classification_report

print(classification_report(X_test_prediction,Y_test))

              precision    recall  f1-score   support

           0       0.96      0.91      0.94       104
           1       0.91      0.96      0.93        93

    accuracy                           0.93       197
   macro avg       0.93      0.94      0.93       197
weighted avg       0.94      0.93      0.93       197



In [None]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

print('Accuracy on Training data : ', training_data_accuracy)

# accuracy on test data
X_test_prediction = model.predict(legit_test.drop(columns='Class', axis=1))
test_data_accuracy = accuracy_score(X_test_prediction, legit_test['Class'])

print('Accuracy score on Test Data : ', test_data_accuracy)



import matplotlib.pyplot as plt
import numpy
from sklearn import metrics

actual = legit_test['Class']
predicted = X_test_prediction

confusion_matrix = metrics.confusion_matrix(actual, predicted)

cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])

cm_display.plot()
plt.show()

# X1 = legit_test.drop(columns='Class', axis=1)
# Y1 = legit_test['Class']

Feature names unseen at fit time:
- Amount
- Time
- V1
- V10
- V11
- ...
Feature names seen at fit time, yet now missing:
- PC1
- PC2



ValueError: ignored

In [None]:
#calculate F1 score on imbalanced dataset
from sklearn.metrics import classification_report

print(classification_report(actual,predicted))