dataset link: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud

In [None]:
!pip install -q kaggle

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/


In [None]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle datasets download -d mlg-ulb/creditcardfraud

In [None]:
! unzip -q /content/creditcardfraud.zip

# Loading python packages

**Packages for data loading, data analysis, and data preparation**

In [None]:

import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import pyplot

from pandas import read_csv, set_option
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler

**Packages for model evaluation and classification models**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [None]:
from sklearn.neural_network import MLPClassifier

from sklearn.pipeline import Pipeline

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

**Packages for deep learning model**

In [None]:
!pip install tensorflow
!pip install tensorflow scikit-learn


In [None]:
!pip install tensorflow==2.9.1

from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier


In [None]:
from pickle import dump
from pickle import load

In [None]:
dataset = pd.read_csv('creditcard.csv')

## Exploratory data analysis

In [None]:
#shape
dataset.shape

In [None]:
#peek at data
set_option('display.width', 100)
dataset.head(5)

In [None]:
class_names = {0:'Not Fraud', 1:'Fraud'}
print(dataset.Class.value_counts().rename(index = class_names))

***Data Visualization: Since the feature descriptions are not provided,visualizing the data will not lead to much insight. This step will be skipped in this case study***

***Data Preparation: This data is kaggle and is already in a cleaned format without any empty rows or columns. Data cleaning or categorization is unnecessary***

# **Evaluate Models: **

**Train-Test split and evaluation metrics**

In [None]:
Y = dataset['Class']
X = dataset.loc[:, dataset.columns != 'Class']
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [None]:
# test options for classifications

num_folds = 10
scoring = 'accuracy'

In [None]:
models = []
models.append(( 'LR', LogisticRegression()))
models.append(( 'LDA', LinearDiscriminantAnalysis()))
models.append(( 'KNN', KNeighborsClassifier()))
models.append(( 'CART', DecisionTreeClassifier()))


In [None]:
from sklearn.model_selection import KFold, cross_val_score

results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)


In [None]:
# compare algorithm
fig = pyplot.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
pyplot.show()

In [None]:
# prepare model
model = DecisionTreeClassifier()
model.fit(X_train, Y_train)

In [None]:
#estimate accuracy on validation set
predictions = model.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

In [None]:
df_cm = pd.DataFrame(confusion_matrix(Y_validation, predictions), columns=np.unique(Y_validation), index = np.unique(Y_validation))
df_cm.index.name = 'Actual'
df_cm.columns.name = 'Predicted'
sns.heatmap(df_cm, cmap="Blues", annot=True,annot_kws={"size": 16})

**Model Tuning**

**Model tuning by choosing the correct evaluation metric**

In [None]:
scoring = 'recall'

**let us spot-check some basic classification algorithms for recall**


In [None]:
models = []
models.append(( 'LR', LogisticRegression()))
models.append(( 'LDA', LinearDiscriminantAnalysis()))
models.append(( 'KNN', KNeighborsClassifier()))
models.append(( 'CART', DecisionTreeClassifier()))

**Running cross validation:**

In [None]:
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring= scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
#prepare model
model = LinearDiscriminantAnalysis()
model.fit(X_train, Y_train)

#estimate accuracy on validation set
predictions = model.predict(X_validation)
print(accuracy_score(Y_validation, predictions))


**Model tuning- balancing the sample by random under-sampling**

In [None]:
df = pd.concat([X_train, Y_train], axis=1)
# amount of fraud classes 492 rows.
fraud_df = df.loc[df['Class'] == 1]
non_fraud_df = df.loc[df['Class'] == 0][:492]

normal_distributed_df = pd.concat([fraud_df, non_fraud_df])

# shuffle dataframe rows
df_new = normal_distributed_df.sample(frac=1, random_state=42)

# split out validation dataset for the end
Y_train_new = df_new['Class']
X_train_new = df_new.loc[:, df_new.columns != 'Class']


**Let us look at the distribution of the classes in the dataset:**

In [None]:
import matplotlib.pyplot as plt
print('Distribution of the Classes in the subsample dataset')
print(df_new['Class'].value_counts(), len(df_new))

sns.countplot(x='Class', data=df_new)
pyplot.title('Equally Distributed Classes', fontsize=14)
pyplot.show()

In [None]:
#setting the evaluation metric
scoring = 'accuracy'

#spot-check the algorithms
models = []
models.append(( 'LR', LogisticRegression()))
models.append(( 'LDA', LinearDiscriminantAnalysis()))
models.append(( 'KNN', KNeighborsClassifier()))
models.append(( 'CART', DecisionTreeClassifier()))
models.append(( 'NB', GaussianNB()))
models.append(( 'SVM', SVC()))

#neural network
models.append(( 'NN', MLPClassifier()))

#Ensemble models
#boosting methods
models.append(('AB', AdaBoostClassifier()))
models.append(('GBM', GradientBoostingClassifier()))

#bagging methods
models.append(('RF', RandomForestClassifier()))
models.append(('ET', ExtraTreesClassifier()))

#bagging methods
models.append(('RF', RandomForestClassifier()))
models.append(('ET', ExtraTreesClassifier()))

**Keras-based deep learning model**

In [None]:
# Function to create model, required for KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

def create_model(neurons=12, activation='relu', learn_rate = 0.01, momentum=0):
    # create model
    model = Sequential()
    model.add(Dense(X_train.shape[1], input_dim= X_train.shape[1], activation=activation))
    model.add(Dense(32,activation=activation))
    model.add(Dense(1,activation='sigmoid'))

    #compile model
    optimizer = SGD(lr=learn_rate, momentum=momentum)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
models.append(('DNN', KerasClassifier(build_fn=create_model, epochs=50, batch_size=10, verbose=0)))

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
results = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print(f"{name}: {cv_results.mean():.4f} ({cv_results.std():.4f})")

# Compare Algorithms
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 8))
plt.boxplot(results, labels=names)
plt.title('Algorithm Comparison')
plt.show()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LR: 0.9990 (0.0002)
LDA: 0.9994 (0.0001)
KNN: 0.9984 (0.0001)
CART: 0.9992 (0.0002)
NB: 0.9929 (0.0005)
SVM: 0.9983 (0.0000)
NN: 0.9978 (0.0018)
AB: 0.9991 (0.0002)
GBM: 0.9987 (0.0003)
RF: 0.9996 (0.0001)
ET: 0.9995 (0.0001)
RF: 0.9996 (0.0001)
ET: 0.9995 (0.0001)


  super(SGD, self).__init__(name, **kwargs)




  super(SGD, self).__init__(name, **kwargs)




  super(SGD, self).__init__(name, **kwargs)




  super(SGD, self).__init__(name, **kwargs)




  super(SGD, self).__init__(name, **kwargs)




  super(SGD, self).__init__(name, **kwargs)




  super(SGD, self).__init__(name, **kwargs)


In [None]:
#Grid Search: GradientBoosting Tuning
n_estimators = [20,180,1000]
max_depth = [2,3,5]
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)
model = GradientBoostingClassifier()
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X_train_new, Y_train_new)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
# prepare model

model = GradientBoostingClassifier(max_depth=5, n_estimators=180)
model.fit(X_train_new, Y_train_new)

#estimate accuracy on Original validation set
predictions = model.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
