# Hyperparameters specified in the paper

In [27]:
# no random seed given
TESTING_SPLIT = 0.2
VALIDATION_SPLIT = 0.2

# Hyperparameters
BATCHSIZE = 1024
EPOCH = 300
LEARNING_RATE = 0.0001

# Importing libraries

Using tensorflow implement the multilayer for the sparse auto encoder

In [28]:
# https://www.geeksforgeeks.org/sparse-autoencoders-in-deep-learning/
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import EarlyStopping


from tensorflow.keras import layers, Input, Model
from sklearn.svm import SVC
from sklearn.metrics import fbeta_score, precision_score, recall_score, classification_report

# Importing data and data preprocessing

Importing data from mounted drive of the credit card data from [Kaggle](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud).

Display the same data as shown in the paper as sample of data.

Data preprocessing
- Convert time from seconds to hour of the day
- Apply log transformation to amount as right skewed
- Split data into training and testing
- Standardscaler fitted to training set then apply to training and testing

# Sparse Autoencoding Model

In [29]:
def SAE_Model():
  inputs = Input(shape=(30,))

  encoder_1 = layers.Dense(100, activation='relu', name='encoder_1')(inputs)
  encoded = layers.Dense(20, activation='relu', name='encode',
                        kernel_regularizer=tf.keras.regularizers.L1(0.005))(encoder_1)

  decoder_1 = layers.Dense(100, activation='relu', name='decoder_1')(encoded)
  outputs = layers.Dense(30, activation='linear', name='output')(decoder_1)

  autoencoder = Model(inputs=inputs, outputs=outputs, name='sparse_autoencoder')

  adam_optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)

  autoencoder.compile(
      optimizer=adam_optimizer,
      loss='mse'
  )

  return autoencoder, encoded, inputs

# Comparison analysis between SAE-SVM vs SVM

In [30]:
def calc_metrics(y_true, y_pred):
    f2 = fbeta_score(y_true, y_pred, beta=2)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    return f2, precision, recall

In [31]:
sae_output_preds = []
svm_output_preds = []
y_output_test = []

In [32]:
sae_metrics = []
svm_metrics = []

f2_scores = []
precision_scores = []
recall_scores = []

f2_scores_sae = []
precision_scores_sae = []
recall_scores_sae = []

# Five Fold Validation

In [33]:
def five_fold_validation(dataset):
  skf = StratifiedKFold(n_splits=5, shuffle=True)
  undersample = RandomUnderSampler(sampling_strategy=0.5, random_state=42)
  oversample = RandomOverSampler(sampling_strategy=1.0, random_state=42)

  for i, (train_idx, test_idx) in enumerate(skf.split(dataset, dataset['Class'])):
    print(f"Fold {i+1}")
    print(len(train_idx), len(test_idx))

    # Use 4 subsets for training and 1 for testing
    train_subset = dataset.iloc[train_idx]
    test_subset = dataset.iloc[test_idx]

    print(train_subset[train_subset['Class'] == 0].shape, train_subset[train_subset['Class'] == 1].shape)
    print(test_subset[test_subset['Class'] == 0].shape, test_subset[test_subset['Class'] == 1].shape)

    # Split the training
    X_train_split = train_subset.drop('Class', axis=1)
    y_train_split = train_subset['Class']
    X_test_split = test_subset.drop('Class', axis=1)
    y_test_split = test_subset['Class']

    # Scale the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_split)
    X_test_scaled = scaler.transform(X_test_split)

    # Undersample the data
    # print("Resampling")
    # X_train_undersample, y_train_undersample = undersample.fit_resample(X_train_scaled, y_train_split)
    # print(len(X_train_undersample))
    # X_train_oversample, y_train_oversample = oversample.fit_resample(X_train_undersample, y_train_undersample)

    X_train_final = X_train_scaled
    y_train_final = y_train_split
    # print(np.sum(y_train_final == 0), np.sum(y_train_final == 1), len(y_train_final))

    # SAE Model
    autoencoder, encoded, inputs = SAE_Model()

    earlyStop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, min_delta=0.0075)
    sae = autoencoder.fit(X_train_scaled, X_train_scaled,
                epochs=EPOCH,
                batch_size=BATCHSIZE,
                verbose=False,
                callbacks=[earlyStop],
                validation_data=(X_test_scaled, X_test_scaled))

    # Middle layer encoder to output
    encoder = Model(inputs=inputs, outputs=encoded, name='encoder')
    encoder_output = encoder.predict(X_train_final)

    sae_svm_model = SVC(kernel='rbf', C=1.0, random_state=42)
    sae_svm_model.fit(encoder_output, y_train_final)

    # SAE-SVM predictions
    sae_svm_preds = sae_svm_model.predict(encoder.predict(X_test_scaled))
    sae_output_preds.append(sae_svm_preds)

    # SVM predictions
    svm_model = SVC(kernel='rbf', C=1.0, random_state=42)
    svm_model.fit(X_train_final, y_train_final)
    svm_preds = svm_model.predict(X_test_scaled)
    svm_output_preds.append(svm_preds)

    y_output_test.append(y_test_split)

# Five Fold Cross Validation Output

In [34]:
df = pd.read_csv('creditcard.csv')

df = df.dropna()
# convert the time in dataframe to hour of the day
df['Time'] = df['Time'].apply(lambda x: x/3600 % 24)

# log on the amounts, handle zero
# Gemini suggestion to fix issue of 0 in "Amount" column
df['Amount'] = df['Amount'].apply(lambda x: np.log(x + 1e-10) if x > 0 else np.log(1e-10)) # Added a small constant (1e-10) to handle zero

In [35]:
five_fold_validation(df)

Fold 1
227845 56962
(227452, 31) (393, 31)
(56863, 31) (99, 31)
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 467us/step
[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 459us/step
Fold 2
227845 56962
(227452, 31) (393, 31)
(56863, 31) (99, 31)
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 395us/step
[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 452us/step
Fold 3
227846 56961
(227452, 31) (394, 31)
(56863, 31) (98, 31)
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 446us/step
[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 437us/step
Fold 4
227846 56961
(227452, 31) (394, 31)
(56863, 31) (98, 31)
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 492us/step
[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 454us/step
Fold 5
227846 56961
(227452, 31) (394, 31)
(56863, 31) (98, 31)
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [36]:
for i in range(5):
    # Calculate metrics for SVM
    f2, precision, recall = calc_metrics(y_output_test[i], svm_output_preds[i])
    f2_scores.append(f2)
    precision_scores.append(precision)
    recall_scores.append(recall)

    # Calculate metrics for SAE-SVM
    f2_sae, precision_sae, recall_sae = calc_metrics(y_output_test[i], sae_output_preds[i])
    f2_scores_sae.append(f2_sae)
    precision_scores_sae.append(precision_sae)
    recall_scores_sae.append(recall_sae)

    # Append classification reports
    svm_metrics.append(classification_report(y_output_test[i], svm_output_preds[i]))
    sae_metrics.append(classification_report(y_output_test[i], sae_output_preds[i]))

In [37]:
print(f"SVM F2 Scores: {f2_scores}")
print(f"SVM Precision Scores: {precision_scores}")
print(f"SVM Recall Scores: {recall_scores}")

print(f"SAE-SVM F2 Scores: {f2_scores_sae}")
print(f"SAE-SVM Precision Scores: {precision_scores_sae}")
print(f"SAE-SVM Recall Scores: {recall_scores_sae}")

print(f"Average SVM F2 Score: {np.mean(f2_scores)}")
print(f"Average SVM Precision Score: {np.mean(precision_scores)}")
print(f"Average SVM Recall Score: {np.mean(recall_scores)}")

print(f"Average SAE-SVM F2 Score: {np.mean(f2_scores_sae)}")
print(f"Average SAE-SVM Precision Score: {np.mean(precision_scores_sae)}")
print(f"Average SAE-SVM Recall Score: {np.mean(recall_scores_sae)}")

SVM F2 Scores: [np.float64(0.5862831858407079), np.float64(0.7371794871794872), np.float64(0.6971677559912854), np.float64(0.7065217391304348), np.float64(0.7251082251082251)]
SVM Precision Scores: [np.float64(0.9464285714285714), np.float64(0.9583333333333334), np.float64(0.9552238805970149), np.float64(0.9558823529411765), np.float64(0.9571428571428572)]
SVM Recall Scores: [np.float64(0.5353535353535354), np.float64(0.696969696969697), np.float64(0.6530612244897959), np.float64(0.6632653061224489), np.float64(0.6836734693877551)]
SAE-SVM F2 Scores: [np.float64(0.6318082788671024), np.float64(0.7537154989384289), np.float64(0.7905982905982906), np.float64(0.7618025751072961), np.float64(0.7855626326963907)]
SAE-SVM Precision Scores: [np.float64(0.9206349206349206), np.float64(0.9466666666666667), np.float64(0.9736842105263158), np.float64(0.9594594594594594), np.float64(0.9367088607594937)]
SAE-SVM Recall Scores: [np.float64(0.5858585858585859), np.float64(0.7171717171717171), np.floa