<a href="https://colab.research.google.com/github/manish190502/financial-fraud-detection-using-generative-AI/blob/main/gan_with_rnn_and_cnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.layers import Input, Dense, Concatenate, LSTM, Conv1D, MaxPooling1D, Flatten
from keras.models import Model, Sequential
from keras.optimizers import Adam
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load your dataset
data = pd.read_excel('/content/drive/MyDrive/fraudset.xlsx')

# Preprocessing
X = data.drop('Class', axis=1).values  # Features
y = data['Class'].values.reshape(-1, 1)  # Target

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Generator model
gen_input_dim = X_train_scaled.shape[1]
gen_input = Input(shape=(gen_input_dim,))
gen = Dense(128, activation='relu')(gen_input)
gen = Dense(64, activation='relu')(gen)
gen_output = Dense(gen_input_dim, activation='tanh')(gen)
generator = Model(gen_input, gen_output)

# Discriminator model
disc_input_dim = gen_input_dim + 1  # Features + Class label
disc_input = Input(shape=(disc_input_dim,))
disc = Dense(128, activation='relu')(disc_input)
disc = Dense(64, activation='relu')(disc)
disc_output = Dense(1, activation='sigmoid')(disc)
discriminator = Model(disc_input, disc_output)
discriminator.compile(optimizer=Adam(lr=0.0002, beta_1=0.5), loss='binary_crossentropy', metrics=['accuracy'])

# Combined model (CGAN)
cgan_input = Input(shape=(gen_input_dim,))
label_input = Input(shape=(1,))
concatenated_input = Concatenate()([cgan_input, label_input])
cgan_output = discriminator(concatenated_input)
cgan = Model(inputs=[cgan_input, label_input], outputs=cgan_output)
cgan.compile(optimizer=Adam(lr=0.0002, beta_1=0.5), loss='binary_crossentropy', metrics=['accuracy'])

# Train CGAN
batch_size = 128
epochs = 10000
for epoch in range(epochs):
    # Train discriminator
    idx = np.random.randint(0, X_train_scaled.shape[0], batch_size)
    real_transactions = X_train_scaled[idx]
    real_labels = y_train[idx]
    fake_labels = np.random.randint(0, 2, (batch_size, 1))
    fake_transactions = generator.predict(np.random.normal(0, 1, (batch_size, gen_input_dim)))
    disc_loss_real = discriminator.train_on_batch(np.concatenate([real_transactions, real_labels], axis=1), np.ones((batch_size, 1)))
    disc_loss_fake = discriminator.train_on_batch(np.concatenate([fake_transactions, fake_labels], axis=1), np.zeros((batch_size, 1)))
    disc_loss = 0.5 * np.add(disc_loss_real, disc_loss_fake)

    # Train generator
    noise = np.random.normal(0, 1, (batch_size, gen_input_dim))
    valid_y = np.ones((batch_size, 1))
    gen_loss = cgan.train_on_batch([noise, fake_labels], valid_y)

    # Print progress
    if epoch % 100 == 0:
        print(f'Epoch: {epoch}, Disc_loss: {disc_loss[0]}, Gen_loss: {gen_loss}')

# Generate synthetic data
num_synthetic_samples = 1000
synthetic_noise = np.random.normal(0, 1, (num_synthetic_samples, gen_input_dim))
synthetic_labels = np.random.randint(0, 2, (num_synthetic_samples, 1))
synthetic_data = generator.predict(synthetic_noise)

# Save synthetic data to a CSV file
synthetic_df = pd.DataFrame(synthetic_data, columns=data.columns[:-1])  # Assuming last column is the target 'Class'
synthetic_df.to_csv('fraudgan.csv', index=False)

# Load synthetic data
synthetic_data = pd.read_csv('fraudgan.csv')

# Preprocessing
scaler = StandardScaler()
scaled_data = scaler.fit_transform(synthetic_data)

# Define sequence length
sequence_length = 10

# Generate sequences
sequences = []
for i in range(len(scaled_data) - sequence_length):
    sequences.append(scaled_data[i:i + sequence_length])

# Convert to numpy array
sequences = np.array(sequences)

# Split sequences into features and target
X_rnn = sequences[:, :-1]    # Features
y_true_rnn = sequences[:, -1]   # Target (next data point in the sequence)

# Define and train RNN model
rnn_model = Sequential()
rnn_model.add(LSTM(units=64, input_shape=(X_rnn.shape[1], X_rnn.shape[2])))
rnn_model.add(Dense(units=X_rnn.shape[2]))  # Output layer
rnn_model.compile(optimizer='adam', loss='mse')
rnn_model.fit(X_rnn, y_true_rnn, epochs=10, batch_size=32)

# Make predictions for the next data point in the sequence
rnn_predicted_next_point = rnn_model.predict(X_rnn)

# Calculate reconstruction errors for RNN
rnn_reconstruction_errors = np.mean(np.abs(rnn_predicted_next_point - y_true_rnn), axis=1)

# Set anomaly threshold for RNN
rnn_threshold = np.mean(rnn_reconstruction_errors) + 3 * np.std(rnn_reconstruction_errors)

# Detect anomalies using RNN
rnn_anomalies = np.where(rnn_reconstruction_errors > rnn_threshold)[0]

# Print anomalies detected by RNN
print("Anomalies indices (RNN):", rnn_anomalies)

# Define true labels for anomalies
true_anomalies = np.zeros_like(y_true_rnn)
true_anomalies[rnn_anomalies] = 1

# Define predicted labels for RNN anomalies
predicted_rnn_anomalies = np.zeros_like(y_true_rnn)
predicted_rnn_anomalies[rnn_anomalies] = 1

# Define and train CNN model
cnn_model = Sequential()
cnn_model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_rnn.shape[1], X_rnn.shape[2])))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Flatten())
cnn_model.add(Dense(units=X_rnn.shape[2]))
cnn_model.compile(optimizer='adam', loss='mse')
cnn_model.fit(X_rnn, y_true_rnn, epochs=10, batch_size=32)

# Make predictions for the next data point in the sequence
cnn_predicted_next_point = cnn_model.predict(X_rnn)

# Calculate reconstruction errors for CNN
cnn_reconstruction_errors = np.mean(np.abs(cnn_predicted_next_point - y_true_rnn), axis=1)

# Set anomaly threshold for CNN
cnn_threshold = np.mean(cnn_reconstruction_errors) + 3 * np.std(cnn_reconstruction_errors)

# Detect anomalies using CNN
cnn_anomalies = np.where(cnn_reconstruction_errors > cnn_threshold)[0]

# Print anomalies detected by CNN
print("Anomalies indices (CNN):", cnn_anomalies)





[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch: 5100, Disc_loss: 8.22822312329663e-06, Gen_loss: [4.5214188590516025e-17, 1.0]
Epoch: 5200, Disc_loss: 1.1492534213668086e-05, Gen_loss: [1.1791967520839517e-17, 1.0]
Epoch: 5300, Disc_loss: 7.358924904110609e-06, Gen_loss: [4.939887148662015e-16, 1.0]
Epoch: 5400, Disc_loss: 1.7977978359340345e-06, Gen_loss: [4.393141850300708e-09, 1.0]
Epoch: 5500, Disc_loss: 1.914467704072058e-06, Gen_loss: [9.44643560422953e-13, 1.0]
Epoch: 5600, Disc_loss: 4.909247581994775e-05, Gen_loss: [1.2700483011940378e-06, 1.0]
Epoch: 5700, Disc_loss: 4.177818027528708e-06, Gen_loss: [2.1088509072342276e-07, 1.0]
Epoch: 5800, Disc_loss: 9.419809430255555e-06, Gen_loss: [9.223028360461592e-16, 1.0]
Epoch: 5900, Disc_loss: 5.8948971854999234e-06, Gen_loss: [0.011805716902017593, 0.9921875]
Epoch: 6000, Disc_loss: 3.3268002880504355e-05, Gen_loss: [7.0937653617875185e-06, 1.0]
Epoch: 6100, Disc_loss: 1.7546410163049586e-05, Gen_loss: [1.19

In [9]:
# Convert anomaly indices to binary labels
true_labels = np.zeros_like(y_true_rnn)
true_labels[rnn_anomalies] = 1

# Convert CNN anomaly indices to binary labels
cnn_labels = np.zeros_like(y_true_rnn)
cnn_labels[cnn_anomalies] = 1

# Compute evaluation metrics for RNN

rnn_accuracy = accuracy_score(true_labels, predicted_rnn_anomalies)
rnn_precision = precision_score(true_labels, predicted_rnn_anomalies, average=None)[1]
rnn_recall = recall_score(true_labels, predicted_rnn_anomalies, average=None)[1]
rnn_f1 = f1_score(true_labels, predicted_rnn_anomalies, average=None)[1]

# Compute evaluation metrics for CNN
cnn_accuracy = accuracy_score(true_labels, cnn_labels)
cnn_precision = precision_score(true_labels, cnn_labels, average=None)[1]
cnn_recall = recall_score(true_labels, cnn_labels, average=None)[1]
cnn_f1 = f1_score(true_labels, cnn_labels, average=None)[1]

print("True labels:", true_labels)
print("Predicted anomalies (RNN):", predicted_rnn_anomalies)



True labels: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Predicted anomalies (RNN): [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [10]:
print("Evaluation Metrics for RNN:")
print("Accuracy:", rnn_accuracy)
print("Precision:", rnn_precision)
print("Recall:", rnn_recall)
print("F1 Score:", rnn_f1)

print("\nEvaluation Metrics for CNN:")
print("Accuracy:", cnn_accuracy)
print("Precision:", cnn_precision)
print("Recall:", cnn_recall)
print("F1 Score:", cnn_f1)

Evaluation Metrics for RNN:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0

Evaluation Metrics for CNN:
Accuracy: 0.9929292929292929
Precision: 0.375
Recall: 0.6
F1 Score: 0.4615384615384615
