<a href="https://colab.research.google.com/github/manish190502/financial-fraud-detection-using-generative-AI/blob/main/cgan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.layers import Input, Dense, Concatenate
from keras.models import Model
from keras.optimizers import Adam

# Load your dataset
data = pd.read_excel('/content/drive/MyDrive/fraudset.xlsx')

# Preprocessing
X = data.drop('Class', axis=1).values  # Features
y = data['Class'].values.reshape(-1, 1)  # Target

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Generator model
gen_input_dim = X_train_scaled.shape[1]
gen_input = Input(shape=(gen_input_dim,))
gen = Dense(128, activation='relu')(gen_input)
gen = Dense(64, activation='relu')(gen)
gen_output = Dense(gen_input_dim, activation='tanh')(gen)
generator = Model(gen_input, gen_output)

# Discriminator model
disc_input_dim = gen_input_dim + 1  # Features + Class label
disc_input = Input(shape=(disc_input_dim,))
disc = Dense(128, activation='relu')(disc_input)
disc = Dense(64, activation='relu')(disc)
disc_output = Dense(1, activation='sigmoid')(disc)
discriminator = Model(disc_input, disc_output)
discriminator.compile(optimizer=Adam(lr=0.0002, beta_1=0.5), loss='binary_crossentropy', metrics=['accuracy'])

# Combined model (CGAN)
cgan_input = Input(shape=(gen_input_dim,))
label_input = Input(shape=(1,))
concatenated_input = Concatenate()([cgan_input, label_input])
cgan_output = discriminator(concatenated_input)
cgan = Model(inputs=[cgan_input, label_input], outputs=cgan_output)
cgan.compile(optimizer=Adam(lr=0.0002, beta_1=0.5), loss='binary_crossentropy', metrics=['accuracy'])

# Train CGAN
batch_size = 128
epochs = 10000
for epoch in range(epochs):
    # Train discriminator
    idx = np.random.randint(0, X_train_scaled.shape[0], batch_size)
    real_transactions = X_train_scaled[idx]
    real_labels = y_train[idx]
    fake_labels = np.random.randint(0, 2, (batch_size, 1))
    fake_transactions = generator.predict(np.random.normal(0, 1, (batch_size, gen_input_dim)))
    disc_loss_real = discriminator.train_on_batch(np.concatenate([real_transactions, real_labels], axis=1), np.ones((batch_size, 1)))
    disc_loss_fake = discriminator.train_on_batch(np.concatenate([fake_transactions, fake_labels], axis=1), np.zeros((batch_size, 1)))
    disc_loss = 0.5 * np.add(disc_loss_real, disc_loss_fake)

    # Train generator
    noise = np.random.normal(0, 1, (batch_size, gen_input_dim))
    valid_y = np.ones((batch_size, 1))
    gen_loss = cgan.train_on_batch([noise, fake_labels], valid_y)

    # Print progress
    if epoch % 100 == 0:
        print(f'Epoch: {epoch}, Disc_loss: {disc_loss[0]}, Gen_loss: {gen_loss}')

# Generate synthetic data
num_synthetic_samples = 1000
synthetic_noise = np.random.normal(0, 1, (num_synthetic_samples, gen_input_dim))
synthetic_labels = np.random.randint(0, 2, (num_synthetic_samples, 1))
synthetic_data = generator.predict(synthetic_noise)

# Save synthetic data to a CSV file
synthetic_df = pd.DataFrame(synthetic_data, columns=data.columns[:-1])  # Assuming last column is the target 'Class'
synthetic_df.to_csv('fraudgan.csv', index=False)




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch: 5100, Disc_loss: 3.4534703445388004e-05, Gen_loss: [1.5933802260941176e-23, 1.0]
Epoch: 5200, Disc_loss: 5.959953796264017e-06, Gen_loss: [7.676177639218736e-15, 1.0]
Epoch: 5300, Disc_loss: 1.7524163922644227e-05, Gen_loss: [9.154995521171074e-15, 1.0]
Epoch: 5400, Disc_loss: 3.3090330479568085e-08, Gen_loss: [6.825726066989318e-16, 1.0]
Epoch: 5500, Disc_loss: 0.0003852292138617486, Gen_loss: [3.069478035619164e-19, 1.0]
Epoch: 5600, Disc_loss: 7.623919827892678e-06, Gen_loss: [1.3350616189452803e-28, 1.0]
Epoch: 5700, Disc_loss: 9.822292668104637e-07, Gen_loss: [0.00024851757916621864, 1.0]
Epoch: 5800, Disc_loss: 3.56508553523112e-06, Gen_loss: [8.143910767939778e-18, 1.0]
Epoch: 5900, Disc_loss: 2.5652680051280186e-05, Gen_loss: [1.0584449228190351e-07, 1.0]
Epoch: 6000, Disc_loss: 2.5929016373993363e-06, Gen_loss: [4.367694053653004e-11, 1.0]
Epoch: 6100, Disc_loss: 9.290095476899296e-06, Gen_loss: [4.9009796

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Load synthetic data
synthetic_data = pd.read_csv('fraudgan.csv')

# Preprocessing
scaler = StandardScaler()
scaled_data = scaler.fit_transform(synthetic_data)

# Define sequence length
sequence_length = 10

# Generate sequences
sequences = []
for i in range(len(scaled_data) - sequence_length):
    sequences.append(scaled_data[i:i + sequence_length])

# Convert to numpy array
sequences = np.array(sequences)

# Split sequences into features and target
X = sequences[:, :-1]    # Features
y_true = sequences[:, -1]   # Target (next data point in the sequence)

# Define and train RNN model
model = Sequential()
model.add(LSTM(units=64, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(units=X.shape[2]))  # Output layer
model.compile(optimizer='adam', loss='mse')
model.fit(X, y_true, epochs=10, batch_size=32)

# Make predictions for the next data point in the sequence
predicted_next_point = model.predict(X)

# Calculate reconstruction errors
reconstruction_errors = np.mean(np.abs(predicted_next_point - y_true), axis=1)

# Set anomaly threshold (e.g., based on mean and standard deviation of errors)
threshold = np.mean(reconstruction_errors) + 3 * np.std(reconstruction_errors)

# Detect anomalies
anomalies = np.where(reconstruction_errors > threshold)[0]

# Print anomalies
print("Anomalies indices:", anomalies)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Anomalies indices: [ 18 389 405 616 894 947]
