In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from torch.autograd import Variable, grad
import random

In [2]:
data = pd.read_csv('Fraud.csv')

In [3]:
data

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,330,CASH_IN,607193.64,C104196924,15458649.28,16065842.92,C2020875465,3394537.70,2787344.06,0,0
1,281,PAYMENT,3473.75,C1283140808,0.00,0.00,M2044695613,0.00,0.00,0,0
2,322,CASH_OUT,66170.69,C1376666142,32803.00,0.00,C52514926,0.00,66170.69,0,0
3,401,TRANSFER,335384.26,C2136161445,197.00,0.00,C2094766437,5311958.61,5647342.87,0,0
4,180,CASH_IN,234392.25,C801137003,20882.00,255274.25,C1255594470,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
599995,259,CASH_OUT,214536.92,C1068316386,8038.00,0.00,C841415377,78268.57,292805.49,0,0
599996,353,CASH_OUT,261789.26,C1879745961,86576.05,0.00,C1127017196,381514.02,643303.28,0,0
599997,21,CASH_OUT,10186.93,C1803632133,103123.00,92936.07,C613565294,41667.22,51854.15,0,0
599998,330,TRANSFER,125737.11,C984665201,0.00,0.00,C1621899657,1013921.69,1139658.80,0,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   step            600000 non-null  int64  
 1   type            600000 non-null  object 
 2   amount          600000 non-null  float64
 3   nameOrig        600000 non-null  object 
 4   oldbalanceOrg   600000 non-null  float64
 5   newbalanceOrig  600000 non-null  float64
 6   nameDest        600000 non-null  object 
 7   oldbalanceDest  600000 non-null  float64
 8   newbalanceDest  600000 non-null  float64
 9   isFraud         600000 non-null  int64  
 10  isFlaggedFraud  600000 non-null  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 50.4+ MB


In [5]:
Q1 = data['amount'].quantile(0.25)
Q3 = data['amount'].quantile(0.75)

# Calculate IQR
IQR = Q3 - Q1

# Define outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
data_clean = data[(data['amount'] >= lower_bound) & (data['amount'] <= upper_bound)]

data_clean = data_clean.drop('isFlaggedFraud', axis=1)
data_clean = data_clean.drop('step', axis=1)
data_clean = data_clean.drop(['nameOrig','nameDest'], axis=1)
data_clean = data_clean.drop('type', axis=1)


data_clean = data_clean[:4350]

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_clean = scaler.fit_transform(data_clean.values)

data_clean

array([[-0.89018086, -0.29311738, -0.29719661, -0.39826831, -0.4151249 ,
        -0.02144718],
       [-0.37376286, -0.28158751, -0.29719661, -0.39826831, -0.38819108,
        -0.02144718],
       [ 1.84367762, -0.29304814, -0.29719661,  1.79203574,  1.88354421,
        -0.02144718],
       ...,
       [ 1.12644624,  0.08906577,  0.16649732,  4.29569427,  3.93831554,
        -0.02144718],
       [ 1.37814484,  2.54640088,  2.60425858, -0.24244412, -0.37481128,
        -0.02144718],
       [-0.87829105, -0.29311738, -0.29719661, -0.39826831, -0.4151249 ,
        -0.02144718]])

In [6]:
# Generator Network
class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(100, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 6),
            nn.Tanh()  # Output values will be normalized between -1 and 1
        )

    def forward(self, x):
        return self.model(x)

In [7]:
# Discriminator Network
class Critic(nn.Module):
    def __init__(self):
        super(Critic, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(6, 512),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(512, 256),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(128, 1)  # Outputs a real-valued score
        )

    def forward(self, x):
        score = self.fc(x)
        return score

In [8]:
def compute_gradient_penalty(critic, real_samples, fake_samples):
    alpha = torch.rand(real_samples.size(0), 1, 1, 1).to(real_samples.device)
    interpolates = (alpha * real_samples + (1 - alpha) * fake_samples).requires_grad_(True)

    scores = critic(interpolates)

    # Compute gradients of the critic output with respect to the interpolated samples
    gradients = grad(
        outputs=scores, # The critic scores on the interpolated samples
        inputs=interpolates, # The interpolated samples themselves
        grad_outputs=torch.ones(scores.size()).to(real_samples.device),  # Gradients w.r.t. critic output
        create_graph=True,  # Retain computational graph for higher order derivatives
        retain_graph=True,  # Retain the graph for backpropagation
        only_inputs=True    # Only compute gradients for interpolates
    )[0]

     # Compute the L2 norm of the gradients for each sample
    gradients = gradients.view(gradients.size(0), -1)    # Flatten the gradients
    gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean()   #  Compute the gradient penalty (||grad||_2 - 1)^2

    return gradient_penalty


In [9]:
lambda_gp = 10
critic_iters = 5

generator = Generator()
critic = Critic()


# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
generator.to(device)
critic.to(device)

# Binary Cross Entropy Loss
adversarial_loss = nn.BCELoss()

# Optimizers for Generator and Discriminator
optimizer_G = optim.Adam(generator.parameters(), lr=0.001 ,betas=(0.5, 0.9))
optimizer_C = optim.Adam(critic.parameters(), lr=0.001,betas=(0.5, 0.9))

In [10]:
# Training Parameters
n_epochs = 500
batch_size = 58

# Sample real transaction data for training
class FraudDataset(Dataset):
    def __init__(self, data):
        self.data = torch.tensor(data, dtype=torch.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Load your real transaction data here (after preprocessing)
dataset = FraudDataset(data_clean)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

for epoch in range(n_epochs):
    for i , (real_data) in enumerate(dataloader):
        real_data = real_data.to(device)

        # ---------------------
        #  Train Discriminator
        # ---------------------
        optimizer_C.zero_grad()

        # Generate fake transactions
        z = torch.randn(batch_size, 100).to(device)
        fake_data = generator(z)

        # Real transactions as labels = 1
        real_labels = torch.ones(real_data.size(0), 1).to(device)
        # Fake transactions as labels = 0
        fake_labels = torch.zeros(fake_data.size(0), 1).to(device)

        # Discriminator loss for real and fake data
        real_score = critic(real_data)
        fake_score = critic(fake_data)
        gradient_penalty = compute_gradient_penalty(critic, real_data, fake_data)
        loss_C = fake_score.mean() - real_score.mean() + lambda_gp * gradient_penalty
        loss_C.backward()
        optimizer_C.step()

         # Train generator every critic_iters iterations
        if i % critic_iters == 0:
            # ---------------------
            #  Train Generator
            # ---------------------
            optimizer_G.zero_grad()

            # Generate fake images
            z = torch.randn(batch_size, 100).to(device)
            fake_data = generator(z)

            # Generator loss (minimize the critic's score for fake images)
            loss_G = -critic(fake_data).mean()

            loss_G.backward()
            optimizer_G.step()

    # Print progress every few epochs
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, critic Loss: {loss_C.item()}, Generator Loss: {loss_G.item()}')

Epoch 0, critic Loss: -0.21551337838172913, Generator Loss: 0.10687978565692902
Epoch 10, critic Loss: -0.13598036766052246, Generator Loss: -0.034636542201042175
Epoch 20, critic Loss: -0.06154690682888031, Generator Loss: -0.2531042695045471
Epoch 30, critic Loss: -0.0529705286026001, Generator Loss: -0.1887790560722351
Epoch 40, critic Loss: -0.0604693666100502, Generator Loss: -0.3644365966320038
Epoch 50, critic Loss: 0.14056412875652313, Generator Loss: -0.457924485206604
Epoch 60, critic Loss: -0.03217973932623863, Generator Loss: -0.24016402661800385
Epoch 70, critic Loss: -0.06737381964921951, Generator Loss: -0.46905943751335144
Epoch 80, critic Loss: -0.03661087527871132, Generator Loss: -0.15540023148059845
Epoch 90, critic Loss: -0.05828024074435234, Generator Loss: -0.33379611372947693
Epoch 100, critic Loss: 0.022121071815490723, Generator Loss: -0.33127909898757935
Epoch 110, critic Loss: -0.03797934949398041, Generator Loss: -0.37710195779800415
Epoch 120, critic Loss:

In [11]:
# Generate synthetic fraud transactions
generator.eval()
noise = torch.randn(100, 100).to(device)
synthetic_data = generator(noise).detach().cpu().numpy()
print("Synthetic Fraud Transactions Generated: \n", pd.DataFrame(scaler.inverse_transform(synthetic_data)))

Synthetic Fraud Transactions Generated: 
                 0             1             2             3             4  \
0   114403.843750 -4.489089e+04  2.488303e+04  3.906978e+05  4.624595e+05   
1   116251.656250 -3.503383e+04  1.716785e+04  3.150825e+05  3.757486e+05   
2    77854.867188 -1.413652e+04 -4.975528e+04  1.497015e+06  1.377632e+06   
3   232955.593750  3.678978e+06  3.736477e+06  3.391083e+06  3.475944e+06   
4     4249.436035  3.678978e+06  3.736477e+06  3.391102e+06  3.476662e+06   
..            ...           ...           ...           ...           ...   
95   19815.287109 -5.266308e+04 -2.892553e+04  1.295542e+05 -4.986468e+03   
96    7331.123535 -8.918695e+04 -3.153984e+04  9.521941e+04 -1.052868e+05   
97  232955.593750 -1.285358e+04 -1.088248e+05  1.559536e+06  1.883962e+06   
98    3626.967041 -9.579970e+04 -2.252128e+04  1.321619e+05 -9.293284e+04   
99   90960.375000  3.678978e+06  3.736477e+06  3.167449e+05  2.662997e+04   

           5  
0   0.000307  
1  

In [12]:
# Generate label column
labels = random.choices([0, 1], k=len(synthetic_data))
#Save synthetic data
synthetic_data[:,5] = labels
df = pd.DataFrame(synthetic_data, columns=['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'isFraud'])

df

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,0.023521,-0.308896,-0.288558,-0.237170,-0.226887,1.0
1,0.038741,-0.305431,-0.291236,-0.268349,-0.262182,0.0
2,-0.277523,-0.298086,-0.314470,0.219003,0.145620,1.0
3,1.000000,1.000000,1.000000,0.999992,0.999708,1.0
4,-0.883792,1.000000,1.000000,1.000000,1.000000,0.0
...,...,...,...,...,...,...
95,-0.755580,-0.311628,-0.307239,-0.344849,-0.417155,1.0
96,-0.858409,-0.324466,-0.308146,-0.359006,-0.457980,1.0
97,1.000000,-0.297635,-0.334977,0.244782,0.351715,0.0
98,-0.888919,-0.326790,-0.305015,-0.343773,-0.452952,0.0


In [13]:
df.to_csv('final_generative_Data', index=False)
print("Synthetic data generated and saved")

Synthetic data generated and saved
