### Installing Required Libraries

In [None]:
!pip install -q sdv

### Imports

In [None]:
import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer

### Sample Credit Card data taken from kaggle
https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud?resource=download

In [None]:
# Load the dataset
url = "creditcard.csv"
df = pd.read_csv(url)
len(df)

249556

### Checking the distribution

In [None]:
df.Class.value_counts()

Class
0.0    249100
1.0       455
Name: count, dtype: int64

### Concentrating the fraud ones

In [None]:
sampled_df=df[df.Class==1.0]
sampled_df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
541,406.0,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,...,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.17784,0.261145,-0.143276,0.0,1.0
623,472.0,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,...,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,529.0,1.0
4920,4462.0,-2.30335,1.759247,-0.359745,2.330243,-0.821628,-0.075788,0.56232,-0.399147,-0.238253,...,-0.294166,-0.932391,0.172726,-0.08733,-0.156114,-0.542628,0.039566,-0.153029,239.93,1.0
6108,6986.0,-4.397974,1.358367,-2.592844,2.679787,-1.128131,-1.706536,-3.496197,-0.248778,-0.247768,...,0.573574,0.176968,-0.436207,-0.053502,0.252405,-0.657488,-0.827136,0.849573,59.0,1.0
6329,7519.0,1.234235,3.01974,-4.304597,4.732795,3.624201,-1.357746,1.713445,-0.496358,-1.282858,...,-0.379068,-0.704181,-0.656805,-1.632653,1.488901,0.566797,-0.010016,0.146793,1.0,1.0


### Synthetic Data Generation using CTGANSynthesizer

In [None]:
# Define the metadata for the dataset
metadata = SingleTableMetadata()

# Update metadata with column information
metadata.detect_from_dataframe(data=sampled_df)

# Display detected metadata
print(metadata.to_dict())

# Initialize the CTGANSynthesizer model with the metadata
synthesizer = CTGANSynthesizer(metadata)

# Fit the model to the original dataset
synthesizer.fit(sampled_df)

# Generate synthetic data
synthetic_data = synthesizer.sample(len(sampled_df))

# Display the first few rows of the synthetic dataset
print("\nSynthetic Dataset:")
print(synthetic_data.head())


{'columns': {'Time': {'sdtype': 'numerical'}, 'V1': {'sdtype': 'numerical'}, 'V2': {'sdtype': 'numerical'}, 'V3': {'sdtype': 'numerical'}, 'V4': {'sdtype': 'numerical'}, 'V5': {'sdtype': 'numerical'}, 'V6': {'sdtype': 'numerical'}, 'V7': {'sdtype': 'numerical'}, 'V8': {'sdtype': 'numerical'}, 'V9': {'sdtype': 'numerical'}, 'V10': {'sdtype': 'numerical'}, 'V11': {'sdtype': 'numerical'}, 'V12': {'sdtype': 'numerical'}, 'V13': {'sdtype': 'numerical'}, 'V14': {'sdtype': 'numerical'}, 'V15': {'sdtype': 'numerical'}, 'V16': {'sdtype': 'numerical'}, 'V17': {'sdtype': 'numerical'}, 'V18': {'sdtype': 'numerical'}, 'V19': {'sdtype': 'numerical'}, 'V20': {'sdtype': 'numerical'}, 'V21': {'sdtype': 'numerical'}, 'V22': {'sdtype': 'numerical'}, 'V23': {'sdtype': 'numerical'}, 'V24': {'sdtype': 'numerical'}, 'V25': {'sdtype': 'numerical'}, 'V26': {'sdtype': 'numerical'}, 'V27': {'sdtype': 'numerical'}, 'V28': {'sdtype': 'numerical'}, 'Amount': {'sdtype': 'numerical'}, 'Class': {'sdtype': 'categorical




Synthetic Dataset:
       Time        V1        V2        V3        V4         V5        V6  \
0  150859.0  1.954852  1.106931 -0.237584  6.117475   2.538700 -0.158353   
1  154309.0  0.800028 -0.899847  2.250210 -0.782297 -18.289895  1.273332   
2   73125.0  1.954852  4.601522 -9.047530  2.960841 -20.057929  6.474115   
3   47174.0 -3.073272  5.864059  2.026087  4.247740 -16.703965  2.352225   
4   27307.0 -2.996481  1.474847  2.250210  1.251406 -16.310828  2.824746   

          V7         V8        V9  ...       V21       V22       V23  \
0  -1.875683   0.348647 -2.356220  ...  3.632716  2.002991  0.391782   
1   1.743124   2.069502  3.353525  ...  2.670679 -0.457824 -0.549299   
2   0.928442 -13.317025  2.977494  ...  1.949921 -0.810971 -0.702235   
3 -10.782134   2.375719  1.954820  ...  2.592331 -0.868907  0.178452   
4  -7.713930   0.804326  2.281498  ...  1.851998  0.714447  0.343489   

        V24       V25       V26       V27       V28  Amount  Class  
0  0.013778 -0.845379

In [None]:
sampled_df[:5]

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
541,406.0,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,...,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.17784,0.261145,-0.143276,0.0,1.0
623,472.0,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,...,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,529.0,1.0
4920,4462.0,-2.30335,1.759247,-0.359745,2.330243,-0.821628,-0.075788,0.56232,-0.399147,-0.238253,...,-0.294166,-0.932391,0.172726,-0.08733,-0.156114,-0.542628,0.039566,-0.153029,239.93,1.0
6108,6986.0,-4.397974,1.358367,-2.592844,2.679787,-1.128131,-1.706536,-3.496197,-0.248778,-0.247768,...,0.573574,0.176968,-0.436207,-0.053502,0.252405,-0.657488,-0.827136,0.849573,59.0,1.0
6329,7519.0,1.234235,3.01974,-4.304597,4.732795,3.624201,-1.357746,1.713445,-0.496358,-1.282858,...,-0.379068,-0.704181,-0.656805,-1.632653,1.488901,0.566797,-0.010016,0.146793,1.0,1.0


In [None]:
synthetic_data[:5]

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,150859.0,1.954852,1.106931,-0.237584,6.117475,2.5387,-0.158353,-1.875683,0.348647,-2.35622,...,3.632716,2.002991,0.391782,0.013778,-0.845379,-0.477818,0.006137,0.446901,0.0,1.0
1,154309.0,0.800028,-0.899847,2.25021,-0.782297,-18.289895,1.273332,1.743124,2.069502,3.353525,...,2.670679,-0.457824,-0.549299,-0.50215,-0.481778,-0.205623,1.004343,-0.272335,155.4,1.0
2,73125.0,1.954852,4.601522,-9.04753,2.960841,-20.057929,6.474115,0.928442,-13.317025,2.977494,...,1.949921,-0.810971,-0.702235,-0.450462,1.883609,-0.546968,-0.016495,0.299422,82.76,1.0
3,47174.0,-3.073272,5.864059,2.026087,4.24774,-16.703965,2.352225,-10.782134,2.375719,1.95482,...,2.592331,-0.868907,0.178452,-0.80378,-0.188387,0.351873,0.140702,-0.294371,0.0,1.0
4,27307.0,-2.996481,1.474847,2.25021,1.251406,-16.310828,2.824746,-7.71393,0.804326,2.281498,...,1.851998,0.714447,0.343489,0.596107,-1.024279,-0.361405,-0.041438,-0.114612,0.0,1.0


### Synthetic Data Generation using GaussianCopulaSynthesizer

In [None]:
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer


# Define the metadata for the dataset
metadata = SingleTableMetadata()

# Update metadata with column information
metadata.detect_from_dataframe(data=sampled_df)

# Initialize the GaussianCopula model with the metadata
gaussian_copula_synthesizer = GaussianCopulaSynthesizer(metadata)

# Fit the model to the original dataset
gaussian_copula_synthesizer.fit(sampled_df)

# Generate synthetic data
synthetic_data_gc = gaussian_copula_synthesizer.sample(len(sampled_df))

# Display the first few rows of the synthetic dataset
print("\nSynthetic Dataset using GaussianCopula:")
print(synthetic_data_gc.head())





Synthetic Dataset using GaussianCopula:
      Time         V1         V2         V3        V4         V5        V6  \
0  40671.0   0.876130   0.562716   0.406737  2.236430  -2.407821 -0.861088   
1  50675.0  -6.226480  -0.162247  -3.441600  4.436247  -2.258624 -2.416462   
2  77189.0 -18.220829   3.557674 -16.101493  6.514801  -2.800846 -1.809800   
3   2429.0  -1.185816   2.241316  -1.932020  3.225587   0.325577 -0.603278   
4  15148.0 -25.124440  12.507314 -25.217403  8.613355 -20.000736  1.575998   

          V7        V8        V9  ...        V21       V22       V23  \
0   0.323997  0.950598 -0.369842  ...  -2.842003  1.159995  0.144229   
1  -3.798705  7.347969 -1.839337  ...   4.903314 -0.937074 -1.771584   
2  -7.745304 -3.559820 -0.744415  ...   8.857641 -2.523978  0.159449   
3  -6.132528 -9.177507 -0.370092  ...   9.468602 -2.492531  2.618035   
4 -19.429707 -6.859409 -9.190055  ... -12.557775  3.584492  0.815975   

        V24       V25       V26       V27       V28  Amou

### Synthetic Data Generation using CopulaGANSynthesizer

In [None]:
from sdv.single_table import CopulaGANSynthesizer

# Initialize the CopulaGAN model with the metadata
copula_gan_synthesizer = CopulaGANSynthesizer(metadata)

# Fit the model to the original dataset
copula_gan_synthesizer.fit(sampled_df)

# Generate synthetic data
synthetic_data_cg = copula_gan_synthesizer.sample(len(sampled_df))

# Display the first few rows of the synthetic dataset
print("\nSynthetic Dataset using CopulaGAN:")
print(synthetic_data_cg.head())



Synthetic Dataset using CopulaGAN:
       Time        V1        V2         V3         V4         V5        V6  \
0   38966.0  0.476058  2.447668   0.891529   7.221449 -22.105532 -0.846497   
1  154260.0 -0.979163  3.444669  -6.025667   6.336175  -8.294853 -3.384852   
2  154290.0 -0.149862  4.641427 -20.234333   9.108890 -11.273140 -0.336924   
3  145743.0  0.207422  5.322252 -29.823527  12.114672 -22.105532 -1.443946   
4  149309.0 -1.324580  4.538496 -18.925340   7.856072  -9.951732 -4.688143   

          V7        V8        V9  ...       V21       V22       V23       V24  \
0   0.626213  7.236997  2.431189  ... -0.996824  0.260237 -1.360439 -0.511285   
1 -11.457401  4.552875 -0.726318  ...  1.513046  1.236607 -2.923806  0.245264   
2   1.009954  3.262468 -2.262001  ... -0.702134  1.899354  0.046715  0.321225   
3  -2.126435  0.319897 -0.544034  ... -1.654011  0.880911 -1.333413 -0.567382   
4  -5.182783  3.404987  2.870583  ...  0.776646  0.088354 -2.959839  0.003696   

        

### Conclusion

When generating synthetic data for bank or card transaction datasets, the choice of synthesizer depends on the specific business case and data characteristics.

For scenarios focusing on numerical transaction data with complex dependencies, such as modeling customer spending behavior or fraud detection patterns, the GaussianCopulaSynthesizer is a suitable choice. It effectively captures correlations between numerical variables and can generate realistic data for statistical analyses and anomaly detection.

If the dataset includes both numerical and categorical features, such as transaction types or merchant categories, and the goal is to generate data that preserves these mixed data types while capturing intricate patterns, the CopulaGANSynthesizer offers a robust solution. This model leverages GANs to generate data that maintains the joint distribution of mixed types, making it ideal for use cases like credit risk modeling or personalized marketing.

For cases requiring highly realistic synthetic data that closely mimics the structure and variability of the original dataset, especially when capturing subtle patterns in both fraud detection and customer segmentation, the CTGANSynthesizer is recommended. Its GAN-based architecture is particularly effective at learning complex data distributions, making it well-suited for enhancing model training in machine learning applications while maintaining data privacy.