In [1]:
import datetime
import json
import time
from pathlib import Path

import numpy as np
import pandas as pd
from sdv.metadata import Metadata
from sdv.single_table import CTGANSynthesizer

PROJECT_ROOT = Path(__name__).resolve().parent.parent.parent
INPUT_FOLDER = PROJECT_ROOT / "data/input"
OUTPUT_FOLDER = PROJECT_ROOT / "data/output"
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)

In [2]:
# Kaggle credit card fraud data
# Read data and check
ifolder = INPUT_FOLDER / "Kaggle_creditcardfraud"
ofolder = OUTPUT_FOLDER / "Kaggle_creditcardfraud"
ofolder.mkdir(parents=True, exist_ok=True)
df = pd.read_csv(ifolder / "creditcard.csv")
print(df.head())
print(df.shape)

# Set up metadata for GAN
df_meta = Metadata.detect_from_dataframe(df)
gen = CTGANSynthesizer(
    metadata=df_meta,
    epochs=250, # 500 takes a while to train, so half it
    verbose=True,
)

   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 


We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.



In [None]:
# Train the GAN - keep track of the time to execute
# Check if GAN exists - these take a while to fit, so only refit if necessary
duration = None
pkl_exists = Path(ofolder / "ctgan.pkl").is_file()
if pkl_exists:
    gen.load(ofolder / "ctgan.pkl")
else:
    # Fit a new model if it doesn't exist
    tstart = time.time()
    gen.fit(df)
    tend = time.time()
    duration = tend - tstart


CUDA initialization: The NVIDIA driver on your system is too old (found version 11040). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at /pytorch/c10/cuda/CUDAFunctions.cpp:109.)

Gen. (0.14) | Discrim. (-0.30): 100%|██████████| 250/250 [2:18:36<00:00, 33.26s/it]   


In [None]:
# Save the results, plot the loss function, and print the time to train the GAN
if not pkl_exists:
    gen.save(ofolder / "ctgan.pkl")
md = ofolder / "ctgan_metadata.json"
if not md.is_file():
    df_meta.save_to_json(md)
df.to_pickle(ofolder / "real_df.pkl")

# Units are seconds, so display minutes
print(f"Time to fit: {(duration / 60):.2f} min.")

Time to fit: 151.14 min.


In [5]:
# Due to the imbalanced data, change the modeling procedure
# Highly imbalanced
print(df.Class.value_counts())
print(df.Class.value_counts() / df.Class.count())

Class
0    284315
1       492
Name: count, dtype: int64
Class
0    0.998273
1    0.001727
Name: count, dtype: float64


In [6]:
# Create two GANs, and create a holdout for testing later
np.random.seed(100)
df["rand10"] = np.random.randint(1, 10, len(df))
df_model = df[df["rand10"] <= 8]
df_valid = df[df["rand10"] >= 9]

# Resave
df.to_pickle(ofolder / "real_df.pkl")
df_model.to_pickle(ofolder / "real_df_model.pkl")
df_valid.to_pickle(ofolder / "real_df_validation.pkl")

# Check imbalance - about the same
print(df_model.Class.value_counts())
print(df_model.Class.value_counts() / df_model.Class.count())

print(df_valid.Class.value_counts())
print(df_valid.Class.value_counts() / df_valid.Class.count())

Class
0    252682
1       433
Name: count, dtype: int64
Class
0    0.998289
1    0.001711
Name: count, dtype: float64
Class
0    31633
1       59
Name: count, dtype: int64
Class
0    0.998138
1    0.001862
Name: count, dtype: float64


In [7]:
# Split up the data
df_model_Class1 = df_model[df_model["Class"] == 1]
df_model_Class0 = df_model[df_model["Class"] == 0]

print(f"Class = 1 df shape: {df_model_Class1.shape}")
print(f"Class = 0 df shape: {df_model_Class0.shape}")

Class = 1 df shape: (433, 32)
Class = 0 df shape: (252682, 32)


In [None]:
# Now create the GANs

# Class = 1

# Set up metadata for GAN
df_meta = Metadata.detect_from_dataframe(df_model_Class1)
gen_Class1 = CTGANSynthesizer(
    metadata=df_meta,
    epochs=250,
    verbose=True,
)

# Train the GAN - keep track of the time to execute
# Check if GAN exists - these take a while to fit, so only refit if necessary
duration = None
pkl_exists = False
try:
    gen_Class1.load(ofolder / "ctgan_Class1.pkl")
    pkl_exists = True
except FileNotFoundError:
    # fit a new model if it doesn't exist
    tstart = time.time()
    gen_Class1.fit(df_model_Class1)
    tend = time.time()
    duration = tend - tstart

# Save the results, plot the loss function, and print the time to train the GAN
if not pkl_exists:
    gen_Class1.save(ofolder / "ctgan_Class1.pkl")
md = ofolder / "ctgan_Class1_metadata.json"
if not md.is_file():
    df_meta.save_to_json(md)

# Units are seconds, so display minutes
print(f"Time to fit: {(duration / 60):.2f} min.")


We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.

Gen. (-2.62) | Discrim. (0.22): 100%|██████████| 250/250 [00:13<00:00, 18.55it/s] 

Time to fit: 0.28 min.





In [None]:
# Now create the GANs

# Class = 0

# Set up metadata for GAN
df_meta = Metadata.detect_from_dataframe(df_model_Class0)
gen_Class0 = CTGANSynthesizer(
    metadata=df_meta,
    epochs=250,
    verbose=True,
)

# Train the GAN - keep track of the time to execute
# Check if GAN exists - these take a while to fit, so only refit if necessary
duration = None
pkl_exists = False
try:
    gen_Class0.load(ofolder / "ctgan_Class0.pkl")
    pkl_exists = True
except FileNotFoundError:
    # fit a new model if it doesn't exist
    tstart = time.time()
    gen_Class0.fit(df_model_Class0)
    tend = time.time()
    duration = tend - tstart

# Save the results, plot the loss function, and print the time to train the GAN
if not pkl_exists:
    gen_Class0.save(ofolder / "ctgan_Class0.pkl")
md = ofolder / "ctgan_Class0_metadata.json"
if not md.is_file():
    df_meta.save_to_json(md)

# Units are seconds, so display minutes
print(f"Time to fit: {(duration / 60):.2f} min.")


We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.

Gen. (-0.95) | Discrim. (-0.35): 100%|██████████| 250/250 [2:11:01<00:00, 31.44s/it]  

Time to fit: 142.17 min.



