In [None]:
import datetime
import json
import time
from pathlib import Path

import numpy as np
import pandas as pd
from sdv.metadata import Metadata
from sdv.single_table import CTGANSynthesizer

PROJECT_ROOT = Path(__name__).resolve().parent.parent.parent
INPUT_FOLDER = PROJECT_ROOT / "data/input"
OUTPUT_FOLDER = PROJECT_ROOT / "data/output"
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)

In [None]:
# Kaggle credit card fraud data
# Read data and check
ifolder = INPUT_FOLDER / "Kaggle_creditcardfraud"
ofolder = OUTPUT_FOLDER / "Kaggle_creditcardfraud"
ofolder.mkdir(parents=True, exist_ok=True)
df = pd.read_csv(ifolder / "creditcard.csv")
print(df.head())
print(df.shape)

# Set up metadata for GAN
df_meta = Metadata.detect_from_dataframe(df)
gen = CTGANSynthesizer(
    metadata=df_meta,
    epochs=250, # 500 takes a while to train, so half it
    verbose=True,
)

In [None]:
# Train the GAN - keep track of the time to execute
# Check if GAN exists - these take a while to fit, so only refit if necessary
duration = None
pkl_exists = Path(ofolder / "ctgan.pkl").is_file()
if pkl_exists:
    gen.load(ofolder / "ctgan.pkl")
else:
    # Fit a new model if it doesn't exist
    tstart = time.time()
    gen.fit(df)
    tend = time.time()
    duration = tend - tstart

In [None]:
# Save the results, plot the loss function, and print the time to train the GAN
if not pkl_exists:
    gen.save(ofolder / "ctgan.pkl")
md = ofolder / "ctgan_metadata.json"
if not md.is_file():
    df_meta.save_to_json(md)
df.to_pickle(ofolder / "real_df.pkl")

# Units are seconds, so display minutes
print(f"Time to fit: {(duration / 60):.2f} min.")

In [None]:
# Due to the imbalanced data, change the modeling procedure
# Highly imbalanced
print(df.Class.value_counts())
print(df.Class.value_counts() / df.Class.count())

In [None]:
# Create two GANs, and create a holdout for testing later
np.random.seed(100)
df["rand10"] = np.random.randint(1, 10, len(df))
df_model = df[df["rand10"] <= 8]
df_valid = df[df["rand10"] >= 9]

# Resave
df.to_pickle(ofolder / "real_df.pkl")
df_model.to_pickle(ofolder / "real_df_model.pkl")
df_valid.to_pickle(ofolder / "real_df_validation.pkl")

# Check imbalance - about the same
print(df_model.Class.value_counts())
print(df_model.Class.value_counts() / df_model.Class.count())

print(df_valid.Class.value_counts())
print(df_valid.Class.value_counts() / df_valid.Class.count())

In [None]:
# Split up the data
df_model_Class1 = df_model[df_model["Class"] == 1]
df_model_Class0 = df_model[df_model["Class"] == 0]

print(f"Class = 1 df shape: {df_model_Class1.shape}")
print(f"Class = 0 df shape: {df_model_Class0.shape}")

In [None]:
# Now create the GANs

# Class = 1

# Set up metadata for GAN
df_meta = Metadata.detect_from_dataframe(df_model_Class1)
gen_Class1 = CTGANSynthesizer(
    metadata=df_meta,
    epochs=250,
    verbose=True,
)

# Train the GAN - keep track of the time to execute
# Check if GAN exists - these take a while to fit, so only refit if necessary
duration = None
pkl_exists = False
try:
    gen_Class1.load(ofolder / "ctgan_Class1.pkl")
    pkl_exists = True
except FileNotFoundError:
    # fit a new model if it doesn't exist
    tstart = time.time()
    gen_Class1.fit(df_model_Class1)
    tend = time.time()
    duration = tend - tstart

# Save the results, plot the loss function, and print the time to train the GAN
if not pkl_exists:
    gen_Class1.save(ofolder / "ctgan_Class1.pkl")
md = ofolder / "ctgan_Class1_metadata.json"
if not md.is_file():
    df_meta.save_to_json(md)

# Units are seconds, so display minutes
print(f"Time to fit: {(duration / 60):.2f} min.")

In [None]:
# Now create the GANs

# Class = 0

# Set up metadata for GAN
df_meta = Metadata.detect_from_dataframe(df_model_Class0)
gen_Class0 = CTGANSynthesizer(
    metadata=df_meta,
    epochs=250,
    verbose=True,
)

# Train the GAN - keep track of the time to execute
# Check if GAN exists - these take a while to fit, so only refit if necessary
duration = None
pkl_exists = False
try:
    gen_Class0.load(ofolder / "ctgan_Class0.pkl")
    pkl_exists = True
except FileNotFoundError:
    # fit a new model if it doesn't exist
    tstart = time.time()
    gen_Class0.fit(df_model_Class0)
    tend = time.time()
    duration = tend - tstart

# Save the results, plot the loss function, and print the time to train the GAN
if not pkl_exists:
    gen_Class0.save(ofolder / "ctgan_Class0.pkl")
md = ofolder / "ctgan_Class0_metadata.json"
if not md.is_file():
    df_meta.save_to_json(md)

# Units are seconds, so display minutes
print(f"Time to fit: {(duration / 60):.2f} min.")