In [47]:
import pickle
from pathlib import Path

import pandas as pd
import plotly.io as pio
from sdv.evaluation.single_table import (
    evaluate_quality,
    get_column_plot,
    run_diagnostic,
)

pio.renderers.default = "vscode"

PROJECT_ROOT = Path(__name__).resolve().parent.parent.parent
INPUT_FOLDER = PROJECT_ROOT / "data/input"
OUTPUT_FOLDER = PROJECT_ROOT / "data/output"
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)

In [None]:
# Kaggle credit card fraud data
# Read the model, look at diagnostics, create some data
ifolder = INPUT_FOLDER / "Kaggle_creditcardfraud"
ofolder = OUTPUT_FOLDER / "Kaggle_creditcardfraud"
gen = None
with open(ofolder / "ctgan.pkl", "rb") as io:
    gen = pickle.load(io)

# Get original data
real_df = pd.read_pickle(ofolder / "real_df.pkl")
# Generate some fake data - 2x rows as original
fake_df = gen.sample(num_rows=real_df.shape[0] * 2)

In [49]:
# Check loss function
gen.get_loss_values_plot().show()

In [50]:
# rand10 not needed
real_dfm = real_df
real_df.drop(["rand10"], axis=1, inplace=True)

In [51]:
# Check built in diagnostics
gen_diagnostics = run_diagnostic(
    real_data=real_df, synthetic_data=fake_df, metadata=gen.get_metadata()
)

Generating report ...




[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


(1/2) Evaluating Data Validity: |██████████| 31/31 [00:00<00:00, 46.38it/s]|
Data Validity Score: 100.0%




[A[A[A


(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 299.94it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%



In [52]:
gen_diagnostics.get_visualization(property_name="Data Validity")

In [53]:
# check built in quality
gen_quality = evaluate_quality(
    real_data=real_df, synthetic_data=fake_df, metadata=gen.get_metadata()
)

Generating report ...




[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


(1/2) Evaluating Column Shapes: |██████████| 31/31 [00:02<00:00, 12.63it/s]|
Column Shapes Score: 84.96%




[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A




In [54]:
gen_quality.get_visualization(property_name="Column Shapes")
gen_quality.get_visualization(property_name="Column Pair Trends")

In [55]:
# Distributions do not match well, this is why the correlation is so poor
col = "Class"
get_column_plot(real_df, fake_df, column_name=col, metadata=gen.metadata)

In [56]:
# Can see how poorly the GAN performs, now look at individual models

real_df_model = pd.read_pickle(ofolder / "real_df_model.pkl")

# Class 1
gen_Class1 = None
with open(ofolder / "ctgan_Class1.pkl", "rb") as io:
    gen_Class1 = pickle.load(io)

real_df_model_Class1 = real_df_model[real_df_model["Class"] == 1]
fake_df_Class1 = gen_Class1.sample(num_rows=real_df_model_Class1.shape[0] * 2)

# Class 0
gen_Class0 = None
with open(ofolder / "ctgan_Class0.pkl", "rb") as io:
    gen_Class0 = pickle.load(io)

real_df_model_Class0 = real_df_model[real_df_model["Class"] == 0]
fake_df_Class0 = gen_Class0.sample(num_rows=real_df_model_Class0.shape[0] * 2)

(2/2) Evaluating Column Pair Trends: |▌         | 30/496 [2:40:25<41:31:56, 320.85s/it]|
(2/2) Evaluating Column Pair Trends: |▌         | 30/496 [2:35:10<40:10:24, 310.35s/it]|
(2/2) Evaluating Column Pair Trends: |▌         | 30/496 [2:34:18<39:56:55, 308.62s/it]|


In [57]:
# Check loss functions
gen_Class1.get_loss_values_plot().show()
gen_Class0.get_loss_values_plot().show()

In [58]:
# Class 1 quality
gen_quality_Class1 = evaluate_quality(
    real_data=real_df_model_Class1, synthetic_data=fake_df_Class1, metadata=gen_Class1.get_metadata()
)

gen_quality_Class1.get_visualization(property_name="Column Shapes")
gen_quality_Class1.get_visualization(property_name="Column Pair Trends")

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 32/32 [00:00<00:00, 636.57it/s]|
Column Shapes Score: 77.12%

(2/2) Evaluating Column Pair Trends: |██████████| 496/496 [00:01<00:00, 355.71it/s]|
Column Pair Trends Score: 84.26%

Overall Score (Average): 80.69%



In [59]:
# Class 0
gen_quality_Class0 = evaluate_quality(
    real_data=real_df_model_Class0, synthetic_data=fake_df_Class0, metadata=gen_Class0.get_metadata()
)

gen_quality_Class0.get_visualization(property_name="Column Shapes")
gen_quality_Class0.get_visualization(property_name="Column Pair Trends")

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 32/32 [00:03<00:00,  9.38it/s]|
Column Shapes Score: 94.9%

(2/2) Evaluating Column Pair Trends: |██████████| 496/496 [00:44<00:00, 11.18it/s]|
Column Pair Trends Score: 90.56%

Overall Score (Average): 92.73%



In [None]:
# Create the simulated dataset with samples generated from gan
n = real_df_model_Class0.shape[0]
gen_Class1.reset_sampling()
syn_df = gen_Class1.sample(num_rows=n)
final_df = pd.concat([real_df_model_Class0, syn_df])

count    505364.0
mean          0.5
std           0.5
min           0.0
25%           0.0
50%           0.5
75%           1.0
max           1.0
Name: Class, dtype: float64

In [62]:
# Check that it is balanced
final_df.Class.value_counts()

Class
0    252682
1    252682
Name: count, dtype: int64

In [None]:
# Save
final_df.to_pickle(ofolder / "syn_df_balanced.pkl")