In [12]:
import pickle
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.io as pio
from sdv.evaluation.single_table import (
    evaluate_quality,
    get_column_plot,
    run_diagnostic,
)

pio.renderers.default = "vscode"

PROJECT_ROOT = Path(__name__).resolve().parent.parent.parent
INPUT_FOLDER = PROJECT_ROOT / "data/input"
OUTPUT_FOLDER = PROJECT_ROOT / "data/output"
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)

In [13]:
# Kaggle credit card fraud data
# Read the model, look at diagnostics, create some data
ifolder = INPUT_FOLDER / "UCI_adult"
ofolder = OUTPUT_FOLDER / "UCI_adult"
gen = None
with open(ofolder / "ctgan.pkl", "rb") as io:
    gen = pickle.load(io)

# Get original data
real_df = pd.read_pickle(ofolder / "real_df.pkl")
# Generate some fake data
fake_df = gen.sample(num_rows=real_df.shape[0])

In [14]:
# Check loss function
gen.get_loss_values_plot().show()

In [15]:
# Check built in diagnostics
gen_diagnostics = run_diagnostic(
    real_data=real_df, synthetic_data=fake_df, metadata=gen.get_metadata()
)

Generating report ...


[A
(1/2) Evaluating Data Validity: |██████████| 16/16 [00:00<00:00, 206.66it/s]|
Data Validity Score: 100.0%


[A
(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 330.89it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%



In [16]:
gen_diagnostics.get_visualization(property_name="Data Validity")

In [17]:
# check built in quality
gen_quality = evaluate_quality(
    real_data=real_df, synthetic_data=fake_df, metadata=gen.get_metadata()
)

Generating report ...


[A
[A
(1/2) Evaluating Column Shapes: |██████████| 16/16 [00:00<00:00, 76.44it/s]|
Column Shapes Score: 89.41%


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
(2/2) Evaluating Column Pair Trends: |██████████| 120/120 [00:02<00:00, 59.60it/s]|
Column Pair Trends Score: 81.89%

Overall Score (Average): 85.65%



In [None]:
gen_quality.get_visualization(property_name="Column Shapes")
gen_quality.get_visualization(property_name="Column Pair Trends")

