In [None]:
import pickle
from pathlib import Path

import pandas as pd
import plotly.io as pio
from sdv.evaluation.single_table import (
    evaluate_quality,
    get_column_plot,
    run_diagnostic,
)

pio.renderers.default = "vscode"

PROJECT_ROOT = Path(__name__).resolve().parent.parent
INPUT_FOLDER = PROJECT_ROOT / "data/input"
OUTPUT_FOLDER = PROJECT_ROOT / "data/output"
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)

In [39]:
# UCI Adult data
# read the model, look at diagnostics, create some data
ifolder = INPUT_FOLDER / "UCI_adult"
ofolder = OUTPUT_FOLDER / "UCI_adult"
gen = None
with open(ofolder / "ctgan.pkl", "rb") as io:
    gen = pickle.load(io)

# get original data
real_df = None
with open(ofolder / "real_df.pkl", "rb") as io:
    real_df = pickle.load(io)
# generate some fake data - 2x rows as original
fake_df = gen.sample(num_rows=real_df.shape[0] * 2)
fake_df.to_pickle(ofolder / "fake_df.pkl")

In [40]:
# check loss function
fig = gen.get_loss_values_plot()
fig.show()

In [41]:
# check built in diagnostics
gen_diagnostics = run_diagnostic(
    real_data=real_df, synthetic_data=fake_df, metadata=gen.metadata
)

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 15/15 [00:00<00:00, 135.79it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 203.61it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%



In [42]:
print(gen_diagnostics.get_details(property_name="Data Validity"))
gen_diagnostics.get_visualization(property_name="Data Validity")

              Column             Metric  Score
0                age  BoundaryAdherence    1.0
1          workclass  CategoryAdherence    1.0
2             fnlwgt  BoundaryAdherence    1.0
3          education  CategoryAdherence    1.0
4      education_num  BoundaryAdherence    1.0
5     marital_status  CategoryAdherence    1.0
6         occupation  CategoryAdherence    1.0
7       relationship  CategoryAdherence    1.0
8               race  CategoryAdherence    1.0
9                sex  CategoryAdherence    1.0
10      capital_gain  BoundaryAdherence    1.0
11      capital_loss  BoundaryAdherence    1.0
12  minutes_per_week  BoundaryAdherence    1.0
13    native_country  CategoryAdherence    1.0
14   Income_Category  CategoryAdherence    1.0


In [43]:
# check built in quality
gen_quality = evaluate_quality(
    real_data=real_df, synthetic_data=fake_df, metadata=gen.metadata
)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 15/15 [00:00<00:00, 50.70it/s]|
Column Shapes Score: 90.22%

(2/2) Evaluating Column Pair Trends: |██████████| 105/105 [00:02<00:00, 44.45it/s]|
Column Pair Trends Score: 85.92%

Overall Score (Average): 88.07%



In [44]:
print(gen_quality.get_details(property_name="Column Shapes"))
print(gen_quality.get_details(property_name="Column Pair Trends"))
gen_quality.get_visualization(property_name="Column Shapes")
gen_quality.get_visualization(property_name="Column Pair Trends")

              Column        Metric     Score
0                age  KSComplement  0.913040
1          workclass  TVComplement  0.959353
2             fnlwgt  KSComplement  0.960382
3          education  TVComplement  0.928718
4      education_num  KSComplement  0.945640
5     marital_status  TVComplement  0.933740
6         occupation  TVComplement  0.893415
7       relationship  TVComplement  0.886137
8               race  TVComplement  0.857329
9                sex  TVComplement  0.969288
10      capital_gain  KSComplement  0.564525
11      capital_loss  KSComplement  0.932189
12  minutes_per_week  KSComplement  0.932696
13    native_country  TVComplement  0.919397
14   Income_Category  TVComplement  0.936535
             Column 1         Column 2                 Metric     Score  \
0                 age        workclass  ContingencySimilarity  0.890636   
1                 age           fnlwgt  CorrelationSimilarity  0.987290   
2                 age        education  ContingencySimi

In [None]:
# data matches well
col = "Income_Category"
get_column_plot(real_df, fake_df, column_name=col, metadata=gen.metadata)

In [32]:
# UCI winqeuality data
# read the model, look at diagnostics, create some data
ifolder = INPUT_FOLDER / "UCI_winequality"
ofolder = OUTPUT_FOLDER / "UCI_winequality"
gen = None
with open(ofolder / "ctgan.pkl", "rb") as io:
    gen = pickle.load(io)

# get original data
real_df = None
with open(ofolder / "real_df.pkl", "rb") as io:
    real_df = pickle.load(io)
# generate some fake data - 2x rows as original
fake_df = gen.sample(num_rows=real_df.shape[0] * 2)
fake_df.to_pickle(ofolder / "fake_df.pkl")

In [33]:
# check loss function
fig = gen.get_loss_values_plot()
fig.show()

In [34]:
# check built in diagnostics
gen_diagnostics = run_diagnostic(
    real_data=real_df, synthetic_data=fake_df, metadata=gen.metadata
)

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 12/12 [00:00<00:00, 524.79it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 202.07it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%



In [35]:
print(gen_diagnostics.get_details(property_name="Data Validity"))
gen_diagnostics.get_visualization(property_name="Data Validity")

                  Column             Metric  Score
0          fixed acidity  BoundaryAdherence    1.0
1       volatile acidity  BoundaryAdherence    1.0
2            citric acid  BoundaryAdherence    1.0
3         residual sugar  BoundaryAdherence    1.0
4              chlorides  BoundaryAdherence    1.0
5    free sulfur dioxide  BoundaryAdherence    1.0
6   total sulfur dioxide  BoundaryAdherence    1.0
7                density  BoundaryAdherence    1.0
8                     pH  BoundaryAdherence    1.0
9              sulphates  BoundaryAdherence    1.0
10               alcohol  BoundaryAdherence    1.0
11               quality  CategoryAdherence    1.0


In [36]:
# check built in quality
gen_quality = evaluate_quality(
    real_data=real_df, synthetic_data=fake_df, metadata=gen.metadata
)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 12/12 [00:00<00:00, 112.16it/s]|
Column Shapes Score: 86.57%

(2/2) Evaluating Column Pair Trends: |██████████| 66/66 [00:00<00:00, 89.42it/s]|
Column Pair Trends Score: 87.13%

Overall Score (Average): 86.85%



In [37]:
print(gen_quality.get_details(property_name="Column Shapes"))
print(gen_quality.get_details(property_name="Column Pair Trends"))
gen_quality.get_visualization(property_name="Column Shapes")
gen_quality.get_visualization(property_name="Column Pair Trends")

                  Column        Metric     Score
0          fixed acidity  KSComplement  0.823380
1       volatile acidity  KSComplement  0.946052
2            citric acid  KSComplement  0.852932
3         residual sugar  KSComplement  0.901801
4              chlorides  KSComplement  0.818609
5    free sulfur dioxide  KSComplement  0.897260
6   total sulfur dioxide  KSComplement  0.884947
7                density  KSComplement  0.859243
8                     pH  KSComplement  0.950977
9              sulphates  KSComplement  0.893028
10               alcohol  KSComplement  0.916885
11               quality  TVComplement  0.643451
         Column 1             Column 2                 Metric     Score  \
0   fixed acidity     volatile acidity  CorrelationSimilarity  0.927916   
1   fixed acidity          citric acid  CorrelationSimilarity  0.748447   
2   fixed acidity       residual sugar  CorrelationSimilarity  0.937334   
3   fixed acidity            chlorides  CorrelationSimilarity  

In [None]:
# distributions do not match well
col = "quality"
get_column_plot(real_df, fake_df, column_name=col, metadata=gen.metadata)

In [47]:
# Kaggle credit card fraud data
# read the model, look at diagnostics, create some data
ifolder = INPUT_FOLDER / "Kaggle_creditcardfraud"
ofolder = OUTPUT_FOLDER / "Kaggle_creditcardfraud"
gen = None
with open(ofolder / "ctgan.pkl", "rb") as io:
    gen = pickle.load(io)

# get original data
real_df = None
with open(ofolder / "real_df.pkl", "rb") as io:
    real_df = pickle.load(io)
# generate some fake data - 2x rows as original
fake_df = gen.sample(num_rows=real_df.shape[0] * 2)
fake_df.to_pickle(ofolder / "fake_df.pkl")

In [48]:
# check loss function
fig = gen.get_loss_values_plot()
fig.show()

In [49]:
# check built in diagnostics
gen_diagnostics = run_diagnostic(
    real_data=real_df, synthetic_data=fake_df, metadata=gen.metadata
)

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 31/31 [00:00<00:00, 48.35it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 470.79it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%



In [50]:
print(gen_diagnostics.get_details(property_name="Data Validity"))
gen_diagnostics.get_visualization(property_name="Data Validity")

    Column             Metric  Score
0     Time  BoundaryAdherence    1.0
1       V1  BoundaryAdherence    1.0
2       V2  BoundaryAdherence    1.0
3       V3  BoundaryAdherence    1.0
4       V4  BoundaryAdherence    1.0
5       V5  BoundaryAdherence    1.0
6       V6  BoundaryAdherence    1.0
7       V7  BoundaryAdherence    1.0
8       V8  BoundaryAdherence    1.0
9       V9  BoundaryAdherence    1.0
10     V10  BoundaryAdherence    1.0
11     V11  BoundaryAdherence    1.0
12     V12  BoundaryAdherence    1.0
13     V13  BoundaryAdherence    1.0
14     V14  BoundaryAdherence    1.0
15     V15  BoundaryAdherence    1.0
16     V16  BoundaryAdherence    1.0
17     V17  BoundaryAdherence    1.0
18     V18  BoundaryAdherence    1.0
19     V19  BoundaryAdherence    1.0
20     V20  BoundaryAdherence    1.0
21     V21  BoundaryAdherence    1.0
22     V22  BoundaryAdherence    1.0
23     V23  BoundaryAdherence    1.0
24     V24  BoundaryAdherence    1.0
25     V25  BoundaryAdherence    1.0
2

In [51]:
# check built in quality
gen_quality = evaluate_quality(
    real_data=real_df, synthetic_data=fake_df, metadata=gen.metadata
)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 31/31 [00:02<00:00, 12.90it/s]|
Column Shapes Score: 85.94%

(2/2) Evaluating Column Pair Trends: |████▋     | 216/465 [00:27<00:29,  8.50it/s]|

KeyboardInterrupt: 

In [None]:
print(gen_quality.get_details(property_name="Column Shapes"))
print(gen_quality.get_details(property_name="Column Pair Trends"))
gen_quality.get_visualization(property_name="Column Shapes")
gen_quality.get_visualization(property_name="Column Pair Trends")

    Column        Metric     Score
0     Time  KSComplement  0.922825
1       V1  KSComplement  0.834937
2       V2  KSComplement  0.782679
3       V3  KSComplement  0.811788
4       V4  KSComplement  0.746362
5       V5  KSComplement  0.871104
6       V6  KSComplement  0.870031
7       V7  KSComplement  0.803664
8       V8  KSComplement  0.881348
9       V9  KSComplement  0.830828
10     V10  KSComplement  0.768289
11     V11  KSComplement  0.812794
12     V12  KSComplement  0.777577
13     V13  KSComplement  0.967517
14     V14  KSComplement  0.752826
15     V15  KSComplement  0.961699
16     V16  KSComplement  0.836946
17     V17  KSComplement  0.843527
18     V18  KSComplement  0.888941
19     V19  KSComplement  0.881602
20     V20  KSComplement  0.895115
21     V21  KSComplement  0.859930
22     V22  KSComplement  0.967359
23     V23  KSComplement  0.919854
24     V24  KSComplement  0.932563
25     V25  KSComplement  0.951267
26     V26  KSComplement  0.962764
27     V27  KSComple

In [52]:
# distributions do not match well, this is why the correlation is so poor
col = "Class"
get_column_plot(real_df, fake_df, column_name=col, metadata=gen.metadata)

In [54]:
real_df.Class.value_counts()

Class
0    284315
1       492
Name: count, dtype: int64