In [17]:
import os
import json
import pandas as pd
from sdv.utils import display_tables
from sdv.evaluation import evaluate

In [15]:
def read_tables(path):
    tables = {}
    for file in os.listdir(path):
        if file.endswith(".csv"):
            table_name = file[:-4].split("-")[-1]
            table = pd.read_csv(path + file)
            tables[table_name] = table
    return tables

In [16]:
real_data = read_tables("../data/zurich_source/")
synthetic_data = read_tables("../data/zurich_synthetic/")

In [18]:
# read metadata from ../models/zurich-sdv-metadata.json
metadata_name = '../models/zurich-sdv-1.0-metadata.json'
with open(metadata_name) as metadata_file:
    metadata = json.load(metadata_file)

The complete list of possible metrics is:

-   `cstest`: This metric compares the distributions of all the
    categorical columns of the table by using a Chi-squared test and
    returns the average of the `p-values` obtained across all the
    columns. If the tables that you are evaluating do not contain any
    categorical columns the result will be `nan`.
-   `kscomplement`: This metric compares the distributions of all the
    numerical columns of the table with a two-sample Kolmogorov-Smirnov
    test using the empirical CDF and returns the average of the
    KS statistic values obtained across all the columns. If the tables
    that you are evaluating do not contain any numerical columns the result
    will be `nan`.
-   `logistic_detection`: This metric tries to use a Logistic Regression
    classifier to detect whether each row is real or synthetic and then
    evaluates its performance using an Area under the ROC curve metric.
    The returned score is 1 minus the ROC AUC score obtained by the
    classifier.
-   `svc_detection`: This metric tries to use an Support Vector
    Classifier to detect whether each row is real or synthetic and then
    evaluates its performance using an Area under the ROC curve metric.
    The returned score is 1 minus the ROC AUC score obtained by the
    classifier.

In [21]:
evaluate(
    synthetic_data,
    real_data,
    aggregate=False,
    metadata=metadata)


Unnamed: 0,metric,name,raw_score,normalized_score,min_value,max_value,goal,error
0,KSComplement,Inverted Kolmogorov-Smirnov D statistic,0.547242,0.547242,0.0,1.0,MAXIMIZE,
1,CSTest,Chi-Squared,0.975976,0.975976,0.0,1.0,MAXIMIZE,
