## Evaluating Smartnoise Synthesizers with SDMetrics
Below we will show a simple example of how to evaluate Smartnoise synthetic data using the existing Multi-table metrics from SDV's SDMetrics. 

In [1]:
import numpy as np
import pandas as pd

import sdmetrics

from snsynth import MWEMSynthesizer
from snsynth.pytorch.nn import DPCTGAN, PATECTGAN
from snsynth.preprocessors import GeneralTransformer
from snsynth.pytorch import PytorchDPSynthesizer

import utils

In [2]:
df, df_non_continuous, sample_size = utils.retrieve_PUMS_data_categorical()

In [3]:
patectgan = PytorchDPSynthesizer(1.0, PATECTGAN(), None)
dpctgan = PytorchDPSynthesizer(1.0, DPCTGAN(), None)

dpctgan.fit(df_non_continuous, categorical_columns=['sex','educ','race','married'])
patectgan.fit(df_non_continuous, categorical_columns=['sex','educ','race','married'])

synth_data_dpctgan = dpctgan.sample(sample_size)
synth_data_patectgan = patectgan.sample(sample_size)

# Note that most SDMetrics metrics require a unique sample id. Here we add this
# back to the data after synthesizing.
synth_data_dpctgan['pid'] = df['pid']
synth_data_patectgan['pid'] = df['pid']
df_non_continuous['pid'] = df['pid']

data_real = {'pums': df_non_continuous}
data_synth_patectgan = {'pums': synth_data_patectgan}
data_synth_dpctgan = {'pums': synth_data_dpctgan}



Epoch 1, Loss G: 0.6681, Loss D: 1.3895
epsilon is 0.08429801659035999, alpha is 63.0
Epoch 2, Loss G: 0.6693, Loss D: 1.3889
epsilon is 0.8159572645684117, alpha is 17.0


### Metadata
Please refer to the following SDV documentation when creating the SDMetric metadata for your dataset: https://sdv.dev/SDV/developer_guides/sdv/metadata.html

Note that if your metadata is incorrect, you are likely to see an error when running your experiments.

In [4]:
meta = utils.return_PUMS_metadata()

In [5]:
metrics = sdmetrics.multi_table.MultiTableMetric.get_subclasses()

# Run all the compatible metrics and get a report
sdmetrics.compute_metrics(metrics, data_real, data_synth_patectgan, metadata=meta)
sdmetrics.compute_metrics(metrics, data_real, data_synth_dpctgan, metadata=meta)

Unnamed: 0,metric,name,raw_score,normalized_score,min_value,max_value,goal,error
0,CSTest,Chi-Squared,0.732815,0.732815,0.0,1.0,MAXIMIZE,
1,KSTest,Inverted Kolmogorov-Smirnov D statistic,,,0.0,1.0,MAXIMIZE,"Cannot find fields of types ('numerical',)"
2,KSTestExtended,Inverted Kolmogorov-Smirnov D statistic,0.82225,0.82225,0.0,1.0,MAXIMIZE,
3,LogisticDetection,LogisticRegression Detection,0.486655,0.486655,0.0,1.0,MAXIMIZE,
4,SVCDetection,SVC Detection,0.367662,0.367662,0.0,1.0,MAXIMIZE,
5,BNLikelihood,BayesianNetwork Likelihood,0.002601,0.002601,0.0,1.0,MAXIMIZE,
6,BNLogLikelihood,BayesianNetwork Log Likelihood,,,-inf,0.0,MAXIMIZE,
7,LogisticParentChildDetection,LogisticRegression Detection,,,0.0,1.0,MAXIMIZE,No foreign keys given
8,SVCParentChildDetection,SVC Detection,,,0.0,1.0,MAXIMIZE,No foreign keys given


In [6]:
from sdv.metrics.relational import KSTestExtended

KSTestExtended.compute(data_real, data_synth_patectgan, metadata=meta)

0.8365