## This jupyter-notebook contains the evaluation of synthetic data generated using CTAB-GAN for the Adult dataset 

In [1]:
# Importing the model
from model.stacked_ctabgan2 import StackedCTABGAN2
from model.eval.evaluation import get_utility_metrics,stat_sim,privacy_metrics
# Importing standard libraries
import numpy as np
import pandas as pd
import glob


# Specifying the replication number 
num_exp = 1 
# Specifying the name of the dataset used 
dataset = "Adult" 
# Specifying the path of the dataset used 
real_path = "Real_Datasets/Adult.csv" 
# Specifying the root directory for storing generated data
fake_file_root = "Fake_Datasets" 


In [2]:

# Initializing the synthesizer object and specifying input parameters
# Notice: If you have continuous variable, you do not need to explicitly assign it. It will be treated like 
# that by default
synthesizer =  StackedCTABGAN2(raw_csv_path = real_path,
                 test_ratio = 0.20,  
                 categorical_columns = ['workclass', 'education', 'marital-status', 'occupation',
                                        'relationship', 'race', 'gender', 'native-country', 'income'], 
                 log_columns = [],
                 mixed_columns= {'capital-loss':[0.0],'capital-gain':[0.0]}, 
                 integer_columns = ['age', 'fnlwgt','capital-gain', 'capital-loss','hours-per-week'],
                 problem_type= {"Classification": 'income'},
                 epochs = 30)

# Fitting the synthesizer to the training dataset and generating synthetic data
for i in range(num_exp):
    synthesizer.fit()
    syn1, syn2 = synthesizer.generate_samples()
    syn1.to_csv(fake_file_root+"/"+dataset+"/"+ "Stacked_"+ dataset+"_fake_{exp}_08_06_firstgen.csv".format(exp=i), index= False)
    syn2.to_csv(fake_file_root+"/"+dataset+"/"+ "Stacked_"+ dataset+"_fake_{exp}_08_06_stacked.csv".format(exp=i), index= False)


  0%|          | 0/30 [00:00<?, ?it/s]

torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-0.8103, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-1.3461, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-2.2471, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-2.1192, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-2.1226, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-2.1658, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-2.4778, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-2.4842, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151

  3%|▎         | 1/30 [45:58<22:13:21, 2758.69s/it]

tensor(-1.2917, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-3.0019, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-2.4437, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-1.8356, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-2.8590, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-2.1449, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-2.5931, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-2.4261, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-2.3248, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
to

  7%|▋         | 2/30 [7:39:39<122:23:47, 15736.69s/it]

tensor(-1.0238, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-3.1604, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-0.8597, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-2.9684, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-1.4568, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-2.0690, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-1.7037, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-2.5309, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-1.0899, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
to

 10%|█         | 3/30 [7:52:34<66:46:59, 8904.43s/it]  

tensor(-1.3681, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-4.2465, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-0.7516, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-4.3085, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-1.9854, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-1.9626, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-2.8849, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-2.1818, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-2.4883, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
to

 13%|█▎        | 4/30 [8:10:05<41:55:08, 5804.16s/it]

tensor(-2.0571, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-3.0215, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-2.9356, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-2.3552, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-2.7951, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-2.5305, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-3.0267, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-2.5612, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-3.0141, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
to

 17%|█▋        | 5/30 [12:03:06<60:47:00, 8752.81s/it]

tensor(-3.2355, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-2.5353, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-3.1099, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-2.1160, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-3.4257, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-1.4986, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-4.3758, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-1.4707, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
torch.Size([500, 256])
torch.Size([500, 151])
tensor(-3.4089, grad_fn=<MeanBackward0>)
torch.Size([500, 1, 16, 16])
to

In [10]:
# Collecting the paths to all corresponding generated datasets for evaluation 
fake_paths = glob.glob(fake_file_root+"/"+dataset+"/Stacked/"+"*")

#### ML Utility Evaluation

In [11]:
# Specifying the list of classifiers to conduct ML utility evaluation
classifiers_list = ["lr","dt","rf","mlp","svm"]

# Storing and presenting the results as a dataframe
result_mat = get_utility_metrics(real_path,fake_paths,"MinMax",classifiers_list, test_ratio = 0.20)
result_df  = pd.DataFrame(result_mat,columns=["Acc","AUC","F1_Score"])
result_df.index = classifiers_list
result_df

Unnamed: 0,Acc,AUC,F1_Score
lr,2.222222,0.294559,0.161362
dt,10.833333,0.218766,0.209497
rf,4.722222,0.312426,0.275859
mlp,4.722222,0.436004,0.249651
svm,2.5,0.270837,0.116938


#### Statistical Similarity Evaluation

In [8]:
# Specifying the categorical columns of the dataset used
adult_categorical = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']

# Storing and presenting the results as a dataframe
stat_res_avg = []
for fake_path in fake_paths:
    stat_res = stat_sim(real_path,fake_path,adult_categorical)
    stat_res_avg.append(stat_res)

stat_columns = ["Average WD (Continuous Columns","Average JSD (Categorical Columns)","Correlation Distance"]
stat_results = pd.DataFrame(np.array(stat_res_avg).mean(axis=0).reshape(1,3),columns=stat_columns)
stat_results

Unnamed: 0,Average WD (Continuous Columns,Average JSD (Categorical Columns),Correlation Distance
0,0.034557,0.142179,2.609594


#### Nearest Neighbour Privacy Analysis

In [9]:
# Storing and presenting the results as a dataframe
priv_res_avg = []
for fake_path in fake_paths:
    priv_res = privacy_metrics(real_path,fake_path)
    priv_res_avg.append(priv_res)
    
privacy_columns = ["DCR between Real and Fake (5th perc)","DCR within Real(5th perc)","DCR within Fake (5th perc)","NNDR between Real and Fake (5th perc)","NNDR within Real (5th perc)","NNDR within Fake (5th perc)"]
privacy_results = pd.DataFrame(np.array(priv_res_avg).mean(axis=0).reshape(1,6),columns=privacy_columns)
privacy_results

Unnamed: 0,DCR between Real and Fake (5th perc),DCR within Real(5th perc),DCR within Fake (5th perc),NNDR between Real and Fake (5th perc),NNDR within Real (5th perc),NNDR within Fake (5th perc)
0,1.290884,0.680375,1.249378,0.674498,0.545009,0.6731


In [10]:
# Storing generated data for future use if needed
syn.to_csv(fake_file_root+"/"+dataset+"/"+ dataset+"_fake_{exp}.csv".format(exp=i), index= False)

NameError: name 'syn' is not defined