## This jupyter-notebook contains the evaluation of synthetic data generated using CTAB-GAN for the Adult dataset 

In [1]:
# Importing the model
from model.stacked_ctabgan import StackedCTABGAN
from model.eval.evaluation import get_utility_metrics,stat_sim,privacy_metrics
# Importing standard libraries
import numpy as np
import pandas as pd
import glob



# Specifying the replication number 
num_exp = 1 
# Specifying the name of the dataset used 
dataset = "Adult" 
# Specifying the path of the dataset used 
real_path = "Real_Datasets/Adult.csv" 
# Specifying the root directory for storing generated data
fake_file_root = "Fake_Datasets" 

In [2]:


# Initializing the synthesizer object and specifying input parameters
# Notice: If you have continuous variable, you do not need to explicitly assign it. It will be treated like 
# that by default
synthesizer =  StackedCTABGAN(raw_csv_path = real_path,
                 test_ratio = 0.20,  
                 categorical_columns = ['workclass', 'education', 'marital-status', 'occupation',
                                        'relationship', 'race', 'gender', 'native-country', 'income'], 
                 log_columns = [],
                 mixed_columns= {'capital-loss':[0.0],'capital-gain':[0.0]}, 
                 integer_columns = ['age', 'fnlwgt','capital-gain', 'capital-loss','hours-per-week'],
                 problem_type= {"Classification": 'income'},
                 epochs = 25) 

# Fitting the synthesizer to the training dataset and generating synthetic data
for i in range(num_exp):
    synthesizer.fit()
    syn = synthesizer.generate_samples()
    syn.to_csv(fake_file_root+"/"+dataset+"/"+ "Stacked_"+ dataset+"_fake_{exp}_29_05.csv".format(exp=i), index= False)


100%|██████████| 25/25 [2:43:32<00:00, 392.51s/it]  


Finished training in 9827.527757644653  seconds.
Stacked condvec factory n_opt first layer: 146


100%|██████████| 25/25 [2:46:52<00:00, 400.50s/it]  


[[-6.9462284e-03  1.1134755e-18  1.0310468e-10 ...  9.2238079e-06
   1.0000000e+00  1.4887029e-19]
 [ 3.4027867e-02  7.2774415e-30  1.2158782e-27 ...  2.9004484e-37
   1.0000000e+00  1.3599813e-25]
 [ 6.5952107e-02  3.4557629e-10  9.9995315e-01 ...  0.0000000e+00
   3.4790006e-17  1.0000000e+00]
 ...
 [ 1.3537489e-02  4.8393889e-14  1.0000000e+00 ...  4.1754829e-37
   6.0670404e-03  9.9393290e-01]
 [ 1.0448255e-02  2.0151813e-06  9.9674290e-01 ...  3.8216742e-19
   1.0000000e+00  2.7149456e-37]
 [ 9.9645384e-02  1.8257536e-10  2.0239463e-09 ...  2.0534628e-41
   9.4197178e-01  5.8028243e-02]]
Stacked condvec factory n_opt first layer: 146
Stacked condvec factory n_opt first layer: 146


In [3]:
# Collecting the paths to all corresponding generated datasets for evaluation 
fake_paths = glob.glob(fake_file_root+"/"+dataset+"/Stacked/"+"*")

#### ML Utility Evaluation

In [4]:
# Specifying the list of classifiers to conduct ML utility evaluation
classifiers_list = ["lr","dt","rf","mlp","svm"]

# Storing and presenting the results as a dataframe
result_mat = get_utility_metrics(real_path,fake_paths,"MinMax",classifiers_list, test_ratio = 0.20)
result_df  = pd.DataFrame(result_mat,columns=["Acc","AUC","F1_Score"])
result_df.index = classifiers_list
result_df

Unnamed: 0,Acc,AUC,F1_Score
lr,4.370969,0.398907,0.220331
dt,15.897226,0.260008,0.260501
rf,10.297881,0.468219,0.358956
mlp,7.595455,0.443108,0.326035
svm,5.435561,0.432087,0.22749


#### Statistical Similarity Evaluation

In [5]:
# Specifying the categorical columns of the dataset used
adult_categorical = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']

# Storing and presenting the results as a dataframe
stat_res_avg = []
for fake_path in fake_paths:
    stat_res = stat_sim(real_path,fake_path,adult_categorical)
    stat_res_avg.append(stat_res)

stat_columns = ["Average WD (Continuous Columns","Average JSD (Categorical Columns)","Correlation Distance"]
stat_results = pd.DataFrame(np.array(stat_res_avg).mean(axis=0).reshape(1,3),columns=stat_columns)
stat_results

Unnamed: 0,Average WD (Continuous Columns,Average JSD (Categorical Columns),Correlation Distance
0,0.028795,0.110153,2.480955


#### Nearest Neighbour Privacy Analysis

In [22]:
# Storing and presenting the results as a dataframe
priv_res_avg = []
for fake_path in fake_paths:
    priv_res = privacy_metrics(real_path,fake_path)
    priv_res_avg.append(priv_res)
    
privacy_columns = ["DCR between Real and Fake (5th perc)","DCR within Real(5th perc)","DCR within Fake (5th perc)","NNDR between Real and Fake (5th perc)","NNDR within Real (5th perc)","NNDR within Fake (5th perc)"]
privacy_results = pd.DataFrame(np.array(priv_res_avg).mean(axis=0).reshape(1,6),columns=privacy_columns)
privacy_results

Unnamed: 0,DCR between Real and Fake (5th perc),DCR within Real(5th perc),DCR within Fake (5th perc),NNDR between Real and Fake (5th perc),NNDR within Real (5th perc),NNDR within Fake (5th perc)
0,0.5076,0.216545,0.119058,0.670375,0.442052,0.352506


In [10]:
# Storing generated data for future use if needed
syn.to_csv(fake_file_root+"/"+dataset+"/"+ dataset+"_fake_{exp}.csv".format(exp=i), index= False)

NameError: name 'syn' is not defined