## This jupyter-notebook contains the evaluation of synthetic data generated using CTAB-GAN for the Adult dataset 

In [2]:
# Importing the model
from model.stacked_ctabgan3 import StackedCTABGAN3
from model.eval.evaluation import get_utility_metrics,stat_sim,privacy_metrics
# Importing standard libraries
import numpy as np
import pandas as pd
import glob



# Specifying the replication number 
num_exp = 1 
# Specifying the name of the dataset used 
dataset = "Adult" 
# Specifying the path of the dataset used 
real_path = "Real_Datasets/Adult.csv" 
# Specifying the root directory for storing generated data
fake_file_root = "Fake_Datasets" 


In [None]:

# Initializing the synthesizer object and specifying input parameters
# Notice: If you have continuous variable, you do not need to explicitly assign it. It will be treated like 
# that by default
synthesizer =  StackedCTABGAN3(raw_csv_path = real_path,
                 test_ratio = 0.20,  
                 categorical_columns = ['workclass', 'education', 'marital-status', 'occupation',
                                        'relationship', 'race', 'gender', 'native-country', 'income'], 
                 log_columns = [],
                 mixed_columns= {'capital-loss':[0.0],'capital-gain':[0.0]}, 
                 integer_columns = ['age', 'fnlwgt','capital-gain', 'capital-loss','hours-per-week'],
                 problem_type= {"Classification": 'income'},
                 epochs = 30)

# Fitting the synthesizer to the training dataset and generating synthetic data
for i in range(num_exp):
    synthesizer.fit()
    syn1, syn2 = synthesizer.generate_samples()
    syn1.to_csv(fake_file_root+"/"+dataset+"/"+ "Stacked_"+ dataset+"_fake_{exp}_08_06_firstgen.csv".format(exp=i), index= False)
    syn2.to_csv(fake_file_root+"/"+dataset+"/"+ "Stacked_"+ dataset+"_fake_{exp}_08_06_stacked.csv".format(exp=i), index= False)


#### ML Utility Evaluation

In [1]:
fake_paths = glob.glob("FullFakeRuns/*")
# Specifying the list of classifiers to conduct ML utility evaluation
classifiers_list = ["lr","dt","rf","mlp","svm"]
for fake_path in fake_paths:
    # Storing and presenting the results as a dataframe
    result_mat = get_utility_metrics(real_path,fake_path,"MinMax",classifiers_list, test_ratio = 0.20)
    result_df  = pd.DataFrame(result_mat,columns=["Acc","AUC","F1_Score"])
    result_df.index = classifiers_list
    result_df.to_csv(fake_path+"_output")

NameError: name 'glob' is not defined

#### Statistical Similarity Evaluation

In [8]:
# Specifying the categorical columns of the dataset used
adult_categorical = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']

# Storing and presenting the results as a dataframe
stat_res_avg = []
for fake_path in fake_paths:
    stat_res = stat_sim(real_path,fake_path,adult_categorical)
    stat_res_avg.append(stat_res)

stat_columns = ["Average WD (Continuous Columns","Average JSD (Categorical Columns)","Correlation Distance"]
stat_results = pd.DataFrame(np.array(stat_res_avg).mean(axis=0).reshape(1,3),columns=stat_columns)
stat_results

Unnamed: 0,Average WD (Continuous Columns,Average JSD (Categorical Columns),Correlation Distance
0,0.034557,0.142179,2.609594


#### Nearest Neighbour Privacy Analysis

In [9]:
# Storing and presenting the results as a dataframe
priv_res_avg = []
for fake_path in fake_paths:
    priv_res = privacy_metrics(real_path,fake_path)
    priv_res_avg.append(priv_res)
    
privacy_columns = ["DCR between Real and Fake (5th perc)","DCR within Real(5th perc)","DCR within Fake (5th perc)","NNDR between Real and Fake (5th perc)","NNDR within Real (5th perc)","NNDR within Fake (5th perc)"]
privacy_results = pd.DataFrame(np.array(priv_res_avg).mean(axis=0).reshape(1,6),columns=privacy_columns)
privacy_results

Unnamed: 0,DCR between Real and Fake (5th perc),DCR within Real(5th perc),DCR within Fake (5th perc),NNDR between Real and Fake (5th perc),NNDR within Real (5th perc),NNDR within Fake (5th perc)
0,1.290884,0.680375,1.249378,0.674498,0.545009,0.6731


In [10]:
# Storing generated data for future use if needed
syn.to_csv(fake_file_root+"/"+dataset+"/"+ dataset+"_fake_{exp}.csv".format(exp=i), index= False)

NameError: name 'syn' is not defined