## This jupyter-notebook contains the evaluation of synthetic data generated using CTAB-GAN for the Adult dataset 

In [1]:
# Importing the model
from model.ctabgan import CTABGAN
# Importing the evaluation metrics 
from model.eval.evaluation import get_utility_metrics,stat_sim,privacy_metrics
# Importing standard libraries
import numpy as np
import pandas as pd
import glob

In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import pandas as pd
import torch

In [None]:
scaler = StandardScaler()
label = LabelEncoder()

data = pd.read_csv("cicids2017.csv")
# data[data.columns[:-1]] = scaler.fit_transform(data[data.columns[:-1]])
data[data.columns[:-1]] = np.array(data[data.columns[:-1]], dtype=float)
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.iloc[:, -1] = label.fit_transform(data.iloc[:, -1])
data.dropna(inplace=True)
data.head(3).T
data.to_csv("Clean_2017data")

In [None]:
import pandas as pd

def get_continuous_columns(dataframe, max_unique_values=20):
    """
    Hàm để xác định các cột continuous, loại bỏ các cột có giá trị rời rạc như 0, 1 hoặc chỉ có 1-2 giá trị.
    - dataframe: DataFrame chứa dữ liệu
    - max_unique_values: Số lượng giá trị rời rạc tối đa để coi là continuous (mặc định là 3)
    """
    continuous_columns = []
    continuous_indices = []
    
    for col in dataframe.columns:
        unique_values = dataframe[col].nunique()
        # Loại bỏ các cột chỉ có giá trị rời rạc ít (như 0, 1 hoặc 0, 1, 2)
        if dataframe[col].dtype in ['float64', 'int64'] and unique_values > max_unique_values:
            continuous_columns.append(col)
            continuous_indices.append(dataframe.columns.get_loc(col))
    
    return continuous_columns, continuous_indices

# Ví dụ với dataset của bạn
continuous_columns, continuous_indices = get_continuous_columns(data)

print("Continuous Columns:", continuous_columns)
print("Indices of Continuous Columns:", continuous_indices)


In [None]:
# Specifying the replication number 
num_exp = 1 
# Specifying the name of the dataset used 
dataset = "2020" 
# Specifying the path of the dataset used 
real_path = "iotid20.csv" 
# Specifying the root directory for storing generated data
fake_file_root = "Fake_Datasets" 

In [None]:
# Initializing the synthesizer object and specifying input parameters
# Notice: If you have continuous variable, you do not need to explicitly assign it. It will be treated like 
# that by default
synthesizer =  CTABGAN(raw_csv_path = real_path,
                 test_ratio = 0.20,  
                 categorical_columns = ['workclass', 'education', 'marital-status', 'occupation',
                                        'relationship', 'race', 'gender', 'native-country', 'income'], 
                 log_columns = [],
                 mixed_columns= {'capital-loss':[0.0],'capital-gain':[0.0]}, 
                 integer_columns = ['age', 'fnlwgt','capital-gain', 'capital-loss','hours-per-week'],
                 problem_type= {"Classification": 'income'},
                 epochs = 150) 

# Fitting the synthesizer to the training dataset and generating synthetic data
for i in range(num_exp):
    synthesizer.fit()
    syn = synthesizer.generate_samples()
    syn.to_csv(fake_file_root+"/"+dataset+"/"+ dataset+"_fake_{exp}.csv".format(exp=i), index= False)

In [None]:
# Collecting the paths to all corresponding generated datasets for evaluation 
fake_paths = glob.glob(fake_file_root+"/"+dataset+"/"+"*")

#### ML Utility Evaluation

In [None]:
# Specifying the list of classifiers to conduct ML utility evaluation
classifiers_list = continuous_columns

# Storing and presenting the results as a dataframe
result_mat = get_utility_metrics(real_path,fake_paths,"MinMax",classifiers_list, test_ratio = 0.20)
result_df  = pd.DataFrame(result_mat,columns=["Acc","AUC","F1_Score"])
result_df.index = classifiers_list
result_df

Unnamed: 0,Acc,AUC,F1_Score
lr,1.064592,0.009517,0.061383
dt,6.285188,0.063739,0.071529
rf,2.589825,0.027153,0.040049
mlp,2.763845,0.013811,0.126618
svm,2.896919,0.049234,0.124052


#### Statistical Similarity Evaluation

In [None]:
# # Specifying the categorical columns of the dataset used
# adult_categorical = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']

# # Storing and presenting the results as a dataframe
# stat_res_avg = []
# for fake_path in fake_paths:
#     stat_res = stat_sim(real_path,fake_path,adult_categorical)
#     stat_res_avg.append(stat_res)

# stat_columns = ["Average WD (Continuous Columns","Average JSD (Categorical Columns)","Correlation Distance"]
# stat_results = pd.DataFrame(np.array(stat_res_avg).mean(axis=0).reshape(1,3),columns=stat_columns)
# stat_results

Unnamed: 0,Average WD (Continuous Columns,Average JSD (Categorical Columns),Correlation Distance
0,0.009362,0.1204,0.761534


#### Nearest Neighbour Privacy Analysis

In [None]:
# # Storing and presenting the results as a dataframe
# priv_res_avg = []
# for fake_path in fake_paths:
#     priv_res = privacy_metrics(real_path,fake_path)
#     priv_res_avg.append(priv_res)
    
# privacy_columns = ["DCR between Real and Fake (5th perc)","DCR within Real(5th perc)","DCR within Fake (5th perc)","NNDR between Real and Fake (5th perc)","NNDR within Real (5th perc)","NNDR within Fake (5th perc)"]
# privacy_results = pd.DataFrame(np.array(priv_res_avg).mean(axis=0).reshape(1,6),columns=privacy_columns)
# privacy_results

Unnamed: 0,DCR between Real and Fake (5th perc),DCR within Real(5th perc),DCR within Fake (5th perc),NNDR between Real and Fake (5th perc),NNDR within Real (5th perc),NNDR within Fake (5th perc)
0,0.485676,0.216545,0.22867,0.632722,0.442052,0.431408


In [None]:
# Storing generated data for future use if needed
syn.to_csv(fake_file_root+"/"+dataset+"/"+ dataset+"_fake_{exp}.csv".format(exp=i), index= False)