In [104]:
# Importing the model
from model.ctabgan import CTABGAN
# Importing the evaluation metrics 
from model.eval.evalFidel import evaluate_fidelity
# Importing standard libraries
import numpy as np
import pandas as pd
import glob
import os

In [105]:
# Specifying the replication number 
num_exp = 1 
# Specifying the name of the dataset used 
dataset = "car" 
# Specifying the path of the dataset used 
real_path = "Real_Datasets/Datasets/Car/car.csv" 
# Specifying the root directory for storing generated data
fake_file_root = "Fake_Datasets" 

In [106]:
# Initializing the synthesizer object and specifying input parameters
# Notice: If you have continuous variable, you do not need to explicitly assign it. It will be treated like 
# that by default
synthesizer =  CTABGAN(raw_csv_path = real_path,
                 test_ratio = 0.20,  
                 categorical_columns = ['Buying','Maint','Doors','Persons','Lug_boot','Safety','Class'], 
                 log_columns = [],
                 mixed_columns= {}, 
                 integer_columns = [],
                 problem_type= {"Classification":'Class'},
                 epochs = 300) 

# Fitting the synthesizer to the training dataset and generating synthetic data
for i in range(num_exp):
    synthesizer.fit()
    syn = synthesizer.generate_samples().iloc[:1000]
    syn.to_csv(fake_file_root+"/"+dataset+"/"+ dataset+"_fake_{exp}.csv".format(exp=i), index= False)

ValueError: Input y contains NaN.

In [88]:
# Collecting the paths to all corresponding generated datasets for evaluation 
fake_paths = glob.glob(fake_file_root+"/"+dataset+"/"+"*")

In [89]:
fake_paths="Fake_Datasets/"+dataset+"/"+dataset+"_fake_0.csv"

test 80 20 

In [77]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Load data
real_data = pd.read_csv(real_path)
synthetic_data = pd.read_csv(fake_paths)

# Encode categorical columns
encoders = {}
for col in real_data.columns:
    if real_data[col].dtype == 'object':
        le = LabelEncoder()
        real_data[col] = le.fit_transform(real_data[col].astype(str))
        synthetic_data[col] = le.transform(synthetic_data[col].astype(str))
        encoders[col] = le

# Split real data into train/test sets
target_col = real_data.columns[-1]
X_real = real_data.drop(columns=[target_col])
y_real = real_data[target_col]

X_real_train, X_real_test, y_real_train, y_real_test = train_test_split(X_real, y_real, test_size=0.2, random_state=42)

# Prepare synthetic data
X_syn = synthetic_data.drop(columns=[target_col])
y_syn = synthetic_data[target_col]

# Scale all features
scaler = StandardScaler()
X_real_train_scaled = scaler.fit_transform(X_real_train)
X_real_test_scaled = scaler.transform(X_real_test)
X_syn_scaled = scaler.transform(X_syn)

# Train classifiers on synthetic data and test on real test set
classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    "MLP Classifier": MLPClassifier(max_iter=1000)
}

for name, clf in classifiers.items():
    clf.fit(X_syn_scaled, y_syn)
    preds = clf.predict(X_real_test_scaled)
    print(f"\n{name} (trained on synthetic, tested on real 20%)")
    print("  Accuracy :", accuracy_score(y_real_test, preds))
    print("  F1 Score :", f1_score(y_real_test, preds, average='weighted'))
    print("  Precision:", precision_score(y_real_test, preds, average='weighted'))
    print("  Recall   :", recall_score(y_real_test, preds, average='weighted'))

# DNN with Keras (trained on synthetic, tested on real)
print("\nDeep Neural Network (Keras)")
num_classes = len(np.unique(y_syn))
y_syn_cat = to_categorical(y_syn, num_classes=num_classes)
y_real_test_cat = to_categorical(y_real_test, num_classes=num_classes)

model = Sequential([
    Dense(128, activation='relu', input_shape=(X_syn_scaled.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_syn_scaled, y_syn_cat, epochs=20, batch_size=32, verbose=0)

preds = model.predict(X_real_test_scaled)
pred_labels = np.argmax(preds, axis=1)

print("  Accuracy :", accuracy_score(y_real_test, pred_labels))
print("  F1 Score :", f1_score(y_real_test, pred_labels, average='weighted'))
print("  Precision:", precision_score(y_real_test, pred_labels, average='weighted'))
print("  Recall   :", recall_score(y_real_test, pred_labels, average='weighted'))



Logistic Regression (trained on synthetic, tested on real 20%)
  Accuracy : 0.6791907514450867
  F1 Score : 0.5494314168316536
  Precision: 0.4613000768485415
  Recall   : 0.6791907514450867

Random Forest (trained on synthetic, tested on real 20%)
  Accuracy : 0.49710982658959535
  F1 Score : 0.492355100233369
  Precision: 0.4924972961265665
  Recall   : 0.49710982658959535

XGBoost (trained on synthetic, tested on real 20%)
  Accuracy : 0.4421965317919075
  F1 Score : 0.4655108294042282
  Precision: 0.49459168422379485
  Recall   : 0.4421965317919075

MLP Classifier (trained on synthetic, tested on real 20%)
  Accuracy : 0.5173410404624278
  F1 Score : 0.48152628780757023
  Precision: 0.4545107648487895
  Recall   : 0.5173410404624278

Deep Neural Network (Keras)
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
  Accuracy : 0.6791907514450867
  F1 Score : 0.5494314168316536
  Precision: 0.4613000768485415
  Recall   : 0.6791907514450867



===== MODEL EVALUATION SUMMARY =====
                 Model  Accuracy  F1 Score  Precision  Recall
0  Logistic Regression    0.6936    0.6119     0.6087  0.6936
1        Random Forest    0.7139    0.6844     0.6723  0.7139
2              XGBoost    0.6040    0.5852     0.5784  0.6040
3       MLP Classifier    0.7543    0.7284     0.7195  0.7543
4          DNN (Keras)    0.7630    0.7162     0.7405  0.7630
5              Average    0.7058    0.6652     0.6639  0.7058

===== AVERAGE METRICS =====
Average Accuracy: 0.7058
Average F1 Score: 0.6652
Average Precision: 0.6639
Average Recall: 0.7058

===== BEST MODEL =====
Best model: MLP Classifier
Accuracy: 0.7543
F1 Score: 0.7284
Precision: 0.7195
Recall: 0.7543
