In [8]:
# Importing the model
from model.ctabgan import CTABGAN
# Importing the evaluation metrics 
from model.eval.evalFidel import evaluate_fidelity
# Importing standard libraries
import numpy as np
import pandas as pd
import glob
import os

In [9]:
# Specifying the replication number 
num_exp = 1 
# Specifying the name of the dataset used 
dataset = "letter" 
# Specifying the path of the dataset used 
real_path = "Real_Datasets/Datasets/letter-recognition.csv" 
# Specifying the root directory for storing generated data
fake_file_root = "Fake_Datasets" 

In [10]:
# Initializing the synthesizer object and specifying input parameters
# Notice: If you have continuous variable, you do not need to explicitly assign it. It will be treated like 
# that by default
synthesizer =  CTABGAN(raw_csv_path = real_path,
                 test_ratio = 0.20,  
                 categorical_columns = ['letter', 'xbox ', 'ybox ', 'width ', 'height', 'onpix ', 'xbar ', 'ybar ', 'x2bar', 'y2bar ', 'xybar ', 'x2ybar', 'xy2bar', 'xedge ', 'xedgey', 'yedge ', 'yedgex'], 
                 log_columns = [],
                 mixed_columns= {}, 
                 integer_columns = [],
                 problem_type= {"Classification":'letter'},
                 epochs = 10) 


for i in range(num_exp):
    synthesizer.fit()
    syn = synthesizer.generate_samples().iloc[:1000]
    syn.to_csv(fake_file_root+"/"+dataset+"/"+ dataset+"_fake_{exp}.csv".format(exp=i), index= False)

282


100%|██████████| 10/10 [19:42<00:00, 118.20s/it]


Finished training in 1182.3428618907928 seconds.


In [11]:
# Collecting the paths to all corresponding generated datasets for evaluation 
fake_paths = glob.glob(fake_file_root+"/"+dataset+"/"+"*")

In [12]:
fake_paths="Fake_Datasets/"+dataset+"/"+dataset+"_fake_0.csv"

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
import warnings

# Suppress warnings for cleaner output (optional)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# --- Configuration ---
SYNTHETIC_DATA_PATH = "Fake_Datasets/loan/loan_fake_0.csv"
REAL_DATA_PATH = "Real_Datasets/Datasets/loan.csv" # Adjust filename if needed
TARGET_COLUMN = 'Online' # Change this if your target column has a different name
TEST_SIZE = 0.2
RANDOM_STATE = 123
# ---------------------

# --- Data Loading ---
try:
    synthetic_data = pd.read_csv(SYNTHETIC_DATA_PATH)
    real_data = pd.read_csv(REAL_DATA_PATH)
except FileNotFoundError as e:
    print(f"Error loading data: {e}")
    exit()

# --- Preprocessing ---
# Target Encoding
le = LabelEncoder()
real_data[TARGET_COLUMN] = le.fit_transform(real_data[TARGET_COLUMN])
synthetic_data[TARGET_COLUMN] = le.transform(synthetic_data[TARGET_COLUMN])

# Feature Separation
X_synth_raw = synthetic_data.drop(columns=[TARGET_COLUMN])
y_synth = synthetic_data[TARGET_COLUMN]
X_real_raw = real_data.drop(columns=[TARGET_COLUMN])
y_real = real_data[TARGET_COLUMN]

# Feature One-Hot Encoding
categorical_features = list(set(X_real_raw.select_dtypes(include=['object', 'category']).columns) |
                            set(X_synth_raw.select_dtypes(include=['object', 'category']).columns))
X_real_encoded = pd.get_dummies(X_real_raw, columns=categorical_features, dummy_na=False)
X_synth_encoded = pd.get_dummies(X_synth_raw, columns=categorical_features, dummy_na=False)

# Align Columns
X_real_aligned, X_synth_aligned = X_real_encoded.align(X_synth_encoded, join='inner', axis=1, fill_value=0)

# --- Train/Test Split ---
X_real_train, X_real_test, y_real_train, y_real_test = train_test_split(
    X_real_aligned, y_real, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_real
)

# --- Define Classifiers ---
classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
    "Random Forest": RandomForestClassifier(random_state=RANDOM_STATE),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=RANDOM_STATE),
    "MLP Classifier": MLPClassifier(max_iter=500, random_state=RANDOM_STATE)
}

# --- Train on Synthetic, Test on Real (Accuracy Only) ---
print("===== Model Test Accuracies (Train on Synthetic, Test on Real) =====")
model_accuracies = []

for name, model in classifiers.items():
    try:
        # Ensure consistent training data alignment - Fit on full aligned synthetic data
        if X_synth_aligned.shape[0] == len(y_synth):
            model.fit(X_synth_aligned, y_synth)
            preds = model.predict(X_real_test)
            acc = accuracy_score(y_real_test, preds)
            print(f"{name}: {acc:.4f}")
            model_accuracies.append({"Model": name, "Accuracy": acc})
        else:
            print(f"{name}: Skipped (data row mismatch)")

    except Exception as e:
        print(f"{name}: Failed - {e}")


# --- Optional: Save Summary ---
if model_accuracies:
    evaluation_df = pd.DataFrame(model_accuracies)
    evaluation_df.to_csv("evaluation_accuracy_only.csv", index=False)
    # print("\nAccuracy summary saved to evaluation_accuracy_only.csv")
    # print("\n--- Summary ---")
    # print(evaluation_df.sort_values(by="Accuracy", ascending=False))
else:
    print("\nNo models were successfully evaluated.")

print("\nScript finished.")

===== Model Test Accuracies (Train on Synthetic, Test on Real) =====
Logistic Regression: 0.4860
Random Forest: 0.4820
XGBoost: 0.5060
MLP Classifier: 0.4680

Script finished.
