In [1]:
import os
import pandas as pd
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")


# Define paths
prepared_data_path = "PreparedData1"
synthetic_data_paths = ["t1_SyntheticData", "t1_SyntheticData_noalpha"]

# Initialize results dictionary
results = {synthetic_data_paths[0]: [], synthetic_data_paths[1]: []}

# Get dataset names
datasets = [
    d for d in os.listdir(prepared_data_path) if os.path.isdir(os.path.join(prepared_data_path, d))
]

# Loop through each dataset
for dataset in datasets:
    real_data_path = os.path.join(prepared_data_path, dataset, "supervised", "training_data.csv")

    # Load real data
    if not os.path.exists(real_data_path):
        print(f"Real data not found for dataset: {dataset}")
        continue
    real_data = pd.read_csv(real_data_path)

    # Compare with synthetic data for each synthetic data folder
    for synthetic_data_path in synthetic_data_paths:
        folder_name = os.path.basename(synthetic_data_path)
        result_row = {"Dataset": dataset}

        model_paths = [
            m for m in os.listdir(synthetic_data_path) if os.path.isdir(os.path.join(synthetic_data_path, m))
        ]

        # Loop through models
        for model in model_paths:
            synthetic_data_file = os.path.join(
                synthetic_data_path, model, dataset, "supervised", "synthetic_data.csv"
            )

            # Check if synthetic data exists
            if not os.path.exists(synthetic_data_file):
                print(f"Synthetic data not found for model: {model}, dataset: {dataset}")
                result_row[model] = 0
                continue

            # Load synthetic data
            synthetic_data = pd.read_csv(synthetic_data_file)

            # Find exact matches
            merged = pd.merge(synthetic_data, real_data, how="inner")
            exact_matches = len(merged)
            total_synthetic = len(synthetic_data)

            # Calculate percentage of exact copies
            if total_synthetic > 0:
                exact_copy_percentage = (exact_matches / total_synthetic) * 100
            else:
                exact_copy_percentage = 0

            result_row[model] = exact_copy_percentage

        # Append row to results for the current synthetic data folder
        results[synthetic_data_path].append(result_row)

# Create and save separate CSV files for each synthetic data folder
for synthetic_data_path in synthetic_data_paths:
    folder_name = os.path.basename(synthetic_data_path)
    output_file = f"{folder_name}_comparison_results.csv"
    results_df = pd.DataFrame(results[synthetic_data_path])
    results_df.to_csv(output_file, index=False)
    print(f"Comparison results for {folder_name} saved to {output_file}")


Real data not found for dataset: T2DM
Real data not found for dataset: StudentsMentalHealth
Comparison results for t1_SyntheticData saved to t1_SyntheticData_comparison_results.csv
Comparison results for t1_SyntheticData_noalpha saved to t1_SyntheticData_noalpha_comparison_results.csv


In [29]:
# List of CSV file names

csv_files = [
    "t1_SyntheticData_comparison_results.csv",
    "t2_SyntheticData_comparison_results.csv",
    "t3_SyntheticData_comparison_results.csv",
    "t4_SyntheticData_comparison_results.csv",
    "t5_SyntheticData_comparison_results.csv"
]
'''
csv_files = [
    "t1_SyntheticData_noalpha_comparison_results.csv",
    "t2_SyntheticData_noalpha_comparison_results.csv",
    "t3_SyntheticData_noalpha_comparison_results.csv",
    "t4_SyntheticData_noalpha_comparison_results.csv",
    "t5_SyntheticData_noalpha_comparison_results.csv"
]
'''
# Initialize an empty list to store DataFrames
dataframes = []

# Read each CSV file and add it to the list
for file in csv_files:
    if os.path.exists(file):
        df = pd.read_csv(file)
        dataframes.append(df)
    else:
        print(f"File {file} does not exist. Skipping.")

# Check if we have files to process
if len(dataframes) == 0:
    print("No valid files found. Exiting.")
else:
    # Merge all DataFrames by the "Dataset" column (assuming the first column is "Dataset")
    merged_df = dataframes[0].set_index("Dataset")
    for df in dataframes[1:]:
        merged_df += df.set_index("Dataset")

    # Divide by the number of files to get the average
    merged_df /= len(dataframes)

    # Reset index to bring "Dataset" back as a column
    averaged_df = merged_df.reset_index()

    # Save the averaged result to a new CSV file
    output_file = "averaged_comparison_results.csv"
    averaged_df.to_csv(output_file, index=False)
    print(f"Averaged results saved to {output_file}.")


Averaged results saved to averaged_comparison_results.csv.
