In [1]:
!pip install sdv




In [12]:
import os
import time
import pandas as pd
import numpy as np
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.metadata import SingleTableMetadata
from sklearn.metrics import mean_absolute_error
from scipy.spatial.distance import cdist

def run_gaussian_experiment(run_id=1, default_distribution='norm'):
    # === File paths ===
    real_path = "Data/Real_Datasets/wine_processed_dedu.csv"
    fake_dir = "Data/Synthetic_Datasets/GaussianCopula"
    fake_path = os.path.join(fake_dir, f"fake_run_{run_id}.csv")
    log_path = "gaussian_experiment_log.csv"
    
    os.makedirs(fake_dir, exist_ok=True)

    # === Load data
    df = pd.read_csv(real_path)
    real_data = df.drop(columns=['type'], errors='ignore')  # optional: drop type

    # === Detect metadata
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(df)

    # === Instantiate Synthesizer
    synthesizer = GaussianCopulaSynthesizer(
        metadata,
        default_distribution=default_distribution
    )

    # === Train
    t0 = time.time()
    synthesizer.fit(df)
    training_time = time.time() - t0

    # === Sample
    t1 = time.time()
    fake_data = synthesizer.sample(num_rows=len(df))
    sampling_time = time.time() - t1

    # === Save synthetic data
    fake_data.to_csv(fake_path, index=False)

    # === Drop type column for numeric comparison
    fake_data_numeric = fake_data.drop(columns=['type'], errors='ignore')

    # === Correlation metrics
    pearson_mae = mean_absolute_error(
        real_data.corr().values.flatten(),
        fake_data_numeric.corr().values.flatten()
    )
    spearman_mae = mean_absolute_error(
        real_data.corr(method='spearman').values.flatten(),
        fake_data_numeric.corr(method='spearman').values.flatten()
    )

    # === Distance metrics
    dist_rf = cdist(real_data.values, fake_data_numeric.values)
    dist_rr = cdist(real_data.values, real_data.values)
    np.fill_diagonal(dist_rr, np.inf)
    min_rf = np.min(dist_rf)
    min_rr = np.min(dist_rr)

    # === Log results
    row = {
        'run_id': run_id,
        'distribution': default_distribution,
        'pearson_mae': round(pearson_mae, 4),
        'spearman_mae': round(spearman_mae, 4),
        'min_real_fake_dist': round(min_rf, 4),
        'min_real_real_dist': round(min_rr, 4),
        'training_time': round(training_time, 2),
        'total_runtime': round(sampling_time, 2)
    }

    if not os.path.exists(log_path):
        pd.DataFrame([row]).to_csv(log_path, index=False)
    else:
        pd.DataFrame([row]).to_csv(log_path, mode='a', index=False, header=False)

    print("Finished GaussianCopula run", run_id)
    print(row)

    return fake_data, row


In [22]:
fake_data, log = run_gaussian_experiment(run_id=7, default_distribution='norm')



The 'SingleTableMetadata' is deprecated. Please use the new 'Metadata' class for synthesizers.


We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.



Finished GaussianCopula run 7
{'run_id': 7, 'distribution': 'norm', 'pearson_mae': 0.0131, 'spearman_mae': 0.0386, 'min_real_fake_dist': np.float64(0.6492), 'min_real_real_dist': np.float64(0.0078), 'training_time': 0.09, 'total_runtime': 0.02}


In [9]:
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.metadata import Metadata
import pandas as pd

In [29]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors

# 1) Loading data
real_df  = pd.read_csv("Data/Real_Datasets/wine_processed_dedu.csv")   
synth_df = pd.read_csv('Data/Synthetic_Datasets/GaussianCopula/fake_run_5.csv')    


# 2) Dropping 'type' and keeping numeric columns only 
for df in (real_df, synth_df):
    if "type" in df.columns:
        df.drop(columns=["type"], inplace=True)

# Align columns/order to the real data
numeric_cols = real_df.select_dtypes(include=[np.number]).columns.tolist()
synth_df = synth_df[numeric_cols]

Xr = real_df[numeric_cols].to_numpy(dtype=float)
Xf = synth_df[numeric_cols].to_numpy(dtype=float)

if Xr.shape[0] < 3 or Xr.shape[1] == 0:
    raise ValueError("Not enough rows/columns to compute distances.")

# 3) Real→Real baseline: nearest OTHER real (k=2, take index 1) 
nn_rr = NearestNeighbors(n_neighbors=2, metric="euclidean").fit(Xr)
dist_rr, _ = nn_rr.kneighbors(Xr, return_distance=True)
rr_other = dist_rr[:, 1]

RR_median = float(np.median(rr_other))
RR_p05    = float(np.percentile(rr_other, 5))
RR_p01    = float(np.percentile(rr_other, 1))

# 4) Fake→Real distances (k=2 for NNDR too) 
nn_fr = NearestNeighbors(n_neighbors=2, metric="euclidean").fit(Xr)
dist_fr, _ = nn_fr.kneighbors(Xf, return_distance=True)
fr1, fr2 = dist_fr[:, 0], dist_fr[:, 1]

FR_median = float(np.median(fr1))
FR_p05    = float(np.percentile(fr1, 5))
pct_FR_below_RR1pct = float(np.mean(fr1 < RR_p01) * 100.0)

# Adaptive near-duplicate threshold = 1% of typical RR spacing
eps = 0.01 * RR_median if RR_median > 0 else 0.0
near_dup_rate = float(np.mean(fr1 < eps) * 100.0)


# 5) Print results 
print({
    "RR_median": RR_median,
    "RR_p05": RR_p05,
    "RR_p01": RR_p01,
    "FR_median": FR_median,
    "FR_p05": FR_p05,
    "pct_FR_below_RR1pct": pct_FR_below_RR1pct,  # <= ~2% ok; >5% red flag
    "near_dup_rate": near_dup_rate,              # should be ~0%
    "eps_used": eps, 
})


{'RR_median': 0.9774631429422356, 'RR_p05': 0.23142579653857615, 'RR_p01': 0.07714061450996036, 'FR_median': 1.6664879305859701, 'FR_p05': 1.1013692670950037, 'pct_FR_below_RR1pct': 0.0, 'near_dup_rate': 0.0, 'eps_used': 0.009774631429422356}


In [10]:
df = pd.read_csv("Data/Real_Datasets/wine_processed.csv")

In [16]:
# Define your metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df)

# (Optional) Save for reproducibility
metadata.save_to_json("my_metadata.json")

In [17]:
synthesizer = GaussianCopulaSynthesizer(metadata)


The 'SingleTableMetadata' is deprecated. Please use the new 'Metadata' class for synthesizers.



In [19]:
# 4. Fit model
synthesizer.fit(df)

In [20]:
# Generate and save 5 synthetic datasets
for i in range(1, 6):
    synthetic_data = synthesizer.sample(num_rows=len(df))
    path = f"Data/Synthetic_Datasets/GaussianCopula/fake_gaussian_data_{i}.csv"
    synthetic_data.to_csv(path, index=False)
    print(f"✅ Saved: {path}")

✅ Saved: Data/Synthetic_Datasets/GaussianCopula/fake_gaussian_data_1.csv
✅ Saved: Data/Synthetic_Datasets/GaussianCopula/fake_gaussian_data_2.csv
✅ Saved: Data/Synthetic_Datasets/GaussianCopula/fake_gaussian_data_3.csv
✅ Saved: Data/Synthetic_Datasets/GaussianCopula/fake_gaussian_data_4.csv
✅ Saved: Data/Synthetic_Datasets/GaussianCopula/fake_gaussian_data_5.csv


In [None]:
# 5. Generate synthetic data
synthetic_data = synthesizer.sample(num_rows=len(df))

In [None]:
# 6. Save synthetic dataset
synthetic_data.to_csv("Data/Synthetic_Datasets/GaussianCopula/fake_gaussian_data.csv", index=False)