In [14]:
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import Metadata
import logging
from pathlib import Path
import pandas as pd
import time

PROJECT_ROOT = Path(__name__).resolve().parent.parent
INPUT_FOLDER = PROJECT_ROOT / "data/input"
OUTPUT_FOLDER = PROJECT_ROOT / "data/output"
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)

In [None]:
# UCI Adult data
# Read data and check
data_path = Path(INPUT_FOLDER / "UCI_adult/adult.data")
colnames = ["Income_Category", "age", "workclass", "fnlwgt", "education", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "capital_gain", "capital_loss", "hours_per_week", "native_country"]
df = pd.read_csv(data_path, names=colnames)
print(df.head())
print(df.shape)

# set up metadata for GAN
df_meta = Metadata.detect_from_dataframe(df)
gen = CTGANSynthesizer(
    metadata=df_meta,
    epochs=500,
    verbose=True,
)


   39          State-gov   77516   Bachelors  ...   0  40   United-States   <=50K
0  50   Self-emp-not-inc   83311   Bachelors  ...   0  13   United-States   <=50K
1  38            Private  215646     HS-grad  ...   0  40   United-States   <=50K
2  53            Private  234721        11th  ...   0  40   United-States   <=50K
3  28            Private  338409   Bachelors  ...   0  40            Cuba   <=50K
4  37            Private  284582     Masters  ...   0  40   United-States   <=50K

[5 rows x 15 columns]
(32560, 15)


In [None]:
# Train the GAN - keep track of the time to execute
tstart = time.time()
gen.fit(df)
tend = time.time()
duration = tend - tstart

In [None]:
# save the results, plot the loss function, and print the time to train the GAN
saveto = OUTPUT_FOLDER / "UCI_adult"
saveto.mkdir(parents = True, exist_ok=True)
gen.save(saveto / "ctgan.pkl")
df_meta.save_to_json(saveto / "ctgan_metdata.json")
df.to_pickle(saveto / "real_df.pkl")
# units are seconds, so display hours
print(f"Time to fit: {(duration / 60):.2f} min.")

In [None]:
# UCI wine quality data
# Read data and check
data_path = Path(INPUT_FOLDER / "UCI_winequality/winequality-red.csv")
df1 = pd.read_csv(data_path, delimiter=";")
data_path = Path(INPUT_FOLDER / "UCI_winequality/winequality-white.csv")
df2 = pd.read_csv(data_path, delimiter=";")
df = pd.concat([df1, df2])
print(df.head())
print(df.shape)

# set up metadata for GAN
df_meta = Metadata.detect_from_dataframe(df)
gen = CTGANSynthesizer(
    metadata=df_meta,
    epochs=500,
    verbose=True,
)


   fixed acidity  volatile acidity  citric acid  ...  sulphates  alcohol  quality
0            7.4              0.70         0.00  ...       0.56      9.4        5
1            7.8              0.88         0.00  ...       0.68      9.8        5
2            7.8              0.76         0.04  ...       0.65      9.8        5
3           11.2              0.28         0.56  ...       0.58      9.8        6
4            7.4              0.70         0.00  ...       0.56      9.4        5

[5 rows x 12 columns]
(6497, 12)


In [None]:
# Train the GAN - keep track of the time to execute
tstart = time.time()
gen.fit(df)
tend = time.time()
duration = tend - tstart

In [None]:
# save the results, plot the loss function, and print the time to train the GAN
saveto = OUTPUT_FOLDER / "UCI_winequality"
saveto.mkdir(parents=True, exist_ok=True)
gen.save(saveto / "ctgan.pkl")
df_meta.save_to_json(saveto / "ctgan_metdata.json")
df.to_pickle(saveto / "real_df.pkl")
# units are seconds, so display hours
print(f"Time to fit: {(duration / 60):.2f} min.")

Traceback (most recent call last):
  File "/home/nick16180/.vscode/extensions/ms-python.python-2025.0.0-linux-x64/python_files/python_server.py", line 133, in exec_user_input
    retval = callable_(user_input, user_globals)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<string>", line 5, in <module>
  File "/home/nick16180/workspace/cas rpm 2025 synthetic data/.venv/lib/python3.12/site-packages/sdv/single_table/ctgan.py", line 73, in get_loss_values_plot
    raise NotFittedError(err_msg)
sdv.errors.NotFittedError: Loss values are not available yet. Please fit your synthesizer first.



In [16]:
# Kaggle credit card fraud data
# Read data and check
data_path = Path(INPUT_FOLDER / "Kaggle_creditcardfraud/creditcard.csv")
df = pd.read_csv(data_path)
print(df.head())
print(df.shape)

# set up metadata for GAN
df_meta = Metadata.detect_from_dataframe(df)
gen = CTGANSynthesizer(
    metadata=df_meta,
    epochs=500,
    verbose=True,
)

   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 


We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.



In [None]:
# Train the GAN - keep track of the time to execute
tstart = time.time()
gen.fit(df)
tend = time.time()
duration = tend - tstart

In [None]:
# save the results, plot the loss function, and print the time to train the GAN
saveto = OUTPUT_FOLDER / "Kaggle_creditcardfraud"
saveto.mkdir(parents=True, exist_ok=True)
gen.save(saveto / "ctgan.pkl")
df_meta.save_to_json(saveto / "ctgan_metdata.json")
df.to_pickle(saveto / "real_df.pkl")
# units are seconds, so display hours
print(f"Time to fit: {(duration / 60):.2f} min.")