In [1]:
# https://docs.sdv.dev/sdv/single-table-data/modeling/synthesizers

import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.single_table import TVAESynthesizer, GaussianCopulaSynthesizer, CTGANSynthesizer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("data/ACS_50k_RACE_3.csv", index_col="id")

# Create synthetic data using only training data to prevent data leakage
def split_data(df):

    X = df.drop(columns=['PINCP'])
    y = df['PINCP']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    y_test = (y_test > 50_000).astype(int)

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(df)
df = X_train.copy()
df['PINCP'] = y_train



# Using CTGAN

In [19]:
# Create metadata for df
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df)

synthesizer = CTGANSynthesizer(metadata)
synthesizer.fit(df)

synthetic_data = synthesizer.sample(364)  # Generate 364 synthetic samples

#synthetic_data.to_csv("data/CTGAN_data.csv")




# Using Random OverSampling (ROS)

In [24]:
additional_samples = df.sample(n=364, replace=True, random_state=42)
additional_samples.head()

#additional_samples.to_csv("data/ROS_data.csv", index=False)

# Using variational autoencoder (TVAE)

In [11]:
synthesizer_vae = TVAESynthesizer(metadata)
synthesizer_vae.fit(df)
synthetic_data_vae = synthesizer_vae.sample(364)  # Generate 364 synthetic samples

#synthetic_data_vae.to_csv("data/TVAE_data.csv")

# Using Gaussian Copula

In [12]:
synthesizer_gc = GaussianCopulaSynthesizer(metadata)
synthesizer_gc.fit(df)
synthetic_data_gc = synthesizer_gc.sample(364)  # Generate 364 synthetic samples

#synthetic_data_gc.to_csv("data/GC_data.csv", index=False)