In [1]:
# https://docs.sdv.dev/sdv/single-table-data/modeling/synthesizers

import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.single_table import TVAESynthesizer, GaussianCopulaSynthesizer, CTGANSynthesizer
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

In [17]:
df = pd.read_csv("data/ACS_50k_RACE_3.csv", index_col="id")

# Create synthetic data using only training data to prevent data leakage
def split_data(df):

    X = df.drop(columns=['PINCP'])
    y = df['PINCP']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    y_test = (y_test > 50_000).astype(int)

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(df)
df = X_train.copy()
df['PINCP'] = y_train



In [18]:
df.head()

Unnamed: 0_level_0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,ST,PINCP
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
966926,58.0,3.0,19.0,1.0,9150.0,35.0,1.0,40.0,2.0,3.0,35.0,30000.0
39565,40.0,5.0,19.0,1.0,5000.0,4.0,0.0,40.0,2.0,3.0,4.0,60000.0
1594634,56.0,5.0,17.0,1.0,1970.0,53.0,1.0,40.0,1.0,3.0,53.0,40300.0
208305,58.0,1.0,16.0,4.0,310.0,303.0,0.0,60.0,2.0,3.0,6.0,30000.0
905567,24.0,1.0,16.0,5.0,5400.0,32.0,0.0,40.0,2.0,3.0,32.0,39000.0


# Using CTGAN

In [19]:
# Create metadata for df
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df)

synthesizer = CTGANSynthesizer(metadata)
synthesizer.fit(df)

synthetic_data = synthesizer.sample(364)  # Generate 364 synthetic samples

#synthetic_data.to_csv("data/CTGAN_data.csv")




# Using variational autoencoder (TVAE)

In [11]:
synthesizer_vae = TVAESynthesizer(metadata)
synthesizer_vae.fit(df)
synthetic_data_vae = synthesizer_vae.sample(364)  # Generate 364 synthetic samples

#synthetic_data_vae.to_csv("data/TVAE_data.csv")

# Using Gaussian Copula

In [12]:
synthesizer_gc = GaussianCopulaSynthesizer(metadata)
synthesizer_gc.fit(df)
synthetic_data_gc = synthesizer_gc.sample(364)  # Generate 364 synthetic samples

#synthetic_data_gc.to_csv("data/GC_data.csv", index=False)

# Using Random OverSampling (ROS)

In [24]:
additional_samples = df.sample(n=364, replace=True, random_state=42)
additional_samples.head()

#additional_samples.to_csv("data/ROS_data.csv", index=False)