# **Synthetic Data Generation Using CTGAN**

In [None]:
!pip install ctgan


In [None]:
import pandas as pd
from ctgan import CTGAN

# Load your dataset
df = pd.read_csv('/content/real_train.csv')

# Define the list of continuous columns
continuous_columns = [
    'careplan_length', 'Diastolic Blood Pressure', 'Systolic Blood Pressure', 'Body Mass Index',
    'Total Cholesterol', 'High Density Lipoprotein Cholesterol', 'Triglycerides',
    'Low Density Lipoprotein Cholesterol', 'Glucose', 'Hemoglobin A1c/Hemoglobin.total in Blood',
    'Sodium', 'Chloride', 'Potassium', 'Carbon Dioxide', 'Calcium', 'Urea Nitrogen',
    'Estimated Glomerular Filtration Rate'
]

# Assuming all other columns are binary except the continuous ones
all_cols = df.columns.tolist()
binary_columns = [col for col in all_cols if col not in continuous_columns]

# Initialize the CTGAN model with some tuned parameters
ctgan = CTGAN(
    epochs=500,                  # Increase the number of epochs
    generator_lr=0.0001,         # Set the learning rate for the generator
    discriminator_lr=0.0001,     # Set the learning rate for the discriminator
    batch_size=700,              # Adjust the batch size
    generator_dim=(256, 256),    # Adjust the size of the generator network
    discriminator_dim=(256, 256) # Adjust the size of the discriminator network
)

ctgan.fit(df, discrete_columns=binary_columns)

# Generate synthetic data
synthetic_data = ctgan.sample(60592)

# Optionally, save the synthetic dataset to a CSV file
synthetic_data.to_csv('synthetic_CTGAN_real_train1.csv', index=False)

print("Synthetic dataset generated and saved to synthetic_CTGAN_real_train.csv.")
