<a href="https://colab.research.google.com/github/nikhil6553/studious-waffle/blob/main/CTGAN_Synthetic_Data_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install ctgan
!pip install pandas


Collecting ctgan
  Downloading ctgan-0.11.0-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.14.0 (from ctgan)
  Downloading rdt-1.18.2-py3-none-any.whl.metadata (10 kB)
Collecting Faker!=37.11.0,>=17 (from rdt>=1.14.0->ctgan)
  Downloading faker-37.12.0-py3-none-any.whl.metadata (15 kB)
Downloading ctgan-0.11.0-py3-none-any.whl (24 kB)
Downloading rdt-1.18.2-py3-none-any.whl (74 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.3/74.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faker-37.12.0-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Faker, rdt, ctgan
Successfully installed Faker-37.12.0 ctgan-0.11.0 rdt-1.18.2


In [None]:
import pandas as pd
from ctgan import CTGAN
from ctgan import load_demo
import numpy as np

# --- 1. Load Data ---
# This example uses the built-in Adult Census dataset.
# For your own data, you would replace the line below with:
# real_data = pd.read_csv('your_data_file.csv')
print("Loading demo data...")
real_data = load_demo()

# Inspect the loaded data
print(f"Shape of REAL data: {real_data.shape}")
print("\nFirst 5 rows of REAL data:")
print(real_data.head())
print("-" * 50)

# --- 2. Define Discrete Columns ---
# CTGAN needs to know which columns are categorical (discrete) vs. continuous.
# You MUST correctly identify these columns for your own dataset.
discrete_columns = [
    'workclass', 'education', 'marital-status', 'occupation',
    'relationship', 'race', 'sex', 'native-country', 'income'
]

# --- 3. Initialize and Fit the CTGAN Model ---
# We initialize the CTGAN model.
# 'epochs': How many training iterations. Higher epochs generally means better
#           synthetic data quality but longer training time. We use 10 for a quick demo.
# 'batch_size': The number of rows processed in each training step.
print("Initializing and fitting CTGAN model...")
ctgan = CTGAN(
    epochs=10, # Use more epochs (e.g., 300) for high-quality data
    batch_size=500,
    verbose=True
)

# Fit the model to the real data
# This is the training step where the Generator and Discriminator learn
try:
    ctgan.fit(real_data, discrete_columns)
    print("CTGAN fitting complete.")
except Exception as e:
    print(f"An error occurred during fitting: {e}")
    print("If you encounter errors, ensure your data is clean (no NaNs) and column types are correct.")
    # Stop execution if fitting fails
    exit()


# --- 4. Generate Synthetic Data ---
NUM_SAMPLES = 1000
print(f"\nGenerating {NUM_SAMPLES} synthetic rows...")
synthetic_data = ctgan.sample(NUM_SAMPLES)

# --- 5. Output and Verification ---
print("-" * 50)
print(f"Shape of SYNTHETIC data: {synthetic_data.shape}")
print(f"Number of rows requested: {NUM_SAMPLES}")
print(f"Number of columns: {len(synthetic_data.columns)}")
print("-" * 50)
print("\nFirst 5 rows of SYNTHETIC data:")
print(synthetic_data.head())

# Optional: Save the synthetic data to a CSV file
# synthetic_data.to_csv('synthetic_census_data.csv', index=False)
# print("\nSynthetic data saved to 'synthetic_census_data.csv'")

# Optional: Save the trained model for later use
# ctgan.save('my_ctgan_model.pkl')
# print("\nModel saved to 'my_ctgan_model.pkl'")

Loading demo data...
Shape of REAL data: (32561, 15)

First 5 rows of REAL data:
   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country income  
0          2174             0  

Gen. (-1.00) | Discrim. (0.04): 100%|██████████| 10/10 [01:27<00:00,  8.80s/it]

CTGAN fitting complete.

Generating 1000 synthetic rows...
--------------------------------------------------
Shape of SYNTHETIC data: (1000, 15)
Number of rows requested: 1000
Number of columns: 15
--------------------------------------------------

First 5 rows of SYNTHETIC data:
   age     workclass  fnlwgt     education  education-num marital-status  \
0   56             ?  218561          11th              5  Never-married   
1   54  Self-emp-inc   49927  Some-college              9      Separated   
2   38       Private   52978          11th             14        Widowed   
3   32       Private  198055       Masters              8  Never-married   
4   30       Private  253643     Bachelors             16        Widowed   

        occupation   relationship   race     sex  capital-gain  capital-loss  \
0     Adm-clerical           Wife  White  Female            82            -4   
1  Exec-managerial      Unmarried  White  Female         15931             1   
2     Craft-repair  


