# Assignment 4: Generative Models with GAN/cGAN

**Course:** Deep Learning  
**Dataset:** Adult (Census Income)  
**Task:** Implement GAN and Conditional GAN for synthetic tabular data generation

---

## 1. Imports and Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import arff
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Set random seeds for reproducibility
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

## 2. Data Loading and Preprocessing

### 2.1 Load ARFF File

In [None]:
# Load the Adult dataset from ARFF file
data, meta = arff.loadarff('adult.arff')
df = pd.DataFrame(data)

# Decode byte strings to regular strings
for col in df.columns:
    if df[col].dtype == object:
        df[col] = df[col].str.decode('utf-8')

print(f"Dataset shape: {df.shape}")
print(f"\nColumn types:")
print(df.dtypes)
print(f"\nFirst few rows:")
df.head()

In [None]:
# Check for missing values (represented as '?')
print("Missing values ('?') per column:")
for col in df.columns:
    if df[col].dtype == object:
        missing_count = (df[col] == '?').sum()
        if missing_count > 0:
            print(f"  {col}: {missing_count} ({missing_count/len(df)*100:.2f}%)")

### 2.2 Preprocessing

**Preprocessing decisions:**
1. **Missing values**: Impute using mode for categorical features (most frequent value)
2. **Categorical features**: One-hot encoding
3. **Continuous features**: Min-Max scaling to [0, 1] range

In [None]:
# Define feature types
CONTINUOUS_COLS = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
CATEGORICAL_COLS = ['workclass', 'education', 'marital-status', 'occupation', 
                    'relationship', 'race', 'sex', 'native-country']
TARGET_COL = 'income'

print(f"Continuous features ({len(CONTINUOUS_COLS)}): {CONTINUOUS_COLS}")
print(f"Categorical features ({len(CATEGORICAL_COLS)}): {CATEGORICAL_COLS}")
print(f"Target: {TARGET_COL}")

In [None]:
class DataPreprocessor:
    """Preprocessor for Adult dataset with imputation and encoding."""
    
    def __init__(self, continuous_cols, categorical_cols, target_col):
        self.continuous_cols = continuous_cols
        self.categorical_cols = categorical_cols
        self.target_col = target_col
        self.scaler = MinMaxScaler()
        self.label_encoder = LabelEncoder()
        self.category_mappings = {}  # Store one-hot mappings
        self.category_dims = {}  # Store dimensions for each categorical feature
        self.mode_values = {}  # Store mode for imputation
        
    def fit(self, df):
        """Fit the preprocessor on training data."""
        df = df.copy()
        
        # Store mode values for imputation
        for col in self.categorical_cols:
            # Get mode excluding '?'
            valid_values = df[df[col] != '?'][col]
            self.mode_values[col] = valid_values.mode()[0]
        
        # Impute missing values
        for col in self.categorical_cols:
            df[col] = df[col].replace('?', self.mode_values[col])
        
        # Fit scaler on continuous features
        self.scaler.fit(df[self.continuous_cols])
        
        # Fit label encoder on target
        self.label_encoder.fit(df[self.target_col])
        
        # Store category mappings for one-hot encoding
        for col in self.categorical_cols:
            unique_values = sorted(df[col].unique())
            self.category_mappings[col] = {v: i for i, v in enumerate(unique_values)}
            self.category_dims[col] = len(unique_values)
        
        return self
    
    def transform(self, df):
        """Transform data using fitted preprocessor."""
        df = df.copy()
        
        # Impute missing values
        for col in self.categorical_cols:
            df[col] = df[col].replace('?', self.mode_values[col])
        
        # Scale continuous features
        continuous_data = self.scaler.transform(df[self.continuous_cols])
        
        # One-hot encode categorical features
        categorical_data = []
        for col in self.categorical_cols:
            one_hot = np.zeros((len(df), self.category_dims[col]))
            for i, val in enumerate(df[col]):
                if val in self.category_mappings[col]:
                    one_hot[i, self.category_mappings[col][val]] = 1
            categorical_data.append(one_hot)
        
        categorical_data = np.hstack(categorical_data)
        
        # Encode target
        target = self.label_encoder.transform(df[self.target_col])
        
        # Combine all features
        X = np.hstack([continuous_data, categorical_data])
        y = target
        
        return X.astype(np.float32), y.astype(np.int64)
    
    def inverse_transform(self, X):
        """Convert processed data back to original format."""
        # Split continuous and categorical parts
        n_continuous = len(self.continuous_cols)
        continuous_data = X[:, :n_continuous]
        categorical_data = X[:, n_continuous:]
        
        # Inverse scale continuous features
        continuous_df = pd.DataFrame(
            self.scaler.inverse_transform(continuous_data),
            columns=self.continuous_cols
        )
        
        # Decode categorical features
        categorical_df = pd.DataFrame()
        idx = 0
        for col in self.categorical_cols:
            dim = self.category_dims[col]
            one_hot = categorical_data[:, idx:idx+dim]
            # Get the category with highest probability
            cat_indices = np.argmax(one_hot, axis=1)
            reverse_mapping = {v: k for k, v in self.category_mappings[col].items()}
            categorical_df[col] = [reverse_mapping.get(i, 'Unknown') for i in cat_indices]
            idx += dim
        
        return pd.concat([continuous_df, categorical_df], axis=1)
    
    def get_output_dim(self):
        """Get total dimension of processed features."""
        return len(self.continuous_cols) + sum(self.category_dims.values())
    
    def get_continuous_dim(self):
        """Get dimension of continuous features."""
        return len(self.continuous_cols)
    
    def get_categorical_dims(self):
        """Get list of dimensions for each categorical feature."""
        return [self.category_dims[col] for col in self.categorical_cols]

In [None]:
# Initialize preprocessor
preprocessor = DataPreprocessor(CONTINUOUS_COLS, CATEGORICAL_COLS, TARGET_COL)
preprocessor.fit(df)

print(f"Total feature dimension: {preprocessor.get_output_dim()}")
print(f"Continuous dimension: {preprocessor.get_continuous_dim()}")
print(f"Categorical dimensions: {preprocessor.get_categorical_dims()}")

### 2.3 Train-Test Split

In [None]:
def prepare_data(df, preprocessor, seed=42, test_size=0.2):
    """Prepare train and test data with stratified split."""
    # Split before preprocessing to avoid data leakage
    train_df, test_df = train_test_split(
        df, test_size=test_size, random_state=seed, stratify=df[TARGET_COL]
    )
    
    # Fit preprocessor only on training data
    preprocessor_local = DataPreprocessor(CONTINUOUS_COLS, CATEGORICAL_COLS, TARGET_COL)
    preprocessor_local.fit(train_df)
    
    # Transform both sets
    X_train, y_train = preprocessor_local.transform(train_df)
    X_test, y_test = preprocessor_local.transform(test_df)
    
    print(f"Training set: {X_train.shape[0]} samples")
    print(f"Test set: {X_test.shape[0]} samples")
    print(f"Label distribution (train): {np.bincount(y_train) / len(y_train)}")
    print(f"Label distribution (test): {np.bincount(y_test) / len(y_test)}")
    
    return X_train, X_test, y_train, y_test, preprocessor_local

# Test with seed 42
X_train, X_test, y_train, y_test, preprocessor = prepare_data(df, preprocessor, seed=42)

---

## 3. GAN Architecture

We implement a GAN with:
- **Generator**: MLP with BatchNorm and LeakyReLU, separate output heads for continuous (linear) and categorical (softmax per group) features
- **Discriminator**: MLP with LeakyReLU and Dropout

In [None]:
class Generator(nn.Module):
    """Generator network for tabular data.
    
    Produces continuous features with linear activation and
    categorical features with softmax per category group.
    """
    
    def __init__(self, latent_dim, continuous_dim, categorical_dims, hidden_dim=256):
        super(Generator, self).__init__()
        
        self.latent_dim = latent_dim
        self.continuous_dim = continuous_dim
        self.categorical_dims = categorical_dims
        self.output_dim = continuous_dim + sum(categorical_dims)
        
        # Main network
        self.main = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.LeakyReLU(0.2),
            
            nn.Linear(hidden_dim, hidden_dim * 2),
            nn.BatchNorm1d(hidden_dim * 2),
            nn.LeakyReLU(0.2),
            
            nn.Linear(hidden_dim * 2, hidden_dim * 2),
            nn.BatchNorm1d(hidden_dim * 2),
            nn.LeakyReLU(0.2),
            
            nn.Linear(hidden_dim * 2, self.output_dim)
        )
        
    def forward(self, z):
        x = self.main(z)
        
        # Split into continuous and categorical parts
        continuous_out = torch.sigmoid(x[:, :self.continuous_dim])  # [0, 1] range
        
        # Apply softmax to each categorical group
        categorical_out = []
        idx = self.continuous_dim
        for dim in self.categorical_dims:
            cat_logits = x[:, idx:idx+dim]
            cat_probs = torch.softmax(cat_logits, dim=1)
            categorical_out.append(cat_probs)
            idx += dim
        
        categorical_out = torch.cat(categorical_out, dim=1)
        
        return torch.cat([continuous_out, categorical_out], dim=1)


class Discriminator(nn.Module):
    """Discriminator network for tabular data."""
    
    def __init__(self, input_dim, hidden_dim=256):
        super(Discriminator, self).__init__()
        
        self.main = nn.Sequential(
            nn.Linear(input_dim, hidden_dim * 2),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            
            nn.Linear(hidden_dim // 2, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        return self.main(x)

---

## 4. Conditional GAN (cGAN) Architecture

The cGAN extends the GAN by conditioning on the target label:
- Generator receives noise + label
- Discriminator receives data + label

In [None]:
class ConditionalGenerator(nn.Module):
    """Conditional Generator network for tabular data.
    
    Takes noise vector and label as input.
    """
    
    def __init__(self, latent_dim, num_classes, continuous_dim, categorical_dims, hidden_dim=256):
        super(ConditionalGenerator, self).__init__()
        
        self.latent_dim = latent_dim
        self.num_classes = num_classes
        self.continuous_dim = continuous_dim
        self.categorical_dims = categorical_dims
        self.output_dim = continuous_dim + sum(categorical_dims)
        
        # Input: noise + one-hot label
        input_dim = latent_dim + num_classes
        
        # Main network
        self.main = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.LeakyReLU(0.2),
            
            nn.Linear(hidden_dim, hidden_dim * 2),
            nn.BatchNorm1d(hidden_dim * 2),
            nn.LeakyReLU(0.2),
            
            nn.Linear(hidden_dim * 2, hidden_dim * 2),
            nn.BatchNorm1d(hidden_dim * 2),
            nn.LeakyReLU(0.2),
            
            nn.Linear(hidden_dim * 2, self.output_dim)
        )
        
    def forward(self, z, labels):
        # Convert labels to one-hot if needed
        if labels.dim() == 1:
            labels_onehot = torch.zeros(labels.size(0), self.num_classes, device=z.device)
            labels_onehot.scatter_(1, labels.unsqueeze(1), 1)
        else:
            labels_onehot = labels
        
        # Concatenate noise and label
        x = torch.cat([z, labels_onehot], dim=1)
        x = self.main(x)
        
        # Split into continuous and categorical parts
        continuous_out = torch.sigmoid(x[:, :self.continuous_dim])
        
        # Apply softmax to each categorical group
        categorical_out = []
        idx = self.continuous_dim
        for dim in self.categorical_dims:
            cat_logits = x[:, idx:idx+dim]
            cat_probs = torch.softmax(cat_logits, dim=1)
            categorical_out.append(cat_probs)
            idx += dim
        
        categorical_out = torch.cat(categorical_out, dim=1)
        
        return torch.cat([continuous_out, categorical_out], dim=1)


class ConditionalDiscriminator(nn.Module):
    """Conditional Discriminator network for tabular data.
    
    Takes data and label as input.
    """
    
    def __init__(self, input_dim, num_classes, hidden_dim=256):
        super(ConditionalDiscriminator, self).__init__()
        
        self.num_classes = num_classes
        
        # Input: data + one-hot label
        total_input_dim = input_dim + num_classes
        
        self.main = nn.Sequential(
            nn.Linear(total_input_dim, hidden_dim * 2),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            
            nn.Linear(hidden_dim // 2, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x, labels):
        # Convert labels to one-hot if needed
        if labels.dim() == 1:
            labels_onehot = torch.zeros(labels.size(0), self.num_classes, device=x.device)
            labels_onehot.scatter_(1, labels.unsqueeze(1), 1)
        else:
            labels_onehot = labels
        
        # Concatenate data and label
        x = torch.cat([x, labels_onehot], dim=1)
        return self.main(x)

---

## 5. Training Functions

In [None]:
def train_gan(X_train, preprocessor, latent_dim=128, hidden_dim=256, 
              batch_size=64, epochs=200, lr_g=0.0002, lr_d=0.0002, seed=42):
    """
    Train a standard GAN on tabular data.
    
    Returns:
        generator: Trained generator model
        g_losses: List of generator losses per epoch
        d_losses: List of discriminator losses per epoch
    """
    set_seed(seed)
    
    # Get dimensions
    continuous_dim = preprocessor.get_continuous_dim()
    categorical_dims = preprocessor.get_categorical_dims()
    data_dim = preprocessor.get_output_dim()
    
    # Create models
    generator = Generator(latent_dim, continuous_dim, categorical_dims, hidden_dim).to(device)
    discriminator = Discriminator(data_dim, hidden_dim).to(device)
    
    # Optimizers
    optimizer_g = optim.Adam(generator.parameters(), lr=lr_g, betas=(0.5, 0.999))
    optimizer_d = optim.Adam(discriminator.parameters(), lr=lr_d, betas=(0.5, 0.999))
    
    # Loss function
    criterion = nn.BCELoss()
    
    # Create dataloader
    dataset = TensorDataset(torch.FloatTensor(X_train))
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    
    # Training history
    g_losses = []
    d_losses = []
    
    # Training loop
    for epoch in tqdm(range(epochs), desc="Training GAN"):
        epoch_g_loss = 0
        epoch_d_loss = 0
        n_batches = 0
        
        for (real_data,) in dataloader:
            batch_size_actual = real_data.size(0)
            real_data = real_data.to(device)
            
            # Labels
            real_labels = torch.ones(batch_size_actual, 1, device=device)
            fake_labels = torch.zeros(batch_size_actual, 1, device=device)
            
            # ---------------------
            # Train Discriminator
            # ---------------------
            optimizer_d.zero_grad()
            
            # Real data
            d_real = discriminator(real_data)
            d_loss_real = criterion(d_real, real_labels)
            
            # Fake data
            z = torch.randn(batch_size_actual, latent_dim, device=device)
            fake_data = generator(z)
            d_fake = discriminator(fake_data.detach())
            d_loss_fake = criterion(d_fake, fake_labels)
            
            d_loss = d_loss_real + d_loss_fake
            d_loss.backward()
            optimizer_d.step()
            
            # ---------------------
            # Train Generator
            # ---------------------
            optimizer_g.zero_grad()
            
            z = torch.randn(batch_size_actual, latent_dim, device=device)
            fake_data = generator(z)
            d_fake = discriminator(fake_data)
            g_loss = criterion(d_fake, real_labels)
            
            g_loss.backward()
            optimizer_g.step()
            
            epoch_g_loss += g_loss.item()
            epoch_d_loss += d_loss.item()
            n_batches += 1
        
        g_losses.append(epoch_g_loss / n_batches)
        d_losses.append(epoch_d_loss / n_batches)
        
        if (epoch + 1) % 50 == 0:
            print(f"Epoch [{epoch+1}/{epochs}] D_loss: {d_losses[-1]:.4f} G_loss: {g_losses[-1]:.4f}")
    
    return generator, g_losses, d_losses

In [None]:
def train_cgan(X_train, y_train, preprocessor, latent_dim=128, hidden_dim=256,
               batch_size=64, epochs=200, lr_g=0.0002, lr_d=0.0002, seed=42):
    """
    Train a Conditional GAN on tabular data.
    
    Returns:
        generator: Trained conditional generator model
        g_losses: List of generator losses per epoch
        d_losses: List of discriminator losses per epoch
    """
    set_seed(seed)
    
    # Get dimensions
    continuous_dim = preprocessor.get_continuous_dim()
    categorical_dims = preprocessor.get_categorical_dims()
    data_dim = preprocessor.get_output_dim()
    num_classes = len(np.unique(y_train))
    
    # Create models
    generator = ConditionalGenerator(latent_dim, num_classes, continuous_dim, 
                                     categorical_dims, hidden_dim).to(device)
    discriminator = ConditionalDiscriminator(data_dim, num_classes, hidden_dim).to(device)
    
    # Optimizers
    optimizer_g = optim.Adam(generator.parameters(), lr=lr_g, betas=(0.5, 0.999))
    optimizer_d = optim.Adam(discriminator.parameters(), lr=lr_d, betas=(0.5, 0.999))
    
    # Loss function
    criterion = nn.BCELoss()
    
    # Create dataloader
    dataset = TensorDataset(torch.FloatTensor(X_train), torch.LongTensor(y_train))
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    
    # Training history
    g_losses = []
    d_losses = []
    
    # Training loop
    for epoch in tqdm(range(epochs), desc="Training cGAN"):
        epoch_g_loss = 0
        epoch_d_loss = 0
        n_batches = 0
        
        for real_data, labels in dataloader:
            batch_size_actual = real_data.size(0)
            real_data = real_data.to(device)
            labels = labels.to(device)
            
            # Labels for loss
            real_targets = torch.ones(batch_size_actual, 1, device=device)
            fake_targets = torch.zeros(batch_size_actual, 1, device=device)
            
            # ---------------------
            # Train Discriminator
            # ---------------------
            optimizer_d.zero_grad()
            
            # Real data with real labels
            d_real = discriminator(real_data, labels)
            d_loss_real = criterion(d_real, real_targets)
            
            # Fake data with the same labels
            z = torch.randn(batch_size_actual, latent_dim, device=device)
            fake_data = generator(z, labels)
            d_fake = discriminator(fake_data.detach(), labels)
            d_loss_fake = criterion(d_fake, fake_targets)
            
            d_loss = d_loss_real + d_loss_fake
            d_loss.backward()
            optimizer_d.step()
            
            # ---------------------
            # Train Generator
            # ---------------------
            optimizer_g.zero_grad()
            
            z = torch.randn(batch_size_actual, latent_dim, device=device)
            fake_data = generator(z, labels)
            d_fake = discriminator(fake_data, labels)
            g_loss = criterion(d_fake, real_targets)
            
            g_loss.backward()
            optimizer_g.step()
            
            epoch_g_loss += g_loss.item()
            epoch_d_loss += d_loss.item()
            n_batches += 1
        
        g_losses.append(epoch_g_loss / n_batches)
        d_losses.append(epoch_d_loss / n_batches)
        
        if (epoch + 1) % 50 == 0:
            print(f"Epoch [{epoch+1}/{epochs}] D_loss: {d_losses[-1]:.4f} G_loss: {g_losses[-1]:.4f}")
    
    return generator, g_losses, d_losses

---

## 6. Synthetic Data Generation

In [None]:
def generate_synthetic_data(generator, n_samples, latent_dim, device):
    """Generate synthetic data using trained GAN."""
    generator.eval()
    with torch.no_grad():
        z = torch.randn(n_samples, latent_dim, device=device)
        synthetic_data = generator(z)
    return synthetic_data.cpu().numpy()


def generate_conditional_synthetic_data(generator, n_samples, latent_dim, label_ratios, device):
    """Generate synthetic data using trained cGAN with specified label ratios.
    
    Args:
        generator: Trained conditional generator
        n_samples: Total number of samples to generate
        latent_dim: Dimension of latent space
        label_ratios: Array of ratios for each class (e.g., [0.75, 0.25])
        device: torch device
    
    Returns:
        synthetic_data: Generated data as numpy array
        synthetic_labels: Labels for generated data
    """
    generator.eval()
    
    # Calculate number of samples per class
    samples_per_class = (np.array(label_ratios) * n_samples).astype(int)
    # Adjust for rounding errors
    samples_per_class[-1] = n_samples - samples_per_class[:-1].sum()
    
    all_data = []
    all_labels = []
    
    with torch.no_grad():
        for label, n in enumerate(samples_per_class):
            if n > 0:
                z = torch.randn(n, latent_dim, device=device)
                labels = torch.full((n,), label, dtype=torch.long, device=device)
                synthetic = generator(z, labels)
                all_data.append(synthetic.cpu().numpy())
                all_labels.append(np.full(n, label))
    
    return np.vstack(all_data), np.concatenate(all_labels)

---

## 7. Evaluation Metrics

In [None]:
def compute_detection_metric(X_real, X_synthetic, n_folds=4, seed=42):
    """
    Compute detection metric using Random Forest.
    
    Lower AUC (closer to 0.5) is better - means synthetic data is indistinguishable.
    
    Args:
        X_real: Real training data
        X_synthetic: Synthetic data (same size as X_real)
        n_folds: Number of folds for cross-validation
        seed: Random seed
    
    Returns:
        mean_auc: Average AUC across folds
        std_auc: Standard deviation of AUC
    """
    # Create labels: 0 for real, 1 for synthetic
    y_real = np.zeros(len(X_real))
    y_synthetic = np.ones(len(X_synthetic))
    
    # Combine data
    X_combined = np.vstack([X_real, X_synthetic])
    y_combined = np.concatenate([y_real, y_synthetic])
    
    # Cross-validation
    kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    auc_scores = []
    
    for train_idx, test_idx in kfold.split(X_combined, y_combined):
        X_train, X_test = X_combined[train_idx], X_combined[test_idx]
        y_train, y_test = y_combined[train_idx], y_combined[test_idx]
        
        rf = RandomForestClassifier(n_estimators=100, random_state=seed, n_jobs=-1)
        rf.fit(X_train, y_train)
        
        y_pred_proba = rf.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_pred_proba)
        auc_scores.append(auc)
    
    return np.mean(auc_scores), np.std(auc_scores)


def compute_efficacy_metric(X_train_real, y_train_real, X_synthetic, y_synthetic, 
                           X_test, y_test, seed=42):
    """
    Compute efficacy metric.
    
    Higher ratio (closer to 1.0) is better - means synthetic data is useful.
    
    Args:
        X_train_real: Real training data
        y_train_real: Real training labels
        X_synthetic: Synthetic training data
        y_synthetic: Synthetic labels
        X_test: Real test data
        y_test: Real test labels
        seed: Random seed
    
    Returns:
        efficacy_ratio: AUC_synthetic / AUC_real
        auc_real: AUC using real data
        auc_synthetic: AUC using synthetic data
    """
    # Train on real data
    rf_real = RandomForestClassifier(n_estimators=100, random_state=seed, n_jobs=-1)
    rf_real.fit(X_train_real, y_train_real)
    y_pred_proba_real = rf_real.predict_proba(X_test)[:, 1]
    auc_real = roc_auc_score(y_test, y_pred_proba_real)
    
    # Train on synthetic data
    rf_synthetic = RandomForestClassifier(n_estimators=100, random_state=seed, n_jobs=-1)
    rf_synthetic.fit(X_synthetic, y_synthetic)
    y_pred_proba_synthetic = rf_synthetic.predict_proba(X_test)[:, 1]
    auc_synthetic = roc_auc_score(y_test, y_pred_proba_synthetic)
    
    efficacy_ratio = auc_synthetic / auc_real
    
    return efficacy_ratio, auc_real, auc_synthetic

---

## 8. Visualization Functions

In [None]:
def plot_training_losses(g_losses, d_losses, title="GAN Training Losses"):
    """Plot generator and discriminator losses over training."""
    plt.figure(figsize=(10, 5))
    plt.plot(g_losses, label='Generator Loss', alpha=0.8)
    plt.plot(d_losses, label='Discriminator Loss', alpha=0.8)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title(title)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()


def plot_feature_distributions(X_real, X_synthetic, preprocessor, n_features=6):
    """Plot histograms comparing real vs synthetic feature distributions."""
    continuous_dim = preprocessor.get_continuous_dim()
    n_plots = min(n_features, continuous_dim)
    
    fig, axes = plt.subplots(2, 3, figsize=(15, 8))
    axes = axes.flatten()
    
    for i in range(n_plots):
        ax = axes[i]
        ax.hist(X_real[:, i], bins=30, alpha=0.5, label='Real', density=True)
        ax.hist(X_synthetic[:, i], bins=30, alpha=0.5, label='Synthetic', density=True)
        ax.set_title(f'{CONTINUOUS_COLS[i]}')
        ax.legend()
        ax.grid(True, alpha=0.3)
    
    plt.suptitle('Feature Distributions: Real vs Synthetic', fontsize=14)
    plt.tight_layout()
    plt.show()


def plot_correlation_matrices(X_real, X_synthetic, preprocessor):
    """Plot correlation matrices for real and synthetic data."""
    continuous_dim = preprocessor.get_continuous_dim()
    
    # Use only continuous features for correlation
    real_cont = X_real[:, :continuous_dim]
    synth_cont = X_synthetic[:, :continuous_dim]
    
    corr_real = np.corrcoef(real_cont.T)
    corr_synth = np.corrcoef(synth_cont.T)
    
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    # Real correlation
    sns.heatmap(corr_real, ax=axes[0], cmap='coolwarm', center=0, 
                xticklabels=CONTINUOUS_COLS, yticklabels=CONTINUOUS_COLS,
                annot=True, fmt='.2f', square=True)
    axes[0].set_title('Real Data Correlation')
    
    # Synthetic correlation
    sns.heatmap(corr_synth, ax=axes[1], cmap='coolwarm', center=0,
                xticklabels=CONTINUOUS_COLS, yticklabels=CONTINUOUS_COLS,
                annot=True, fmt='.2f', square=True)
    axes[1].set_title('Synthetic Data Correlation')
    
    # Difference
    corr_diff = corr_real - corr_synth
    sns.heatmap(corr_diff, ax=axes[2], cmap='coolwarm', center=0,
                xticklabels=CONTINUOUS_COLS, yticklabels=CONTINUOUS_COLS,
                annot=True, fmt='.2f', square=True)
    axes[2].set_title('Difference (Real - Synthetic)')
    
    plt.tight_layout()
    plt.show()

---

## 9. Main Experiment

Run experiments with 3 different random seeds and report average results.

In [None]:
# Experiment configuration
SEEDS = [42, 123, 456]
LATENT_DIM = 128
HIDDEN_DIM = 256
BATCH_SIZE = 64
EPOCHS = 200
LR_G = 0.0002
LR_D = 0.0002

print("="*60)
print("EXPERIMENT CONFIGURATION")
print("="*60)
print(f"Seeds: {SEEDS}")
print(f"Latent dimension: {LATENT_DIM}")
print(f"Hidden dimension: {HIDDEN_DIM}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Epochs: {EPOCHS}")
print(f"Learning rate (G): {LR_G}")
print(f"Learning rate (D): {LR_D}")
print("="*60)

### 9.1 GAN Experiments

In [None]:
# Store results for GAN
gan_results = {
    'detection_auc': [],
    'efficacy_ratio': [],
    'auc_real': [],
    'auc_synthetic': []
}

for seed in SEEDS:
    print(f"\n{'='*60}")
    print(f"GAN Experiment with seed {seed}")
    print(f"{'='*60}")
    
    # Prepare data
    X_train, X_test, y_train, y_test, preprocessor = prepare_data(df, preprocessor, seed=seed)
    
    # Train GAN
    generator, g_losses, d_losses = train_gan(
        X_train, preprocessor, 
        latent_dim=LATENT_DIM, hidden_dim=HIDDEN_DIM,
        batch_size=BATCH_SIZE, epochs=EPOCHS, 
        lr_g=LR_G, lr_d=LR_D, seed=seed
    )
    
    # Plot training losses
    plot_training_losses(g_losses, d_losses, f"GAN Training Losses (Seed {seed})")
    
    # Generate synthetic data
    X_synthetic = generate_synthetic_data(generator, len(X_train), LATENT_DIM, device)
    
    # For GAN, we don't have labels for synthetic data
    # Sample labels from training distribution for efficacy
    label_ratios = np.bincount(y_train) / len(y_train)
    y_synthetic = np.random.choice(len(label_ratios), size=len(X_synthetic), p=label_ratios)
    
    # Plot distributions
    plot_feature_distributions(X_train, X_synthetic, preprocessor)
    plot_correlation_matrices(X_train, X_synthetic, preprocessor)
    
    # Compute metrics
    detection_auc, detection_std = compute_detection_metric(X_train, X_synthetic, seed=seed)
    efficacy, auc_real, auc_synth = compute_efficacy_metric(
        X_train, y_train, X_synthetic, y_synthetic, X_test, y_test, seed=seed
    )
    
    print(f"\nMetrics:")
    print(f"  Detection AUC: {detection_auc:.4f} (+/- {detection_std:.4f})")
    print(f"  Efficacy Ratio: {efficacy:.4f}")
    print(f"  AUC (Real): {auc_real:.4f}")
    print(f"  AUC (Synthetic): {auc_synth:.4f}")
    
    gan_results['detection_auc'].append(detection_auc)
    gan_results['efficacy_ratio'].append(efficacy)
    gan_results['auc_real'].append(auc_real)
    gan_results['auc_synthetic'].append(auc_synth)

In [None]:
# Summary for GAN
print("\n" + "="*60)
print("GAN RESULTS SUMMARY (Average over 3 seeds)")
print("="*60)
print(f"Detection AUC: {np.mean(gan_results['detection_auc']):.4f} (+/- {np.std(gan_results['detection_auc']):.4f})")
print(f"Efficacy Ratio: {np.mean(gan_results['efficacy_ratio']):.4f} (+/- {np.std(gan_results['efficacy_ratio']):.4f})")
print(f"AUC (Real): {np.mean(gan_results['auc_real']):.4f}")
print(f"AUC (Synthetic): {np.mean(gan_results['auc_synthetic']):.4f}")

### 9.2 Conditional GAN Experiments

In [None]:
# Store results for cGAN
cgan_results = {
    'detection_auc': [],
    'efficacy_ratio': [],
    'auc_real': [],
    'auc_synthetic': []
}

for seed in SEEDS:
    print(f"\n{'='*60}")
    print(f"cGAN Experiment with seed {seed}")
    print(f"{'='*60}")
    
    # Prepare data
    X_train, X_test, y_train, y_test, preprocessor = prepare_data(df, preprocessor, seed=seed)
    
    # Train cGAN
    cond_generator, g_losses, d_losses = train_cgan(
        X_train, y_train, preprocessor,
        latent_dim=LATENT_DIM, hidden_dim=HIDDEN_DIM,
        batch_size=BATCH_SIZE, epochs=EPOCHS,
        lr_g=LR_G, lr_d=LR_D, seed=seed
    )
    
    # Plot training losses
    plot_training_losses(g_losses, d_losses, f"cGAN Training Losses (Seed {seed})")
    
    # Generate synthetic data with same label ratios as training
    label_ratios = np.bincount(y_train) / len(y_train)
    X_synthetic, y_synthetic = generate_conditional_synthetic_data(
        cond_generator, len(X_train), LATENT_DIM, label_ratios, device
    )
    
    print(f"Generated label distribution: {np.bincount(y_synthetic.astype(int)) / len(y_synthetic)}")
    print(f"Original label distribution: {label_ratios}")
    
    # Plot distributions
    plot_feature_distributions(X_train, X_synthetic, preprocessor)
    plot_correlation_matrices(X_train, X_synthetic, preprocessor)
    
    # Compute metrics
    detection_auc, detection_std = compute_detection_metric(X_train, X_synthetic, seed=seed)
    efficacy, auc_real, auc_synth = compute_efficacy_metric(
        X_train, y_train, X_synthetic, y_synthetic.astype(int), X_test, y_test, seed=seed
    )
    
    print(f"\nMetrics:")
    print(f"  Detection AUC: {detection_auc:.4f} (+/- {detection_std:.4f})")
    print(f"  Efficacy Ratio: {efficacy:.4f}")
    print(f"  AUC (Real): {auc_real:.4f}")
    print(f"  AUC (Synthetic): {auc_synth:.4f}")
    
    cgan_results['detection_auc'].append(detection_auc)
    cgan_results['efficacy_ratio'].append(efficacy)
    cgan_results['auc_real'].append(auc_real)
    cgan_results['auc_synthetic'].append(auc_synth)

In [None]:
# Summary for cGAN
print("\n" + "="*60)
print("cGAN RESULTS SUMMARY (Average over 3 seeds)")
print("="*60)
print(f"Detection AUC: {np.mean(cgan_results['detection_auc']):.4f} (+/- {np.std(cgan_results['detection_auc']):.4f})")
print(f"Efficacy Ratio: {np.mean(cgan_results['efficacy_ratio']):.4f} (+/- {np.std(cgan_results['efficacy_ratio']):.4f})")
print(f"AUC (Real): {np.mean(cgan_results['auc_real']):.4f}")
print(f"AUC (Synthetic): {np.mean(cgan_results['auc_synthetic']):.4f}")

---

## 10. Final Comparison and Analysis

In [None]:
# Create comparison table
comparison_df = pd.DataFrame({
    'Model': ['GAN', 'cGAN'],
    'Detection AUC (mean)': [np.mean(gan_results['detection_auc']), np.mean(cgan_results['detection_auc'])],
    'Detection AUC (std)': [np.std(gan_results['detection_auc']), np.std(cgan_results['detection_auc'])],
    'Efficacy Ratio (mean)': [np.mean(gan_results['efficacy_ratio']), np.mean(cgan_results['efficacy_ratio'])],
    'Efficacy Ratio (std)': [np.std(gan_results['efficacy_ratio']), np.std(cgan_results['efficacy_ratio'])],
})

print("\n" + "="*60)
print("FINAL COMPARISON: GAN vs cGAN")
print("="*60)
print(comparison_df.to_string(index=False))
print("\nNote: For Detection AUC, lower is better (closer to 0.5 means indistinguishable)")
print("      For Efficacy Ratio, higher is better (closer to 1.0 means useful substitute)")

In [None]:
# Visualization of comparison
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Detection AUC
models = ['GAN', 'cGAN']
detection_means = [np.mean(gan_results['detection_auc']), np.mean(cgan_results['detection_auc'])]
detection_stds = [np.std(gan_results['detection_auc']), np.std(cgan_results['detection_auc'])]

axes[0].bar(models, detection_means, yerr=detection_stds, capsize=5, color=['steelblue', 'coral'])
axes[0].axhline(y=0.5, color='green', linestyle='--', label='Ideal (0.5)')
axes[0].set_ylabel('Detection AUC')
axes[0].set_title('Detection Metric (Lower is Better)')
axes[0].legend()
axes[0].set_ylim(0, 1)

# Efficacy Ratio
efficacy_means = [np.mean(gan_results['efficacy_ratio']), np.mean(cgan_results['efficacy_ratio'])]
efficacy_stds = [np.std(gan_results['efficacy_ratio']), np.std(cgan_results['efficacy_ratio'])]

axes[1].bar(models, efficacy_means, yerr=efficacy_stds, capsize=5, color=['steelblue', 'coral'])
axes[1].axhline(y=1.0, color='green', linestyle='--', label='Ideal (1.0)')
axes[1].set_ylabel('Efficacy Ratio')
axes[1].set_title('Efficacy Metric (Higher is Better)')
axes[1].legend()
axes[1].set_ylim(0, 1.2)

plt.tight_layout()
plt.savefig('comparison_results.png', dpi=150, bbox_inches='tight')
plt.show()

---

## 11. Conclusions

### Analysis of Results

**Detection Metric:**
- A detection AUC close to 0.5 indicates that a Random Forest classifier cannot distinguish between real and synthetic data
- Higher AUC values suggest that the synthetic data has detectable patterns that differ from real data

**Efficacy Metric:**
- An efficacy ratio close to 1.0 indicates that a classifier trained on synthetic data performs similarly to one trained on real data
- This measures the utility of synthetic data as a substitute for real data

**Observations:**
1. [To be filled based on actual results]
2. [Compare GAN vs cGAN performance]
3. [Discuss any mode collapse or training instabilities observed]
4. [Analyze distribution visualizations]

**Limitations and Future Work:**
- Tabular data with mixed types is challenging for GANs
- Consider using more advanced architectures like CTGAN or TVAE
- Wasserstein loss with gradient penalty may improve training stability