In [5]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm

# Configuration
# We can keep most hyperparameters the same for a fair comparison
LATENT_DIM = 128
BATCH_SIZE = 256
EPOCHS = 50
LEARNING_RATE = 1e-4
W_ADV = 1    # Weight for Adversarial Loss
W_CON = 50   # Weight for Contextual (Reconstruction) Loss
W_ENC = 1    # Weight for Encoder Loss

# --- Device Configuration ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- Load the Datasets for CBIGAN ---

X_train = pd.read_parquet('processed_data/cbigan_X_train.parquet')
y_train = pd.read_parquet('processed_data/cbigan_y_train.parquet')['label']
X_test = pd.read_parquet('processed_data/cbigan_X_test.parquet')
y_test = pd.read_parquet('processed_data/cbigan_y_test.parquet')['label']

# --- One-Hot Encode the Labels ---
# The condition 'y' needs to be in a numerical format for the model
one_hot_encoder = OneHotEncoder(sparse_output=False)
y_train_one_hot = one_hot_encoder.fit_transform(y_train.values.reshape(-1, 1))
y_test_one_hot = one_hot_encoder.transform(y_test.values.reshape(-1, 1))

NUM_CLASSES = y_train_one_hot.shape[1]
print(f"Number of classes (for conditioning): {NUM_CLASSES}")


INPUT_DIM = X_train.shape[1]

# --- PyTorch Dataset and DataLoader for CBIGAN ---
class ConditionalDataset(Dataset):
    def __init__(self, features, labels_one_hot):
        self.features = torch.tensor(features.values, dtype=torch.float32)
        self.labels = torch.tensor(labels_one_hot, dtype=torch.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

# Create datasets and dataloaders
train_dataset_cbigan = ConditionalDataset(X_train, y_train_one_hot)
test_dataset_cbigan = ConditionalDataset(X_test, y_test_one_hot)

train_loader_cbigan = DataLoader(train_dataset_cbigan, batch_size=BATCH_SIZE, shuffle=True)
test_loader_cbigan = DataLoader(test_dataset_cbigan, batch_size=BATCH_SIZE, shuffle=False)


print(f"CBIGAN training data shape: {X_train.shape}")
print(f"CBIGAN testing data shape: {X_test.shape}")

Using device: cuda
Number of classes (for conditioning): 2
CBIGAN training data shape: (96822, 190)
CBIGAN testing data shape: (49971, 190)


In [6]:
class Generator(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(Generator, self).__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.LeakyReLU(0.2),
        )
        self.latent_layer = nn.Linear(256, latent_dim)

        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(256, 512),
            nn.LeakyReLU(0.2),
            nn.Linear(512, input_dim),
            nn.Sigmoid()  # To output values between 0 and 1
        )

    def forward(self, x):
        encoded = self.encoder(x)
        z = self.latent_layer(encoded)
        reconstruction = self.decoder(z)
        return reconstruction, z

class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.LeakyReLU(0.2),
        )
        self.feature_layer = nn.Linear(256, 128)
        self.final_layer = nn.Linear(128, 1)

    def forward(self, x):
        features = self.model(x)
        feature_vector = self.feature_layer(features)
        prediction = torch.sigmoid(self.final_layer(feature_vector))
        return prediction.squeeze(), feature_vector

# --- Instantiate the models and move to device ---
generator = Generator(INPUT_DIM, LATENT_DIM).to(device)
discriminator = Discriminator(INPUT_DIM).to(device)

print("--- Generator Architecture ---")
print(generator)
print("\n--- Discriminator Architecture ---")
print(discriminator)

--- Generator Architecture ---
Generator(
  (encoder): Sequential(
    (0): Linear(in_features=190, out_features=512, bias=True)
    (1): LeakyReLU(negative_slope=0.2)
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=512, out_features=256, bias=True)
    (4): LeakyReLU(negative_slope=0.2)
  )
  (latent_layer): Linear(in_features=256, out_features=128, bias=True)
  (decoder): Sequential(
    (0): Linear(in_features=128, out_features=256, bias=True)
    (1): LeakyReLU(negative_slope=0.2)
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=256, out_features=512, bias=True)
    (4): LeakyReLU(negative_slope=0.2)
    (5): Linear(in_features=512, out_features=190, bias=True)
    (6): Sigmoid()
  )
)

--- Discriminator Architecture ---
Discriminator(
  (model): Sequential(
    (0): Linear(in_features=190, out_features=512, bias=True)
    (1): LeakyReLU(negative_slope=0.2)
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=512, out_features=256,

In [None]:
# Configuration for Re-balanced Training
D_LEARNING_RATE = 1e-5   # Make the Discriminator learn slowly
GE_LEARNING_RATE = 2e-4  # Allow the Generator/Encoder to learn faster

encoder = Encoder(input_dim, latent_dim, NUM_CLASSES).to(device)
generator = Generator(latent_dim, input_dim, NUM_CLASSES).to(device)
discriminator = Discriminator(input_dim, latent_dim, NUM_CLASSES).to(device)

# --- Loss Function for the Adversarial Game ---
adversarial_loss = nn.BCELoss().to(device)

# Optimizers with different learning rates
# One for the Discriminator with a low LR
optimizer_d = optim.Adam(discriminator.parameters(), lr=D_LEARNING_RATE, betas=(0.5, 0.999))

# A second one for the Generator and Encoder team with a higher LR
optimizer_ge = optim.Adam(
    list(generator.parameters()) + list(encoder.parameters()),
    lr=GE_LEARNING_RATE,
    betas=(0.5, 0.999)
)



print("Corrected Optimizers and Loss function for CBIGAN have been set up.")
print(f"Discriminator LR: {D_LEARNING_RATE}, Generator/Encoder LR: {GE_LEARNING_RATE}")