# Tabular GAN from Scratch using Pytorch

The below code is a modifications of the [Pytorch DCGAN tutorial](https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html).

In [59]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import optuna
import os

In [41]:
df = pd.read_csv('../datasets/income/adult.csv')

In [42]:
df_dropped = df.drop('income', axis=1)

In [43]:
categorical_columns = [col for col in df_dropped.columns if df_dropped[col].dtype == 'object']
numerical_columns = [col for col in df_dropped.columns if df_dropped[col].dtype in ['int64', 'float64']]

In [46]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(sparse_output=False), categorical_columns),
    ])

In [47]:
X = preprocessor.fit_transform(df_dropped)
y = df['income'].apply(lambda x: 1 if x == '>50K' else 0).values

In [48]:
X_torch = torch.tensor(X, dtype=torch.float32)
y_torch = torch.tensor(y, dtype=torch.float32)

In [49]:
dataset = TensorDataset(X_torch, y_torch)

In [50]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [51]:
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 512),
            nn.LeakyReLU(0.2),
            nn.Linear(512, output_dim),
            nn.Tanh()
        )
    
    def forward(self, x):
        return self.net(x)

In [52]:
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.LeakyReLU(0.2),
            nn.Linear(512, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        return self.net(x)

In [53]:
noise_dim = 100
generator = Generator(input_dim=noise_dim, output_dim=X.shape[1])
discriminator = Discriminator(input_dim=X.shape[1])

In [54]:
criterion = nn.BCELoss()
optimizer_generator = optim.Adam(generator.parameters(), lr=0.0002, betas=(0.5, 0.999))
optimizer_discriminator = optim.Adam(discriminator.parameters(), lr=0.0002, betas=(0.5, 0.999))

The basic training loop for the GAN should look as following

```epochs = 50

for epoch in range(epochs):
    for batch in dataloader:
        # Update Discriminator with Real Data
        discriminator.zero_grad()
        real_data, _ = batch
        real_labels = torch.ones(real_data.size(0), 1)
        output_real = discriminator(real_data)
        loss_real = criterion(output_real, real_labels)
        
        # Update Discriminator with Fake Data
        noise = torch.randn(real_data.size(0), noise_dim)
        fake_data = generator(noise)
        fake_labels = torch.zeros(real_data.size(0), 1)
        output_fake = discriminator(fake_data.detach())
        loss_fake = criterion(output_fake, fake_labels)
        
        # Combine Losses for discriminator and Update
        loss_disc = loss_real + loss_fake
        loss_disc.backward()
        optimizer_discriminator.step()
        
        # Update Generator
        generator.zero_grad()
        output = discriminator(fake_data)
        loss_gen = criterion(output, real_labels)
        loss_gen.backward()
        optimizer_generator.step()

    print(f'Epoch {epoch+1}/{epochs}, Loss D: {loss_disc.item()}, Loss G: {loss_gen.item()}')
```     

Initial experiments on 50 epochs with some initial parameters show a lot of the expected behaviours such as early fluctations in losses.

However, the discriminator loss seems to decrease, while the generator loss seems to increase most of the times (though, not necessarily in an oscillation manner, which would somewhat be expected). The discriminator loss reaches a point where it becomes really small, like the generator loss increases towards a loss of 5.

As a way to experiment with other parameters to see its performance, we'll utilize Optuma's Hyperparameter optimizer to potentially find better hyperparameters.

In [60]:
def optuna_optimize(trial, X, dataloader):
    # Define Hyperparameters to optimize
    lr_gen = trial.suggest_float('lr_gen', 1e-5, 1e-3)
    lr_disc = trial.suggest_float('lr_disc', 1e-5, 1e-3)

    # Initialize Models
    generator = Generator(input_dim=100, output_dim=X.shape[1])
    discriminator = Discriminator(input_dim=X.shape[1])
    
    # Optimizers
    criterion = nn.BCELoss()
    optimizer_gen = optim.Adam(generator.parameters(), lr=lr_gen)
    optimizer_disc = optim.Adam(discriminator.parameters(), lr=lr_disc)

    epochs = 10

    for epoch in range(epochs):
        for i, (data, _) in enumerate(dataloader):
            # Number of data points
            n_data = data.size(0)

            # Train Discriminator
            real_data = data
            real_labels = torch.ones(n_data, 1)
            fake_labels = torch.zeros(n_data, 1)

            discriminator.zero_grad()
            output_real = discriminator(real_data)
            loss_real = criterion(output_real, real_labels)

            noise = torch.randn(n_data, 100)
            fake_data = generator(noise)
            output_fake = discriminator(fake_data.detach())
            loss_fake = criterion(output_fake, fake_labels)

            loss_d = loss_real + loss_fake
            loss_d.backward()
            optimizer_disc.step()

            generator.zero_grad()
            output = discriminator(fake_data)
            loss_g = criterion(output, real_labels)
            loss_g.backward()
            optimizer_gen.step()

    # Example metric to optimize could be the last generator loss
    metric_to_optimize = loss_g.item()

    # Save models for each trial temporarily
    trial_model_dir = f'../experiments/vanilla_gan/hyperparam/trial_{trial.number}_models'
    os.makedirs(trial_model_dir, exist_ok=True)
    torch.save(generator.state_dict(), os.path.join(trial_model_dir, 'generator.pth'))
    torch.save(discriminator.state_dict(), os.path.join(trial_model_dir, 'discriminator.pth'))

    return metric_to_optimize

In [61]:
# After optimization, save best models more permanently
def save_best_models(study, temporary_dir_base='../experiments/vanilla_gan/hyperparam/trial_'):
    best_trial = study.best_trial.number
    best_model_dir = f'{temporary_dir_base}{best_trial}_models'
    permanent_model_dir = '../experiments/vanilla_gan/hyperparam/best_models'
    os.makedirs(permanent_model_dir, exist_ok=True)

    for filename in ['generator.pth', 'discriminator.pth']:
        os.rename(os.path.join(best_model_dir, filename), os.path.join(permanent_model_dir, filename))

In [58]:
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: optuna_optimize(trial, X, dataloader), n_trials=20)

[I 2024-04-09 11:31:52,075] A new study created in memory with name: no-name-07f05e47-b23b-4897-a49e-d0f8df07b5b5
[W 2024-04-09 11:32:44,450] Trial 0 failed with parameters: {'lr_gen': 9.273696896165869e-05, 'lr_disc': 0.0001591486947786221} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\Users\NScha\.virtualenvs\TabuGAN\Lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\NScha\AppData\Local\Temp\ipykernel_5352\1899854309.py", line 2, in <lambda>
    study.optimize(lambda trial: optuna_optimize(trial, X, dataloader), n_trials=20)
                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\NScha\AppData\Local\Temp\ipykernel_5352\1564926688.py", line 32, in optuna_optimize
    fake_data = generator(noise)
                ^^^^^^^^^^^^^^^^
  File "C:\Users\NScha\.virtualenvs\TabuGAN\Lib\site-packages\torch\nn\m

KeyboardInterrupt: 

In [None]:
# Save the best models after optimization
save_best_models(study)