# VAE Demo with Chocolate

This was adapted from the VAE demo available from PyTorch: https://github.com/pytorch/examples/blob/master/vae/main.py

**Chocolate**: https://github.com/AIworx-Labs/chocolate


In [39]:
import torch
import torch.utils.data
from torch import nn, optim
from torch.nn import functional as F
from torchvision import datasets, transforms
from torchvision.utils import save_image

import chocolate as choco

## Setting up a connection to MongoDB

Establish the connection to a running mongodb instance.

In [40]:
# database connection info
DATABASE_URL = '140.160.139.44:27017'
DATABASE_NAME = 'choco_demo'

# chocolate setup
choco_conn = choco.MongoDBConnection(url=DATABASE_URL, database=DATABASE_NAME)
choco_conn.clear()  # clear the database for new experiment runs

## Torch Setup

In [41]:
on_gpu = False

device = torch.device("cuda" if on_gpu else "cpu")

kwargs = {'num_workers': 1, 'pin_memory': True} if on_gpu else {}

torch.manual_seed(1)

<torch._C.Generator at 0x7fbd37db5b90>

## VAE Setup

This is setting up the model and loss function.

In [42]:
# Model
class VAE(nn.Module):
    def __init__(self):
        super(VAE, self).__init__()
        
        self.fc1 = nn.Linear(784, 400)
        self.fc21 = nn.Linear(400, 20)
        self.fc22 = nn.Linear(400, 20)
        self.fc3 = nn.Linear(20, 400)
        self.fc4 = nn.Linear(400, 784)
        
    def encode(self, x):
        h1 = F.relu(self.fc1(x))
        return self.fc21(h1), self.fc22(h1)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return mu + eps*std
    
    def decode(self, z):
        h3 = F.relu(self.fc3(z))
        return torch.sigmoid(self.fc4(h3))
    
    def forward(self, x):
        mu, logvar = self.encode(x.view(-1, 784))
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

In [43]:
model = VAE().to(device)

In [44]:
# Loss function:
# Reconstruction + KL divergence losses summed over all elements and batch
def loss_function(recon_x, x, mu, logvar):
    BCE = F.binary_cross_entropy(recon_x, x.view(-1, 784), reduction='sum')

    # see Appendix B from VAE paper:
    # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
    # https://arxiv.org/abs/1312.6114
    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

    return BCE + KLD

In [45]:
def load_datasets(batch_size):
    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=True, download=True,
                       transform=transforms.ToTensor()),
        batch_size=batch_size, shuffle=True, **kwargs)
    
    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=False, transform=transforms.ToTensor()),
        batch_size=batch_size, shuffle=True, **kwargs)
    
    return train_loader, test_loader

## Training Loop and Test Method

In [46]:
def train(epoch):
    model.train()
    train_loss = 0
    for batch_idx, (data, _) in enumerate(train_loader):
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        loss = loss_function(recon_batch, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
#         if batch_idx % log_interval == 0:
#             print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
#                 epoch, batch_idx * len(data), len(train_loader.dataset),
#                 100. * batch_idx / len(train_loader),
#                 loss.item() / len(data)))

    print('====> Epoch: {} Average loss: {:.4f}'.format(
          epoch, train_loss / len(train_loader.dataset)))

In [47]:
def test(epoch):
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for i, (data, _) in enumerate(test_loader):
            data = data.to(device)
            recon_batch, mu, logvar = model(data)
            test_loss += loss_function(recon_batch, data, mu, logvar).item()
            if i == 0:
                n = min(data.size(0), 8)
                comparison = torch.cat([data[:n],
                                      recon_batch.view(batch_size, 1, 28, 28)[:n]])
                save_image(comparison.cpu(),
                         'results/reconstruction_' + str(epoch) + '.png', nrow=n)

    test_loss /= len(test_loader.dataset)
    print('====> Test set loss: {:.4f}'.format(test_loss))
    return test_loss

## Hyperparams the worst way

Normally, we might use argparse or some other mechanism to drive our experiment.

Here, I'm just going to hardcode the hyperparams, which is probably the worst way to do this.

In [48]:
# Hyperparameters
batch_size = 128
epochs = 10
log_interval = 200
learning_rate =1e-3

In [49]:
# Load the datasets
train_loader, test_loader = load_datasets(batch_size)



In [50]:
# Do stuff
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(1, epochs + 1):
        train(epoch)
        test(epoch)
        with torch.no_grad():
            sample = torch.randn(64, 20).to(device)
            sample = model.decode(sample).cpu()
            save_image(sample.view(64, 1, 28, 28),
                       'results/sample_' + str(epoch) + '.png')

====> Epoch: 1 Average loss: 164.1720
====> Test set loss: 127.7911
====> Epoch: 2 Average loss: 121.6052
====> Test set loss: 115.7066
====> Epoch: 3 Average loss: 114.4285
====> Test set loss: 111.5441
====> Epoch: 4 Average loss: 111.4300
====> Test set loss: 109.3296
====> Epoch: 5 Average loss: 109.6984
====> Test set loss: 108.2240
====> Epoch: 6 Average loss: 108.5445
====> Test set loss: 107.5437
====> Epoch: 7 Average loss: 107.6886
====> Test set loss: 106.7517
====> Epoch: 8 Average loss: 107.0719
====> Test set loss: 106.1587
====> Epoch: 9 Average loss: 106.5827
====> Test set loss: 105.8837
====> Epoch: 10 Average loss: 106.1514
====> Test set loss: 105.7077


## Now, let's have some Chocolate

We'll start by creating our hyperparameter space.

Note:  we can create different range types for the hyperparameter space.  This includes:
* `uniform`: Uniform continuous distribution.
* `quantized_uniform`: Uniform discrete distribution.
* `log`: Logarithmic uniform continuous distribution.
* `quantized_log`: Logarithmic uniform discrete distribution.
* `choice`: Uniform choice distribution between non-numeric samples.

For more details on how these spaces are computed: https://chocolate.readthedocs.io/api/space.html#module-chocolate.space

In [51]:
def create_space():
    space = {
        "learning_rate": choco.log(low=-4, high=-1, base=10),
        "batch_size": choco.choice([16, 32, 64, 128, 256, 512]),
        "epochs": choco.quantized_uniform(low=10, high=100, step=5)
    }
    return space

### Next, setup the space and define the tuning algorithm

Chocolate offers the following sampling algorithms:
* `Grid`: Regular cartesian grid sampler.
* `Random`: Random sampler.
* `QuasiRandom`: Quasi-Random sampler. Samples the search space using the generalized Halton low-discrepancy sequence. 

Chocolate offers the following search algorithms:
* `Bayes`: Bayesian minimization method with gaussian process regressor.
* `CMAES`: Covariance Matrix Adaptation Evolution Strategy minimization method.
* `MOCMAES`: Multi-Objective Covariance Matrix Adaptation Evolution Strategy.

For this demonstration, we'll use `Bayes`

In [52]:
choco_space = create_space()
sampler = choco.Bayes(choco_conn, choco_space, clear_db=False)

### Now lets do a simple training loop with Chocolate requesting a configuration

In [55]:
# Have the choco sampler fetch parameters
tokens, params = sampler.next()

# Display the chocolate space
print(tokens)
print(params)

{'_chocolate_id': 1}
{'learning_rate': 0.00023468606397270453, 'epochs': 45, 'batch_size': 32}


In [None]:
# Use our new hyperparameters from chocolate

learning_rate = params['learning_rate']
epochs = params['epochs']
batch_size = params['batch_size']


# Load the datasets
train_loader, test_loader = load_datasets(batch_size)

# set the optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# run the training loop
for epoch in range(1, epochs + 1):
        train(epoch)
        test_loss = test(epoch)
        with torch.no_grad():
            sample = torch.randn(64, 20).to(device)
            sample = model.decode(sample).cpu()
            save_image(sample.view(64, 1, 28, 28),
                       'results/sample_' + str(epoch) + '.png')


# IMPORTANT!!!!            
# Record the loss
sampler.update(token, test_loss)

====> Epoch: 1 Average loss: 125.9546
====> Test set loss: 112.1922
====> Epoch: 2 Average loss: 111.3176
====> Test set loss: 109.3802
====> Epoch: 3 Average loss: 109.4607
====> Test set loss: 108.1772
====> Epoch: 4 Average loss: 108.4805
====> Test set loss: 107.4520
====> Epoch: 5 Average loss: 107.7983
====> Test set loss: 106.9426
====> Epoch: 6 Average loss: 107.2659
====> Test set loss: 106.4481
====> Epoch: 7 Average loss: 106.8209
====> Test set loss: 106.2418
====> Epoch: 8 Average loss: 106.5144
====> Test set loss: 105.8120
====> Epoch: 9 Average loss: 106.2268
====> Test set loss: 105.6413
====> Epoch: 10 Average loss: 106.0182
====> Test set loss: 105.4409
====> Epoch: 11 Average loss: 105.7907
====> Test set loss: 105.2756
====> Epoch: 12 Average loss: 105.6666
====> Test set loss: 105.1655
====> Epoch: 13 Average loss: 105.5220
====> Test set loss: 105.0749
====> Epoch: 14 Average loss: 105.3564
====> Test set loss: 104.8799
====> Epoch: 15 Average loss: 105.2073
====

## How about a batch of experiments?!?

In [None]:
# I do not reccommend running this code in Jupyter with on_gpu=False
num_experiments = 10 if on_gpu else -1


for i in range(num_experiments):
    
    # get next configuration from chocolate
    tokens, params = sampler.next()
    
    # Use our new hyperparameters from chocolate
    learning_rate = params['learning_rate']
    epochs = params['epochs']
    batch_size = params['batch_size']
    
    
    # Load the datasets
    train_loader, test_loader = load_datasets(batch_size)

    # set the optimizer
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # run the training loop
    for epoch in range(1, epochs + 1):
            train(epoch)
            test_loss = test(epoch)
            with torch.no_grad():
                sample = torch.randn(64, 20).to(device)
                sample = model.decode(sample).cpu()
                save_image(sample.view(64, 1, 28, 28),
                           'results/sample_' + str(epoch) + '.png')
    
    # Record the loss
    sampler.update(token, test_loss)

## Extra info

### Chocolate and Sacred
Chocolate works nice with Sacred since the params are returned in a dictionary form, you can load these directly into
```python
token, params = sampler.next()
loss = ex.run(config_updates=params).result
sampler.update(token, loss)

```

### Database Caveats
If you change the `create_space` method, this requires the `clear_db` flag to be set to true, **OR** referencing a new database_name

```python
sampler = choco.Bayes(choco_conn, choco_space, clear_db=True)
```

```python
# database connection info
DATABASE_URL = '140.160.139.44:27017'
DATABASE_NAME = 'choco_experiment2'  # note the new database name here

# chocolate setup
choco_conn = choco.MongoDBConnection(url=DATABASE_URL, database=DATABASE_NAME)
choco_conn.clear()  # clear the database for new experiment runs
```

One of the downsides of Chocolate, is that each space is unique and requires it's own database.  While the footprint of a database is relatively small, this can add up if you need to keep a long history of different experiments