In [None]:
googlecolab = False

if googlecolab:
    from os.path import exists
    from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
    platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
    cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
    accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

    !pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision
    !pip install Pillow==4.1.1

In [None]:
%matplotlib inline
import argparse
import os
import random
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.utils as vutils
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML
torch.manual_seed(0)

## Hyperpameters

Define the hyperparameters. You can play with those later.

In [None]:
## Data loading

workers = 4 # Number of workers for dataloader (/!\ set to 4 when you're done debugging)

## Architecture

arch = 'cDCGAN' # or cGAN
nz = 100 # Size of z latent vector (i.e. size of generator input)µ
ndf = 32 # Base size of feature maps in discriminator
ngf = 32 # Base size of feature maps in generator

## Optimization

lrD = 0.0002 # Learning rate for the discriminator
lrG = 0.0002 # Learning rate for the generator
beta1G = 0.5 # Momentum beta1 for the discriminator
beta1D = 0.5 # Momentum beta1 for the generator

## Training

batch_size = 128 # Images per batch
nb_update_D = 1 # Number of sub-steps of discriminator optim. at each step
nb_update_G = 1 # Number of sub-steps of generator optim. at each step
#steps = 8000 # Number of global steps in the training loop
nb_epochs = 20 # Number of epochs, leave "None" if you want to set the number of "steps" (i.e. batches)


if nb_epochs is None:
    nb_epochs = (steps * batch_size) / (nb_update_D * 50000)
else:
    steps = int(nb_epochs * nb_update_D * 50000 / batch_size)
print("Doing %.1f epochs in %d steps" % (nb_epochs, steps))
steps_per_epoch = int(steps / nb_epochs)

# Dataset

Download and load the dataset. Nothing to do here.

In [None]:
# We can use an image folder dataset the way we have it setup.
# Create the dataset
dataroot = '/tmp/mnist'
transform = transforms.Compose([
        transforms.Pad(2),
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.5,), std=(0.5,))
])
dataset = dset.MNIST(dataroot, train=True, download=True, transform=transform)
# Create the dataloader
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                         shuffle=True, num_workers=workers)

# Decide which device we want to run on
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Plot some training images
real_batch = next(iter(dataloader))
plt.figure(figsize=(8,8))
plt.axis("off")
plt.title("Training Images")
plt.imshow(np.transpose(vutils.make_grid(real_batch[0].to(device)[:64], padding=2, normalize=True).cpu(),(1,2,0)))
plt.show()

# Model architectures

## Discriminator

Input: Image $x \in \mathbb{R}^{32\times 32\times 1}$  
Output: "Real" image probability $\in [0,1]$

## Generator

Input: Random "noise" $z \in \mathbb{R}^{\text{nz}}$  
Output: Generated image $\tilde x \in \mathbb{R}^{32\times 32\times 1}$


In [None]:
# cDCGAN

if arch == 'cDCGAN':
    
    class Discriminator(nn.Module):
        def __init__(self):
            super(Discriminator, self).__init__()
            self.emb_x = nn.Sequential(
                # TODO
                nn.Conv2d(in_channels=1, out_channels=2*ndf, kernel_size=4, stride=2, padding=1, bias=True),
                nn.LeakyReLU(negative_slope = 0.2),
            )
            self.emb_y = nn.Sequential(
                nn.Conv2d(in_channels=10, out_channels=2*ndf, kernel_size=4, stride=2, padding=1, bias=True),
                nn.LeakyReLU(negative_slope = 0.2),
            )
            self.emb_xy = nn.Sequential(
                # TODO
                nn.Conv2d(in_channels=4*ndf, out_channels=8*ndf,kernel_size=4, stride=2, padding=1, bias=False),
                nn.BatchNorm2d(8*ndf),
                nn.LeakyReLU(negative_slope = 0.2),

                nn.Conv2d(8*ndf,16*ndf,4,2,1,bias=False),
                nn.BatchNorm2d(16*ndf),
                nn.LeakyReLU(negative_slope = 0.2),

                nn.Conv2d(16*ndf,1,4,1,0,bias=True),
                nn.Sigmoid()
            )

        def forward(self, x, y):
            #TODO
            x = self.emb_x(x)
            y = torch.ones((10,32,32)).to(device)*y
            y = self.emb_y(y)
            o = self.emb_xy(torch.cat((x,y),dim=1))
            return o



    class Generator(nn.Module):
        def __init__(self):
            super(Generator, self).__init__()
            self.emb_z = nn.Sequential(
                # TODO
                nn.ConvTranspose2d(in_channels=nz, out_channels=8*ndf,kernel_size=4, stride=1, padding=0, bias=False),
                nn.BatchNorm2d(8*ndf),
                nn.ReLU()
            )
            self.emb_y = nn.Sequential(
                # TODO
                nn.ConvTranspose2d(in_channels=10, out_channels=8*ndf,kernel_size=4, stride=1, padding=0, bias=False),
                nn.BatchNorm2d(8*ndf),
                nn.ReLU()
            )
            self.emb_zy = nn.Sequential(
                # TODO
                nn.ConvTranspose2d(16*ndf,8*ndf,4,2,1,bias=False),
                nn.BatchNorm2d(8*ndf),
                nn.ReLU(),

                nn.ConvTranspose2d(8*ndf,4*ndf,4,2,1,bias=False),
                nn.BatchNorm2d(4*ndf),
                nn.ReLU(),
                
                nn.ConvTranspose2d(4*ndf,1,4,2,1,bias=True),
                nn.Tanh()
            )

        def forward(self, z, y):
            # TODO
            z = self.emb_z(z)
            y = self.emb_y(y)
            o = self.emb_zy(torch.cat((z,y),dim=1))
            return o
        

In [None]:
# cGAN

if arch == 'cGAN':
    class Discriminator(nn.Module):
        def __init__(self):
            super(Discriminator, self).__init__()
            self.emb_x = nn.Sequential(
                # TODO
                nn.Linear(1024,1024, bias = True),
                nn.LeakyReLU(negative_slope = 0.2)
            )
            self.emb_y = nn.Sequential(
                # TODO
                nn.Linear(10,1024, bias = True),
                nn.LeakyReLU(negative_slope = 0.2)
            )
            self.emb_xy = nn.Sequential(
                # TODO
                nn.Linear(2048,1024, bias=False),
                nn.BatchNorm1d(1024),
                nn.LeakyReLU(negative_slope = 0.2),
                
                nn.Linear(1024,512, bias=False),
                nn.BatchNorm1d(512),
                nn.LeakyReLU(negative_slope = 0.2),
                
                nn.Linear(512,1, bias=True),
                nn.Sigmoid()
            )

        def forward(self, x, y):
            # Remove unused dimensions in non-conv model
            x = x.view(x.shape[0], -1) #z[:, :, 0, 0]
            y = y.view(y.shape[0], -1)          
            #TODO
            x = self.emb_x(x)
            y = self.emb_y(y)
            o = self.emb_xy(torch.cat((x,y),dim=1))
            return o



    class Generator(nn.Module):
        def __init__(self):
            super(Generator, self).__init__()
            self.emb_z = nn.Sequential(
                # TODO
                nn.Linear(nz,256, bias=False),
                nn.BatchNorm1d(256),
                nn.LeakyReLU(negative_slope = 0.2),
            )
            self.emb_y = nn.Sequential(
                # TODO
                nn.Linear(10,256, bias=False),
                nn.BatchNorm1d(256),
                nn.LeakyReLU(negative_slope = 0.2),
            )
            self.emb_zy = nn.Sequential(
                # TODO
                nn.Linear(512,512, bias=False),
                nn.BatchNorm1d(512),
                nn.ReLU(),
                
                nn.Linear(512,1024, bias=False),
                nn.BatchNorm1d(1024),
                nn.ReLU(),
                
                nn.Linear(1024,1024, bias=True),
                nn.Tanh()
            )

        def forward(self, z, y):
            # Remove unused dimensions in non-conv model
            z = z.view(z.shape[0], -1) #z[:, :, 0, 0]
            y = y.view(y.shape[0], -1)
            
            # TODO
            z = self.emb_z(z)
            y = self.emb_y(y)
            o = self.emb_zy(torch.cat((z,y),dim=1))
            return o

In [None]:
# Create the models
netG = Generator().to(device)
netD = Discriminator().to(device)

# Apply the weights_init function to randomly initialize all weights to mean=0, stdev=0.2
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        nn.init.normal_(m.weight.data, 0.0, 0.02)
    elif classname.find('BatchNorm') != -1:
        nn.init.normal_(m.weight.data, 1.0, 0.02)
        nn.init.constant_(m.bias.data, 0)
netG.apply(weights_init)
netD.apply(weights_init)

# Print the models
print(netG)
print(netD)

In [None]:
# TODO test you models to check if they 
z = torch.zeros(10, nz, 1, 1).to(device)
x = torch.zeros(10, 1, 32, 32).to(device)
y = torch.randn(10, 10, 1, 1).to(device)
print(netG(z, y).shape) # expected: [10, 1, 32, 32]
print(netD(x, y).shape) # expected: [10, 1, 1, 1]

# Optimization and training

Here we will define:
* Our prior $P(z)$ that we use to sample random "noise". We will use a Gaussian distribution.
* The criterion that will be used to train the discriminator, and indirectly the generator. We will use the binary cross-entropy.
* The optimizers of both models. We will use the Adam optimizer.

In [None]:
# Prior P(z). Returns a Gaussian random tensor of shape (batch_size, nz, 1, 1)
def get_noise(batch_size):
    noise = torch.randn((batch_size,nz,1,1)).to(device) # TODO
    return noise

# Returns a random vector of one-hot encoding of shape (batch_size, 10, 1, 1)
def get_fixed_y(batch_size):
    y = torch.zeros(batch_size, 10, 1, 1, device=device)
    n = batch_size // 10
    for i in range(10):
        y[i*n:i*n+n, i, :, :].fill_(1)
    return y

# Create the criterion function that will take (y_hat, y) as input
criterion = nn.BCELoss() # TODO

# Setup Adam optimizers for D and G
optimizerD = optim.Adam(netD.parameters(),lr=lrD, betas=(beta1D,0.999)) # TODO
optimizerG = optim.Adam(netG.parameters(),lr=lrG, betas=(beta1G,0.999))  # TODO

### Data format / batch creation functions

`r_real` and `r_fake` are targets for the disriminator's criterion

In [None]:
# Data format / batch creation functions

fixed_noise = get_noise(100) # Create a fixed random vector sampled from a Gaussian, will be used during train for viz
fixed_y = get_fixed_y(100)
real_label = 1
fake_label = 0
iterator = iter(dataloader)

def onehot(data, nclass=10):
    bsize = data.shape[0]
    y = torch.zeros(bsize, nclass)
    y.scatter_(1, data.unsqueeze(1), 1)
    y = y.unsqueeze(2).unsqueeze(3)
    return y

# returns a batch of real images from the dataset (iterates infinitely on the dataset)
def get_batch_real():
    global iterator
    try:
        x_real, y_real = next(iterator)
    except:
        iterator = iter(dataloader)
        x_real, y_real = next(iterator)
    x_real = x_real.to(device)
    y_real = onehot(y_real).to(device)
    r_real = torch.full((x_real.size(0),), real_label, device=device)
    return x_real, y_real, r_real

# returns a batch of generated images and training targets y_fake
# Note that the targets r_fake will be different is train_G is True or False
def get_batch_fake(y_real, train_G=False):
    z = torch.randn(y_real.shape[0], nz, 1, 1, device=device)
    x_fake = netG(z, y_real).to(device)
    if train_G:
        r_fake = torch.ones((y_real.shape[0],)).to(device) # TODO
    else:
        r_fake = torch.zeros((y_real.shape[0],)).to(device) # TODO
    return x_fake, y_real, r_fake

### Training Loop

**Reminder:** when your training loop starts to work, change the `workers` variable to 4 and rerun your notebook

In [None]:
img_list = []
G_losses = []
D_losses = []

In [None]:
for i in range(steps):
    
    if i == steps_per_epoch * 11:
        optimizerG.param_groups[0]['lr'] /= 10
        optimizerD.param_groups[0]['lr'] /= 10
        print("learning rate change!")
        
    if i == steps_per_epoch * 16:
        optimizerG.param_groups[0]['lr'] /= 10
        optimizerD.param_groups[0]['lr'] /= 10
        print("learning rate change!")
    
    ########
    # Update D network: maximize log(D(x)) + log(1 - D(G(z)))
    for _ in range(nb_update_D):
        netD.zero_grad()
        
        # Create batches
        x_real, y_real, r_real = get_batch_real()
        x_fake, y_real, r_fake = get_batch_fake(y_real)
        
        # Forward 
        r_hat_real = netD(x_real,y_real).view((y_real.shape[0],)) # TODO
        r_hat_fake = netD(x_fake,y_real).view((y_real.shape[0],)) # TODO
        errD = criterion(r_hat_real,r_real) # TODO sum of criterion of real and fake samples
        errD += criterion(r_hat_fake,r_fake)
        
        # Backward
        # TODO backward & optimization step on D
        errD.backward()
        optimizerD.step()
        
        # Compute / save metrics
        avg_output_for_real = r_hat_real.mean().item()
        avg_output_for_fake = r_hat_fake.mean().item()    
        D_losses.append(errD.item())



    ########
    # Update G network: maximize log(D(G(z)))
    for _ in range(nb_update_G):
        netG.zero_grad()
        
        # TODO: forward + backward
        # NOTE: use errG as name for your loss variable, like errD above
        # Create batches
        x_real, y_real, r_real = get_batch_real()
        x_fake, y_real, r_fake = get_batch_fake(y_real,train_G=True)
        
        # Forward 
        r_hat_fake = netD(x_fake,y_real).view((y_real.shape[0],)) 
        errG = criterion(r_hat_fake,r_fake) 
        
        # Backward
        # TODO backward & optimization step on D
        errG.backward()
        optimizerG.step()
        
        # Compute / save metrics
        G_losses.append(errG.item())
        
    ########
    # Logs
    if i % 25 == 0:
        print('[%5d/%5d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f'
              % (i, steps, errD.item(), errG.item(), avg_output_for_real, avg_output_for_fake))

for i in range(steps):
    
    if i == steps_per_epoch * 11:
        optimizerG.param_groups[0]['lr'] /= 10
        optimizerD.param_groups[0]['lr'] /= 10
        print("learning rate change!")
        
    if i == steps_per_epoch * 16:
        optimizerG.param_groups[0]['lr'] /= 10
        optimizerD.param_groups[0]['lr'] /= 10
        print("learning rate change!")
    
    ########
    # Update D network: maximize log(D(x)) + log(1 - D(G(z)))
    for _ in range(nb_update_D):
        netD.zero_grad()
        
        # Create batches
        x_real, y_real, r_real = get_batch_real()
        x_fake, y_real, r_fake = get_batch_fake(y_real)
        
        # Forward 
        r_hat_real = netD(x_real,y_real).view((y_real.shape[0],)) # TODO
        r_hat_fake = netD(x_fake,y_real).view((y_real.shape[0],)) # TODO
        errD = criterion(r_hat_real,r_real) # TODO sum of criterion of real and fake samples
        errD += criterion(r_hat_fake,r_fake)
        
        # Backward
        # TODO backward & optimization step on D
        errD.backward()
        optimizerD.step()
        
        # Compute / save metrics
        avg_output_for_real = r_hat_real.mean().item()
        avg_output_for_fake = r_hat_fake.mean().item()    
        D_losses.append(errD.item())



    ########
    # Update G network: maximize log(D(G(z)))
    for _ in range(nb_update_G):
        netG.zero_grad()
        
        # TODO: forward + backward
        # NOTE: use errG as name for your loss variable, like errD above
        # Create batches
        x_real, y_real, r_real = get_batch_real()
        x_fake, y_real, r_fake = get_batch_fake(y_real,train_G=True)
        
        # Forward 
        r_hat_fake = netD(x_fake,y_real).view((y_real.shape[0],)) 
        errG = criterion(r_hat_fake,r_fake) 
        
        # Backward
        # TODO backward & optimization step on D
        errG.backward()
        optimizerG.step()
        
        # Compute / save metrics
        G_losses.append(errG.item())
        
    ########
    # Logs
    if i % 100 == 0:
        print('[%5d/%5d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f'
              % (i, steps, errD.item(), errG.item(), avg_output_for_real, avg_output_for_fake))

    if i % 500 == 0:
        with torch.no_grad():
            x_fake = netG(fixed_noise, fixed_y).detach().cpu()
            if arch == 'cGAN':
                x_fake = x_fake.view((x_fake.shape[0],1,32,32))
        img_list.append(vutils.make_grid(x_fake, padding=2, normalize=True, nrow=10))
        plt.figure(figsize=(10,10))
        plt.imshow(np.transpose(img_list[-1],(1,2,0)))
        plt.axis("off")
        plt.show()

# Display training evolution

In [None]:
# Show generations
fig = plt.figure(figsize=(8,8))
plt.axis("off")
ims = [[plt.imshow(np.transpose(i,(1,2,0)), animated=True)] for i in img_list]
ani = animation.ArtistAnimation(fig, ims, interval=1000, repeat_delay=1000, blit=True)

HTML(ani.to_jshtml())

In [None]:
# Loss evolution
plt.figure(figsize=(10,8))
plt.subplot(2,1,1)
plt.title("Generator Training Loss")
plt.plot(G_losses)
plt.xlabel("iterations")
plt.ylabel("Loss")

plt.subplot(2,1,2)
plt.title("Generator Training Loss")
plt.plot(D_losses)
plt.xlabel("iterations")
plt.ylabel("Loss")
plt.show()