In [None]:
%%HTML
<!-- Mejorar visualización en proyector -->
<style>
.rendered_html {font-size: 1.2em; line-height: 150%;}
div.prompt {min-width: 0ex; padding: 0px;}
.container {width:95% !important;}
</style>

In [None]:
%matplotlib notebook
%autosave 0
import numpy as np
import matplotlib.pyplot as plt
import torch

The Kullback-Liebler divergence between two continuous distributions is

$$
D_\text{KL}\left[q (x) || p(x)\right] = \mathbb{E}_{x\sim q(x)} \left[ \log \frac{q(x)}{p(x)}\right] = \int q(x) \log \frac{q(x)}{p(x)} \,dx 
$$


Our generative model with latent variable $z$ y observed variable $x$ is described by the joint density

$$
p_\theta(x, z) = p_\theta(x|z) p(z)
$$

Tipically, we find the parameters $\theta$ of our generator using a maximum likelihood approach over the **evidence** or **marginal likelihood** $p_\theta (x)$

$$
\max_\theta p_\theta (x) = \int p_\theta(x|z) p(z) \,dz
$$

but the integral is in general intractable.

Instead we will optimize a lower bound of the evidence

We define a simpler posterior $q_\phi(z|x)$ and write the divergence between this and the generator posterior

$$
p_\theta(z|x) = \frac{p_\theta(x|z) p(z)}{p_\theta(x)}, 
$$
where $p(z)$ is a prior specified by the user


$$
\begin{align}
D_\text{KL}\left[q_\phi(z|x) || p_\theta(z|x)\right] &=
\mathbb{E}_{z\sim q_\phi(z|x)} \left [ \log \frac{q_\phi(z|x)}{p_\theta(z|x)}\right ]\nonumber \\
&= \mathbb{E}_{z\sim q_\phi(z|x)} \left [ \log \frac{p_\theta(x)}{p_\theta(x|z)}\frac{q_\phi(z|x)}{p(z)} \right ] \nonumber \\
&= \log p_\theta(x) + \mathbb{E}_{z\sim q_\phi(z|x)} \left [ - \log p_\theta(x|z) + \log \frac{q_\phi(z|x)}{p(z)} \right ] \nonumber \\
&= \log p_\theta(x) - \mathbb{E}_{z\sim q_\phi(z|x)} \left [\log p_\theta(x|z)\right ] + D_\text{KL}\left[q_\phi(z|x) || p(z)\right] \nonumber \\
&= \log p_\theta(x) - \mathcal{L}_{\theta, \phi} (x) \nonumber
\end{align}
$$




From here we can do
$$
\begin{align}
\log p_\theta(x) &= \mathcal{L}_{\theta, \phi} (x) + D_\text{KL}\left[q_\phi(z|x) || p_\theta(z|x)\right] \nonumber \\
&\geq \mathcal{L}_{\theta, \phi} (x)
\end{align}
$$

because the KL divergence is non-negative

The term

$$
\mathcal{L}_{\theta, \phi} (x) = \mathbb{E}_{z\sim q_\phi(z|x)} \left [\log p_\theta(x|z)\right ] + D_\text{KL}\left[q_\phi(z|x) || p(z)\right],
$$

is known as the *Evidence Lower Bound* (ELBO)


El ELBO se puede obtener de forma alternativa aplicando la desigualdad de Jensen sobre

$$
\log p_\theta(x) = \log \mathbb{E}_{z\sim p(z)}[p_\theta(x|z)]
$$

## Supuestos


logverosimilitud, prior, posterior, etc

In [None]:
class VAE(torch.nn.Module):
    
    def __init__(self, n_hidden1=100, n_hidden2=100, n_latent=2):
        super(RedEjemplo, self).__init__()
        # encoder layers
        self.enc_hidden1 = torch.nn.Linear(28*28, n_hidden1)
        self.enc_hidden2 = torch.nn.Linear(n_hidden1, n_hidden2)
        self.enc_mu = torch.nn.Linear(n_hidden2, n_latent)
        self.enc_logsigma = torch.nn.Linear(n_hidden2, n_latent)
        # decoder layers
        self.dec_hidden1 = torch.nn.Linear(n_latent, n_hidden2)
        self.dec_hidden2 = torch.nn.Linear(n_hidden2, n_hidden1)
        self.dec_hidden3 = torch.nn.Linear(n_hidden1, 28*28)
        self.activation = torch.nn.ReLU()
    
    def encode(self, x):
        x = x.reshape(-1, 1*28*28)
        #print(x.shape)
        z = self.activation(self.enc_hidden1(x))
        z = self.activation(self.enc_hidden2(z))
        return self.enc_mu(z), self.enc_logsigma(z)
    
    def sample(self, mu, logsigma):
        return mu + torch.exp(logsigma)*torch.randn_like(logsigma)
    
    def decode(self, z):
        x = self.activation(self.dec_hidden1(z))
        x = self.activation(self.dec_hidden2(x))
        return (self.dec_hidden3(x)).reshape(-1, 1, 28, 28)
    
    def forward(self, x):
        mu, logsigma = self.encode(x)
        z = self.sample(mu, logsigma)
        return self.decode(z), mu, logsigma    
    
Elogpxz = torch.nn.BCEWithLogitsLoss(reduction='sum')
def KLqzxpz(mu, logsigma):
    return 0.5*torch.sum(1 + 2*logsigma - torch.pow(mu, 2) - torch.exp(2*logsigma))

negELBO = loss_function(xhat, x) - KLdiv(mu, logsigma)

In [None]:
import torchvision
mnist_train_data = torchvision.datasets.MNIST('dataset', train=True, download=True,
                                              transform=torchvision.transforms.ToTensor())
mnist_test_data = torchvision.datasets.MNIST('dataset', train=False, download=True,
                                             transform=torchvision.transforms.ToTensor())
from torch.utils.data import DataLoader, SubsetRandomSampler

np.random.seed(0)
#idx = list(range(len(mnist_train_data)))
idx = list(range(10000))
np.random.shuffle(idx)
split = int(0.7*len(idx))

train_loader = DataLoader(mnist_train_data, batch_size=128, 
                          sampler=SubsetRandomSampler(idx[:split]))

valid_loader = DataLoader(mnist_train_data, batch_size=256, 
                          sampler=SubsetRandomSampler(idx[split:]))

In [None]:
# Algoritmo de Gradiente descedente con ADAM para entrenar
mi_red_neuronal = RedEjemplo()
optimizer = torch.optim.Adam(mi_red_neuronal.parameters(), lr=1e-3)

history1 = hl.History()
canvas1 = hl.Canvas()
# GPU: Para entrenar en GPU trasladamos el modelo con
#mi_red_neuronal = mi_red_neuronal.cuda()
#for epoch in tqdm_notebook(range(10)):
for epoch in range(100):
    epoch_loss = 0.0
    KL_loss = 0.0
    for image, label in train_loader:
        # GPU: También trasladamos las imágenes
        #image = image.cuda()
        #label = label.cuda()
        optimizer.zero_grad()
        prediction, mu, logsigma = mi_red_neuronal.forward(image)
        rec, KLdiv = ELBO(image, prediction, mu, logsigma)
        loss = rec + KLdiv
        loss.backward()
        optimizer.step()
        epoch_loss += rec.item()  
        KL_loss += KLdiv.item()  
    den = train_loader.__len__()*train_loader.batch_size
    history1.log(epoch, loss_train=epoch_loss/den, kl_train=KL_loss/den)
    
    epoch_loss = 0.0
    KL_loss = 0.0
    for image, label in valid_loader:
        prediction, mu, logsigma = mi_red_neuronal.forward(image)
        rec, KLdiv = ELBO(image, prediction, mu, logsigma)
        epoch_loss += rec.item()  
        KL_loss += KLdiv.item()  
    den = valid_loader.__len__()*valid_loader.batch_size
    history1.log(epoch, loss_valid=epoch_loss/den, kl_valid=KL_loss/den)

    with canvas1: # So that they render together
        canvas1.draw_plot([history1["loss_train"], history1["loss_valid"]],
                          labels=["Train loss", "Validation loss"])
        canvas1.draw_plot([history1["kl_train"], history1["kl_valid"]],
                          labels=["Train KL", "Validation KL"])
        #canvas1.draw_plot([history1["loss_valid"]])

In [None]:
# Si entrenamos en GPU y queremos inferir en CPU recuperamos nuestra red con:
#mi_red_neuronal = mi_red_neuronal.cpu()

In [None]:
fig, ax = plt.subplots(2, 10, figsize=(12, 2.5), tight_layout=True)
sig = torch.nn.Sigmoid()

for i in range(10):
    image, label = mnist_train_data[i]
    ax[0, i].imshow(image[0, :, :].numpy())
    ax[0, i].set_title(label);
    ax[0, i].axis('off')
    image_hat = sig(mi_red_neuronal.forward(image)[0]).detach().numpy()
    ax[1, i].imshow(image_hat[0, 0, :, :])
    ax[1, i].axis('off')

In [None]:
N = mnist_test_data.__len__()
codes = np.zeros(shape=(N, 4))
labels = np.zeros(shape=(N, ))
for k in range(N):
    labels[k] = mnist_test_data[k][1]
    codes[k, :] = torch.cat(mi_red_neuronal.encode(mnist_test_data[k][0]), dim=1).detach().numpy()

fig, ax = plt.subplots(figsize=(9, 9), tight_layout=True)
for k, color in enumerate(plt.cm.tab10.colors):
    ax.scatter(codes[labels==k, 0], codes[labels==k, 1], 
               c=np.array(color).reshape(1, -1), s=10, alpha=0.5, label=str(k))
    ax.errorbar(codes[labels==k, 0], codes[labels==k, 1], 
                np.exp(codes[labels==k, 2]), np.exp(codes[labels==k, 3]), fmt='none', alpha=0.5)
plt.legend();

In [None]:
M = 30
z_plot = np.linspace(-4, 4, num=M)
big_imag = np.zeros(shape=(28*M, 28*M))
sig = torch.nn.Sigmoid()
for i in range(M):
    for j in range(M):
        z = torch.tensor(np.array([z_plot[j], z_plot[M-1-i]]), dtype=torch.float32)
        xhat = sig(mi_red_neuronal.decode(z)).detach().numpy()
        big_imag[i*28:(i+1)*28, j*28:(j+1)*28] = xhat

fig, ax = plt.subplots(figsize=(9, 9), tight_layout=True)
Z_plot1, Z_plot2 = np.meshgrid(z_plot, z_plot)
ax.matshow(big_imag, vmin=0.0, vmax=1.0, cmap=plt.cm.gray, extent=[-4, 4, -4, 4])
H, xedge, yedge = np.histogram2d(codes[:, 0], codes[:, 1], bins=M, range=[[-4, 4], [-4, 4]])
ax.contour(Z_plot1, Z_plot2, H.T, linewidths=3, levels=[5], cmap=plt.cm.Reds);

https://github.com/RuiShu/vae-experiments/blob/master/modality/README.md