In [None]:
%%HTML
<!-- Mejorar visualización en proyector -->
<style>
.rendered_html {font-size: 1.2em; line-height: 150%;}
div.prompt {min-width: 0ex; padding: 0px;}
.container {width:95% !important;}
</style>

In [None]:
%matplotlib notebook
%autosave 0
import numpy as np
import matplotlib.pyplot as plt
import torch
#import pyro

# Latent Variable Models (LVM)


Let's say we want to model a dataset $X = (x_1, x_2, \ldots, x_N)$ with $x_i \in \mathbb{R}^D$ 

> We are looking for $p(x)$

Each sample has D attributes

> These are the **observed variables** (visible space)

To model the data we have to propose dependency relationships between variables

> Modeling correlation is difficult

One alternative is to assume that what we observe is correlated due to *hidden causes*

> These are the **latent variables** (hidden space)

Models with latent variables are called **Latent Variable Models** (LVM)

Then we get the marginal using

$$
\begin{align}
p(x) &= \int_z p(x, z) \,dz \nonumber \\
&= \int_z p(x|z) p(z) \,dz \nonumber
\end{align}
$$

Did we gain anything? 

> The integral can be hard to solve (in some cases it is tractable)

The answer is YES

> We can propose simple $p(x|z)$ and $p(z)$ and get complex $p(x)$



# Probabilistic Principal Component Analysis (PCA)


## Classical PCA

PCA is an algorithm to reduce the dimensionality of continous data

Let's say we have $X = (x_1, x_2, \ldots, x_N)$ con $x_i \in \mathbb{R}^D$

In classical PCA we 

1. Compute covariance matrix $C = \frac{1}{N} X^T X$
1. Solve the eigen value problem $(C - \lambda I)W = 0$

This comes from 

$$
\min_W W^T C W, \text{s.t.} ~ W^T W = I
$$

> PCA finds an **orthogonal transformation** $W$ that **minimizes the variance** of the projected data $XW$

Then we can reduce the amount of columns of $W$ to reduce the dimensionality of $XW$


In [None]:
import torchvision
mnist_train_data = torchvision.datasets.MNIST('dataset', train=True, download=True,
                                              transform=torchvision.transforms.ToTensor())
mnist_test_data = torchvision.datasets.MNIST('dataset', train=False, download=True,
                                             transform=torchvision.transforms.ToTensor())
from torch.utils.data import DataLoader, SubsetRandomSampler

np.random.seed(0)
#idx = list(range(len(mnist_train_data)))
idx = list(range(10000))
np.random.shuffle(idx)
split = int(0.7*len(idx))

train_loader = DataLoader(mnist_train_data, batch_size=128, 
                          sampler=SubsetRandomSampler(idx[:split]))

valid_loader = DataLoader(mnist_train_data, batch_size=256, 
                          sampler=SubsetRandomSampler(idx[split:]))

## Probabilistic interpretation

We can give a probabilistic interpretation to PCA as an LVM

An observed sample $x_i \in \mathbb{R}^D$ is modeled as 

$$
x_i = W z_i + B + \epsilon
$$

> Observed variable is related to the latent variable via a **linear mapping**

where 
- $B \in \mathbb{R}^D$ is the mean of $X$
- $W \in \mathbb{R}^{D\times K}$ is a linear transformation matrix
- $\epsilon$ is noise

> $z_i \in  \mathbb{R}^K$ is a continuous latent variable with $K<D$

#### Assumption: The noise is independent and Gaussian distributed with variance $\sigma^2$

Then

$$
p(x_i | z_i) = \mathcal{N}(B + W z_i, I \sigma^2)
$$

Note: In general factor analysis the noise has a diagonal covariance

#### Assumption: The latent variable has a standard Gaussian prior

$$
p(z_i) = \mathcal{N}(0, I)
$$


#### Marginal likelihood

The Gaussian is conjugated to itself (convolution of Gaussians is Gaussian)
$$
\begin{align}
p(x) &= \int p(x|z) p(z) \,dz \nonumber \\
&= \mathcal{N}(x|B, W^T W + I\sigma^2 ) \nonumber
\end{align}
$$

> We have parametrized a normal with full covariance from to normals with diagonal covariance"

The parameters are calculated from 
- $\mathbb{E}[x] = W\mathbb{E}[z] + \mu + \mathbb{E}[\epsilon]$
- $\mathbb{E}[(Wz + \epsilon)(Wz + \epsilon)^T] = W \mathbb{E}[zz^T] W^T + \mathbb{E}[\epsilon \epsilon^T]$

#### Posterior

Using Bayes we can obtain the posterior to go from observed to latent

$$
p(z|x) = \mathcal{N}(z|M^{-1}W^T(x-B), M\sigma^{-2} )
$$

where

$$
M = W^T W + I\sigma^2
$$

#### Training

We fit the model to find $W$, $\mu$ and $\sigma$ by maximizing the marginal likelihood

$$
\max \log L(W, B, \sigma^2) = \sum_{i=1}^N \log p(x_i)
$$

From here we can do derivatives and obtain closed form solutions of the parameters

> Solution for $W$ is equivalent to conventional PCA ($\sigma^2 \to 0$)

> Now we have estimated $\sigma$, we have error-bars for $z$ and the model is generative


## Self-study
- Barber, Chapter 21 and Murphy, Chapter 12

In [None]:
import pymc3 as pm
from mpl_toolkits.mplot3d import Axes3D
import theano.tensor as tt
from sklearn.decomposition import PCA
N = 1000 
M = 3  # dimensions of the data
D = 2  # dimensions of the projection

np.random.seed(10)
C = np.random.randn(M, M)
C = np.dot(C.T, C)
X = np.random.multivariate_normal(np.zeros(shape=(M, )), C, size=N)
X = X - np.mean(X, axis=0)
X = X/np.std(X, axis=0)

fig = plt.figure(figsize=(7, 3))
ax = fig.add_subplot(121, projection='3d')
ax.scatter(X[:, 0], X[:, 1], X[:, 2], s=2)
pca = PCA(n_components=2, whiten=False)
R = pca.fit_transform(X)
ax = fig.add_subplot(122)
plt.scatter(R[:, 0], R[:, 1], s=1)
_ = plt.title('PCA projection')


In [None]:
with pm.Model() as PPCA:
    s = pm.HalfCauchy('s', beta=5, shape=[1,])
    w = pm.Normal('w', mu=tt.zeros([D, M]), sd=tt.ones([D, M]), shape=[D, M])
    z = pm.Normal('z', mu=tt.zeros([N, D]), sd=tt.ones([N, D]), shape=[N, D])
    x = pm.Normal('x', mu=z.dot(w), sd=s*tt.ones([N, M]), shape=[N, M], observed=X)  
    inference = pm.ADVI()
    approx = pm.fit(n=2000, method=inference, obj_optimizer=pm.adam(learning_rate=1e-1))
"""
_ = plt.plot(-inference.hist)
plt.ylabel('Evidence lower bound (ELBO)')
plt.xlabel('Iteration')
plt.grid()
"""
with PPCA:
    trace = approx.sample(draws=1000)
    ppc = pm.sample_ppc(trace=trace, samples=100)
_ = pm.traceplot(trace=trace, varnames=['w', 's'])


In [None]:
W_avg = np.mean(trace['w'], axis=0)
s_avg = np.mean(trace['s'], axis=0)
print("Average W")
print(W_avg)
print("Average sigma: %f" %(s_avg))

x_reconstructed = ppc['x'][0, :, :] 

fig = plt.figure(figsize=(8, 3))
ax = fig.add_subplot(121, projection='3d')
ax.scatter(X[:, 0], X[:, 1], X[:, 2], s=2)
ax.set_title('Input data')             
bx, by, bz = ax.get_xbound(), ax.get_ybound(), ax.get_zbound()      
ax = fig.add_subplot(122, projection='3d')
ax.set_title("Sampled data")
ax.scatter(x_reconstructed[:, 0], x_reconstructed[:, 1], x_reconstructed[:, 2], s=1, alpha=0.5)
t = np.linspace(-4, 4, num=100)
ax.set_xbound(bx)
ax.set_ybound(by)
ax.set_zbound(bz)

z_trace_avg = np.mean(trace['z'], axis=0)
z_trace_std = np.std(trace['z'], axis=0)
z_trace_var = np.mean(np.var(trace['z'], axis=1), axis=0)
# Sort the new axis in decreasing order of variance
axis_order = np.argsort(z_trace_var)[::-1]

In [None]:
fig = plt.figure(figsize=(8, 3), tight_layout=True)
ax = fig.add_subplot(1, 3, 1)
ax.errorbar(z_trace_avg[:, axis_order[0]], z_trace_avg[:, axis_order[1]], 
            z_trace_std[:, axis_order[0]], z_trace_std[:, axis_order[1]], fmt='none', alpha= 0.5)
plt.title('Average z from trace')

Z_test = np.dot(X, np.dot(np.linalg.inv(np.dot(W_avg.T, W_avg) + np.eye(M)*s_avg**2 ), W_avg.T))
ax = fig.add_subplot(1, 3, 2)
ax.scatter(Z_test[:, axis_order[0]], Z_test[:, axis_order[1]], s=1, alpha=0.5)
_ = plt.title('Average z by hand')

ax = fig.add_subplot(1, 3, 3)
ax.scatter(R[:, 0], R[:, 1], s=1, alpha=0.5)
_ = plt.title('z from sklearn PCA')
ax.invert_xaxis()
ax.invert_yaxis()
# SKLEARN gives you the new axis already sorted by variance, also axis might appear rotated

# Self-study

**Gaussian Mixture Model:** Model with categorical latent variables


# Autoencoders

# Variational Autoencoder

Our generative model with latent variable $z$ y observed variable $x$ is described by the joint density

$$
p_\theta(x, z) = p_\theta(x|z) p(z)
$$

Tipically, we find the parameters $\theta$ of our generator using a maximum likelihood approach over the **evidence** or **marginal likelihood** $p_\theta (x)$

$$
\max_\theta p_\theta (x) = \int p_\theta(x|z) p(z) \,dz
$$

but the integral is in general intractable.

Instead we will optimize a lower bound of the evidence

We define a simpler posterior $q_\phi(z|x)$ and write the divergence between this and the generator posterior

$$
p_\theta(z|x) = \frac{p_\theta(x|z) p(z)}{p_\theta(x)}, 
$$
where $p(z)$ is a prior specified by the user


$$
\begin{align}
D_\text{KL}\left[q_\phi(z|x) || p_\theta(z|x)\right] &=
\mathbb{E}_{z\sim q_\phi(z|x)} \left [ \log \frac{q_\phi(z|x)}{p_\theta(z|x)}\right ]\nonumber \\
&= \mathbb{E}_{z\sim q_\phi(z|x)} \left [ \log \frac{p_\theta(x)}{p_\theta(x|z)}\frac{q_\phi(z|x)}{p(z)} \right ] \nonumber \\
&= \log p_\theta(x) + \mathbb{E}_{z\sim q_\phi(z|x)} \left [ - \log p_\theta(x|z) + \log \frac{q_\phi(z|x)}{p(z)} \right ] \nonumber \\
&= \log p_\theta(x) - \mathbb{E}_{z\sim q_\phi(z|x)} \left [\log p_\theta(x|z)\right ] + D_\text{KL}\left[q_\phi(z|x) || p(z)\right] \nonumber \\
&= \log p_\theta(x) - \mathcal{L}_{\theta, \phi} (x) \nonumber
\end{align}
$$




From here we can do
$$
\begin{align}
\log p_\theta(x) &= \mathcal{L}_{\theta, \phi} (x) + D_\text{KL}\left[q_\phi(z|x) || p_\theta(z|x)\right] \nonumber \\
&\geq \mathcal{L}_{\theta, \phi} (x)
\end{align}
$$

because the KL divergence is non-negative

The term

$$
\mathcal{L}_{\theta, \phi} (x) = \mathbb{E}_{z\sim q_\phi(z|x)} \left [\log p_\theta(x|z)\right ] + D_\text{KL}\left[q_\phi(z|x) || p(z)\right],
$$

is known as the *Evidence Lower Bound* (ELBO)


El ELBO se puede obtener de forma alternativa aplicando la desigualdad de Jensen sobre

$$
\log p_\theta(x) = \log \mathbb{E}_{z\sim p(z)}[p_\theta(x|z)]
$$

In [None]:
# Algoritmo de Gradiente descedente con ADAM para entrenar
mi_red_neuronal = RedEjemplo()
optimizer = torch.optim.Adam(mi_red_neuronal.parameters(), lr=1e-3)

history1 = hl.History()
canvas1 = hl.Canvas()
# GPU: Para entrenar en GPU trasladamos el modelo con
#mi_red_neuronal = mi_red_neuronal.cuda()
#for epoch in tqdm_notebook(range(10)):
for epoch in range(100):
    epoch_loss = 0.0
    KL_loss = 0.0
    for image, label in train_loader:
        # GPU: También trasladamos las imágenes
        #image = image.cuda()
        #label = label.cuda()
        optimizer.zero_grad()
        prediction, mu, logsigma = mi_red_neuronal.forward(image)
        rec, KLdiv = ELBO(image, prediction, mu, logsigma)
        loss = rec + KLdiv
        loss.backward()
        optimizer.step()
        epoch_loss += rec.item()  
        KL_loss += KLdiv.item()  
    den = train_loader.__len__()*train_loader.batch_size
    history1.log(epoch, loss_train=epoch_loss/den, kl_train=KL_loss/den)
    
    epoch_loss = 0.0
    KL_loss = 0.0
    for image, label in valid_loader:
        prediction, mu, logsigma = mi_red_neuronal.forward(image)
        rec, KLdiv = ELBO(image, prediction, mu, logsigma)
        epoch_loss += rec.item()  
        KL_loss += KLdiv.item()  
    den = valid_loader.__len__()*valid_loader.batch_size
    history1.log(epoch, loss_valid=epoch_loss/den, kl_valid=KL_loss/den)

    with canvas1: # So that they render together
        canvas1.draw_plot([history1["loss_train"], history1["loss_valid"]],
                          labels=["Train loss", "Validation loss"])
        canvas1.draw_plot([history1["kl_train"], history1["kl_valid"]],
                          labels=["Train KL", "Validation KL"])
        #canvas1.draw_plot([history1["loss_valid"]])