In [None]:
%%HTML
<!-- Mejorar visualización en proyector -->
<style>
.rendered_html {font-size: 1.2em; line-height: 150%;}
div.prompt {min-width: 0ex; padding: 0px;}
.container {width:95% !important;}
</style>

In [None]:
%matplotlib notebook
%autosave 0
import numpy as np
import matplotlib.pyplot as plt
import torch
#import pyro

# Latent Variable Models


Let's say we want to model a dataset $X = (x_1, x_2, \ldots, x_N)$ with $x_i \in \mathbb{R}^D$ 

> We are looking for $p(x)$

Each sample has D attributes

> These are the **observed variables** (visible space)

To model the data we have to propose dependency relationships between variables

> Modeling correlation is difficult

One alternative is to assume that what we observe is correlated due to *hidden causes*

> These are the **latent variables** (hidden space)

Models with latent variables are called Latent Variable Models (LVM)

Then we get the marginal using

$$
\begin{align}
p(x) &= \int_z p(x, z) \,dz \nonumber \\
&= \int_z p(x|z) p(z) \,dz \nonumber
\end{align}
$$

Did we gain anything? 

> The integral can be hard to solve (in some cases it is tractable)

The answer is YES

> We can propose simple $p(x|z)$ and $p(z)$ and get complex $p(x)$



# Probabilistic Principal Component Analysis (PCA)


## Classical PCA

PCA is an algorithm to reduce the dimensionality of continous data

Let's say we have $X = (x_1, x_2, \ldots, x_N)$ con $x_i \in \mathbb{R}^D$

In classical PCA we 

1. Compute covariance matrix $C = \frac{1}{N} X^T X$
1. Solve the eigen value problem $(C - \lambda I)W = 0$

This comes from 

$$
\min_W W^T C W, \text{s.t.} ~ W^T W = I
$$

> PCA finds an **orthogonal transformation** $W$ that **minimizes the variance** of the projected data $XW$

Then we can reduce the amount of columns of $W$ to reduce the dimensionality of $XW$


## Probabilistic interpretation

We can give a probabilistic interpretation to PCA as an LVM

An observed sample $x_i \in \mathbb{R}^D$ is modeled as 

$$
x_i = W z_i + \mu + \epsilon
$$

> Observed variable is related to the latent variable via a **linear mapping**

where 
- $\mu \in \mathbb{R}^D$ is the mean of $X$
- $W \in \mathbb{R}^{D\times K}$ is a linear transformation matrix
- $\epsilon$ is noise

> $z_i \in  \mathbb{R}^K$ is a continuous latent variable with $K<D$

#### Assumption: The noise is independent and gaussian distributed with variance $\sigma^2$

Then

$$
p(x_i | z_i) = \mathcal{N}(\mu + W z_i, I \sigma^2)
$$

Note: In general factor analysis the noise has a diagonal covariance

#### Assumption: The latent variable has a standard gaussian prior

$$
p(z_i) = \mathcal{N}(0, I)
$$


#### Marginal likelihood

The Gaussian is conjugated to itself (convolution of Gaussians is Gaussian)
$$
\begin{align}
p(x) &= \int p(x|z) p(z) \,dz \nonumber \\
&= \mathcal{N}(x|\mu, W^T W + I\sigma^2 ) \nonumber
\end{align}
$$

> We have parametrized a normal with full covariance from to normals with diagonal covariance"

The parameters are calculated from 
- $\mathbb{E}[x] = W\mathbb{E}[z] + \mu + \mathbb{E}[\epsilon]$
- $\mathbb{E}[(Wz + \epsilon)(Wz + \epsilon)^T] = W \mathbb{E}[zz^T] W^T + \mathbb{E}[\epsilon \epsilon^T]$

#### Posterior

Using Bayes we can obtain the posterior to go from observed to latent

$$
p(z|x) = \mathcal{N}(z|M^{-1}W^T(x-\mu), M\sigma^{-2} )
$$

where

$$
M = W^T W + I\sigma^2
$$

#### Training

We fit the model to find $W$, $\mu$ and $\sigma$ by maximizing the marginal likelihood

$$
\max \log L(W,\mu, \sigma^2) = \sum_{i=1}^N \log p(x_i)
$$

From here we can do derivates and obtain closed form solutions of the parameters

> Solution for $W$ is equivalent to conventional PCA ($\sigma^2 \to 0$)

> Now we have estimated $\sigma$, we have errorbars for $z$ and the model is generative


## Self-study
- Barber, Chapter 21 and Murphy, Chapter 12

In [None]:
import pymc3 as pm
from mpl_toolkits.mplot3d import Axes3D
import theano.tensor as tt
from sklearn.decomposition import PCA
N = 1000 
M = 3  # dimensions of the data
D = 2  # dimensions of the projection

np.random.seed(10)
C = np.random.randn(M, M)
C = np.dot(C.T, C)
X = np.random.multivariate_normal(np.zeros(shape=(M, )), C, size=N)
X = X - np.mean(X, axis=0)
X = X/np.std(X, axis=0)

fig = plt.figure(figsize=(7, 3))
ax = fig.add_subplot(121, projection='3d')
ax.scatter(X[:, 0], X[:, 1], X[:, 2], s=2)
pca = PCA(n_components=2, whiten=False)
R = pca.fit_transform(X)
ax = fig.add_subplot(122)
plt.scatter(R[:, 0], R[:, 1], s=1)
_ = plt.title('PCA projection')


In [None]:
with pm.Model() as PPCA:
    s = pm.HalfCauchy('s', beta=5, shape=[1,])
    w = pm.Normal('w', mu=tt.zeros([D, M]), sd=tt.ones([D, M]), shape=[D, M])
    z = pm.Normal('z', mu=tt.zeros([N, D]), sd=tt.ones([N, D]), shape=[N, D])
    x = pm.Normal('x', mu=z.dot(w), sd=s*tt.ones([N, M]), shape=[N, M], observed=X)  
    inference = pm.ADVI()
    approx = pm.fit(n=2000, method=inference, obj_optimizer=pm.adam(learning_rate=1e-1))
"""
_ = plt.plot(-inference.hist)
plt.ylabel('Evidence lower bound (ELBO)')
plt.xlabel('Iteration')
plt.grid()
"""
with PPCA:
    trace = approx.sample(draws=1000)
    ppc = pm.sample_ppc(trace=trace, samples=100)
_ = pm.traceplot(trace=trace, varnames=['w', 's'])


In [None]:
W_avg = np.mean(trace['w'], axis=0)
s_avg = np.mean(trace['s'], axis=0)
print("Average W")
print(W_avg)
print("Average sigma: %f" %(s_avg))

x_reconstructed = ppc['x'][0, :, :] 

fig = plt.figure(figsize=(8, 3))
ax = fig.add_subplot(121, projection='3d')
ax.scatter(X[:, 0], X[:, 1], X[:, 2], s=2)
ax.set_title('Input data')             
bx, by, bz = ax.get_xbound(), ax.get_ybound(), ax.get_zbound()      
ax = fig.add_subplot(122, projection='3d')
ax.set_title("Sampled data")
ax.scatter(x_reconstructed[:, 0], x_reconstructed[:, 1], x_reconstructed[:, 2], s=1, alpha=0.5)
t = np.linspace(-4, 4, num=100)
ax.set_xbound(bx)
ax.set_ybound(by)
ax.set_zbound(bz)

z_trace_avg = np.mean(trace['z'], axis=0)
z_trace_std = np.std(trace['z'], axis=0)
z_trace_var = np.mean(np.var(trace['z'], axis=1), axis=0)
# Sort the new axis in decreasing order of variance
axis_order = np.argsort(z_trace_var)[::-1]

In [None]:
fig = plt.figure(figsize=(8, 3), tight_layout=True)
ax = fig.add_subplot(1, 3, 1)
ax.errorbar(z_trace_avg[:, axis_order[0]], z_trace_avg[:, axis_order[1]], 
            z_trace_std[:, axis_order[0]], z_trace_std[:, axis_order[1]], fmt='none', alpha= 0.5)
plt.title('Average z from trace')

Z_test = np.dot(X, np.dot(np.linalg.inv(np.dot(W_avg.T, W_avg) + np.eye(M)*s_avg**2 ), W_avg.T))
ax = fig.add_subplot(1, 3, 2)
ax.scatter(Z_test[:, axis_order[0]], Z_test[:, axis_order[1]], s=1, alpha=0.5)
_ = plt.title('Average z by hand')

ax = fig.add_subplot(1, 3, 3)
ax.scatter(R[:, 0], R[:, 1], s=1, alpha=0.5)
_ = plt.title('z from sklearn PCA')
ax.invert_xaxis()
ax.invert_yaxis()
# SKLEARN gives you the new axis already sorted by variance, also axis might appear rotated

# Gaussian Mixture Model

Model with categorical latent variables


FUTURE!

In [None]:
import scipy.stats
p = scipy.stats.bernoulli(0.6).rvs(1000)
G1 = scipy.stats.norm(loc=5., scale=2.).rvs(1000) # N(5, sqrt(2))
G2 = scipy.stats.norm(loc=-2., scale=1.5).rvs(1000) # N(0, sqrt(10))
data = np.concatenate((G1[p==1], G2[p==0])) # Gaussian mixture

In [None]:
def model():
    #mu_prior = pyro.sample("mean", pyro.distributions.Normal(0, 10))
    #sd_prior = pyro.sample("sigma", pyro.distributions.HalfNormal(5, 10))
    #return pyro.sample("obs", pyro.distributions.Normal(mu_prior, sd_prior))
    mu = torch.tensor([[5.], [-2.]])
    sd = torch.tensor([[2.], [1.5]])
    pi = torch.tensor([np.log(0.6), np.log(0.4)])
    return pyro.sample("obs", pyro.distributions.MixtureOfDiagNormals(mu, sd**2, pi))
    

def guide():
    mu_loc = pyro.param("mu_loc", torch.tensor([0.], dtype=torch.float32))
    mu_scale = pyro.param("mu_scale", torch.tensor([1.], dtype=torch.float32), 
                          constraint=torch.distributions.constraints.positive)
    sd_loc = pyro.param("sd_loc", torch.tensor([1.], dtype=torch.float32), 
                        constraint=torch.distributions.constraints.positive)
    sd_scale = pyro.param("sd_scale", torch.tensor([1], dtype=torch.float32), 
                          constraint=torch.distributions.constraints.positive)
    
    mu = pyro.sample("mean", pyro.distributions.Normal(mu_loc, mu_scale))
    sd = pyro.sample("sigma", pyro.distributions.Normal(sd_loc, sd_scale))

from pyro.contrib.autoguide import AutoDiagonalNormal
#guide = AutoDiagonalNormal(model)

In [None]:
pyro.clear_param_store()

#data = 5*np.random.randn(1000)
data_torch = torch.from_numpy(data.astype('float32'))
conditioned_model = pyro.condition(model, data={"obs": data_torch})

svi = pyro.infer.SVI(model=conditioned_model,
                     guide=guide,
                     optim=pyro.optim.Adam({"lr": 1e-3}),
                     loss=pyro.infer.Trace_ELBO(),
                     num_samples=10)

losses, a,b  = [], [], []
num_steps = 2000
for epoch in range(num_steps):
    #a.append(pyro.param("mu").detach().numpy())
    #b.append(pyro.param("cov").detach().numpy())
    losses.append(svi.step())

fig, ax = plt.subplots()
ax.plot(losses)
ax.set_title("ELBO")
ax.set_xlabel("step")
ax.set_ylabel("loss");

for name, value in pyro.get_param_store().items():
    print(name, pyro.param(name))

In [None]:
fig, ax = plt.subplots(figsize=(6, 3))
ax.hist(data, bins=30, density=True, alpha=0.75);
x_plot = torch.linspace(np.amin(data), np.amax(data), steps=1000)
#line = ax.plot(x_plot.numpy(), model.pdf(x_plot).detach().numpy(), lw=2)

#ax.plot(x_plot, np.exp(-0.5*(x_plot - 2.15)**2/3.7580**2)/np.sqrt(2.0*np.pi*3.75**2))

#anim = animation.FuncAnimation(fig, update_plot, frames=100, interval=20, 
#                               repeat=True, blit=False)

In [None]:
from matplotlib.patches import Ellipse
fig, ax = plt.subplots()
for i in [0, 100, 1000, -1]:
    ellipse = Ellipse((a[i][0], a[i][1]),
            width=b[i][0], height=b[i][1], facecolor='none', ls='--', edgecolor='k')

    ax.add_patch(ellipse)
    
ax.set_xlim([-5, 5])
ax.set_ylim([-5, 5])