In [None]:
%%HTML
<!-- Mejorar visualización en proyector -->
<style>
.rendered_html {font-size: 1.2em; line-height: 150%;}
div.prompt {min-width: 0ex; padding: 0px;}
.container {width:95% !important;}
</style>

In [None]:
%matplotlib notebook
#%autosave 0
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import torch
import pyro
print(pyro.__version__)

# Non-parametric Bayesian Methods

Before training a neural network we select the number of layers and the number of neurons per layer

> The structure of the neural network is fixed

In contrast non-parametric models have no fixed structure

> Non-parametric models automatically infer a model size from the complexity of the data



# Gaussian Process (GP)

A GP is a distribution over functions $f: \mathcal{X} \rightarrow \mathbb{R}$

## GP with pyro

Let's start by creating some synthetic data for regression

In [None]:
# Synthetic data
se = 0.1
np.random.seed(0)
x = np.linspace(0, 1, num=20) #100x1
x_test = np.linspace(-0.1, 1.1, num=200)
f = lambda x : x*np.sin(10*x)

x = np.delete(x, slice(9, 14))
y = f(x) + se*np.random.randn(len(x))
fig, ax = plt.subplots(figsize=(7, 3), tight_layout=True)
ax.scatter(x, y);

x_torch = torch.from_numpy(x.astype('float32'))
x_test = torch.from_numpy(x_test.astype('float32'))
y_torch = torch.from_numpy(y.astype('float32'))

We will use `pyro.contrib.gp` to implement our first GP

Let's start by creating a kernel from `gp.kernels`

We will use a Radial Basis Function (RBF) aka Squared Exponential aka Gaussian kernel as our covariance

We can specify the initial value of the variance and the lengthscale

In [None]:
import pyro.contrib.gp as gp

pyro.enable_validation(True)
pyro.set_rng_seed(0)

K = gp.kernels.RBF(input_dim=1, 
                   variance=torch.tensor(1), 
                   lengthscale=torch.tensor(0.1))

How does this model looks before fitting the data? 

Let's inspect the prior $\mathcal{N}(0, K)$ on the test data

**Activity:** Increase/decrese the lengthscale and repeat, get a notion of its influence

In [None]:
# We sum a small value to the diagonal for numerical stability
C = K.forward(x_test) + torch.eye(len(x_test))*1e-4
# Then we sample from the a multivariate normal distribution
samples = pyro.distributions.MultivariateNormal(torch.zeros(len(x_test)), 
                                                covariance_matrix=C).sample(sample_shape=(50,))
        
fig, ax = plt.subplots(figsize=(6, 3))
for i in range(samples.shape[0]):
    ax.plot(x_test.detach().numpy(), samples.detach().numpy()[i, :],
            linestyle='-', c='tab:blue', alpha=0.5)

Then we create a model from `gp.models`

> In this case we pick a model for regression `GPRegression`

This model expects the train data, the kernel and the initial value of the noise variance

Then we select an optimizer and a cost function

> We will use Adam and the Trace_ELBO, respectively

The training is very similar to how we train neural networks in pytorch

> The only remarkable difference is that the cost function expects the `model` and `guide` from our GP

In [None]:
pyro.clear_param_store()

#Kernel
K = gp.kernels.RBF(input_dim=1, 
                   variance=torch.tensor(1.0), 
                   lengthscale=torch.tensor(0.1))
# Model
gpr_model = gp.models.GPRegression(x_torch, y_torch, 
                                   kernel=K, 
                                   noise=torch.tensor(2.))
# Optimizer
optimizer = torch.optim.Adam(gpr_model.parameters(), lr=1e-2)
# Criterion
criterion = pyro.infer.Trace_ELBO().differentiable_loss

# We train the model and visualize the results on the fly
fig, ax = plt.subplots(1, 2, figsize=(7, 3), tight_layout=True)
line_loss = ax[1].plot([], [])
ax[0].scatter(x, y)
epoch_loss = np.zeros(shape=(2000,))

for k in tqdm_notebook(range(len(epoch_loss))):
    optimizer.zero_grad()
    loss = criterion(gpr_model.model, gpr_model.guide)
    loss.backward()
    optimizer.step()
    epoch_loss[k] = loss.item()
    #break    
    if k % 100 == 0:
        ax[0].cla()
        mu, cov = gpr_model.forward(x_test, full_cov=True, noiseless=False)
        mu = mu.detach().numpy()
        sd = cov.diag().sqrt().detach().numpy()        
        ax[0].scatter(x, y, c='k')
        ax[0].plot(x_test.detach(), mu)
        ax[0].fill_between(x_test.detach(), mu-sd, mu+sd, alpha=0.5)
        line_loss[0].set_xdata(range(k))
        line_loss[0].set_ydata(epoch_loss[:k])
        ax[1].relim()
        ax[1].autoscale_view()
        fig.canvas.draw()

The learned parameters are

In [None]:
display("RBF variance:", gpr_model.kernel.variance.item())
display("RBF length scale:", gpr_model.kernel.lengthscale.item())
display("Noise variance:", gpr_model.noise.item())

We can also sample from the posterior

In [None]:
# We sum a small value to the diagonal for numerical stability
mu, Sigma = gpr_model.forward(x_test, full_cov=True, noiseless=True)
Sigma += torch.eye(len(x_test))*1e-5
# Then we sample from the a multivariate normal distribution
samples = pyro.distributions.MultivariateNormal(mu, covariance_matrix=Sigma).sample(sample_shape=(50,))
        
fig, ax = plt.subplots(figsize=(6, 3))
for i in range(samples.shape[0]):
    ax.plot(x_test.detach().numpy(), samples.detach().numpy()[i, :], 
            linestyle='-', c='tab:blue', alpha=0.25)
ax.scatter(x, y, c='k', zorder=100);

# Self-study

- Mackay chapter 45
- Barber chapter 19
- [Zhoubin Ghahramadi tutorial](http://mlg.eng.cam.ac.uk/zoubin/talks/uai05tutorial-b.pdf)

http://pyro.ai/examples/gp.html

## Link dump 

https://www.inference.vc/maximum-likelihood-for-representation-learning-2/

https://www.reddit.com/r/MachineLearning/comments/9g1rxs/d_how_is_the_log_marginal_likelihood_of/

https://colinraffel.com/blog/gans-and-divergence-minimization.html

https://www.inference.vc/maximum-likelihood-for-representation-learning-2/

https://medium.com/@jonathan_hui/gan-why-it-is-so-hard-to-train-generative-advisory-networks-819a86b3750b

https://pyro.ai/, https://pyro.ai/examples/intro_part_i.html

https://www.tuananhle.co.uk/notes/reverse-forward-kl.html

https://blog.evjang.com/2016/08/variational-bayes.html

https://dibyaghosh.com/blog/probability/kldivergence.html

https://wiseodd.github.io/techblog/2016/12/21/forward-reverse-kl/

http://willwolf.io/2018/03/31/gaussian-algebra-to-gaussian-processes-part-1/

http://andymiller.github.io/2016/11/23/vb.html

http://mlg.eng.cam.ac.uk/zoubin/talks/uai05tutorial-b.pdf