# Bayesian Inference Modelling
## Goal: estimation of both evidence and posterior in one go.

# 1. Design the Gaussian process surrogate model.
- We model GP as zero mean GP with RBF kernel. (Vanilla BQ)
- Optimize the hyperparameters based on type-II MLE using BoTorch optimizer (L-BFGS-B)

In [1]:
import torch
import time
import gpytorch
from gpytorch.kernels import ScaleKernel, RBFKernel
from gpytorch.means import ZeroMean
from gpytorch.likelihoods import GaussianLikelihood
from gpytorch.mlls import ExactMarginalLogLikelihood
from gpytorch.constraints import Interval
from botorch.models import SingleTaskGP
from botorch.fit import fit_gpytorch_model

def set_model(X, Y):
    base_kernel = RBFKernel()
    mean_module = ZeroMean()
    covar_module = ScaleKernel(base_kernel)

    # Set a GP model
    train_Y = Y.view(-1).unsqueeze(1)
    likelihood = GaussianLikelihood()
    likelihood.noise_covar.register_constraint("raw_noise", gpytorch.constraints.Interval(1e-12, 1e-9))
    model = SingleTaskGP(X, train_Y, likelihood=likelihood, mean_module=mean_module, covar_module=covar_module)
    hypers = {'likelihood.noise_covar.noise': torch.tensor(1e-10)}
    model.initialize(**hypers)
    model.likelihood.raw_noise.requires_grad = False
    return model

def optimise_model(model):
    mll = ExactMarginalLogLikelihood(model.likelihood, model)
    fit_gpytorch_model(mll)
    return model

def set_and_opt_gp(X, Y):
    model = set_model(X, Y)
    model = optimise_model(model)
    return model

# 2. Set up the problems we wish to solve
- true_likelihood: a likelihood modelled with Gaussian mixture. We wish to estimate this function only from the queries to this.
- prior: a unimodal multivariate normal distribution. mean: mu_pi, covariance matrix: cov_pi
- true evidence: 1

In [2]:
from SOBER._utils import TensorManager
from SOBER.BASQ._basq import BASQ
from BASQ.experiment.gmm import GMM
#import warnings
tm = TensorManager()
#warnings.filterwarnings("ignore")


num_dim = 10  # Number of dimensions of the true likelihood to be estimated
mu_pi = torch.zeros(num_dim).to(tm.device, tm.dtype)  # the mean vactor of Gaussian prior
cov_pi = 2 * torch.eye(num_dim).to(tm.device, tm.dtype)  # the covariance matrix of Gaussian prior

from SOBER._prior import Gaussian
prior = Gaussian(mu_pi, cov_pi)
true_likelihood = GMM(num_dim, mu_pi, cov_pi, tm.device)  # true likelihood to be estimated

# 3. Set up experimental condition
We set up the experimental conditions
- initial dataset: (X, Y) = (train_x, train_y). Generated from prior.

In [3]:
n_init = 2           # number of initial guess
n_iterations = 10    # number of iterations (batches)
n_cand = 20000       # number of candidates
n_nys = 500          # number of Nyström samples
n_batch = 100        # batch size

# 4. Run!
### 4.1 Metric for integral estimation
- the KL divergence between true and estimated posterior.
- posterior = E[GP-modelled-likelihood] * prior / marginal-likelihood

### 4.2 Metric for posterior estimation
- logarithmic mean absolute error between true and estimated evidence.
- logMAE = (Z_estimated - Z_true).abs().log()

In [4]:
Z_true = 1                             # true integral
x_test = prior.sample(10000)           # test data for evaluating posterior using KL divergence

torch.manual_seed(0)                    # fix random seed for reproducibility
X = prior.sample(n_init)               # inital dataset X
Y = true_likelihood(X).to(tm.dtype)    # initial guess Y
basq = BASQ(n_cand, n_nys, prior)      # set up BASQ instance
model = set_and_opt_gp(X, Y)           # set up the GP surroage model

for ith_round in range(n_iterations):
    tik = time.monotonic()
    X_batch, _ = basq.batch_uncertainty_sampling(model, n_batch)  # run BASQ algorithm to select 100 batch points
    tok = time.monotonic()
    overhead = tok - tik               # overhead of batch query
    
    Y_batch = true_likelihood(X_batch) # parallel query to true likelihood function
    X = torch.cat([X, X_batch])        # concatenate the observations for X
    Y = torch.cat([Y, Y_batch])        # concatenate the observations for X
    
    # Evaluation for integral
    model = set_and_opt_gp(X, Y)       # retrain GP model
    integral_estimated = basq.quadrature(model, 500)  # integral estimation
    logMAE = (Z_true - integral_estimated).abs()      # evaluate the estimated integral value to true one
    # EZ, VZ = basq.full_quadrature(model, 500)       # You can estimate integral variance (but takes more time)
    
    # Evaluation for the posterior
    KL = basq.KLdivergence(Z_true, x_test, true_likelihood, model)  # compute the KL divergence
    print('Iter %d - overhead: %.3f [s]  logMAE of Integral: %.3f   logKL of posterior: %.3f' % (
        ith_round, overhead, logMAE.log().item(), KL.log().item()
    ))

Iter 0 - overhead: 0.612 [s]  logMAE of Integral: -2.464   logKL of posterior: -8.234
Iter 1 - overhead: 0.683 [s]  logMAE of Integral: -4.188   logKL of posterior: -9.943
Iter 2 - overhead: 1.085 [s]  logMAE of Integral: -3.324   logKL of posterior: -9.166
Iter 3 - overhead: 1.186 [s]  logMAE of Integral: -4.013   logKL of posterior: -11.666
Iter 4 - overhead: 1.130 [s]  logMAE of Integral: -2.723   logKL of posterior: -8.678
Iter 5 - overhead: 1.103 [s]  logMAE of Integral: -3.592   logKL of posterior: -11.177
Iter 6 - overhead: 1.156 [s]  logMAE of Integral: -4.558   logKL of posterior: -9.773
Iter 7 - overhead: 1.221 [s]  logMAE of Integral: -4.137   logKL of posterior: -11.323
Iter 8 - overhead: 1.391 [s]  logMAE of Integral: -4.951   logKL of posterior: -10.951
Iter 9 - overhead: 1.680 [s]  logMAE of Integral: -3.583   logKL of posterior: -9.448
