In [95]:
import jax
import jax.nn
import numpy as np
import numpyro
import optax
from numpyro.handlers import block, trace, seed
from jax import random, vmap
import matplotlib.pyplot as plt
from numpyro.infer import Predictive
from typing import Optional

In [3]:
%load_ext autoreload
%autoreload 2

from experiments.src.experiment import *
from experiments.src.data import *
from experiments.src.model import BNNRegressor

In [6]:
# %matplotlib inline
# import matplotlib
# matplotlib.use("nbAgg")  # noqa: E402

plt.rcParams.update({
    "axes.grid": True,  # show grid by default
    "font.weight": "bold",  # bold fonts
    "xtick.labelsize": 15,  # large tick labels
    "ytick.labelsize": 15,  # large tick labels
    "lines.linewidth": 1,  # thick lines
    "lines.color": "k",  # black lines
    # "grid.color": "0.5",    # gray gridlines
    "grid.linestyle": "-",  # solid gridlines
    "grid.linewidth": 0.1,  # thin gridlines
    "savefig.dpi": 300,  # higher resolution output.
})

In [175]:
DEVICE = "cpu"
numpyro.set_platform(DEVICE)
# numpyro.set_host_device_count(NUM_CHAINS)
D_X = 10
BNN_SIZE = [16, 16]
VI_ITER = 100_000

## Spurious correlation

We hypothesise that MAP is more effective at finding a sparse solution when encountering correlated covariates, resulting in better generalisation performance than that of the Bayesian model-averaged posterior mean.

To test this we create a dataset with two features $x_1, x_2$, of correlation $\rho$, where the response is generated by a $\mathcal{N}(x_1, \sigma^2)$.
In the test dataset however, the covariates are sampled independently with the same marginals, and the response is still calculated as $\mathcal{N}(x_1, \sigma^2)$.

We generalise this to multiple features too.

Switching to independent Laplace priors on the weights now corresponds to Lasso regression in the linear model case, which we know induces sparsity in the MAP solution.
According to our hypothesis, BMA considers models which are less sparse, or picks the wrong feature due to correlation.
This might not be the fault of Bayesian inference per se -- we might have the prior belief that exactly one of the correlated features is explanatory, but any one of them can be -- but it is a simple demonstration that BMA performs poorly under  distribution shift.

Note: rho=0.9999, D_X=10, train_size=20, run key=0 works for id non-linearity, single hidden *unit*, iid Laplace prior scale of $\sqrt{0.002}$

In [176]:
class SpuriouslyCorrelatedData(Data):
    def __init__(self, rho=0.90, sigma_obs=0.05, train_size=100, test_size=500, D_X=2):
        np.random.seed(0)
        common = np.random.normal(scale=np.sqrt(rho), size=(train_size, 1))
        self.X_train = np.random.normal(scale=np.sqrt(1. - rho), size=(train_size, D_X)) + common
        # self.Y_train = np.mean(self.X_train, axis=1)[:, np.newaxis]
        # For now I'm calculating y = x1 + noise; above would calculate as mean + noise
        self.Y_train = self.X_train[:, [0]]
        self.Y_train += np.random.normal(scale=sigma_obs, size=(train_size, 1))

        self.X_test = np.random.normal(size=(test_size, D_X))
        self.Y_test = self.X_test[:, [0]]
        self.Y_test += np.random.normal(scale=sigma_obs, size=(test_size, 1))

    @property
    def train(self) -> tuple[jax.Array, jax.Array]:
        return self.X_train, self.Y_train

    @property
    def test(self) -> tuple[jax.Array, Optional[jax.Array]]:
        return self.X_test, self.Y_test

    def true_predictive(self, X: jax.Array) -> dist.Distribution:
        raise NotImplementedError()

In [177]:
data = SpuriouslyCorrelatedData(rho=0.9999, D_X=D_X, train_size=20)

In [195]:
bnn = BNNRegressor(
    nonlin=jax.nn.silu,
    D_X=D_X,
    D_Y=1,
    D_H=BNN_SIZE,
    biases=True,
    obs_model=1 / 0.05 ** 2,
    prior_scale=np.sqrt(0.002),
    # prior_type='xavier',
)
bnn.get_weight_dim()

465

In [187]:
# Set Laplace prior for equivalent of Lasso regression, classically inducing sparsity
bnn = bnn.with_prior(dist.Laplace(scale=bnn.prior[0].base_dist.scale).to_event(1), bnn.prior[1])

In [196]:
delta = AutoDeltaVIExperiment(bnn, data, max_iter=VI_ITER, lr_schedule=optax.constant_schedule(-0.01))

In [197]:
delta.train(random.PRNGKey(0))
delta.make_predictions(random.PRNGKey(1))

Initial eval loss: 12942.4912 (lik: -6857.3438, kl: 6085.1475)


100%|██████████| 50/50 [00:13<00:00,  3.82it/s, init loss: 12942.4912, avg. train loss / eval. loss [98000-100000]: -402.7476 / -402.7123]



SVI elapsed time: 13.262001037597656


In [190]:
delta._params

{'prec_obs_loc': DeviceArray(400.00003, dtype=float32),
 'w_loc': DeviceArray([ 1.8338560e-05, -1.5930017e-03, -2.8805146e-03,
               2.1726576e-04,  9.3807932e-04,  1.5049449e-03,
               4.6406838e-04,  7.5329829e-04,  9.9020032e-04,
               8.0157275e-04,  8.8605827e-01,  6.2554370e-04,
               2.1706017e-04,  1.7920779e-03, -6.6106296e-01,
               1.4567844e-03, -3.0970681e-04,  3.9255875e-03,
              -1.2768168e-03,  2.3593577e-03,  4.7313952e-04,
               1.2444295e-03, -7.5252249e-04,  6.6034653e-04,
               7.7791797e-04, -4.2006294e-03,  6.6264961e-03,
              -1.4343015e-03,  1.0075604e-03, -5.6614680e-04,
              -5.1607609e-02,  4.5500608e-04,  3.4343170e-03,
              -6.0255517e-04,  1.1063849e-04, -2.9683244e-04,
               2.1183601e-04, -6.3420594e-04,  1.2230133e-03,
               1.3135200e-04,  8.8151410e-04,  2.7877477e-03,
               3.6967739e-03,  1.4538117e-04, -1.3878767e-03,
     

In [198]:
map_posterior = delta._predictions['Y_mean'][0]
map_mse = np.mean(np.square(map_posterior - data.test[1]))
map_mse

0.90022594

In [199]:
hmc = BasicHMCExperiment(bnn, data, init_params={'w': delta._params['w_loc']},
                         num_samples=400, num_warmup=300)

In [200]:
hmc.train(random.PRNGKey(0))
hmc.make_predictions(random.PRNGKey(1))

sample: 100%|██████████| 700/700 [00:08<00:00, 79.84it/s, 31 steps of size 1.09e-01. acc. prob=0.91] 



MCMC elapsed time: 9.943346977233887


In [201]:
hmc_mean_predictions = hmc._predictions['Y_mean'].mean(axis=0)
hmc_mse = np.mean(np.square(hmc_mean_predictions - data.test[1]))
hmc_mse

0.89950585

In [174]:
hmc._samples['w'][-5:]

DeviceArray([[-0.33152878, -0.02803851, -0.04128244, -0.13686892,
              -0.02575255, -0.06942135,  0.01600564, -0.07470808,
              -0.148497  , -0.07571253,  0.04045138, -1.0937977 ,
               0.0415034 ],
             [-0.41514605, -0.03892874, -0.16183186, -0.38947535,
               0.02460151, -0.03073461,  0.01686918, -0.07670984,
              -0.12511697, -0.16551675, -0.06777012, -0.71585566,
              -0.05705013],
             [-0.4078125 ,  0.00680308, -0.15236731, -0.38793436,
               0.0086543 , -0.05372803,  0.00497035, -0.07178487,
               0.01372926, -0.15573306, -0.05072388, -0.831438  ,
              -0.06757354],
             [-0.45889187, -0.00375076,  0.01590823, -0.4541376 ,
              -0.22374429, -0.00281199, -0.10143273,  0.00992812,
              -0.18039647, -0.06191424,  0.00412107, -0.67887485,
              -0.01307255],
             [-0.21803291, -0.04129405, -0.09465925, -0.45696926,
              -0.01365899, -0.