# [ASI Project] Weight Uncertainty in Neural Networks  
**Authors**: Miriam Lamari, Francesco Giannuzzo  


In [29]:
import csv
import math
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data
from torchvision import datasets
from torchvision import transforms
from kaggle_secrets import UserSecretsClient
from torch.utils.data import DataLoader, random_split
import wandb

In [30]:
user_secrets = UserSecretsClient()
key = user_secrets.get_secret('wandb-api-key')

wandb.login(key=key)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## Minibatches
**minibatch_weight**(batch_idx: int, num_batches: int)

In [31]:
def minibatch_weight(batch_idx: int, num_batches: int) -> float:
    return 2 ** (num_batches - batch_idx) / (2 ** num_batches - 1)#batch_idx)

## Variational Approximation

In [32]:
from typing import Any, Optional

import torch.nn as nn
from torch import Tensor


class BayesianModule(nn.Module):

    """Base class for BNN to enable certain behaviour."""

    def __init__(self):
        super().__init__()

    def kld(self, *args):
        raise NotImplementedError('BayesianModule::kld()')


def variational_approximator(model: nn.Module) -> nn.Module:

    def kl_divergence(self) -> Tensor:

        kl = 0
        for module in self.modules():
            if isinstance(module, BayesianModule):
                kl += module.kl_divergence

        return kl

    # add `kl_divergence` to the model
    setattr(model, 'kl_divergence', kl_divergence)

    def elbo(self,
             inputs: Tensor,
             targets: Tensor,
             criterion: Any,
             n_samples: int,
             w_complexity: Optional[float] = 1.0) -> Tensor:

        loss = 0
        for sample in range(n_samples):
            outputs = self(inputs)
            loss += criterion(outputs, targets)
            loss += self.kl_divergence() * w_complexity

        return loss / n_samples

    # add `elbo` to the model
    setattr(model, 'elbo', elbo)

    return model


## Scale Mixture Prior

In [33]:
import functools as ft

import torch
import torch.nn as nn
from torch import Tensor


class ScaleMixture(nn.Module):

    """Scale Mixture Prior.

    Section 3.3 of the 'Weight Uncertainty in Neural Networks' paper
    proposes the use of a Scale Mixture prior for use in variational
    inference - this being a fixed-form prior.

    The authors note that, should the parameters be allowed to adjust
    during training, the prior changes rapidly and attempts to capture
    the empirical distribution of the weights. As a result the prior
    learns to fit poor initial parameters and struggles to improve.
    """

    def __init__(self, pi: float, sigma1: float, sigma2: float) -> None:

        """Scale Mixture Prior.

        The authors of 'Weight Uncertainty in Neural Networks' note:

            sigma1 > sigma2:
                provides a heavier tail in the prior density than is
                seen in a plain Gaussian prior.
            sigma2 << 1.0:
                causes many of the weights to a priori tightly
                concentrate around zero.

        Parameters
        ----------
        pi : float
            Parameter used to scale the two Gaussian distributions.
        sigma1 : float
            Standard deviation of the first normal distribution.
        sigma2 : float
            Standard deviation of the second normal distribution.
        """

        super().__init__()

        self.pi = pi
        self.sigma1 = sigma1
        self.sigma2 = sigma2

        self.normal1 = torch.distributions.Normal(0, sigma1)
        self.normal2 = torch.distributions.Normal(0, sigma2)

    def log_prior(self, w: Tensor) -> Tensor:

        """Log Likelihood of the weight according to the prior.

        Calculates the log likelihood of the supplied weight given the
        prior distribution - the scale mixture of two Gaussians.

        Parameters
        ----------
        w : Tensor
            Weight to be used to calculate the log likelihood.

        Returns
        -------
        Tensor
            Log likelihood of the weights from the prior distribution.
        """

        likelihood_n1 = torch.exp(self.normal1.log_prob(w))
        likelihood_n2 = torch.exp(self.normal2.log_prob(w))

        p_scalemixture = self.pi * likelihood_n1 + (1 - self.pi) * likelihood_n2
        log_prob = torch.log(p_scalemixture).sum()

        return log_prob

## Gaussian Variational Inference

In [34]:
import numpy as np
import torch
import torch.nn as nn
from torch import Tensor


class GaussianVariational(nn.Module):
    def __init__(self, mu: Tensor, rho: Tensor) -> None:

        super().__init__()

        self.mu = nn.Parameter(mu)
        self.rho = nn.Parameter(rho)

        self.w = None
        self.sigma = None

        self.normal = torch.distributions.Normal(0, 1)

    def sample(self) -> Tensor:
        device = self.mu.device
        epsilon = self.normal.sample(self.mu.size()).to(device)
        self.sigma = torch.log(1 + torch.exp(self.rho)).to(device)
        self.w = self.mu + self.sigma * epsilon

        return self.w

    def log_posterior(self) -> Tensor:

        if self.w is None:
            raise ValueError('self.w must have a value.')

        log_const = np.log(np.sqrt(2 * np.pi))
        log_exp = ((self.w - self.mu) ** 2) / (2 * self.sigma ** 2)
        log_posterior = -log_const - torch.log(self.sigma) - log_exp

        return log_posterior.sum()

## Bayesian Linear Layer ##

In [35]:
from typing import Optional

import torch
import torch.nn.functional as F
from torch import Tensor

#from .base_bayesian import BayesianModule
#from .samplers.gaussian_variational import GaussianVariational
#from .samplers.scale_mixture import ScaleMixture


class BayesLinear(BayesianModule):

    """Bayesian Linear Layer.

    Implementation of a Bayesian Linear Layer as described in the
    'Weight Uncertainty in Neural Networks' paper.
    """

    def __init__(self,
                 in_features: int,
                 out_features: int,
                 prior_pi: Optional[float] = 0.5,
                 prior_sigma1: Optional[float] = 1.0,
                 prior_sigma2: Optional[float] = 0.0025) -> None:

        super().__init__()

        w_mu = torch.empty(out_features, in_features).uniform_(-0.2, 0.2)
        w_rho = torch.empty(out_features, in_features).uniform_(-5.0, -4.0)

        bias_mu = torch.empty(out_features).uniform_(-0.2, 0.2)
        bias_rho = torch.empty(out_features).uniform_(-5.0, -4.0)

        self.w_posterior = GaussianVariational(w_mu, w_rho)
        self.bias_posterior = GaussianVariational(bias_mu, bias_rho)

        self.w_prior = ScaleMixture(prior_pi, prior_sigma1, prior_sigma2)
        self.bias_prior = ScaleMixture(prior_pi, prior_sigma1, prior_sigma2)

        self.kl_divergence = 0.0

    def forward(self, x: Tensor) -> Tensor:

        w = self.w_posterior.sample()
        b = self.bias_posterior.sample()

        w_log_prior = self.w_prior.log_prior(w)
        b_log_prior = self.bias_prior.log_prior(b)

        w_log_posterior = self.w_posterior.log_posterior()
        b_log_posterior = self.bias_posterior.log_posterior()

        total_log_prior = w_log_prior + b_log_prior
        total_log_posterior = w_log_posterior + b_log_posterior
        self.kl_divergence = self.kld(total_log_prior, total_log_posterior)

        return F.linear(x, w, b)

    def kld(self, log_prior: Tensor, log_posterior: Tensor) -> Tensor:
        return log_posterior - log_prior

In [36]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
kwargs = {'num_workers': 1, 'pin_memory': True} if device == 'cuda' else {}

In [37]:
def train_wrapper():
    run = wandb.init(project="Project-ASI")

    return train_loop(
        learning_rate = wandb.config.learning_rate,
        prior_pi = wandb.config.prior_pi,
        prior_sigma1=wandb.config.prior_sigma1,
        prior_sigma2=wandb.config.prior_sigma2
    )

## Tuning Hyperparamters

In [38]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, random_split
from sklearn.metrics import mean_squared_error

def generate_data(n_samples=200):
    x = np.linspace(-0.5, 2.5, n_samples)
    eps = np.random.normal(0, 0.02, size=n_samples)
    y = x + 0.3 * np.sin(2 * np.pi * (x + eps)) + 0.3 * np.sin(4 * np.pi * (x + eps)) + eps
    x = torch.tensor(x, dtype=torch.float32).unsqueeze(1)
    y = torch.tensor(y, dtype=torch.float32).unsqueeze(1)
    return x, y

In [39]:
# === TRAIN LOOP ===
def train_loop(learning_rate, prior_pi, prior_sigma1, prior_sigma2, epochs=100):
    run = wandb.init(project="Project-ASI")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


    @variational_approximator
    class BayesianNetwork(nn.Module):
        def __init__(self, input_dim, output_dim):
            super().__init__()
            self.bl1 = BayesLinear(input_dim, 1200, prior_pi, prior_sigma1, prior_sigma2)
            self.bl2 = BayesLinear(1200, 1200, prior_pi, prior_sigma1, prior_sigma2)
            self.bl3 = BayesLinear(1200, output_dim, prior_pi, prior_sigma1, prior_sigma2)
    
        def forward(self, x):
            x = F.relu(self.bl1(x))
            x = F.relu(self.bl2(x))
            x = self.bl3(x)
            
            # Accumula KL divergence da tutti i layer
            self.kl_divergence = (
                self.bl1.kl_divergence +
                self.bl2.kl_divergence +
                self.bl3.kl_divergence
            )
            
            return x  # output: (batch_size, 2) → [mean, raw_variance]
        
    # Data prep
    x, y = generate_data()
    full_dataset = TensorDataset(x, y)
   
    test_size = int(0.2 * len(full_dataset))
    val_size = int(0.3 * (len(full_dataset) - test_size))
    train_size = len(full_dataset) - test_size - val_size
    
    train_set, val_set, test_set = random_split(full_dataset, [train_size, val_size, test_size])
    trainloader = DataLoader(train_set, batch_size=32, shuffle=True) #explain in the report
    valloader = DataLoader(val_set, batch_size=32)
    testloader = DataLoader(test_set, batch_size=32)

    # Model
    model = BayesianNetwork(1, 2).to(device)  # Output: [mean, logvar]
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.GaussianNLLLoss(full=True, reduction='sum')

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0

        for batch_idx, (x_batch, y_batch) in enumerate(trainloader):
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()
            pi_weight = minibatch_weight(batch_idx=batch_idx, num_batches=len(trainloader))

            output = model(x_batch)
            mean = output[:, 0:1]
            variance = F.softplus(output[:, 1:2]) + 1e-6
            
            nll = criterion(mean, y_batch, variance)
            kld = model.kl_divergence  # updated during forward
            loss = nll + pi_weight * kld

            train_loss += loss.item()
            loss.backward()
            optimizer.step()

        # === Validation ===
        model.eval()
        val_loss = 0.0
        val_rmse = 0.0

        with torch.no_grad():
            for batch_idx, (x_batch, y_batch) in enumerate(valloader):
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)

                output = model(x_batch)
                mean = output[:, 0:1]
                variance = F.softplus(output[:, 1:2]) + 1e-6

                nll = criterion(mean, y_batch, variance)
                kld = model.kl_divergence
                loss = nll + pi_weight * kld

                val_loss += loss.item()
                val_rmse += torch.sum((mean - y_batch) ** 2).item()

        val_rmse = np.sqrt(val_rmse / len(val_set))

        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.2f} | "
              f"Val Loss: {val_loss:.2f} | Val RMSE: {val_rmse:.4f}")

    metrics = {'train_loss' : train_loss, 'val_loss' : val_loss, 'val_rmse' :  val_rmse}
    wandb.log(metrics)

"""""
    # === Test set evaluation ===
    model.eval()
    test_preds = []
    test_targets = []

    with torch.no_grad():
        for x_batch, y_batch in testloader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            output = model(x_batch)
            mean = output[:, 0:1]
            test_preds.append(mean.cpu())
            test_targets.append(y_batch.cpu())

    test_preds = torch.cat(test_preds)
    test_targets = torch.cat(test_targets)
    test_rmse = torch.sqrt(F.mse_loss(test_preds, test_targets)).item()
    print(f"Test RMSE: {test_rmse:.4f}")
"""

'""\n    # === Test set evaluation ===\n    model.eval()\n    test_preds = []\n    test_targets = []\n\n    with torch.no_grad():\n        for x_batch, y_batch in testloader:\n            x_batch, y_batch = x_batch.to(device), y_batch.to(device)\n            output = model(x_batch)\n            mean = output[:, 0:1]\n            test_preds.append(mean.cpu())\n            test_targets.append(y_batch.cpu())\n\n    test_preds = torch.cat(test_preds)\n    test_targets = torch.cat(test_targets)\n    test_rmse = torch.sqrt(F.mse_loss(test_preds, test_targets)).item()\n    print(f"Test RMSE: {test_rmse:.4f}")\n'

In [40]:
sweep_configuration = {
     "method": "grid",
     "metric": {"goal": "minimize", "name": "val_loss"},
     'name': "sweep-BBB-Gaussian_regression",
     "parameters": {
         "learning_rate": {'values': [1e-3, 1e-4]},
         "prior_pi": {'values': [0.25, 0.5]},
         "prior_sigma1": {'values': [1, math.exp(-1)]},
         "prior_sigma2": {'values': [math.exp(-6), math.exp(-7)]},
     },
}

sweep_id = wandb.sweep(sweep=sweep_configuration, project="Project-ASI")
wandb.agent(sweep_id, function=train_wrapper);

Create sweep with ID: x7fwbh87
Sweep URL: https://wandb.ai/miriam-lamari2-eurecom/Project-ASI/sweeps/x7fwbh87


[34m[1mwandb[0m: Agent Starting Run: hhocwy0h with config:
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	prior_pi: 0.25
[34m[1mwandb[0m: 	prior_sigma1: 1
[34m[1mwandb[0m: 	prior_sigma2: 0.0024787521766663585


Epoch 1/2 | Train Loss: 14938158.19 | Val Loss: 1982334.75 | Val RMSE: 2.6505
Epoch 2/2 | Train Loss: 14856628.06 | Val Loss: 1973423.06 | Val RMSE: 1.9034


0,1
train_loss,▁
val_loss,▁
val_rmse,▁

0,1
train_loss,14856628.0625
val_loss,1973423.0625
val_rmse,1.90344


[34m[1mwandb[0m: Agent Starting Run: ou9zhf91 with config:
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	prior_pi: 0.25
[34m[1mwandb[0m: 	prior_sigma1: 0.36787944117144233
[34m[1mwandb[0m: 	prior_sigma2: 0.0024787521766663585


Epoch 1/2 | Train Loss: 12297374.69 | Val Loss: 1629663.56 | Val RMSE: 0.3763
Epoch 2/2 | Train Loss: 12210770.94 | Val Loss: 1620745.38 | Val RMSE: 0.5176


0,1
train_loss,▁
val_loss,▁
val_rmse,▁

0,1
train_loss,12210770.9375
val_loss,1620745.375
val_rmse,0.51759


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


In [41]:
#wandb.init(project="Project-ASI")

#train_loop(
#    learning_rate = best_hyperparameters['learning_rate'],
#    prior_pi = best_hyperparameters['prior_pi'],
#    prior_sigma1 = best_hyperparameters['prior_sigma1'],
#    prior_sigma2 = best_hyperparameters['prior_sigma2'],
#    epochs = 2
#)