In [9]:
import os
import typing

import numpy as np
import torch
import torch.optim
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score, average_precision_score
from torch import nn
from torch.nn import functional as F
from tqdm import trange

import abc
import warnings

In [201]:
torch.manual_seed(0)

<torch._C.Generator at 0x7f707c02b3b0>

In [6]:
torch.Tensor([1.0]).item()

1.0

In [11]:
class ParameterDistribution(torch.nn.Module, metaclass=abc.ABCMeta):
    """
    Abstract class that models a distribution over model parameters,
    usable for Bayes by backprop.
    You can implement this class using any distribution you want
    and try out different priors and variational posteriors.
    All torch.nn.Parameter that you add in the __init__ method of this class
    will automatically be registered and know to PyTorch.
    """

    def __init__(self):
        super().__init__()

    @abc.abstractmethod
    def log_likelihood(self, values: torch.Tensor) -> torch.Tensor:
        """
        Calculate the log-likelihood of the given values
        :param values: Values to calculate the log-likelihood on
        :return: Log-likelihood
        """
        pass

    @abc.abstractmethod
    def sample(self) -> torch.Tensor:
        """
        Sample from this distribution.
        Note that you only need to implement this method for variational posteriors, not priors.

        :return: Sample from this distribution. The sample shape depends on your semantics.
        """
        pass

    def forward(self, values: torch.Tensor) -> torch.Tensor:
        # DO NOT USE THIS METHOD
        # We only implement it since torch.nn.Module requires a forward method
        warnings.warn('ParameterDistribution should not be called! Use its explicit methods!')
        return self.log_likelihood(values)

In [176]:
class UnivariateGaussian(ParameterDistribution):
    """
    Univariate Gaussian distribution.
    For multivariate data, this assumes all elements to be i.i.d.
    """

    def __init__(self, mu: torch.Tensor, sigma: torch.Tensor):
        super(
            UnivariateGaussian, self
        ).__init__()  # always make sure to include the super-class init call!
        assert mu.size() == () and sigma.size() == ()
        assert sigma > 0
        self.mu = mu
        self.sigma = sigma

    def log_likelihood(self, values: torch.Tensor) -> torch.Tensor:
        ll = torch.sum(torch.log(((1. / np.sqrt(2.0 * np.pi))/self.sigma) * torch.exp(- (values - self.mu) ** 2 / (2.0 * self.sigma ** 2))))
        return ll  # clip to avoid numerical issues

    def sample(self) -> torch.Tensor:
        return torch.distributions.Normal(self.mu, self.sigma).sample()

In [125]:
test_norm = UnivariateGaussian(torch.tensor(0.0), torch.tensor(1.0))

In [126]:
test_values = torch.Tensor([1.1])

In [133]:
test_norm.log_likelihood(weight)

tensor(-18.4838, grad_fn=<SumBackward0>)

In [127]:
test_norm.log_likelihood(weight)

tensor(-18.4838, grad_fn=<SumBackward0>)

In [135]:
torch.sum(torch.distributions.Normal(0,1).log_prob(weight))

tensor(-18.4838, grad_fn=<SumBackward0>)

In [136]:
test_norm.log_likelihood(weight)

tensor(-18.4838, grad_fn=<SumBackward0>)

In [89]:
out_features = 4
in_features = 5
weight_rho = nn.Parameter(torch.Tensor(out_features, in_features).uniform_(-3., -3.))
weight_mu = nn.Parameter(torch.Tensor(out_features, in_features).normal_(0., .1))
bias_mu = nn.Parameter(torch.Tensor(out_features).normal_(0., .1))
bias_rho = nn.Parameter(torch.Tensor(out_features).uniform_(-3., -3.))

weight_sigma = torch.log(1. + torch.exp(weight_rho))
bias_sigma = torch.log(1. + torch.exp(bias_rho))
epsilon_weight = torch.autograd.Variable(torch.Tensor(out_features, in_features).normal_(0., 1.))
epsilon_bias = torch.autograd.Variable(torch.Tensor(out_features).normal_(0., 1.))
weight = weight_mu + weight_sigma * epsilon_weight
bias = bias_mu + bias_sigma * epsilon_bias

In [118]:
torch.log(gaussian(weight, 0, 1.0)).sum()

tensor(-18.4838, grad_fn=<SumBackward0>)

In [94]:
GAUSSIAN_SCALER = 1. / np.sqrt(2.0 * np.pi)
def gaussian(x, mu, sigma):
    bell = torch.exp(- (x - mu) ** 2 / (2.0 * sigma ** 2))
    return torch.clamp(GAUSSIAN_SCALER / sigma * bell, 1e-10, 1.)  # clip to avoid numerical issues

torch.log(gaussian(weight, 0, 1.0).sum() + gaussian(bias, 0, 1.0).sum())

tensor(2.2531, grad_fn=<LogBackward>)

In [123]:
torch.distributions.Normal(0,1).sample((10,4))

tensor([[ 0.6915,  0.4607,  0.0377, -1.4025],
        [-0.2379,  0.2781, -1.3480, -0.7980],
        [ 0.0922,  1.1334, -1.2153, -2.1855],
        [ 0.5999,  0.4441,  0.4513,  0.8644],
        [-0.1280, -2.4436, -0.8932, -0.3862],
        [-0.7883,  0.0139, -0.1614,  0.3438],
        [ 0.8921, -0.1609,  1.4146, -1.7340],
        [ 0.0317,  0.5293, -0.6766,  1.0246],
        [-1.3761, -0.3815, -0.3808,  0.6661],
        [-0.8763, -0.9672,  0.6168, -1.2584]])

In [137]:
weight_mu = torch.nn.Parameter(torch.zeros(out_features, in_features).uniform_(-0.0005, 0.0005))

In [185]:
class MultivariateDiagonalGaussian(ParameterDistribution):
    """
    Multivariate diagonal Gaussian distribution,
    i.e., assumes all elements to be independent Gaussians
    but with different means and standard deviations.
    This parameterizes the standard deviation via a parameter rho as
    sigma = softplus(rho).
    """

    def __init__(self, mu: torch.Tensor, rho: torch.Tensor):
        super(
            MultivariateDiagonalGaussian, self
        ).__init__()  # always make sure to include the super-class init call!
        assert mu.size() == rho.size()
        self.mu = mu
        self.rho = rho

    def log_likelihood(self, values: torch.Tensor) -> torch.Tensor:
        return torch.sum(torch.distributions.Normal(self.mu, self.rho).log_prob(values))

    def sample(self) -> torch.Tensor:
        return torch.distributions.Normal(self.mu, self.sigma).sample(mu.size())

## Class Bayes Layer

In [216]:
class BayesianLayerOwn(nn.Module):
    """
    Module implementing a single Bayesian feedforward layer.
    It maintains a prior and variational posterior for the weights (and biases)
    and uses sampling to approximate the gradients via Bayes by backprop.
    """

    def __init__(self, in_features: int, out_features: int, bias: bool = True):
        """
        Create a BayesianLayer.

        :param in_features: Number of input features
        :param out_features: Number of output features
        :param bias: If true, use a bias term (i.e., affine instead of linear transformation)
        """
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.use_bias = bias

        # TODO: Create a suitable prior for weights and biases as an instance of ParameterDistribution.
        #  You can use the same prior for both weights and biases, but are free to experiment with different priors.
        #  You can create constants using torch.tensor(...).
        #  Do NOT use torch.Parameter(...) here since the prior should not be optimized!
        #  Example: self.prior = MyPrior(torch.tensor(0.0), torch.tensor(1.0))

        self.prior_mu = 0
        self.prior_sigma = 1.0

        self.prior = UnivariateGaussian(
            torch.tensor(self.prior_mu), torch.tensor(self.prior_sigma)
        )
        assert isinstance(self.prior, ParameterDistribution)
        assert not any(
            True for _ in self.prior.parameters()
        ), "Prior cannot have parameters"

        # TODO: Create a suitable variational posterior for weights as an instance of ParameterDistribution.
        #  You need to create separate ParameterDistribution instances for weights and biases,
        #  but can use the same family of distributions if you want.
        #  IMPORTANT: You need to create a nn.Parameter(...) for each parameter
        #  and add those parameters as an attribute in the ParameterDistribution instances.
        #  If you forget to do so, PyTorch will not be able to optimize your variational posterior.
        #  Example: self.weights_var_posterior = MyPosterior(
        #      torch.nn.Parameter(torch.zeros((out_features, in_features))),
        #      torch.nn.Parameter(torch.ones((out_features, in_features)))
        #  )
        self.weight_mu = torch.nn.Parameter(
            torch.zeros(out_features, in_features).uniform_(-0.0005, 0.0005)
        )
        self.weight_logsigma = torch.nn.Parameter(
            torch.zeros(out_features, in_features).uniform_(-2.56, -2.55)
        )
        self.weights_var_posterior = MultivariateDiagonalGaussian(
            self.weight_mu, self.weight_logsigma
        )

        assert isinstance(self.weights_var_posterior, ParameterDistribution)
        assert any(
            True for _ in self.weights_var_posterior.parameters()
        ), "Weight posterior must have parameters"

        if self.use_bias:
            # TODO: As for the weights, create the bias variational posterior instance here.
            #  Make sure to follow the same rules as for the weight variational posterior.
            self.bias_mu = torch.nn.Parameter(
                torch.zeros(out_features).normal_(-0.0005, 0.0005)
            )
            self.bias_logsigma = torch.nn.Parameter(
                torch.zeros(out_features).uniform_(-2.56, -2.55)
            )
            self.bias_var_posterior = MultivariateDiagonalGaussian(
                self.bias_mu, self.bias_logsigma
            )
            assert isinstance(self.bias_var_posterior, ParameterDistribution)
            assert any(
                True for _ in self.bias_var_posterior.parameters()
            ), "Bias posterior must have parameters"
        else:
            self.bias_var_posterior = None

    def forward(self, inputs: torch.Tensor):
        """
        Perform one forward pass through this layer.
        If you need to sample weights from the variational posterior, you can do it here during the forward pass.
        Just make sure that you use the same weights to approximate all quantities
        present in a single Bayes by backprop sampling step.

        :param inputs: Flattened input images as a (batch_size, in_features) float tensor
        :return: 3-tuple containing
            i) transformed features using stochastic weights from the variational posterior,
            ii) sample of the log-prior probability, and
            iii) sample of the log-variational-posterior probability
        """
        # TODO: Perform a forward pass as described in this method's docstring.
        #  Make sure to check whether `self.use_bias` is True,
        #  and if yes, include the bias as well.

        # Sample weights and bias #
        crt_sigma = torch.exp(self.weight_logsigma)
        # Weight sampling
        normal_dist = torch.distributions.Normal(0, 1)
        ## Step 1 from paper
        epsilon_weight = normal_dist.sample(self.weight_mu.shape)
        ## Step 2 of paper
        weights = self.weight_mu + torch.log(1.0 + crt_sigma * epsilon_weight)

        # Bias sampling
        if self.use_bias:
            bias_sigma = torch.log(1.0 + torch.exp(self.bias_logsigma))
            ## Step 1 from paper
            epsilon_bias = normal_dist.sample(self.bias_mu.shape)
            ## Step 2 of paper
            bias = self.bias_mu + bias_sigma * epsilon_bias
            print(bias)
            print(weights)
            # LOG PRIOR (WEIGHTS ONLY)
            log_prior = self.prior.log_likelihood(weights) + self.prior.log_likelihood(
                bias
            )
            log_variational_posterior = (
                MultivariateDiagonalGaussian(self.weight_mu.data, crt_sigma).log_likelihood(
                    weights
                )
                + torch.distributions.Normal(
                    self.bias_mu.data, torch.log(1 + torch.exp(self.bias_logsigma))
                )
                .log_prob(bias)
                .sum()
            )
        else:
            bias = None
            log_prior = self.prior.log_likelihood(weights)
            log_variational_posterior = MultivariateDiagonalGaussian(
                self.weight_mu.data, crt_sigma
            ).log_likelihood(weights)

        return F.linear(inputs, weights, bias), log_prior, log_variational_posterior

In [192]:
class BayesianLayerOld(torch.nn.Module):
    '''
    Module implementing a single Bayesian feedforward layer.
    The module performs Bayes-by-backprop, that is, mean-field
    variational inference. It keeps prior and posterior weights
    (and biases) and uses the reparameterization trick for sampling.
    '''

    def __init__(self, input_dim, output_dim, bias=True):
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.use_bias = bias

        # TODO: enter your code here
        self.prior_mu = 0
        self.prior_sigma = 0.1

        self.prior = torch.distributions.Normal(self.prior_mu, self.prior_sigma)


        self.weight_mu = nn.Parameter(torch.zeros(output_dim, input_dim).uniform_(-0.0005, 0.0005))
        self.weight_logsigma = nn.Parameter(torch.zeros(output_dim, input_dim).uniform_(-2.56,-2.55))

        if self.use_bias:
            self.bias_mu = nn.Parameter(torch.zeros(output_dim).uniform_(-0.0005,0.0005))
            self.bias_logsigma =  nn.Parameter(torch.zeros(output_dim).uniform_(-2.56,-2.55))

        else:
            self.register_parameter('bias_mu', None)
            self.register_parameter('bias_logsigma', None)



    def forward(self, inputs):

        self.num_batches = round(60000 / inputs.shape[0])

        # SAMPLE WEIGHTS
        w_epsilon = torch.distributions.Normal(0, 1).sample(self.weight_mu.shape)
        self.w = self.weight_mu + torch.log(1 + torch.exp(self.weight_logsigma)) * w_epsilon
        print(w_epsilon)


        # LOG PRIOR (WEIGHTS ONLY)
        w_log_prior = self.prior.log_prob(self.w)
        self.log_prior = torch.sum(w_log_prior)


        # LOG POSTERIOR (WEIGHTS ONLY)
        self.w_post = torch.distributions.Normal(self.weight_mu.data, torch.log(1 + torch.exp(self.weight_logsigma)))
        self.log_post = self.w_post.log_prob(self.w).sum()


        # ADDING THE BIAS TERM
        if self.use_bias:
            # SAMPLE BIAS
            b_epsilon = torch.distributions.Normal(0, 1).sample(self.bias_mu.shape)
            self.b = self.bias_mu + torch.log(1 + torch.exp(self.bias_logsigma)) * b_epsilon
            print(self.b)
            print(self.w)
            # LOG PRIOR AND POSTERIOR OF THE BIAS
            b_log_prior = self.prior.log_prob(self.b)
            self.log_prior += torch.sum(b_log_prior)

            self.b_post = torch.distributions.Normal(self.bias_mu.data, torch.log(1 + torch.exp(self.bias_logsigma)))
            self.log_post += self.b_post.log_prob(self.b).sum()


        if self.use_bias:
            return F.linear(inputs, self.w, self.b), self.log_prior, self.log_post
        else:
            bias = None
            return torch.relu(torch.mm(inputs, self.w)), self.log_prior, self.log_post

In [203]:
vorlageBL = BayesianLayerOld(5, 5, True)

In [217]:
eigenerBL = BayesianLayerOwn(5, 5, True)

In [208]:
vorlageBL.bias_logsigma

Parameter containing:
tensor([-2.5592, -2.5562, -2.5548, -2.5543, -2.5538], requires_grad=True)

In [209]:
eigenerBL.bias_logsigma

Parameter containing:
tensor([-2.5514, -2.5536, -2.5526, -2.5532, -2.5562], requires_grad=True)

## Forward pass

In [205]:
input_tensor = torch.Tensor([0.4,0.6,0.1,0.23,0.85])

In [206]:
vorlageBL.forward(input_tensor)

tensor([ 0.0746, -0.0119, -0.0190, -0.0810,  0.0007], grad_fn=<AddBackward0>)
tensor([[ 0.0121,  0.0084, -0.0614,  0.0476, -0.1427],
        [-0.0420,  0.0400,  0.0307,  0.0745, -0.0929],
        [ 0.0732, -0.0373, -0.0959,  0.0331, -0.0004],
        [ 0.1010, -0.0209,  0.0210, -0.0024, -0.0794],
        [-0.0090,  0.0150, -0.0546,  0.1267,  0.0260]], grad_fn=<AddBackward0>)


(tensor([-0.0320, -0.0635, -0.0144, -0.1191,  0.0519], grad_fn=<AddBackward0>),
 tensor(35.8993, grad_fn=<AddBackward0>),
 tensor(40.1668, grad_fn=<AddBackward0>))

In [218]:
eigenerBL.forward(input_tensor)

tensor([ 0.0422, -0.0047, -0.0463, -0.0283, -0.0970], grad_fn=<AddBackward0>)
tensor([[ 0.0329, -0.0405, -0.0149, -0.0155,  0.0120],
        [ 0.0651,  0.0181,  0.0183, -0.0197, -0.0201],
        [ 0.0366,  0.0263, -0.1335, -0.0462,  0.0459],
        [ 0.0339, -0.0073, -0.1104, -0.0018, -0.1168],
        [ 0.0450, -0.0943,  0.0025,  0.0164, -0.0746]], grad_fn=<AddBackward0>)


(tensor([ 0.0362,  0.0123, -0.0009, -0.1299, -0.1949], grad_fn=<AddBackward0>),
 tensor(-27.6137, grad_fn=<AddBackward0>),
 tensor(41.6318, grad_fn=<AddBackward0>))