# Probabilistic Bayesian Neural Networks in Tensorflow

## Introduction

Taking a probabilistic approach to deep learning allows to account for *uncertainty*,
so that models can assign less levels of confidence to incorrect predictions.
Sources of uncertainty can be found in the data, due to measurement error or
noise in the labels, or the model, due to insufficient data availability for
the model to learn effectively.

In [1]:
%%capture
!pip -Uqq install tensorflow-probability
!pip -Uqq install tensorflow-datasets

In [92]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_probability as tfp

from dataclasses import dataclass
from tensorflow.keras import layers, Model, Sequential
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import RootMeanSquaredError

print("numpy                 ", np.__version__)
print("pandas                ", pd.__version__)
print("tensorflow            ", tf.__version__)
print("tensorflow-datasets   ", tfp.__version__)
print("tensorflow-probability", tfds.__version__)

numpy                  1.19.5
pandas                 1.1.5
tensorflow             2.4.0
tensorflow-datasets    0.12.1
tensorflow-probability 4.0.1


## The dataset
We use the Wine Quality dataset, which is available in the TensorFlow Datasets. We use the red wine subset, which contains 4,898 examples. The dataset has 11 numerical physicochemical features of the wine, and the task is to predict the wine quality, which is a score between 0 and 10.

In [62]:
@dataclass
class Config:
    DATASET_SIZE = 4898
    BATCH_SIZE = 256
    TRAIN_SIZE = int(DATASET_SIZE * 0.85)
    BASELINE_HIDDEN_UNITS = [8, 8]
    LEARNING_RATE = 0.001
    EPOCHS = 100

config = Config()

In [21]:
dataset = (
    tfds.load(name="wine_quality", as_supervised=True, split="train")
    .map(lambda x, y: (x, tf.cast(y, tf.float32)))
    .prefetch(buffer_size=config.DATASET_SIZE)
    .cache()
)

train_dataset = (
    dataset.take(config.TRAIN_SIZE).shuffle(buffer_size=config.TRAIN_SIZE).batch(config.BATCH_SIZE)
)
test_dataset = dataset.skip(config.TRAIN_SIZE).batch(config.BATCH_SIZE)

train_dataset.cardinality(), test_dataset.cardinality()

(<tf.Tensor: shape=(), dtype=int64, numpy=17>,
 <tf.Tensor: shape=(), dtype=int64, numpy=3>)

In [31]:
sample_record = next(iter(train_dataset))[0]
list(sample_record.keys())

['alcohol',
 'chlorides',
 'citric acid',
 'density',
 'fixed acidity',
 'free sulfur dioxide',
 'pH',
 'residual sugar',
 'sulphates',
 'total sulfur dioxide',
 'volatile acidity']

In [39]:
df = pd.DataFrame(sample_record)
df

Unnamed: 0,alcohol,chlorides,citric acid,density,fixed acidity,free sulfur dioxide,pH,residual sugar,sulphates,total sulfur dioxide,volatile acidity
0,8.7,0.057,0.49,1.00055,6.8,55.0,3.00,19.299999,0.56,247.0,0.240
1,9.3,0.035,0.37,0.99845,6.6,62.0,3.02,15.400000,0.40,153.0,0.220
2,9.2,0.168,0.40,0.99910,8.0,29.0,3.07,12.400000,0.64,190.0,0.430
3,9.5,0.049,0.26,0.99907,7.4,67.0,3.06,15.600000,0.68,210.0,0.330
4,8.8,0.055,0.24,0.99880,7.1,46.0,3.12,15.400000,0.49,198.0,0.350
...,...,...,...,...,...,...,...,...,...,...,...
251,9.5,0.050,0.49,0.99510,6.3,67.0,3.23,7.100000,0.34,210.0,0.230
252,11.2,0.036,0.32,0.99196,7.4,27.0,3.15,1.900000,0.49,119.0,0.360
253,10.1,0.046,0.18,0.99400,7.0,9.0,3.38,1.300000,0.47,62.0,0.240
254,10.1,0.045,0.30,0.99249,8.6,16.0,2.95,0.900000,0.39,109.0,0.310


In [40]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
alcohol,256.0,10.540394,1.218503,8.6,9.5,10.5,11.3,13.7
chlorides,256.0,0.044117,0.017533,0.014,0.035,0.0415,0.05,0.18
citric acid,256.0,0.320625,0.11098,0.0,0.2675,0.3,0.36,0.8
density,256.0,0.99385,0.002917,0.98742,0.99143,0.9934,0.9958,1.00055
fixed acidity,256.0,6.838671,0.810536,4.9,6.3,6.8,7.3,9.2
free sulfur dioxide,256.0,35.558594,15.76988,6.0,24.75,34.0,47.0,83.0
pH,256.0,3.185156,0.142521,2.87,3.08,3.18,3.29,3.57
residual sugar,256.0,6.037696,4.946777,0.8,1.6,4.55,9.075,19.799999
sulphates,256.0,0.495352,0.116723,0.29,0.41,0.48,0.55,0.94
total sulfur dioxide,256.0,139.695312,40.554577,45.0,112.0,135.0,169.25,255.0


## Experiment 1. Baseline Model

In [97]:
def create_model_inputs(feature_names):
    return {
        feature: Input(name=feature, shape=(1,), dtype=tf.float32)
        for feature in feature_names
    }

In [98]:
inputs = create_model_inputs(feature_names = sample_record.keys())
input_values = [value for _, value in sorted(inputs.items())]
features = layers.concatenate(input_values)
features = layers.BatchNormalization()(features)
for units in config.BASELINE_HIDDEN_UNITS:
    features = layers.Dense(units, activation="sigmoid")(features)
outputs = Dense(units=1)(features)
baseline_model = Model(inputs=inputs, outputs=outputs)


In [99]:
baseline_model.compile(
    optimizer=RMSprop(config.LEARNING_RATE),
    loss=MeanSquaredError(),
    metrics=[RootMeanSquaredError()],
)
baseline_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
alcohol (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
chlorides (InputLayer)          [(None, 1)]          0                                            
__________________________________________________________________________________________________
citric acid (InputLayer)        [(None, 1)]          0                                            
__________________________________________________________________________________________________
density (InputLayer)            [(None, 1)]          0                                            
____________________________________________________________________________________________

In [100]:
history = baseline_model.fit(train_dataset, epochs=config.EPOCHS, validation_data=test_dataset)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [69]:
_, train_rmse = baseline_model.evaluate(train_dataset, verbose=0)
_, test_rmse = baseline_model.evaluate(test_dataset, verbose=0)
print(f"Train RMSE {round(train_rmse, 3)}")
print(f"Test  RMSE {round(test_rmse, 3)}")

Train RMSE 0.76
Test  RMSE 0.752


In [91]:
samples, targets = next(iter(test_dataset))
predicted = baseline_model(samples)
for pred, targ in zip(predicted[:5], targets[:5]):
    print(f"Predicted: {pred[0]:.2} - Actual: {targ:.2}")

Predicted: 6.6 - Actual: 7.0
Predicted: 5.4 - Actual: 5.0
Predicted: 5.6 - Actual: 7.0
Predicted: 5.6 - Actual: 6.0
Predicted: 5.8 - Actual: 6.0


## Experiment 2: Bayesian neural network(BNN)

The object of the Bayesian approach for modeling neural networks is to capture
the *epistemic uncertainty*, which is uncertainty about the model fitness,
due to limited training data.

The idea is that, instead of learning specific weight (and bias) *values* in the
neural network, the Bayesian approach learns weight *distributions*
- from which we can sample to produce an output for a given input -
to encode weight uncertainty.

Thus, we need to define prior and the posterior distributions of these weights,
and the training process is to learn the parameters of these distributions.

Define the prior weight distribution as Normal of mean=0 and stddev=1.Note that, in this example, the we prior distribution is not trainable, as we fix its parameters.

In [104]:
def prior(kernel_size, bias_size, dtype=None):
    n = kernel_size + bias_size
    prior_model = Sequential([
        tfp.layers.DistributionLambda(
            lambda t: tfp.distributions.MultivariateNormalDiag(
                # normal of mean=0 and stddev=1
                loc=tf.zeros(n), scale_diag=tf.ones(n)
            )
        )
    ])
    return prior_model

Define variational posterior weight distribution as multivariate Gaussian. Note that the learnable parameters for this distribution are the means, variances, and covariances.

In [105]:
def posterior(kernel_size, bias_size, dtype=None):
    n = kernel_size + bias_size
    posterior_model = Sequential([
        tfp.layers.VariableLayer(
            tfp.layers.MultivariateNormalTriL.params_size(n), dtype=dtype
        ),
        tfp.layers.MultivariateNormalTriL(n)
    ])
    return posterior_model

We use the `tfp.layers.DenseVariational` layer instead of the standard
`keras.layers.Dense` layer in the neural network model.

In [108]:
inputs = create_model_inputs(feature_names=sample_record.keys())
features = layers.concatenate(list(inputs.values()))
features = layers.BatchNormalization()(features)

features = tfp.layers.DenseVariational(
    units=config.BASELINE_HIDDEN_UNITS[0],
    make_prior_fn=prior,
    make_posterior_fn=posterior,
    kl_weight=1 / config.TRAIN_SIZE,
    activation="sigmoid",
)(features)
outputs = Dense(units=1)(features)
bnn_model_small = Model(inputs=inputs, outputs=outputs)
bnn_model_small.compile(
    optimizer=RMSprop(learning_rate=config.LEARNING_RATE),
    loss=MeanSquaredError(),
    metrics=[RootMeanSquaredError()],
)

In [114]:
num_epochs = 500
train_sample_size = int(config.TRAIN_SIZE * 0.3)
small_train_dataset = (
    train_dataset.unbatch().take(train_sample_size).batch(config.BATCH_SIZE)
)

print("Start training the model...")
bnn_model_small.fit(small_train_dataset, epochs=num_epochs, validation_data=test_dataset, verbose=0)
print("Model training finished.")
_, rmse = bnn_model_small.evaluate(small_train_dataset, verbose=0)
print(f"Train RMSE: {round(rmse, 3)}")

print("Evaluating model performance...")
_, rmse = bnn_model_small.evaluate(test_dataset, verbose=0)
print(f"Test RMSE: {round(rmse, 3)}")



Start training the model...
Model training finished.
Train RMSE: 0.744
Evaluating model performance...
Test RMSE: 0.744
