## Bayesian Neural Network

In this notebook we use Coin SVGD (and SVGD) to perform inference in a Bayesian neural network.

In [None]:
import os
from tqdm import tqdm
import random
import time
import copy

import numpy as np
import theano
import theano.tensor as T
from scipy.spatial.distance import pdist, squareform

import matplotlib.pyplot as plt

from plot_utils import *
from utils import *

Before we define the model, let's define somewhere to save any results.

In [None]:
# set up directories for plots & results
plot_dir = "plots/SVGD/BayesNN"
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)

results_dir = "results/SVGD/BayesNN"
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

time_plot_dir = plot_dir
if not os.path.exists(time_plot_dir):
    os.makedirs(time_plot_dir)

time_results_dir = results_dir
if not os.path.exists(time_results_dir):
    os.makedirs(time_results_dir)

Our model & implementation is based on the one in Liu & Wang (2016).

In [None]:
"""
    Bayesian Neural Network, following Liu et al. 2016.
    Adapted from https://github.com/dilinwang820/Stein-Variational-Gradient-Descent/
    
    The model is defined as follows:
    p(y | W, X, \gamma) = \prod_i^N  N(y_i | f(x_i; W), \gamma^{-1})
    p(W | \lambda) = \prod_i N(w_i | 0, \lambda^{-1})
    p(\gamma) = Gamma(\gamma | a0, b0)
    p(\lambda) = Gamma(\lambda | a0, b0)

    The posterior distribution is then given by
    p(W, \gamma, \lambda) = p(y | W, X, \gamma) p(W | \lambda) p(\gamma) p(\lambda).
    
    To avoid negative values of \gamma and \lambda, we will work with log_gamma and log_lambda.
"""


class BayesNN:
    """
        Neural network with one hidden layer.

        Input
            -- X_train: training dataset, features
            -- y_train: training labels
            -- batch_size: sub-sampling batch size
            -- max_iter: maximum iterations for the training procedure
            -- M: number of particles are used to fit the posterior distribution
            -- n_hidden: number of hidden units
            -- a0, b0: hyper-parameters of Gamma distribution
            -- step_size, auto_corr: parameters of adagrad
            -- do_svgd, do_coin_svgd: whether to do SVGD or coin SVGD or both
            -- alpha: scaling parameter for neural net coin SVGD, default = 100
            -- L_init: initial value of max. observed scale for Coin SVGD, default = 1e-10
    """

    def __init__(self, X_train, y_train, X_test, y_test, batch_size=100, max_iter=1000, M=20, n_hidden=50, a0=1, b0=0.1,
                 stepsize=1e-3, auto_corr=0.9, do_svgd=True, do_coin_svgd=True, alpha=100, L_init=1e-10):

        self.n_hidden = n_hidden
        self.d = X_train.shape[1]
        self.M = M

        # coin parameters
        self.alpha = alpha
        self.L_init = L_init

        # w1: d*n_hidden; b1: n_hidden; w2 = n_hidden; b2 = 1; 2 variances
        num_vars = self.d * n_hidden + n_hidden * 2 + 3
        self.theta = np.zeros([self.M, num_vars])
        self.theta_coin = np.zeros([self.M, num_vars])

        # which methods to use
        self.do_svgd = do_svgd
        self.do_coin_svgd = do_coin_svgd

        # development set
        size_dev = min(int(np.round(0.1 * X_train.shape[0])), 500)
        X_dev, y_dev = X_train[-size_dev:], y_train[-size_dev:]
        X_dev_copy, y_dev_copy = X_train[-size_dev:], y_train[-size_dev:]
        X_train, y_train = X_train[:-size_dev], y_train[:-size_dev]

        # first normalise data sets
        self.std_X_train = np.std(X_train, 0)
        self.std_X_train[self.std_X_train == 0] = 1
        self.mean_X_train = np.mean(X_train, 0)
        self.mean_y_train = np.mean(y_train)
        self.std_y_train = np.std(y_train)

        # save rmses during training
        self.svgd_rmses = np.zeros(max_iter)
        self.coin_rmses = np.zeros(max_iter)

        # save lls during training
        self.svgd_lls = np.zeros(max_iter)
        self.coin_lls = np.zeros(max_iter)

        # define the neural net
        X = T.matrix('X')  # covariates
        y = T.vector('y')  # labels

        w_1 = T.matrix('w_1')  # weights between input layer and hidden layer
        b_1 = T.vector('b_1')  # bias vector of hidden layer
        w_2 = T.vector('w_2')  # weights between hidden layer and output layer
        b_2 = T.scalar('b_2')  # bias of output

        N = T.scalar('N')  # number of observations

        log_gamma = T.scalar('log_gamma')  # variances related parameters
        log_lambda = T.scalar('log_lambda')

        # prediction
        prediction = T.dot(T.nnet.relu(T.dot(X, w_1) + b_1), w_2) + b_2

        # log posterior
        log_lik_data = -0.5 * X.shape[0] * (T.log(2 * np.pi) - log_gamma) - (T.exp(log_gamma) / 2) * T.sum(
            T.power(prediction - y, 2))
        log_prior_data = (a0 - 1) * log_gamma - b0 * T.exp(log_gamma) + log_gamma
        log_prior_w = -0.5 * (num_vars - 2) * (T.log(2 * np.pi) - log_lambda) - (T.exp(log_lambda) / 2) * (
                    (w_1 ** 2).sum() + (w_2 ** 2).sum() + (b_1 ** 2).sum() + b_2 ** 2) \
                      + (a0 - 1) * log_lambda - b0 * T.exp(log_lambda) + log_lambda

        # sub-sample mini-batches
        log_posterior = (log_lik_data * N / X.shape[0] + log_prior_data + log_prior_w)
        dw_1, db_1, dw_2, db_2, d_log_gamma, d_log_lambda = T.grad(log_posterior,
                                                                   [w_1, b_1, w_2, b_2, log_gamma, log_lambda])

        # automatic gradient
        logp_gradient = theano.function(
            inputs=[X, y, w_1, b_1, w_2, b_2, log_gamma, log_lambda, N],
            outputs=[dw_1, db_1, dw_2, db_2, d_log_gamma, d_log_lambda]
        )

        # prediction function
        self.nn_predict = theano.function(inputs=[X, w_1, b_1, w_2, b_2], outputs=prediction)

        # normalise
        X_train, y_train = self.normalization(X_train, y_train)
        N0 = X_train.shape[0]

        # initialise
        for i in range(self.M):
            w1, b1, w2, b2, log_gamma, log_lambda = self.init_weights(a0, b0)
            ridx = np.random.choice(range(X_train.shape[0]), np.min([X_train.shape[0], 1000]), replace=False)
            y_hat = self.nn_predict(X_train[ridx, :], w1, b1, w2, b2)
            log_gamma = -np.log(np.mean(np.power(y_hat - y_train[ridx], 2)))
            self.theta[i, :] = self.pack_weights(w1, b1, w2, b2, log_gamma, log_lambda)
        self.theta *= 2

        # initialise coin
        self.theta_coin = copy.deepcopy(self.theta)
        self.theta_coin_init = copy.deepcopy(self.theta_coin)

        # run svgd
        if self.do_svgd:
            grad_theta = np.zeros([self.M, num_vars])

            # adagrad with momentum
            fudge_factor = 1e-6
            historical_grad = 0

            # svgd
            for iter in range(max_iter):

                # sub-sampling
                batch = [i % N0 for i in range(iter * batch_size, (iter + 1) * batch_size)]
                for i in range(self.M):
                    w1, b1, w2, b2, log_gamma, log_lambda = self.unpack_weights(self.theta[i, :])
                    dw1, db1, dw2, db2, dlog_gamma, dlog_lambda = logp_gradient(X_train[batch, :], y_train[batch], w1, b1, w2,
                                                                              b2, log_gamma, log_lambda, N0)
                    grad_theta[i, :] = self.pack_weights(dw1, db1, dw2, db2, dlog_gamma, dlog_lambda)

                # calculate svgd gradient
                kxy, dxkxy = self.svgd_kernel(h=-1)
                grad_theta = (np.matmul(kxy, grad_theta) + dxkxy) / self.M  # \Phi(x)

                # adagrad
                if iter == 0:
                    historical_grad = historical_grad + np.multiply(grad_theta, grad_theta)
                else:
                    historical_grad = auto_corr * historical_grad + (1 - auto_corr) * np.multiply(grad_theta, grad_theta)
                adj_grad = np.divide(grad_theta, fudge_factor + np.sqrt(historical_grad))
                self.theta = self.theta + stepsize * adj_grad

                # save rmse and log-likelihood
                if self.do_svgd and self.do_coin_svgd:
                    self.svgd_rmses[iter], self.svgd_lls[iter], _, _ = self.evaluation(X_test, y_test)
                elif self.do_svgd and not self.do_coin_svgd:
                    self.svgd_rmses[iter], self.svgd_lls[iter] = self.evaluation(X_test, y_test)

        # coin svgd
        if self.do_coin_svgd:

            # initialise gradient
            grad_theta = np.zeros([self.M, num_vars])

            # initialise other variables
            L = self.L_init
            grad_theta_sum = 0
            abs_grad_theta_sum = 0
            reward = 0

            for iter in range(max_iter):

                # sub sample
                batch = [i % N0 for i in range(iter * batch_size, (iter + 1) * batch_size)]
                for i in range(self.M):
                    w1, b1, w2, b2, log_gamma, log_lambda = self.unpack_weights(self.theta_coin[i, :])
                    dw1, db1, dw2, db2, dlog_gamma, dlog_lambda = logp_gradient(X_train[batch, :], y_train[batch], w1, b1,
                                                                              w2,
                                                                              b2, log_gamma, log_lambda, N0)
                    grad_theta[i, :] = self.pack_weights(dw1, db1, dw2, db2, dlog_gamma, dlog_lambda)

                # compute svgd gradient
                kxy, dxkxy = self.svgd_kernel_coin(h=-1)

                # gradient
                grad_theta = (np.matmul(kxy, grad_theta) + dxkxy) / self.M

                # | gradient |
                abs_grad_theta = abs(grad_theta)

                # max gradient
                L = np.maximum(L, abs_grad_theta)

                # sum of gradients
                grad_theta_sum += grad_theta
                abs_grad_theta_sum += abs_grad_theta

                # 'reward'
                reward = np.maximum(reward + np.multiply(self.theta_coin - self.theta_coin_init, grad_theta), 0)

                self.theta_coin = self.theta_coin_init + grad_theta_sum / (L * np.maximum(abs_grad_theta_sum + L, self.alpha * L)) * (L + reward)

                # record rmse and log-lik
                if self.do_svgd and self.do_coin_svgd:
                    _, _, self.coin_rmses[iter], self.coin_lls[iter] = self.evaluation(X_test, y_test)
                elif self.do_coin_svgd:
                    self.coin_rmses[iter], self.coin_lls[iter] = self.evaluation(X_test, y_test)

        # model selection (svgd)
        if self.do_svgd:
            X_dev = self.normalization(X_dev)
            for i in range(self.M):
                w1, b1, w2, b2, log_gamma, log_lambda = self.unpack_weights(self.theta[i, :])
                pred_y_dev = self.nn_predict(X_dev, w1, b1, w2, b2) * self.std_y_train + self.mean_y_train

                # likelihood
                def f_log_lik(log_gamma):
                    return np.sum(np.log(np.sqrt(np.exp(log_gamma)) / np.sqrt(2 * np.pi) * np.exp(
                        -1 * (np.power(pred_y_dev - y_dev, 2) / 2) * np.exp(log_gamma))))

                lik1 = f_log_lik(log_gamma)

                # heuristic
                log_gamma = -np.log(np.mean(np.power(pred_y_dev - y_dev, 2)))
                lik2 = f_log_lik(log_gamma)

                # update log_gamma
                if lik2 > lik1:
                    self.theta[i, -2] = log_gamma

        # model selection (coin svgd)
        if self.do_coin_svgd:
            X_dev_copy = self.normalization(X_dev_copy)
            for i in range(self.M):
                w1, b1, w2, b2, log_gamma, log_lambda = self.unpack_weights(self.theta_coin[i, :])
                pred_y_dev_copy = self.nn_predict(X_dev_copy, w1, b1, w2, b2) * self.std_y_train + self.mean_y_train

                # likelihood
                def f_log_lik_copy(log_gamma):
                    return np.sum(np.log(np.sqrt(np.exp(log_gamma)) / np.sqrt(2 * np.pi) * np.exp(
                        -1 * (np.power(pred_y_dev_copy - y_dev_copy, 2) / 2) * np.exp(log_gamma))))

                lik1 = f_log_lik_copy(log_gamma)

                # heuristic
                log_gamma = -np.log(np.mean(np.power(pred_y_dev_copy - y_dev_copy, 2)))
                lik2 = f_log_lik_copy(log_gamma)

                # update log_gamma
                if lik2 > lik1:
                    self.theta_coin[i, -2] = log_gamma

    # normalisation
    def normalization(self, X, y=None):
        X = (X - np.full(X.shape, self.mean_X_train)) / np.full(X.shape, self.std_X_train)

        if y is not None:
            y = (y - self.mean_y_train) / self.std_y_train
            return (X, y)
        else:
            return X

    # initialisation
    def init_weights(self, a0, b0):
        w1 = 1.0 / np.sqrt(self.d + 1) * np.random.randn(self.d, self.n_hidden)
        b1 = np.zeros((self.n_hidden,))
        w2 = 1.0 / np.sqrt(self.n_hidden + 1) * np.random.randn(self.n_hidden)
        b2 = 0.
        log_gamma = np.log(np.random.gamma(a0, b0))
        log_lambda = np.log(np.random.gamma(a0, b0))
        return w1, b1, w2, b2, log_gamma, log_lambda

    # svgd kernel
    def svgd_kernel(self, h=-1):
        sq_dist = pdist(self.theta)
        pairwise_dists = squareform(sq_dist) ** 2

        # median rule
        if h < 0:
            h = np.median(pairwise_dists)
            h = np.sqrt(0.5 * h / np.log(self.theta.shape[0] + 1))

        # rbf kernel
        kxy = np.exp(-pairwise_dists / h ** 2 / 2)

        # rbf kernel grad
        dxkxy = -np.matmul(kxy, self.theta)
        sumkxy = np.sum(kxy, axis=1)

        for i in range(self.theta.shape[1]):
            dxkxy[:, i] = dxkxy[:, i] + np.multiply(self.theta[:, i], sumkxy)

        dxkxy = dxkxy / (h ** 2)

        return kxy, dxkxy

    # svgd kernel (for coin update)
    def svgd_kernel_coin(self, h=-1):
        sq_dist = pdist(self.theta_coin)
        pairwise_dists = squareform(sq_dist) ** 2

        # median rule
        if h < 0:
            h = np.median(pairwise_dists)
            h = np.sqrt(0.5 * h / np.log(self.theta_coin.shape[0] + 1))

        # rbf kernel
        kxy = np.exp(-pairwise_dists / h ** 2 / 2)

        # rbf kernel grad
        dxkxy = -np.matmul(kxy, self.theta_coin)
        sumkxy = np.sum(kxy, axis=1)

        for i in range(self.theta_coin.shape[1]):
            dxkxy[:, i] = dxkxy[:, i] + np.multiply(self.theta_coin[:, i], sumkxy)

        dxkxy = dxkxy / (h ** 2)
        return kxy, dxkxy

    # pack parameters
    def pack_weights(self, w1, b1, w2, b2, log_gamma, log_lambda):
        params = np.concatenate([w1.flatten(), b1, w2, [b2], [log_gamma], [log_lambda]])
        return params

    # unpack parameters
    def unpack_weights(self, z):
        w = z
        w1 = np.reshape(w[:self.d * self.n_hidden], [self.d, self.n_hidden])
        b1 = w[self.d * self.n_hidden:(self.d + 1) * self.n_hidden]

        w = w[(self.d + 1) * self.n_hidden:]
        w2, b2 = w[:self.n_hidden], w[-3]

        # the last two parameters are log variance
        log_gamma, log_lambda = w[-2], w[-1]

        return w1, b1, w2, b2, log_gamma, log_lambda

    # evaluate test RMSE and log-likelihood
    def evaluation(self, X_test, y_test):

        # normalise
        X_test = self.normalization(X_test)

        # average over the output
        pred_y_test = np.zeros([self.M, len(y_test)])
        prob = np.zeros([self.M, len(y_test)])

        pred_y_test_coin = np.zeros([self.M, len(y_test)])
        prob_coin = np.zeros([self.M, len(y_test)])

        if self.do_svgd:
            for i in range(self.M):
                w1, b1, w2, b2, log_gamma, log_lambda = self.unpack_weights(self.theta[i, :])
                pred_y_test[i, :] = self.nn_predict(X_test, w1, b1, w2, b2) * self.std_y_train + self.mean_y_train
                prob[i, :] = np.sqrt(np.exp(log_gamma)) / np.sqrt(2 * np.pi) * np.exp(
                    -1 * (np.power(pred_y_test[i, :] - y_test, 2) / 2) * np.exp(log_gamma))
            pred = np.mean(pred_y_test, axis=0)
            svgd_rmse = np.sqrt(np.mean((pred - y_test) ** 2))
            svgd_ll = np.mean(np.log(np.mean(prob, axis=0)))

        if self.do_coin_svgd:
            for i in range(self.M):
                w1, b1, w2, b2, log_gamma, log_lambda = self.unpack_weights(self.theta_coin[i, :])
                pred_y_test_coin[i, :] = self.nn_predict(X_test, w1, b1, w2, b2) * self.std_y_train + self.mean_y_train
                prob_coin[i, :] = np.sqrt(np.exp(log_gamma)) / np.sqrt(2 * np.pi) * np.exp(
                    -1 * (np.power(pred_y_test_coin[i, :] - y_test, 2) / 2) * np.exp(log_gamma))
            pred_coin = np.mean(pred_y_test_coin, axis=0)
            svgd_coin_rmse = np.sqrt(np.mean((pred_coin - y_test) ** 2))
            svgd_coin_ll = np.mean(np.log(np.mean(prob_coin, axis=0)))

        if self.do_svgd is True and self.do_coin_svgd is not True:
            return svgd_rmse, svgd_ll
        if self.do_svgd is not True and self.do_coin_svgd is True:
            return svgd_coin_rmse, svgd_coin_ll
        if self.do_svgd is True and self.do_coin_svgd is True:
            return svgd_rmse, svgd_ll, svgd_coin_rmse, svgd_coin_ll

Now we're ready to run our method.

In [None]:
np.random.seed(42)

# data sets
data_names = ["concrete", "energy", "protein-structure", "boston", "kin8nm", "wine", "yacht"]
data_names_abrv = ["concrete", "energy", "protein", "boston", "kin8nm", "wine", "yacht"]

# number of repeats
n_reps = 10

# array to store times
svgd_times = np.zeros((n_reps, (len(data_names))))
coin_times = np.zeros((n_reps, (len(data_names))))

for ii, data_name in enumerate(data_names):
    data = np.loadtxt(open("data/" + data_name + ".csv", "rb"), delimiter=",", skiprows=1)

    # last column is target
    X_input = data[:, range(data.shape[1] - 1)]
    y_input = data[:, data.shape[1] - 1]

    step_sizes = np.logspace(-10, -0.5, 20)
    n_sims = len(step_sizes)

    svgd_results = np.zeros([n_reps, 2, n_sims])
    svgd_coin_results = np.zeros([n_reps, 2])
    svgd_coin_results_alt = np.zeros([n_reps, 2])

    for rep in range(n_reps):

        np.random.seed(rep)

        # train and test data
        train_ratio = 0.9
        permutation = np.arange(X_input.shape[0])
        random.shuffle(permutation)

        size_train = int(np.round(X_input.shape[0] * train_ratio))
        index_train = permutation[0: size_train]
        index_test = permutation[size_train:]

        X_train, y_train = X_input[index_train, :], y_input[index_train]
        X_test, y_test = X_input[index_test, :], y_input[index_test]

        # neural network parameters
        batch_size, n_hidden, max_iter = 100, 50, 2000
        alpha = 100

        if data_name == "protein-structure":
            n_hidden = 100

        if data_name == "boston":
            alpha = 1000

        for id, step_size in enumerate(step_sizes):
            print("Dataset: " + str(data_name) + ", Repetition: " + str(rep) + "/" + str(n_reps) + ", Iteration " + str(id) + "/" + str(len(step_sizes)))
            if id == 0:
                svgd_start = time.time()
                svgd = BayesNN(X_train, y_train, X_test, y_test, batch_size=batch_size, n_hidden=n_hidden,
                               max_iter=max_iter, stepsize=step_size, do_svgd=True, do_coin_svgd=False,
                               alpha=alpha)
                svgd_end = time.time()
                svgd_time = svgd_end - svgd_start
                svgd_rmse, svgd_ll = svgd.evaluation(X_test, y_test)

                coin_start = time.time()
                svgd_coin = BayesNN(X_train, y_train, X_test, y_test, batch_size=batch_size, n_hidden=n_hidden,
                                    max_iter=max_iter, stepsize=step_size, do_svgd=False, do_coin_svgd=True,
                                    alpha=alpha)
                coin_end = time.time()
                coin_time = coin_end - coin_start
                svgd_coin_rmse, svgd_coin_ll = svgd_coin.evaluation(X_test, y_test)

            if id > 0:
                svgd = BayesNN(X_train, y_train, X_test, y_test, batch_size=batch_size, n_hidden=n_hidden,
                                    max_iter=max_iter, stepsize=step_size, do_coin_svgd=False)
                svgd_rmse, svgd_ll = svgd.evaluation(X_test, y_test)

            svgd_times[rep, ii] = svgd_time
            coin_times[rep, ii] = coin_time

            svgd_results[rep, 0, id], svgd_results[rep, 1, id] = svgd_rmse, svgd_ll
            svgd_coin_results[rep, 0], svgd_coin_results[rep, 1] = svgd_coin_rmse, svgd_coin_ll

    np.save(results_dir + "/" + "svgd_" + data_name, svgd_results)
    np.save(results_dir + "/" + "coin_svgd_" + data_name, svgd_coin_results)

    # plot rmse vs learning rate
    plt.close("all")
    mean_svgd = np.mean(svgd_results, axis=0)
    upper_svgd, lower_svgd = return_confidence_interval(svgd_results)
    mean_coin = np.mean(svgd_coin_results, axis=0)
    upper_coin, lower_coin = return_confidence_interval(svgd_coin_results)
    mean_coin_alt = np.mean(svgd_coin_results_alt, axis=0)
    upper_coin_alt, lower_coin_alt = return_confidence_interval(svgd_coin_results_alt)

    plt.plot(step_sizes[:-1], mean_svgd[0,:-1], ".-", label="SVGD", color="C0")
    plt.fill_between(step_sizes[:-1], lower_svgd[0,:-1], upper_svgd[0,:-1], color="C0", alpha=0.1)
    plt.grid(visible=True, color="whitesmoke", ls='-')
    plt.axhline(y=mean_coin[0], xmin=0, xmax=1, color="C1", label="Coin SVGD")
    plt.fill_between(step_sizes[:-1], lower_coin[0], upper_coin[0], color="C1", alpha=0.1)

    plt.legend()
    plt.xscale("log")
    plt.xlabel("Learning Rate", fontsize=18)
    plt.ylabel("Test RMSE", fontsize=18)
    plt.legend(prop={'size': 15})
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)
    plt.gcf().subplots_adjust(bottom=0.15, left=0.15)
    if data_name == "kin8nm":
        plt.gcf().subplots_adjust(bottom=0.15, left=0.20)
    fname = plot_dir + "/" + "rmse_vs_lr_" + data_name + ".pdf"
    plt.savefig(fname, format="pdf")
    plt.show()

    # plot log-lik vs learning rate
    plt.plot(step_sizes, mean_svgd[1,:], ".-", label="SVGD")
    plt.fill_between(step_sizes, lower_svgd[1,:], upper_svgd[1,:], color="C0", alpha=0.1)
    plt.grid(visible=True, color="whitesmoke", ls='-')
    plt.axhline(y=mean_coin[1], xmin=0, xmax=1, color="C1", label="Coin SVGD")
    plt.fill_between(step_sizes, lower_coin[1], upper_coin[1], color="C1", alpha=0.1)
    plt.legend()
    plt.xscale("log")
    plt.xlabel("Learning Rate", fontsize=18)
    plt.ylabel("Test Log-Likelihood", fontsize=18)
    plt.legend(prop={'size': 15})
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)
    plt.gcf().subplots_adjust(bottom=0.15, left=0.18)
    if data_name == "kin8nm":
        plt.gcf().subplots_adjust(bottom=0.15, left=0.20)
    fname = plot_dir + "/" + "ll_vs_lr_" + data_name + ".pdf"
    plt.savefig(fname, format="pdf")
    plt.show()

np.save(results_dir + "/" + "all_svgd_times_" + data_name, svgd_times)
np.save(results_dir + "/" + "all_coin_times_" + data_name, coin_times)

# times
mean_svgd_time = np.mean(svgd_times, axis=0)
lower_svgd_time, upper_svgd_time = return_confidence_interval(svgd_times)
lower_svgd_time[-1], upper_svgd_time[-1] = mean_svgd_time[-1]-1e-10, mean_svgd_time[-1]+1e-10
mean_svgd_time_CIs = [mean_svgd_time - lower_svgd_time, upper_svgd_time - mean_svgd_time]

mean_coin_time = np.mean(coin_times, axis=0)
lower_coin_time, upper_coin_time = return_confidence_interval(coin_times)
lower_coin_time[-1],  upper_coin_time[-1] = mean_coin_time[-1], mean_coin_time[-1]
mean_coin_time_CIs = [mean_coin_time - lower_coin_time, upper_coin_time - mean_coin_time]

plt.figure(figsize=(12, 6))
plt.scatter(range(8), mean_svgd_time, marker="o", color="C0", label="SVGD", s=100, zorder=10)
plt.scatter([x+0.2 for x in range(8)], mean_coin_time, marker="D", color="C1", label="Coin SVGD", s=100, zorder=9)
plt.legend(prop={'size': 18})
plt.grid(color='whitesmoke')
plt.xlabel("Dataset", fontsize=18)
plt.ylabel("Clock Time (s)", fontsize=18)
x_tick_locs = np.arange(len(data_names_abrv))
plt.xticks(x_tick_locs, data_names_abrv, rotation=60)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
fname = plot_dir + "/" + "all_times" + ".pdf"
plt.savefig(fname, bbox_inches="tight", dpi=300)
plt.show()