# Gaussian Processes

## 1. Introduction

### 1.1 Abstract

...

In this article, we'll first revise the fundamentals of Gaussian processes

### 1.2 Contents
1. Introduction
2. Getting Started
3. Multivariate Normal (MVN)
4. Gaussian Processes
5. Gaussian Process Regression
6. Gaussian Process Classification

## 2. Getting Started
The examples in this notebook make use of additional libraries and the plotting functions defined below. Make sure you have installed the requirements, and then run this cell once to import modules and define utility functions before continuing.

In [407]:
import numpy as np
import matplotlib.pyplot as plt
import GPy
GPy.plotting.change_plotting_library('plotly')

# GPy's built-in plotly implementation is broken due to deprecated functions. :(
# So my own plotting function is implemented here for aesthetically pleasing plots.
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Palette from Matplotlib
colorsHex = {\
    "black": "#000000",\
    "darkBlue": "#204a87",\
    "blue": "#1f77b4",\
    "orange": "#ff7f0e",\
    "green": "#2ca02c",\
    "red": "#d62728",\
    "purple": "#9467bd",\
    "brown": "#8c564b",\
    "yellow": "#bcbd22",\
    "pink": "#e377c2",\
    "teal": "#17becf"}

# Unique colours defined for first 7 plots, then we cycle through
colorWheel = ["orange", "green", "red", "purple", "pink", "yellow", "teal"]
meanColor = colorsHex["blue"]

def hex2rgb(hexcolor):
    hexcolor = [hexcolor[1+2*i:1+2*(i+1)] for i in range(3)]
    r,g,b = [int(n,16) for n in hexcolor]
    return (r,g,b)

def plotly_fill_between(ax, X, lower, upper, color=colorsHex['blue'], label=None, hide_legend=False, line_kwargs=None, **kwargs):
    if not 'line' in kwargs:
        kwargs['line'] = go.scatter.Line(**line_kwargs or {})
    else:
        kwargs['line'].update(line_kwargs or {})
        
    if color.startswith('#'):
        fcolor = 'rgba({c[0]}, {c[1]}, {c[2]}, {alpha})'.format(c=hex2rgb(color), alpha=0.1)
    else:
        fcolor = color
        
    u = go.Scatter(x=X, y=upper, fill='tonextx', fillcolor=fcolor, showlegend=(not hide_legend) and label is not None, name=label, legendgroup='{}_fill_({},{})'.format(label, ax[0], ax[1]), **kwargs)
    l = go.Scatter(x=X, y=lower, fillcolor=fcolor, showlegend=False, name=label, legendgroup='{}_fill_({},{})'.format(label, ax[0], ax[1]), **kwargs)
    return l, u

def plotly_gp(mu, cov, X, X_train=None, Y_train=None, samples=[], title=""):
    '''
    Plot a Gaussian process using plotly.
    
    Args:
        mu -- the mean vector
        cov -- the covariance matrix
        X -- the grid of X values
        X_train, Y_train -- the training data i.e. any points with known values, to be plotted as scatter points.
        samples -- samples (functions) drawn from the Gaussian process.
    '''
    fig = go.Figure(layout=go.Layout(hovermode='x'))
    fig.update_layout(title_text=title)
    
    X = np.squeeze(X)
    mu = np.squeeze(mu)
    
    # Plot confidence interval as shaded area
    uncertainty = 1.96 * np.sqrt(np.diag(cov))
    confidence_line = {'color': colorsHex['darkBlue'], 'width': 0.5 }
    l, u = plotly_fill_between((fig.layout.xaxis, fig.layout.yaxis), X, mu - uncertainty, mu + uncertainty, label="Confidence", line_kwargs=confidence_line)
    fig.add_trace(l)
    fig.add_trace(u)
    
    # Plot mean
    fig.add_trace(go.Scatter(x=X, y=mu, name='Mean', line=go.scatter.Line(color=meanColor)))
    
    # Plot samples
    for i, sample in enumerate(samples):
        line_color = colorsHex[colorWheel[i % len(colorWheel)]]
        fig.add_trace(go.Scatter(x=X, y=samples[i], name=f'Sample {i+1}', line=go.scatter.Line(color=line_color)))
    
    # Plot training data, if any
    if X_train is not None and Y_train is not None:
        fig.add_trace(go.Scatter(x=X_train.ravel(), y=Y_train.ravel(), mode='markers', marker=dict(color=colorsHex['black'], symbol='x'), name='Data'))
    
    fig.show()
    
def plotly_gps(data=[], nrows=1, ncols=1, title=""):
    '''
    Plot several Gaussian processes together as subplots.
    Plots are similar in style to plotly_gp but share a single legend.
    
    Args:
        data -- a dict/object including the args to plotly i.e. mu, cov, X, and optionally X_train, Y_train, samples, etc...
        nrows, ncols -- the dimensions of the subplot grid, s.t. nrows * ncols <= len(data)
        title -- the main title of the figure
    '''
    titles = [subplot_data['title'] if 'title' in subplot_data else None for subplot_data in data]
    fig = make_subplots(rows=nrows, cols=ncols, subplot_titles=titles)
    fig.update_layout(title_text=title, hovermode='x')
    
    for r in range(nrows):
        for c in range(ncols):
            subplot_data = data[r * ncols + c]
            mu = np.squeeze(subplot_data['mu'])
            cov = subplot_data['cov']
            X = np.squeeze(subplot_data['X'])
            X_train = subplot_data['X_train'] if 'X_train' in subplot_data else None
            Y_train = subplot_data['Y_train'] if 'Y_train' in subplot_data else None
            samples = subplot_data['samples'] if 'samples' in subplot_data else []
            
            showlegend = r == 0 and c == 0
            
            # Plot confidence
            uncertainty = 1.96 * np.sqrt(np.diag(cov))
            confidence_line = {'color': colorsHex['darkBlue'], 'width': 0.5 }
            # Pass (0,0) as axes for shared legend group across all subplots
            l, u = plotly_fill_between((0,0), X, mu - uncertainty, mu + uncertainty, label="Confidence", hide_legend = not showlegend, line_kwargs=confidence_line)
            fig.add_trace(l, row=r+1, col=c+1)
            fig.add_trace(u, row=r+1, col=c+1)
            
            # Plot mean
            fig.add_trace(go.Scatter(x=X, y=mu, name='Mean', legendgroup='Mean', showlegend=showlegend, line=go.scatter.Line(color=meanColor)), row=r+1, col=c+1)

            # Plot samples
            for i, sample in enumerate(samples):
                line_color = colorsHex[colorWheel[i % len(colorWheel)]]
                fig.add_trace(go.Scatter(x=X, y=samples[i], name=f'Sample {i+1}', legendgroup=f'Sample {i+1}', showlegend=showlegend, line=go.scatter.Line(color=line_color)), row=r+1, col=c+1)

            # Plot training
            if X_train is not None and Y_train is not None:
                fig.add_trace(go.Scatter(x=X_train.ravel(), y=Y_train.ravel(), legendgroup='Data', showlegend=showlegend, mode='markers', marker=dict(color=colorsHex['black'], symbol='x'), name='Data'), row=r+1, col=c+1)
    fig.show()


## 3. Multivariate Normal (MVN)


## 4. Gaussian Processes

### 4.2 Covariance Functions

Polynomial
$$ k({\bf x}_1, {\bf x}_2) = (x + {\bf x}_1^\top {\bf x}_2)^k $$

Exponential
$$ k({\bf x}_1, {\bf x}_2) = \exp \Big( -\frac{|{\bf x}_1 - {\bf x}_2|}{l}\Big)$$

Squared exponential (RBF)
$$ k({\bf x}_1, {\bf x}_2) = \sigma_f^2 \exp \Big( - \frac{|{\bf x}_1 - {\bf x}_2|^2}{2l^2} \Big)$$

Gamma exponential
$$ k({\bf x}_1, {\bf x}_2) = \exp \Big( - \Big( \frac{|{\bf x}_1 - {\bf x}_2 |}{l} \Big)^\gamma \Big) $$

Rational quadratic
$$ k({\bf x}_1, {\bf x}_2) = \Big( 1 + \frac{|{\bf x}_1 - {\bf x}_2|^2}{2 \alpha l^2} \Big)^{-\alpha} $$

Neural network (arc sine kernel)
$$ k({\bf x}_1, {\bf x}_2) = \sigma^2 \frac{2}{\pi} {\rm asin}\Big( \frac{ \sigma_w^2 {\bf x}_1^\top {\bf x}_2 + \sigma_b^2 }{\sqrt{\sigma_w^2 {\bf x}_1^\top {\bf x}_1 + \sigma_b^2 + 1} \sqrt{\sigma_w^2 {\bf x}_2^\top  {\bf x}_2 + \sigma_b^2 + 1 }} \Big) $$

In [430]:
# HELPER FUNCTIONS
def squared_dist(x1,x2):
    return np.sum(x1**2, 1).reshape(-1,1) + np.sum(x2**2, 1) - 2 * np.dot(x1,x2.T)

def comp_prod(X1, weight_variance, bias_variance, X2=None):
    if X2 is None:
        return (np.square(X1)*weight_variance).sum(axis=1)+bias_variance
    else:
        return (X1 * weight_variance).dot(X2.T)+bias_variance

# DEFINE KERNELS

def poly_kernel(x1, x2, l=1., x=1., sigma=1., k=3.):
    '''
    Polynomial kernel, as defined above.
    
    Args:
        x1 -- array of m points (m x d)
        x2 -- array of n points (n x d)
        l -- scale or length parameter
        x -- bias
        sigma -- defines the variance, sigma**2
        k -- order
    '''
    dot_prod = np.dot(x1, x2.T)
    A = (l * dot_prod) + x
    B = A ** k
    return (sigma**2) * B
    
def exp_kernel(x1, x2, l=1.):
    '''
    Exponential kernel, as defined above.
    
    Args:
        x1 -- array of m points (m x d)
        x2 -- array of n points (n x d)
        l -- length or smoothness parameter
    '''
    return np.exp(- np.sqrt(squared_dist(x1,x2)) / l)
    
def rbf(x1, x2, l=1.0, sigma_f=1.0):
    '''
    RBF or squared exponential kernel, as defined above. 
    
    Args:
        x1 -- array of m points (m x d)
        x2 -- array of n points (n x d)
        l -- length or smoothness parameter
        sigma_f -- defines the variance, sigma_f**2
        
    Returns:
        (m x n) matrix
    '''
    return sigma_f**2 * np.exp(-0.5 / l**2 * squared_dist(x1, x2))

def gamma_exp(x1, x2, l=.5, gamma=.5):
    '''
    Gamma exponential kernel, as defined above.
    
    Args:
        x1 -- array of m points (m x d)
        x2 -- array of n points (n x d)
        l -- length or smoothness parameter
        gamma
    '''
    return np.exp(- (squared_dist(x1, x2) / l)**gamma)

def rational_quadratic(x1, x2, l=3., alpha=3.):
    '''
    Rational quadratic kernel, as defined above.
    
    Args:
        x1 -- array of m points (m x d)
        x2 -- array of n points (n x d)
        l -- length or smoothness parameter
        alpha --
    '''
    return (1 + ( (squared_dist(x1, x2) ) / (2 * alpha * l**2) ) )**(-alpha)

def neural_network(x1, x2, sigma=1., sigma_w=1., sigma_b=1.):
    '''
    Neural network kernel, as defined above. Also known as the arc sine or MLP kernel.
    
    Args:
        x1 -- array of m points (m x d)
        x2 -- array of n points (n x d)
        sigma -- defines the variance sigma**2
        sigma_w -- defines variance sigma_w**2 of prior over input weights
        sigma_b -- define variance sigma_b**2 of prior over bias parameters
    '''
    numer = comp_prod(x1, sigma_w**2, sigma_b**2, x2)
    x1_denom = np.sqrt( comp_prod(x1, sigma_w**2, sigma_b**2) + 1.)
    x2_denom = np.sqrt( comp_prod(x2, sigma_w**2, sigma_b**2) + 1.)
    fraction = numer / x1_denom[:, None] / x2_denom[None, :]
    asin = np.arcsin(fraction)
    return sigma**2 * (2. / np.pi) * asin
    
gpy_rbf = GPy.kern.RBF(input_dim=1, variance=1., lengthscale=1.)

In [459]:
# Plot priors
NSAMPLES = 3
X = np.linspace(-5., 5., 500)[:, None]
mu = np.zeros((500))

# 1. RBF or Squared Exponential
l1 = .5
sigma_f = 1.
rbf_cov = rbf(X,X,l=l1, sigma_f=sigma_f)
rbf_samples = np.random.multivariate_normal(mu, cov, NSAMPLES)
rbf_title = "Squared Exponential (RBF) (l={l}, \u03C3_f={sigma_f})".format(l=l1, sigma_f=sigma_f)

# 2. Gamma Exponential
l2 = .5
gamma = .5
gamma_cov = gamma_exp(X, X, l = l2, gamma=gamma)
gamma_samples = np.random.multivariate_normal(mu, gamma_cov, NSAMPLES)
gamma_title = "Gamma Exponential (l={l}, \u03B3={gamma})".format(l=l2, gamma=gamma)

# 3. Rational Quadratic
l3 = 3.
alpha = 3.
rq_cov = rational_quadratic(X, X, l=l3, alpha=alpha)
rq_samples = np.random.multivariate_normal(mu, rq_cov, NSAMPLES)
rq_title = "Rational Quadratic (l={l}, \u03B1={alpha})".format(l=l3, alpha=alpha)

# 4. Neural Network
sigma_n = 1.
sigma_w = 3.5
sigma_b = 3
nn_cov = neural_network(X, X, sigma=sigma_n, sigma_w=sigma_w, sigma_b=sigma_b)
# Equivalently, in GPy:
# gpy_nn = GPy.kern.MLP(input_dim=1, variance=sigma_n**2, weight_variance=sigma_w**2, bias_variance=sigma_b**2)
# nn_cov = gpy_nn.K(X, X)
nn_samples = np.random.multivariate_normal(mu, nn_cov, NSAMPLES)
nn_title = "Neural Network (\u03C3={sigma}, \u03C3_w={sigma_w}, \u03C3_b={sigma_b})".format(sigma=sigma_n, sigma_w=sigma_w, sigma_b=sigma_b)

# 5. Exponential
l5 = 1.
exp_cov = exp_kernel(X, X, l=l5)
exp_samples = np.random.multivariate_normal(mu, exp_cov, NSAMPLES)
exp_title = "Exponential (l={l})".format(l=l5)

# 6. Polynomial
l6 = 1.
k = 3.
sigma_p = 1.
x_p = 1.
poly_cov = poly_kernel(X, X, l=l6, x=x_p, sigma=sigma_p, k=k)
# Equivalently, in GPy:
# gpy_poly = GPy.kern.Poly(input_dim=1, order=k, bias=x, variance=sigma**2, scale=l)
# poly_cov = gpy_poly.K(X, X)
poly_samples = np.random.multivariate_normal(mu, poly_cov, NSAMPLES)
poly_title = "Polynomial (\u03C3={sigma}, l={l}, k={order}, x={x})".format(sigma=sigma_p, l=l6, order=k, x=x_p)

# View the prior plots in a grid for comparison
subplot_data = [{'mu': mu, 'cov': rbf_cov, 'X': X, 'samples': rbf_samples, 'title': rbf_title}, {'mu': mu, 'cov': gamma_cov, 'X':X, 'samples': gamma_samples, 'title': gamma_title}, {'mu': mu, 'cov': rq_cov, 'X': X, 'samples': rq_samples, 'title': rq_title}, {'mu': mu, 'cov': nn_cov, 'X': X, 'samples': nn_samples, 'title': nn_title}, {'mu': mu, 'cov': exp_cov, 'X':X, 'samples':exp_samples, 'title':exp_title}, {'mu': mu, 'cov': poly_cov, 'X':X, 'samples':poly_samples, 'title':poly_title}]
plotly_gps(subplot_data, 3, 2, title="Gaussian Process Priors")



In [460]:
# UNCOMMENT THESE TO TAKE A CLOSER LOOK AT ONE OF THE PRIOR PLOTS ABOVE.
plotly_gp(mu, cov, X, samples=rbf_samples, title=rbf_title)
# plotly_gp(mu, gamma_cov, X, samples=gamma_samples, title=gamma_title)
# plotly_gp(mu, rq_cov, X, samples=rq_samples, title=rq_title)
# plotly_gp(mu, nn_cov, X, samples=nn_samples, title=nn_title)
# plotly_gp(mu, exp_cov, X, samples=exp_samples, title=exp_title)
# plotly_gp(mu, poly_cov, X, samples=poly_samples, title=poly_title)

## 5. Gaussian Process Regression

In [485]:
from numpy.linalg import inv

def posterior(X_s, X_train, Y_train, kernel, sigma_y=1e-8):
    '''
    Compute the statistics (mu, cov) of the posterior distribution,
    from m training data (X_train, Y_train) and n new inputs Xp.
    
    Args:
        X_s -- new input locations (n x d)
        X_train -- training location (m x d)
        Y_train -- training targets (m x 1)
        kernel -- kernel function
        sigma_y -- noise parameter
    '''
    # TODO: pass misc parameters to kernel
    K = kernel(X_train, X_train) + sigma_y**2 * np.eye(len(X_train))
    K_s = kernel(X_train, X_s)
    K_ss = kernel(X_s, X_s) + 1e-8 * np.eye(len(X_s))
    K_inv = inv(K)
    
    mu_s = K_s.T.dot(K_inv).dot(Y_train)
    cov_s = K_ss - K_s.T.dot(K_inv).dot(K_s)
    
    return mu_s, cov_s

In [489]:
# Generate noise-free training data
X = np.linspace(-5., 5., 500)[:, None]

TRAIN_SIZE = 5
X_train = np.random.uniform(-5., 5, (TRAIN_SIZE, 1))
Y_train = np.sin(X_train)

# Compute mean and covariance of posterior
mup, covp = posterior(X, X_train, Y_train, kernel=rbf)

# Draw samples from posterior
NSAMPLES = 3
samples = np.random.multivariate_normal(mup.ravel(), covp, NSAMPLES)
plotly_gp(mup, covp, X, X_train=X_train, Y_train=Y_train, samples=samples, title="Posterior from Noise-Free Training Data")

In [490]:
# Generate noisy training data
X = np.linspace(-5., 5., 500)[:, None]

NOISE = 0.2
TRAIN_SIZE = 7
X_train = np.random.uniform(-5., 5, (TRAIN_SIZE, 1))
Y_train = np.sin(X_train) + NOISE * np.random.rand(*X_train.shape)

# Compute mean and covariance of posterior
mup, covp = posterior(X, X_train, Y_train, kernel=rbf, sigma_y=NOISE)

# Draw samples from posterior
NSAMPLES = 3
samples = np.random.multivariate_normal(mup.ravel(), covp, NSAMPLES)
plotly_gp(mup, covp, X, X_train=X_train, Y_train=Y_train, samples=samples, title="Posterior from Noisy Training Data")

In [504]:
# Compare with GPy

# Generate noisy training data
X = np.linspace(-5., 5., 500)[:, None]

NOISE = 0.2
TRAIN_SIZE = 7
X_train = np.random.uniform(-5., 5, (TRAIN_SIZE, 1))
Y_train = np.sin(X_train) + NOISE * np.random.rand(*X_train.shape)

# Compute posterior with NumPy implementation
mup1, covp1 = posterior(X, X_train, Y_train, kernel=rbf, sigma_y=NOISE)

# Compute posterior with GPy
kernel = GPy.kern.RBF(input_dim=1, variance=1., lengthscale=1.)
model = GPy.models.GPRegression(X_train, Y_train, kernel)

# GPy tries to estimate noise by default, so to reproduce the above result, we need to fix the variance
model.Gaussian_noise.variance = NOISE**2
model.Gaussian_noise.variance.fix()

mup2, covp2 = model.predict(X, full_cov=True)

# Plot side by side
subplot_data = [{'mu': mup1, 'cov': covp2, 'X': X, 'X_train': X_train, 'Y_train': Y_train, 'samples': [], 'title': "NumPy Implementation"}, {'mu': mup2, 'cov': covp2, 'X':X, 'X_train': X_train, 'Y_train': Y_train, 'samples': [], 'title': "GPy Implementation"}]
plotly_gps(subplot_data, 1, 2, title="Gaussian Process Posteriors")

In [None]:
# Optimising Hyper-parameters

In [505]:
# GP REGRESSION PLAYGROUND
## Choose number of training points
TRAIN_SIZE = 5 
## Choose noise level
NOISE = 0.05 
## Choose number of samples to draw from posterior
NSAMPLES = 3

## Choose how test points are chosen
X_train = np.random.uniform(-5., 5, (TRAIN_SIZE, 1))
## Choose the ground truth function (or load in data)
Y_train = np.sin(np.pi*X_train/2) + np.random.randn(TRAIN_SIZE, 1) * NOISE
## Choose prior kernel
kernel = GPy.kern.RBF(input_dim=1, variance=1., lengthscale=1.)

## The rest
model = GPy.models.GPRegression(X_train, Y_train, kernel)
model.constrain_positive('')
model.optimize(messages=True)
Xp = np.linspace(-5., 5., 500)[:, None]
mup, covp = model.predict(Xp, full_cov=True)
posteriorYp = model.posterior_samples_f(Xp, full_cov=True, size=NSAMPLES)
samples = np.transpose(np.squeeze(posteriorYp)) # Reshape samples to form expected by plotting function
plotly_gp(mup, covp, Xp, X_train=X_train, Y_train=Y_train, samples=samples, title="Posterior")


HBox(children=(VBox(children=(IntProgress(value=0, max=1000), HTML(value=''))), Box(children=(HTML(value=''),)…

## 6. Gaussian Process Classification