In [None]:
import resources.workspace as ws
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
plt.ion();

$
% START OF MACRO DEF
% DO NOT EDIT IN INDIVIDUAL NOTEBOOKS, BUT IN macros.py
%
\newcommand{\Reals}{\mathbb{R}}
\newcommand{\Expect}[0]{\mathbb{E}}
\newcommand{\NormDist}{\mathcal{N}}
%
\newcommand{\DynMod}[0]{\mathscr{M}}
\newcommand{\ObsMod}[0]{\mathscr{H}}
%
\newcommand{\mat}[1]{{\mathbf{{#1}}}}
%\newcommand{\mat}[1]{{\pmb{\mathsf{#1}}}}
\newcommand{\bvec}[1]{{\mathbf{#1}}}
%
\newcommand{\trsign}{{\mathsf{T}}}
\newcommand{\tr}{^{\trsign}}
\newcommand{\tn}[1]{#1}
\newcommand{\ceq}[0]{\mathrel{≔}}
%
\newcommand{\I}[0]{\mat{I}}
\newcommand{\K}[0]{\mat{K}}
\newcommand{\bP}[0]{\mat{P}}
\newcommand{\bH}[0]{\mat{H}}
\newcommand{\bF}[0]{\mat{F}}
\newcommand{\R}[0]{\mat{R}}
\newcommand{\Q}[0]{\mat{Q}}
\newcommand{\B}[0]{\mat{B}}
\newcommand{\C}[0]{\mat{C}}
\newcommand{\Ri}[0]{\R^{-1}}
\newcommand{\Bi}[0]{\B^{-1}}
\newcommand{\X}[0]{\mat{X}}
\newcommand{\A}[0]{\mat{A}}
\newcommand{\Y}[0]{\mat{Y}}
\newcommand{\E}[0]{\mat{E}}
\newcommand{\U}[0]{\mat{U}}
\newcommand{\V}[0]{\mat{V}}
%
\newcommand{\x}[0]{\bvec{x}}
\newcommand{\y}[0]{\bvec{y}}
\newcommand{\z}[0]{\bvec{z}}
\newcommand{\q}[0]{\bvec{q}}
\newcommand{\br}[0]{\bvec{r}}
\newcommand{\bb}[0]{\bvec{b}}
%
\newcommand{\bx}[0]{\bvec{\bar{x}}}
\newcommand{\by}[0]{\bvec{\bar{y}}}
\newcommand{\barB}[0]{\mat{\bar{B}}}
\newcommand{\barP}[0]{\mat{\bar{P}}}
\newcommand{\barC}[0]{\mat{\bar{C}}}
\newcommand{\barK}[0]{\mat{\bar{K}}}
%
\newcommand{\D}[0]{\mat{D}}
\newcommand{\Dobs}[0]{\mat{D}_{\text{obs}}}
\newcommand{\Dmod}[0]{\mat{D}_{\text{obs}}}
%
\newcommand{\ones}[0]{\bvec{1}}
\newcommand{\AN}[0]{\big( \I_N - \ones \ones\tr / N \big)}
%
% END OF MACRO DEF
$

The previous tutorial studied the Gaussian distribution, with pdf:
$$\begin{align}
\mathcal{N}(x \mid b, B) = (2 \pi B)^{-1/2} e^{-(x-b)^2/2 B} \, , \tag{G1}
\end{align}$$
and implemented it. It's also available from `scipy.stats`:

In [None]:
def pdf_G1(x, b, B):
    return sp.stats.norm.pdf(x, loc=b, scale=np.sqrt(B))

The Gaussian distribution will help to illustrate:

# Bayes' rule
In the Bayesian approach, knowledge and uncertainty about the unknown ($x$)
is quantified through probability.
And **Bayes' rule** is how we do inference: it says how to condition/merge/assimilate/update this belief based on data/observation ($y$).
For *continuous* "random variables", $x$ and $y$, it reads:

$$\begin{align}
p(x|y) &= \frac{p(x) \, p(y|x)}{p(y)} \, , \tag{BR} \\
\text{i.e.} \qquad \text{"posterior" (pdf of $x$ given $y$)}
\; &= \;
\frac{\text{"prior" (pdf of $x$)}
\; \times \;
\text{"likelihood" (pdf of $y$ given $x$)}}
{\text{"normalization" (pdf of $y$)}} \, ,
\end{align}
$$

Note that, in contrast to (the frequent aim of) classical statistics, Bayes' rule in itself makes no attempt at producing only a single estimate (but the topic is briefly discussed [further below](#Exc-2.28-(optional):)). It merely states how quantitative belief (weighted possibilities) should be updated in view of new data.

**Exc 2.10:** Derive Bayes' rule from the definition of [conditional pdf's](https://en.wikipedia.org/wiki/Conditional_probability_distribution#Conditional_continuous_distributions).

In [None]:
# ws.show_answer('BR derivation')

**Exc 2.11 (optional):** Laplace called "statistical inference" the reasoning of "inverse probability" (1774). You may also have heard of "inverse problems" in reference to similar problems, but without a statistical framing. In view of this, why do you think we use $x$ for the unknown, and $y$ for the known/given data?

In [None]:
# ws.show_answer('inverse')

Bayes' rule, eqn. (BR), involves functions (the densities), but applies for any/all values of $x$ (and $y$).
Thus, upon discretisation, eqn. (BR) becomes the multiplication of two arrays of values,
followed by a normalisation (explained [below](#Exc-2.14:)). It is hard to overstate how simple this principle is.

In [None]:
def Bayes_rule(prior_values, lklhd_values, dx):
    prod = prior_values * lklhd_values         # pointwise multiplication
    posterior_values = prod/(np.sum(prod)*dx)  # normalization
    return posterior_values

bounds = -15, 15
grid1d = np.linspace(*bounds, 201)
dx = grid1d[1]  - grid1d[0]

The code below shows Bayes' rule in action. Move the sliders (use arrow keys?) to animate it.

In [None]:
@ws.interact(y=(*bounds, 1),
             R=(0.01, 20, 0.2))
def Bayes1(y=4.0, R=1.0):
    b = 0
    B = 1
    # Compute
    x = grid1d
    prior_vals = pdf_G1(x, b, B)
    lklhd_vals = pdf_G1(y, x, R)
    postr_vals = Bayes_rule(prior_vals, lklhd_vals, dx)
    # Plot
    plt.figure(figsize=(10, 4))
    plt.plot(x, prior_vals, label=f'Prior, {b=:.4g}, {B=:.4g}')
    plt.plot(x, lklhd_vals, label=f'Lklhd {y=}, {R=:.4g}')
    plt.plot(x, postr_vals, label=f'Postr, pointwise')
    try:
        # See exercise below
        xhat, P = Bayes_rule_G1(b, B, y, R)
        label = f'Postr, parametric\n{xhat=:.4g},{P=:.4g}'
        postr_vals_G1 = pdf_G1(x, xhat, P)
        plt.plot(x, postr_vals_G1, '--', label=label)
    except NameError:
        pass

    plt.ylim(0, 0.6)
    plt.legend(loc="upper right")
    plt.show()

**Exc 2.12:** This exercise serves to make you acquainted with how Bayes' rule blends information.  
Move the sliders to see what happens, and answer the following:
 * What happens to the posterior when $R \rightarrow \infty$ ?
 * What happens to the posterior when $R \rightarrow 0$ ?
 * Move around $y$. What is the posterior's location (mean/mode) when $R = B$ ?
 * Can you say something universally valid (for any $y$ and $R$) about the height of the posterior pdf?
 * Does the posterior scale (width) depend on $y$?  
   *Optional*: What does this mean [information-wise](https://en.wikipedia.org/wiki/Differential_entropy#Differential_entropies_for_various_distributions)?
 * Consider the shape (ignoring location & scale) of the posterior. Does it depend on $R$ or $y$?
 * Can you see a shortcut to computing this posterior rather than having to do the pointwise multiplication?

In [None]:
# ws.show_answer('Posterior behaviour')

#### Exc 2.14 (optional):
Show that the normalization in `Bayes_rule()` amounts to (approximately) the same as dividing by $p(y)$.

In [None]:
# ws.show_answer('BR normalization')

In fact, since $p(y)$ is thusly implicitly known,
we often don't bother to write it down, simplifying Bayes' rule (eqn. BR) to
$$\begin{align}
p(x|y) \propto p(x) \, p(y|x) \, .  \tag{BR2}
\end{align}$$
Actually, do we even need to care about $p(y)$ at all? All we really need to know is how much more likely some value of $x$ (or an interval around it) is compared to any other $x$.
The normalisation is only necessary because of the *convention* that all densities integrate to $1$.
However, for large models, we usually can only afford to evaluate $p(y|x)$ at a few points (of $x$), so that the integral for $p(y)$ can only be roughly approximated. In such settings, estimation of the normalisation factor becomes an important question too.

**Exc 2.15:** The following implements a [uniform](https://en.wikipedia.org/wiki/Uniform_distribution_(continuous)#Moments)
(or "flat" or "box") pdf.
In the above animations, replace `pdf_G1` with your new `pdf_U1` (both for the prior and likelihood).

In [None]:
def pdf_U1(x, b, B):
    "Univariate (scalar), Uniform pdf."
    lower = b - np.sqrt(3*B)
    upper = b + np.sqrt(3*B)
    height = 1/(upper - lower)
    pdfx = height * np.ones_like(x)
    pdfx[x<lower] = 0
    pdfx[x>upper] = 0
    return pdfx

(a) Why (in the figure) are the walls of the pdf (ever so slightly) inclined?  
What happens when you move the prior and likelihood too far apart? Is the fault of the implementation, the math, or the problem statement?

In [None]:
# ws.show_answer('BR U1')

(b): Re-do Exc 2.12, now with `pdf_U1`.  
(c): Now test a Gaussian prior with a uniform likelihood.

**Exc 2.15 part 2**
Restore the use of `pdf_G1` everywhere.
- (a) Suppose the "observation model" consists in squaring, i.e.
      $y = x^2/4 + \varepsilon$, i.e. $p(y|x) = \NormDist(y|x^2/4, R)$, where $R$ is the variance of $\varepsilon$. Code this into the animation code.
- (b) Try $y = |x|$. Compare with (a).
- (c) Try $y = 2 x$. Can you reproduce a posterior obtained with $y = x$ ?

Restore $y = x$.

## Gaussian-Gaussian Bayes

The above animation shows Bayes' rule in 1 dimension. Previously, we saw how a Gaussian looks in 2 dimensions. Can you imagine how Bayes' rule looks in 2 dimensions? In higher dimensions, these things get difficult to imagine, let alone visualize.

Similarly, the size of the calculations required for Bayes' rule poses a difficulty. Indeed, the following exercise shows that (pointwise) multiplication for all grid points becomes preposterous in high dimensions.

**Exc 2.16 (optional):**
 * (a) How many point-multiplications are needed on a grid with $N$ points in $M$ dimensions? (Imagine an $M$-dimensional cube where each side has a grid with $N$ points on it)
 * (b) Suppose we model 15 physical quantities, on each grid point, on a discretized surface model of Earth. Assume the resolution is $1^\circ$ for latitude (110km), $1^\circ$ for longitude. How many variables are there in total? This is the dimensionality ($M$) of the problem.
 * (c) Suppose each variable is has a pdf represented with a grid using only $N=20$ points. How many multiplications are necessary to calculate Bayes rule (jointly) for all variables on our Earth model?

In [None]:
# ws.show_answer('Dimensionality a')
# ws.show_answer('Dimensionality b')
# ws.show_answer('Dimensionality c')

In response to this computational difficulty, we try to be smart and do something more analytical ("pen-and-paper"): we only compute the parameters (mean and (co)variance) of the posterior pdf.

This is doable and quite simple in the Gaussian-Gaussian case:  
With a prior $p(x) = \mathcal{N}(x \mid b,B)$ and  
a likelihood $p(y|x) = \mathcal{N}(y \mid x,R)$,
the posterior is
$$\begin{align}
p(x|y)
&= \mathcal{N}(x \mid \hat{x},P) \tag{4} \, ,
\end{align}$$
where, in the univariate (1-dimensional) case:
$$\begin{align}
    P &= 1/(1/B + 1/R) \, , \tag{5} \\\
  \hat{x} &= P(b/B + y/R) \, .  \tag{6}
\end{align}$$

The multivariate case is discussed in a later tutorial.

**Exc 2.17:**
The statement $x = \mu \pm \sigma$ is *sometimes* used
as a shorthand for $p(x) = \mathcal{N}(x \mid \mu, \sigma^2)$. Suppose
- you think the temperature $x = 20°C \pm 2°C$,
- a thermometer yields the observation $y = 18°C \pm 2°C$.
- Show that your posterior is $p(x|y) = \mathcal{N}(x \mid 19, 2)$

In [None]:
# ws.show_answer('GG BR example')

#### Exc  2.18 'Gaussian Bayes':
Derive the above expressions for $P$ and $\hat{x}$
from Bayes' rule (BR2) and the expression for a Gaussian pdf (G1).

In [None]:
# ws.show_answer('BR Gauss')

**Exc 2.20:** Algebra exercise: Show that eqn. (5) can be written as
$$P = K R \,,    \tag{8}$$
where
$$K = B/(B+R) \,,    \tag{9}$$
is called the "Kalman gain".  
Then shown that eqns (5) and (6) can be written as
$$\begin{align}
    P &= (1-K)B \, ,  \tag{10} \\\
  \hat{x} &= b + K (y-b) \tag{11} \, ,
\end{align}$$

In [None]:
# ws.show_answer('BR Kalman1')

**Exc 2.22 (optional):** Consider the formula for $K$ and its role in the previous couple of equations. Why do you think $K$ is called a "gain"?

In [None]:
# ws.show_answer('KG intuition')

**Exc 2.24:** Implement a Gaussian-Gaussian Bayes' rule (eqns 5 and 6, or 9-11) by completing the code below.

In [None]:
def Bayes_rule_G1(b, B, y, R):
    ### INSERT ANSWER HERE ###
    return xhat, P

In [None]:
# ws.show_answer('BR Gauss code')

**Exc 2.26:** Go back to the above animation code cell. Restore `pdf_G1` (both for prior and likelihood). Run/execute.
- What is the relationship between the two posterior curves?  
  *Hint: This is the main secret of the "Kalman filter".*
- Now use `_U1` instead of `_G1` to compute `prior_vals` or `lklhd_vals` or both.  
  Does `Bayes_rule_G1()` provide a good approximation to `Bayes_rule()`?

#### Exc 2.28 (optional):
If you need to pick a single point value for your estimate (for example, an action to be taken), you can **decide** on it by optimising (with respect to the estimate) the expected value of some utility/loss function [[ref](https://en.wikipedia.org/wiki/Bayes_estimator)]. For example, if the density of $X$ is symmetric,
   and $\text{Loss}$ is convex and symmetric,
   then $\Expect[\text{Loss}(X - \theta)]$ is minimized
   by the mean, $\Expect[X]$, which also coincides with the median.
   <!-- See Corollary 7.19 of Lehmann, Casella -->
For the expected *squared* loss, $\Expect[(X - \theta)^2]$,
the minimum is the mean for *any distribution*.
Show the latter result.  
*Hint: insert $0 = \,?\, - \,?$.*

In summary, the intuitive idea of **considering the mean of $p(x)$ as the point estimate** has good theoretical foundations.

## Multivariate illlustration

Unlike previous tutorial, which implemented the Gaussian pdf,
we here take it from `scipy.stats`.

In [None]:
from scipy.stats import multivariate_normal
def pdf_GM(points, mu, Sigma):
    diff = points - mu  # enable broadcasting of *mean*
    dims = len(Sigma)
    return multivariate_normal(np.zeros(dims), Sigma).pdf(diff)

Notice that we're re-using the very same `Bayes_rule` as in the 1D case.

In [None]:
grid2d = np.dstack(np.meshgrid(grid1d, grid1d))

@ws.interact(corr=(-0.999, 0.999, .01),
             y1=bounds,
             y2=bounds,
             R1=(0.01, 36, 0.2),
             R2=(0.01, 36, 0.2),
             )
def Bayes2(corr=.3, y1=3, y2=-12, R1=4**2, R2=1, y1_only=False):
    x = grid2d

    mu = np.zeros(2)
    C = 25 * np.array([[1, corr],
                       [corr, 1]])

    y = np.array([y1, y2])
    R = np.diag([R1, R2])
    Hx = x
    #Hx = x**2/4
    #Hx = x**3/36
    
    #Hx = x[..., :1] * x[..., 1:2]
    #y1_only = True

    if y1_only:
        y = y[:1]
        R = R[:1, :1]
        Hx = Hx[..., :1]

    lklhd = pdf_GM(y, Hx, R)
    prior = pdf_GM(x, mu, C)
    postr = Bayes_rule(prior, lklhd, dx**2)

    ax, plot = ws.get_jointplotter(grid1d)
    contours = [plot(prior, 'royalblue'),
                plot(lklhd, 'orange'),
                plot(postr, 'green')]
    ax.legend(contours, ['prior', 'lklhd', 'postr'], loc="upper left")
    plt.show()

**Exc:** Try different `H`.

It will not surprise you to learn that the shape of the posterior is again Gaussian,
essentially for the same reason as in 1D.

### Next: [Univariate (scalar) Kalman filtering](T3%20-%20Univariate%20Kalman%20filtering.ipynb)