In [1]:
import numpy as np
import scipy as sp
import scipy.stats as stats
from scipy.stats import gaussian_kde as gkde
from scipy.special import gamma
from bokeh.plotting import figure, show, output_notebook, gridplot
from bokeh.models import Span

output_notebook()

In [2]:
# Utility functions to work with the normal distribution in terms of mean and
# variance.
def normPDF(x,mu=0.0, sig2=1.0):
    return stats.norm.pdf(x,mu,np.sqrt(sig2))

def normRVS(samples=1, mu=0.0, sig2=1.0):
#     return stats.norm.rvs(mu, np.sqrt(sig2), size=samples)
    return np.random.normal(mu,np.sqrt(sig2),size=samples)

# Utility functions to work with the gamma distribution
def gammaPDF(x, alpha, beta):
    return stats.gamma.pdf(x, alpha, beta)

def gammaRVS(alpha, beta, samples=1):
#     return stats.gamma.rvs(alpha,scale=beta,size=samples)
    return np.random.gamma(alpha, beta, size=samples)

# Utility functions to work with the uniform distribution
def uniformPDF(x,lower=0.0, upper=1.0):
    return stats.uniform.pdf(x, lower, upper)

def uniformRVS(samples=1, lower=0.0, upper=1.0):
#     return stats.uniform.rvs(lower, upper, size=samples)
    return np.random.uniform(lower,upper, size=samples)

# 2. Unknown mean ($\mu$) and unknown variance ($\sigma^2$)

In [3]:
mu = 0.8
sig2 = 0.1
prec = 1.0 / sig2

Find the mean $\mu$ of $p(x) \sim \mathcal{N}(x \vert \mu, \sigma^2)$. $\mu$ and $\sigma^2$ unknown. The **real** distribution is presented below.

In [4]:
def p(x):
    return normPDF(x, mu, sig2)

x = np.linspace(-3.0,5.0,1000)
px = [p(i) for i in x]
f = figure(title="Real distribution", x_axis_label='x', y_axis_label='p(x)',
           plot_width=300, plot_height=300)
f.line(x, px, line_width=2, color="firebrick", legend="Real dist.")
show(f)

In [5]:
N = 63
D = normRVS(N, mu, sig2)

## 2.1 Rejection sampling

- Likelihood:

\begin{align*}
p(D \vert \mu,\lambda) = \frac{1}{(2\pi)^{n/2}} \lambda^{n/2} exp \left\{- \frac{\lambda}{2} \sum_{i=1}^N (x_i - \mu)^2\right\}
\end{align*}

In [6]:
def likelihood_(mu,prec):
    a = 1.0 / ((2.0 * np.pi) ** (N / 2.0))
    b = prec ** (N / 2.0)
    c = - (prec / 2.0) * np.sum((D - mu) ** 2)
    return a * b * np.exp(c)
likelihood = np.vectorize(likelihood_)

- Prior:

\begin{align*}
\mathcal{NG}(\mu,\lambda \vert \mu_0, \alpha_0, \beta_0) = \mathcal{N}(\mu \vert \mu_0, \lambda^{-1}) \mathcal{G}(\lambda \vert \alpha_0, \beta_0)
\end{align*}

Initial values for the hyper parameters

In [7]:
mu0 = 0.5
alpha0 = 5.0
beta0 = 1.0

- Posterior of the conjugate prior.

In [8]:
def posterior_mean_prec(beta, mu0, a0, b0, X, n):
    muml = np.sum(X[0:n]) / n
    mun = (beta * mu0 + n * muml) / (beta + n)
    betan = beta + n
    an = a0 + n * 0.5
    a = X - muml
    a = a ** 2
    a = np.sum(a)
    bn = b0 + 0.5  * (a) + beta * n * (muml - mu0) * (muml - mu0) / (2 * beta + n)
    return betan, mun, an, bn

x1, x2, x3, x4 = posterior_mean_prec(1, mu0, alpha0, beta0, D, N)
curbest = x3 / x4

def Zng(mu0, k0, a0, b0):
    #Computes the normalization constant of the normal-gamma dist'
    ft = sp.special.gamma(a0) / (b0 **  a0)
    return ft * ((2.0 * np.pi) / k0) ** (0.5)

def norm_gamma(x, la, mu0, a0, b0):
    k0 = 1.0
    zng = Zng(mu0, k0, a0, b0)
    a = la ** (a0 - 0.5)
    return (1.0 / zng) * a * np.exp(- (la / 2.0) * (k0 * ((x - mu0) ** 2) + 2.0 * b0))


kn, mun, an, bn = posterior_mean_prec(1, 1, alpha0, beta0, D, N)
def pz_(x, la): 
        return norm_gamma(x, la, mun, an, bn)
pz = np.vectorize(pz_)

In [9]:
def sb_plot():
    plots = list()
    
    f = figure(title="Posterior")
    x = np.linspace(-0.5, 1.5, 300)
    f.line(x, pz(x,10.0), legend="Mean",
           color="darkgreen", line_width=2)
    realMean = Span(location=0.8, dimension='height',
                    line_color='firebrick', line_dash='dashed', line_width=2)
    f.add_layout(realMean)
    f.legend.location = "top_left"
    plots.append(f)

    f = figure(title=None)
    x = np.linspace(0, 20, 300)
    f.line(x, pz(0.8,x), legend="Precision",
           color="darkgreen", line_width=2)
    realPrecision = Span(location=10.0, dimension='height',
                         line_color='firebrick', line_dash='dashed', line_width=2)
    f.add_layout(realPrecision)
    f.legend.location = "top_right"
    plots.append(f)
    
    grid = gridplot(plots, ncols=2, plot_width=370, plot_height=350)
    show(grid)
    
sb_plot()

In [11]:
def q_(x, la):
    return norm_gamma(x, la, mu0, alpha0, beta0)

q = np.vectorize(q_)

def draw_q(samples):
    # Take the samples of lambda from the gamma distribution 
    lmbdas = gammaRVS(alpha0, beta0, samples)
    # Use each sample from the gamma distribution as the precision for a
    # sample from the normal distribution
    mus = np.zeros(samples)
    for i in range(samples):
        mus[i] = normRVS(mu=mu0, sig2=(1.0 / lmbdas[i]))
    return mus, lmbdas

In [12]:
def rejectionSampling(N):
    # Draw N samples from the proposal distribution
    muSamples, lmbdaSamples = draw_q(N)
    # Compute the maximum lakelihood for the mean
    muML = np.mean(D)
    sig2ML = (1.0 / len(D)) * np.sum((D - muML)**2)
    lmbdaML = 1.0 / sig2ML
    # Compute the proportion between the probability of the data given the
    # mean and the probability of the data given muML.
    proportion = likelihood(muSamples, lmbdaSamples) / likelihood(muML, lmbdaML)
    u = uniformRVS(N)
    # Accept the samples for which a random sample is less than or equal to 
    # the proportion.
    # print (p(muSamples, lmbdaSamples))
    # print (proportion)
    return muSamples[u <= proportion], lmbdaSamples[u <= proportion]

In [45]:
def plotMeanResult(n, muSamples, l1='Samples=', l2=' accepted=', xmin=-0.5, xmax=1.5):
    x = np.linspace(xmin, xmax, 300)
    muKernel = gkde(muSamples)
    accepted = len(muSamples)
    ttl = "Mean (" + l1 + str(n) + l2 + str(accepted) + ")"
    f = figure(title=ttl, x_axis_label='x', y_axis_label='p(x)')
    f.line(x, muKernel(x), legend="Mean est.",
               color="darkgreen", line_width=2)
    realMean = Span(location=mu, dimension='height',
                    line_color='firebrick', line_dash='dashed',
                    line_width=2)
    startMean = Span(location=mu0, dimension='height',
                    line_color='grey', line_dash='dashed',
                    line_width=2)
    f.add_layout(realMean)
    f.add_layout(startMean)
    f.legend.location = "top_left"
    return f

def plotPrecisionResult(n, lmbdaSamples, l1='Sampl=', l2=' acc.=',
                        xmin=0.0, xmax=20.0):
    x = np.linspace(xmin, xmax, 300)
    lmbdaKernel = gkde(lmbdaSamples)
    accepted = len(lmbdaSamples)
    
    y = lmbdaKernel(x)
    samplingBest = x[np.argmax(y)]
    delta = np.abs(curbest - samplingBest)
#     print ("Sampling prec. " + str(samplingBest) + " Bayes best " + str(curbest))
    ttl = "Precision (" + l1 + str(n) + l2 + str(accepted) + ")" + " delta = %.2f" % delta
    f = figure(title=ttl, x_axis_label='x', y_axis_label='p(x)')
    f.line(x, lmbdaKernel(x), legend="Prec est.",
           color="darkgreen", line_width=2)
    realPrecision = Span(location=prec, dimension='height',
                         line_color='firebrick', line_dash='dashed',
                         line_width=2)
    startPrecision = Span(location=alpha0/beta0, dimension='height',
                         line_color='grey', line_dash='dashed',
                         line_width=2)
    bestPrecision = Span(location=curbest, dimension='height',
                         line_color='blue', line_dash='dashed',
                         line_width=2)
    f.add_layout(realPrecision)
    f.add_layout(startPrecision)
    f.add_layout(bestPrecision)
    f.legend.location = "top_right"
    return f

### Experiment

To evaluate the effectiveness of the rejection sampling approach finding both the mean and the precision of a Gaussian we take $4$ experiments satrting with $1\times 10^3$ and going one order of magnitude higher every time. The plots present the results of each experiment. Every row of plots shows the approximated mean and precision for each experiment. In the precision plot the dotted red line presents the real value of the precision while the dotted blue signals the best value obtained by Bayesian inference.
The plot title presents the total number of samples, the number of accepted samples and the distance between the Gaussian precision and the precision obtained by sampling.

In [46]:
Ns = [1000, 10000, 100000, 1000000]

plots = list()
for n in Ns:
    muSamples, lmbdaSamples = rejectionSampling(n)
    if len(muSamples) > 1:
        mplot = plotMeanResult(n, muSamples)
        plots.append(mplot)
        pplot = plotPrecisionResult(n, lmbdaSamples)
        plots.append(pplot)
        
grid = gridplot(plots, ncols=2, plot_width=370, plot_height=350)
show(grid)

### Conclusions

Approximating $\mu$ seems quite straightforward. For all the experiments, the mean of the distribution is very close to the real value of the parameter. For the precision is quite different. For the first two experiments, the mean of the distribution is not as close neither to the real value nor to the Bayesian estimate. However, as we increase the number of samples and, indirectly, the number of accepted samples, the distance between the precision obtained by sampling and the one from Bayesian inference is reduced.

The number of samples required to approximate both parameters is bigger than the one used when only the mean was unknown. This can be explained from the fact that we are now sampling from a bivariate distribution (normal-gamma) and the number of samples that we require for accuracy is probably way larger than for the univariate case (i.e. when only the mean was unknown).

Another interesting aspect is the distance between the real value for the precision (red line) and the other two. We present our conclusions comparing against Bayesian inference because the posterior is the distribution that we are reconstructing with sampling. That posterior might get closer to the real value on the presence of more data. However, with more data the likelihood gets smaller and prone to precision error.

## 2.2. Sampling importance resampling (SIR)

In [18]:
def pt(mus, las):
    return likelihood(mus, las) * q(mus, las)

In [19]:
def computeWeights(N):
    mus, las = draw_q(N)
    w = pt(mus, las) / q(mus, las)
    den = np.sum(w)
    w = w / den
    assert (abs(np.sum(w) - 1.0) < 1e-10)
    return mus, las, w

In [20]:
def SIR(N,R):
    mus, las, weights = computeWeights(N)
    accumulate = np.copy(weights)
    for i in range(1,N):
        accumulate[i] += accumulate[i-1]
    muSamples = np.zeros(R)
    laSamples = np.zeros(R)
    for i in range(R):
        index = np.searchsorted(accumulate, uniformRVS())
        muSamples[i] = mus[index]
        laSamples[i] = las[index]
    return muSamples, laSamples

### 2.2.1 Results

The following plots present the mean and the variance of the normal distribution approximated using Bayesian inference. These are used as reference to compare the two sampling approaches.

In [21]:
def sb_plot():
    # Plots the exact posterior
    kn, mun, an, bn = posterior_mean_prec(1, 1, alpha0, beta0, D, N)
    print (an / bn)

    def pz(x, la): 
        return norm_gamma(x, la, mun, an, bn)

    pz = np.vectorize(pz)
    plots = list()
    
    f = figure(title="Posterior")
    x = np.linspace(-0.5, 1.5, 300)
    f.line(x, pz(x,10.0), legend="Mean",
           color="darkgreen", line_width=2)
    realMean = Span(location=0.8, dimension='height',
                    line_color='firebrick', line_dash='dashed', line_width=2)
    f.add_layout(realMean)
    f.legend.location = "top_left"
    plots.append(f)

    f = figure(title=None)
    x = np.linspace(0, 20, 300)
    f.line(x, pz(0.8,x), legend="Precision",
           color="darkgreen", line_width=2)
    realPrecision = Span(location=10.0, dimension='height',
                         line_color='firebrick', line_dash='dashed', line_width=2)
    f.add_layout(realPrecision)
    f.legend.location = "top_right"
    plots.append(f)
    
    grid = gridplot(plots, ncols=2, plot_width=370, plot_height=350)
    show(grid)
    
sb_plot()

10.9427831242


In [51]:
Ns = [100, 1000, 10000]
Rs = [100, 1000,  1000]

seed = 95704815 #np.random.randint(100000000)
print ("seed = ", seed)
np.random.seed(seed)

plots = list()
for i in range(len(Ns)):
    n, r = Ns[i], Rs[i]
    muSamples, lmbdaSamples = SIR(n,r)
    if len(muSamples) > 1:
        mplot = plotMeanResult(n, muSamples, 'SI=', ' R=')
        plots.append(mplot)
        pplot = plotPrecisionResult(n, lmbdaSamples, 'SI=', ' R=')
        plots.append(pplot)
        
grid = gridplot(plots, ncols=2, plot_width=370, plot_height=350)
show(grid)

seed =  95704815


### 2.2.2 Conclusion

In addition to the conclusions presented above regarding the samplig space, we can add that sampling with SIR is way faster than with rejection sampling. This difference is because SIR does not discards any sample. However, sometimes there are samples that get very high weights and this leads to distributions in which several modes can be apreciated.