# Exercises Chapter 02

1. Change the parameters of the prior Beta distribution
in `our_first_model` to match those of the previous chapter.
Compare the results to the previous chapter.

In [None]:
# Import functional tools
from cytoolz.curried import *

# Import PyMC
import arviz as az
import pymc as pm
import preliz as pz

# Import data science packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Use a fixed random number generator
rng = np.random.default_rng(seed=123)

In [None]:
trials = 4
theta_real = 0.35 # Unknown in a real experiment
data = pz.Binomial(n=1, p=theta_real).rvs(size=trials)

In [None]:
our_first_model = [None] * 3
idata = [None] * 3
idata

In [None]:
with pm.Model() as our_first_model[0]:
    theta = pm.Beta('theta', alpha = 1, beta = 1)
    y = pm.Bernoulli('y', p = theta, observed = data)
    idata[0] = pm.sample(1000)

In [None]:
az.plot_trace(idata[0])
plt.show()

In [None]:
az.summary(idata[0], kind='stats').round(2)

In [None]:
az.plot_posterior(idata[0], ref_val=theta_real)
plt.show()

In [None]:
with pm.Model() as our_first_model[1]:
    theta = pm.Beta('theta', alpha=20, beta=20)
    y = pm.Bernoulli('y', p=theta, observed= data)
    idata[1] = pm.sample(1000)

In [None]:
az.summary(idata[1], kind='stats').round(2)

In [None]:
az.plot_posterior(idata[1], ref_val=theta_real)
plt.show()

In [None]:
with pm.Model() as our_first_model[2]:
    theta = pm.Beta('theta', alpha=1, beta=4)
    y = pm.Bernoulli('y', p=theta, observed= data)
    idata[2] = pm.sample(1000)

In [None]:
az.plot_trace(idata[2])
plt.show()

In [None]:
az.summary(idata[2], kind='stats').round(2)

In [None]:
az.plot_posterior(idata[2], ref_val=theta_real)
plt.show()

The first experiment, $\alpha = 1$ and $\beta = 1$, appear to be the
best. However, experiment 3, $\alpha = 1$ and $\beta = 4$, also seems
pretty reasonable. The second experiment, $\alpha = 20$ and
$\beta = 20$, seems to be the worst of the three models; however, all
models contain the actual value within the 94% highest density
interval (HDI).

2. Compare `our_first_model` with a prior $\theta \sim Beta(1, 1)$ with
a model with prior $\theta \sim Uniform(0, 1)$.

In [None]:
with pm.Model() as our_first_beta_model:
    theta = pm.Beta('theta', alpha=1, beta=1)
    y = pm.Bernoulli('y', p=theta, observed= data)
    idata_beta = pm.sample(1000)

In [None]:
az.plot_trace(idata_beta)
plt.show()

In [None]:
az.summary(idata_beta, kind='stats').round(2)

In [None]:
az.plot_posterior(idata_beta, ref_val=theta_real)
plt.show()

In [None]:
with pm.Model() as our_first_uniform:
    theta = pm.Uniform('theta', lower=0, upper=1)
    y = pm.Bernoulli('y', p=theta, observed= data)
    idata_uniform = pm.sample(1000)

In [None]:
az.plot_trace(idata_uniform)
plt.show()

In [None]:
az.summary(idata_uniform, kind='stats').round(2)

In [None]:
az.plot_posterior(idata_uniform, ref_val=theta_real)
plt.show()

Interesting... It is difficult to distinguish the Beta prior results
from the Uniform prior results. The means are very close, the 94% HDI
are very close, and both models have a similar number of values around
the actual mean.

3. PyMC has a function named `pm.model_to_graphviz` that can be used
to visualize the mode. Use it to visualize `our_first_model`.
Compare the results with the Kruschke diagram. Use
`pm.model_to_graphviz` to visualize model `comparing_groups`.

In [None]:
with pm.Model() as our_first_model:
    theta=pm.Beta('theta', alpha=1, beta=1)
    y = pm.Bernoulli('y', p=theta, observed=data)

In [None]:
pm.model_to_graphviz(our_first_model)

In [None]:
tips = pd.read_csv('data/tips.csv')

categories =np.array(['Thu', 'Fri', 'Sat', 'Sun'])
tip = tips['tip'].values
idx = pd.Categorical(tips['day'], categories=categories).codes

with pm.Model() as comparing_groups:
    mu = pm.Normal('mu', mu=0, sigma=10, shape=4)
    sigma = pm.HalfNormal('sigma', sigma=10, shape=4)
    y = pm.Normal('y', mu=mu[idx], sigma=sigma[idx], observed=tip)

pm.model_to_graphviz(comparing_groups)

4. Read about the coal mining disaster model that is part of the
PyMC documentation
(https://www.pymc.io/projects/docs/en/latest/learn/core_notebooks/pymc_overview.html#case-study-2-coal-mining-disasters).
Try to implement and run this model yourself.

In [None]:
disaster_data = pd.Series([
    4, 5, 4, 0, 1, 4, 3, 4, 0, 6, 3, 3, 4, 0, 2, 6,
    3, 3, 5, 4, 5, 3, 1, 4, 4, 1, 5, 5, 3, 4, 2, 5,
    2, 2, 3, 4, 2, 1, 3, np.nan, 2, 1, 1, 1, 1, 3, 0, 0,
    1, 0, 1, 1, 0, 0, 3, 1, 0, 3, 2, 2, 0, 1, 1, 1,
    0, 1, 0, 1, 0, 0, 0, 2, 1, 0, 0, 0, 1, 1, 0, 2,
    3, 3, 1, np.nan, 2, 1, 1, 1, 1, 2, 4, 2, 0, 0, 1, 4,
    0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1
])
years = np.arange(1851, 1962)

with pm.Model() as disaster_model:
    switchpoint = pm.DiscreteUniform('switchpoint',
                                     lower=years.min(),
                                     upper=years.max())

    # Priors for pre- and post-switch rates number of disasters
    early_rate = pm.Exponential('early_rate', 1.0)
    late_rate = pm.Exponential('late_rate', 1.0)

    # Allocate appropriate Poisson rates to years before and after current
    rate = pm.math.switch(switchpoint >= years, early_rate, late_rate)

    disasters = pm.Poisson('disasters', rate, observed=disaster_data)

    idata_disasters = pm.sample(1000)

In [None]:
az.plot_trace(idata_disasters)
plt.show()

In [None]:
az.summary(idata_disasters, kind='stats').round(2)

In [None]:
az.plot_posterior(idata_disasters)
plt.show()

5. Modify `model_g`. Change the prior for the mean to a Gaussian
distribution centered at the empirical mean, and play with a
couple of reasonable values for the standard deviation of this
prior.

In [None]:
# Load the data
data = np.loadtxt('./data/chemical_shifts.csv')
empirical_mean = np.mean(data)
empirical_std = np.std(data)
len(data), empirical_mean, empirical_std

In [None]:
sigmas = [2, 3, 4, 5]
model_g = [None] * len(sigmas)
idata_g = [None] * len(sigmas)

In [None]:
with pm.Model() as model_g[0]:
    mu= pm.Normal('mu', mu=empirical_mean, sigma=sigmas[0])
    sigma = pm.HalfNormal('sigma', sigma=5)
    y = pm.Normal('y', mu=mu, sigma=sigma, observed=data)
    idata_g[0] = pm.sample()

In [None]:
az.plot_trace(idata_g[0])
plt.show()

In [None]:
with pm.Model() as model_g[1]:
    mu= pm.Normal('mu', mu=empirical_mean, sigma=sigmas[1])
    sigma = pm.HalfNormal('sigma', sigma=5)
    y = pm.Normal('y', mu=mu, sigma=sigma, observed=data)
    idata_g[1] = pm.sample()

In [None]:
az.plot_trace(idata_g[1])
plt.show()

In [None]:
with pm.Model() as model_g[2]:
    mu= pm.Normal('mu', mu=empirical_mean, sigma=sigmas[2])
    sigma = pm.HalfNormal('sigma', sigma=5)
    y = pm.Normal('y', mu=mu, sigma=sigma, observed=data)
    idata_g[2] = pm.sample()

In [None]:
az.plot_trace(idata_g[2])
plt.show()

In [None]:
with pm.Model() as model_g[3]:
    mu=pm.Normal('mu', mu=empirical_mean, sigma=sigmas[3])
    sigma = pm.HalfNormal('sigma', sigma=5)
    y = pm.Normal('y', mu=mu, sigma=sigma, observed=data)
    idata_g[3] = pm.sample()

In [None]:
az.plot_trace(idata_g[3])
plt.show()

Modelling the mean using a Gaussian distribution seems to make little
difference even for a relatively small amount of data (40 samples).
In my four experiments, I see that the mean seems to lie about 53
and that sigma appears to be about 3.5.

Although using a Gaussian, which has no bounds, seems to have no
effect on the results, I would generally prefer to model my prior
accurately instead of saying, "Oh. I've got enough data. I'll just
use a Gaussian." The behavior I see might be specific to the problem
that I'm modelling here.

Even though it is physically not possible to have values outside
the range [0, 100], a Gaussian with a narrow enough standard
deviation  may be similar enough. And I do know that enough data
"swamps the prior."

However, I generally think it is better to use priors that reflect
my actually belief about the problem. I believe that this choice is
much more consistent with a "Bayesian approach" and is less likely
to mislead me.

6. Using data from the "chemical_shifts.csv" file, compute the empirical
mean and the standard deviation with and without outliers.

In [None]:
# Load the data
data = np.loadtxt('./data/chemical_shifts.csv')
data

In [None]:
# Plot the data using a boxplot
_, ax = plt.subplots(figsize=(12, 3))
ax.boxplot(data, vert=False)
plt.show()

In [None]:
np.mean(data), np.std(data)

In [None]:
data_series = pd.Series(data)
data_noo= data_series[data_series < 60]
data_noo

In [None]:
# Plot the data without "outliers"
_, ax = plt.subplots(figsize=(12, 3))
ax.boxplot(data_noo, vert=False)
plt.show()

In [None]:
np.mean(data_noo), np.std(data_noo)

In [None]:
with pm.Model() as model_g:
    mu = pm.Uniform('mu', lower=40, upper=70)
    sigma = pm.HalfNormal('sigma', sigma=5)
    y = pm.Normal('y', mu=mu, sigma=sigma, observed=data)
    idata_g = pm.sample()

In [None]:
az.plot_trace(idata_g)
plt.show()

In [None]:
az.summary(idata_g, kind='stats').round(2)

In [None]:
az.plot_posterior(idata_g)
plt.show()

In [None]:
with pm.Model() as model_g_noo:
    mu = pm.Uniform('mu', lower=40, upper=70)
    sigma = pm.HalfNormal('sigma', sigma=5)
    y = pm.Normal('y', mu=mu, sigma=sigma, observed=data_noo)
    idata_g_noo = pm.sample()

In [None]:
az.plot_trace(idata_g_noo)
plt.show()

In [None]:
az.summary(idata_g_noo, kind='stats').round(2)

In [None]:
az.plot_posterior(idata_g_noo)
plt.show()

In [None]:
with pm.Model() as model_t:
    mu = pm.Uniform('mu', lower=40, upper=70)
    sigma = pm.HalfNormal('sigma', sigma=5)
    nu = pm.Exponential('nu', 1/30)
    y = pm.StudentT('y', mu=mu, nu=nu, observed=data)
    idata_t = pm.sample()

In [None]:
az.plot_trace(idata_t)
plt.show()

In [None]:
az.summary(idata_t, kind='stats').round(2)

In [None]:
az.plot_posterior(idata_t)
plt.show()

In [None]:
with pm.Model() as model_t_noo:
    mu = pm.Uniform('mu', lower=40, upper=70)
    sigma = pm.HalfNormal('sigma', sigma=5)
    nu = pm.Exponential('nu', 1/30)
    y = pm.StudentT('y', mu=mu, nu=nu, observed=data_noo)
    idata_t_noo = pm.sample()

In [None]:
az.plot_trace(idata_t_noo)
plt.show()

In [None]:
az.summary(idata_t_noo, kind='stats').round(2)

In [None]:
az.plot_posterior(idata_t_noo)
plt.show()

Again, interesting... I see fewer differences between the models
using Student's T distribution than I do between the models using
the Normal distribution. This difference seems to indicate that a
better model is more robust overall. Very interesting.

7. Repeat the previous exercise by adding more outliers to
`chemical_shifts.csv` and computer new posteriors for `model_g` and
`model_t` using this new data. What do you observe.

In [None]:
# Load the data
data = np.loadtxt('./data/chemical_shifts.csv')
data

In [None]:
# Add "additional outliers."
np.mean(data_noo), np.std(data_noo)

In [None]:
data_outliers = np.array([42.24, 37.85])
more_data = np.concatenate((data, data_outliers))
more_data

In [None]:
# Plot all the data
_, ax = plt.subplots(figsize=(12, 3))
ax.boxplot(more_data, vert=False)
plt.show()

In [None]:
with pm.Model() as model_g_more_data:
    mu = pm.Uniform('mu', lower=40, upper=70)
    sigma = pm.HalfNormal('sigma', sigma=5)
    y = pm.Normal('y', mu=mu, sigma=sigma, observed=more_data)
    idata_g_more_data = pm.sample()

In [None]:
az.plot_trace(idata_g_more_data)
plt.show()

In [None]:
az.summary(idata_g_more_data, kind='stats').round(2)

In [None]:
az.plot_posterior(idata_g_more_data)
plt.show()

In [None]:
with pm.Model() as model_g_noo:
    mu = pm.Uniform('mu', lower=40, upper=70)
    sigma = pm.HalfNormal('sigma', sigma=5)
    Y = pm.Normal('Y', mu=mu, sigma=sigma, observed=data)
    idata_g_noo = pm.sample()

In [None]:
az.plot_trace(idata_g_noo)
plt.show()

In [None]:
az.summary(idata_g_noo, kind='stats').round(2)

In [None]:
az.plot_posterior(idata_g_noo)
plt.show()

In [None]:
with pm.Model() as model_t_more_data:
    mu = pm.Uniform('mu', lower=40, upper=70)
    sigma = pm.HalfNormal('sigma', sigma=5)
    nu = pm.Exponential('nu', 1/30)
    y = pm.StudentT('y', mu=mu, nu=nu, observed=data)
    idata_t_more_data = pm.sample()

In [None]:
az.plot_trace(idata_t_more_data)
plt.show()

In [None]:
az.summary(idata_t_more_data, kind='stats').round(2)

In [None]:
az.plot_posterior(idata_t_more_data)
plt.show()

In [None]:
with pm.Model() as model_t_noo:
    mu = pm.Uniform('mu', lower=40, upper=70)
    sigma = pm.HalfNormal('sigma', sigma=5)
    nu = pm.Exponential('nu', 1/30)
    y = pm.StudentT('y', mu=mu, nu=nu, observed=data_noo)
    idata_t_noo = pm.sample()

In [None]:
az.plot_trace(idata_t_noo)
plt.show()

In [None]:
az.summary(idata_t_noo, kind='stats').round(2)

In [None]:
az.plot_posterior(idata_t_noo)
plt.show()

Again, with additional "outliers", the original data shows the larger
standard deviation. But, again, the posterior seems a bit more robust
against these outliers. And, again, the Normal is affected more by
the outliers than the Student's T distribution.

8. Explore the InferenceData object, `idata_cg`.

In [None]:
# Load the tips data
tips = pd.read_csv('./data/tips.csv')

# Set up the categories of interest
categories = np.array(['Thu', 'Fri', 'Sat', 'Sun'])
tip = tips['tip'].values
idx = pd.Categorical(tips['day'], categories=categories).codes

# Set up our model with the created coordinates
coords = {'days': categories, 'days_flat': categories[idx]}
with pm.Model(coords=coords) as comparing_groups:
    mu = pm.HalfNormal('mu', sigma=5, dims='days')
    sigma = pm.HalfNormal('sigma', sigma=1, dims='days')
    y = pm.Gamma('y', mu=mu[idx], sigma=sigma[idx],
                 observed=tip, dims='days_flat')

    idata_cg = pm.sample(random_seed=4591)
    idata_cg.extend(pm.sample_posterior_predictive(idata_cg, random_seed=4591))

In [None]:
idata_cg

In [None]:
dir(idata_cg)

In [None]:
idata_cg.groups()

How many groups does it cantain?

The `InferenceData`  contains four (4) groups.

Inspect the posterior distribution of the parameter $\mu$ for a specific
day using the `sel()` method.

In [None]:
type(idata_cg.posterior)

In [None]:
idata_cg.sel(draw=0, chain=[0, 2])

In [None]:
idata_cg.posterior['mu']

In [None]:
idata_cg.posterior['mu'].sel(days='Thu')

Compute the distributions of mean differences between Thursday and
Sunday. What are the coordinates and dimensions of the resulting
`DataArray`?

In [None]:
idata_cg.posterior.mean()

In [None]:
idata_cg.posterior.mean('draw')

In [None]:
stacked = az.extract(idata_cg)
stacked

In [None]:
stacked['mu']

In [None]:
stacked['mu'].sel(days='Thu') - stacked['mu'].sel(days='Sun')

In [None]:
stacked['mu'].sel(days='Thu').mean() - stacked['mu'].sel(days='Sun').mean()

In [None]:
stacked['mu'].sel(days='Thu') - stacked['mu'].sel(days='Sun')

In [None]:
idata_cg.posterior['mu'].sel(days='Thu') - idata_cg.posterior['mu'].sel(days='Sun')