# Chapter 03 exercises

## Exercise 01

Using your own words explain the following concepts in two or three sentences

- Complete pooling - Pool all data together
- No pooling - Only determine answers for each group
- Partial pooling - Use all data to improve group data answers

## Exercise 02

Repeat the exercise we did with `model_h` **without** a hierarchical
structure using a flat prior like $Beta(\alpha=1, \beta=1)$. Compare
the results of both models.

In [None]:
# Perform our typical data science and PyMC imports

# Import cytoolz for data manipulation
import cytoolz.curried as ctc

# Import PyMC and supporting packages
import arviz as az
import pymc as pm
import preliz as pz

# Import other "data science" packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Generate synthetic data
N_samples = [30, 30, 30] # number of samples
G_samples = [18, 3, 3] # number of good samples
group_idx = np.repeat(np.arange(len(N_samples)), N_samples)

# Perhaps `cytoolz` might simplify this code
data = []
for i in range(0, len(N_samples)):
    data.extend(np.repeat([1, 0],
                          [G_samples[i], N_samples[i] - G_samples[i]]))

In [None]:
# Let's write the model in PyMC.
with pm.Model() as model_h:
    # hyperpriors
    mu = pm.Beta('mu', 1, 1)
    nu = pm.HalfNormal('nu', 10)

    # prior
    theta = pm.Beta('theta', mu=mu, nu=nu, shape=len(N_samples))

    # likelihood
    y = pm.Bernoulli('y', p = theta[group_idx], observed=data)

    idata_h = pm.sample(random_seed=4591)

In [None]:
# Let's write the same model **without** using a hierarchical model
with pm.Model() as model_nh:
    # Prior
    theta = pm.Beta('theta', alpha=1, beta=1, shape=len(N_samples))

    # Likelihood
    y = pm.Bernoulli('y', p=theta[group_idx], observed=data)

    idata_nh = pm.sample(random_seed=4591)

In [None]:
az.plot_trace(idata_h)
az.plot_trace(idata_nh)
plt.show()

In [None]:
az.summary(idata_h)

In [None]:
az.summary(idata_nh)

In [None]:
az.plot_posterior(idata_h)
az.plot_posterior(idata_nh)
plt.show()

In [None]:
axes = az.plot_forest(
    [idata_h, idata_nh],
    model_names=['hierarchical', 'non-hierarchical'],
    var_names='theta',
    combined=True,
    r_hat=False,
    ess=False,
    figsize=(12, 3),
    colors='cycle',
)
y_lims = axes[0].get_ylim()
plt.show()

I do not notice any significant difference between the two models.
Both models capture the difference between first group and the
last two groups. Additionally, both models capture the similarity
between the last two groups.

However, as the author points out, in the first group, the means
of the two groups are "pulled apart"; that is, using the
hierarchical model, the mean of the first group is "pulled toward"
the mean of the other two groups. Without a hierarchical model, the
mean is unaffected by the other two groups.

## Exercise 03

Create a hierarchical version of the tips example from chapter 2 by
partially pooling across the days of the week. Compare the results
to the results obtained without the hierarchical structure.

In [None]:
# Load the tips data set
tips = pd.read_csv('../data/tips.csv')

In [None]:
# We'll preprocess the tips data set
categories = np.array(['Thur', 'Fri', 'Sat', 'Sun'])
tip = tips['tip'].values
idx = pd.Categorical(tips['day'], categories=categories).codes
coords = {'days': categories, 'days_flat': categories[idx]}

In [None]:
# We now perform the same modeling we did in chapter 2
with pm.Model(coords=coords) as comparing_groups_nh:
    mu = pm.HalfNormal('mu', sigma=5, dims='days')
    sigma = pm.HalfNormal('sigma', sigma=1, dims='days')
    y = pm.Gamma('y', mu=mu[idx], sigma=sigma[idx],
                 observed=tip, dims='days_flat')

    idata_cg_nh = pm.sample(random_seed=4591)

In [None]:
# Let's now create a similar model with partial pooling across the
# days of the week.
with pm.Model(coords=coords) as comparing_groups_h:
    # Hyper-prior
    hp_mu = pm.HalfNormal('hp_mu', sigma=5, dims='days')

    # Prior
    mu = pm.Normal('mu', mu=hp_mu, sigma=5, dims='days')
    sigma = pm.HalfNormal('sigma', sigma=1, dims='days')

    # Likelihood
    y = pm.Gamma('y', mu=mu[idx], sigma=sigma[idx],
                 observed=tip, dims='days_flat')

    idata_cg_h = pm.sample(random_seed=4591)

In [None]:
az.plot_trace(idata_cg_nh)
az.plot_trace(idata_cg_h)
plt.show()

In [None]:
az.summary(idata_cg_nh)

In [None]:
az.summary(idata_cg_h)

In [None]:
az.plot_posterior(idata_cg_nh)
az.plot_posterior(idata_cg_h)
plt.show()

In [None]:
axes = az.plot_forest(
    [idata_cg_nh, idata_cg_h],
    model_names=['non-hierarchical', 'hierarchical'],
    var_names='mu',
    combined=True,
    r_hat=False,
    ess=False,
    figsize=(12, 3),
    colors='cycle',
)
y_lims = axes[0].get_ylim()

The author chose a different set of models. I think I understand
the reason (but I'm not fully certain).

In [None]:
# The first model just sets a hyper-prior over `mu`
with pm.Model(coords=coords) as comparing_groups_h_00:
    # Hyper-prior
    # I'm uncertain about the reason the author chose a Gamma distribution.
    # From the AI Overview generated by Google, these reasons seem
    # to be pertinent.
    # Positive support: The Gamma distribution only generates
    # **positive** values which seems suitable for tips.
    # I'm uncertain about the other reasons supplied.
    mu_g = pm.Gamma('mu_g', mu=5, sigma=2)

    # Prior
    mu = pm.HalfNormal('mu', sigma=mu_g, dims='days')
    sigma = pm.HalfNormal('sigma', sigma=1, dims='days')

    # Likelihood
    y = pm.Gamma('y', mu=mu[idx], sigma=sigma[idx],
                 observed=tip, dims='days_flat')

    # Magic inference button
    idata_cg_h_00 = pm.sample(random_seed=4591)

In [None]:
# Here's the second (alternative) model
with pm.Model(coords=coords) as comparing_groups_h_01:
    # Hyper-priors
    mu_g = pm.Gamma('mu_g', mu=5, sigma=2)
    sigma_g = pm.Gamma('sigma_g', mu=2, sigma=1.5)

    # Priors
    mu = pm.Gamma('mu', mu=mu_g, sigma=sigma_g, dims='days')
    sigma = pm.HalfNormal('sigma', sigma=1, dims='days')

    # Likelihood
    y = pm.Gamma('y', mu=mu[idx], sigma=sigma[idx],
                 observed=tip, dims='days_flat')

    # The magic inference button
    idata_cg_h_01 = pm.sample(random_seed=4591)

In [None]:
# The author also suggests coding up a non-centered version. These
# kinds of models are discussed in chapter 4.
with pm.Model(coords=coords) as comparing_groups_h_02:
    # Hyper-priors
    mu_g = pm.Gamma('mu_g', mu=5, sigma=2)
    sigma_g = pm.HalfNormal('sigma_g', sigma=2)

    # Priors
    mu_g_offset = pm.Normal('mu_g_offset', sigma=2, dims='days')
    mu = pm.Deterministic('mu', mu_g + mu_g_offset, dims='days')
    sigma = pm.HalfNormal('sigma', sigma=1, dims='days')

    # Likelihood
    y = pm.Gamma('y', mu=mu[idx], sigma=sigma[idx],
                 observed=tip, dims='days_flat')

    # Again, the magical inference button
    idata_cg_h_02 = pm.sample(random_seed=4591, target_accepted=0.99)

In [None]:
axes = az.plot_forest(
    [idata_cg_nh, idata_cg_h_00, idata_cg_h_01, idata_cg_h_02],
    model_names=['non-hierarchical', 'hierarchical_00',
                 'hierarchical_01', 'hierarchical_02'],
    var_names='mu',
    combined=True,
    r_hat=False,
    figsize=(12, 3),
    colors='cycle',
)
y_lims = axes[0].get_ylim()
axes[0].vlines(idata_cg_h_00.posterior['mu_g'].mean(),
               *y_lims,
               colors='k',
               ls=":")
plt.show()

As the author points out, "We can see little difference between
the hierarchical and non-hierarchical models when we use a
HalfNormal distribution for the mean, `mu`, (`hierarchical_00`
and `non-hierarchical`, respectively). The difference is
larger when we use a Gamma distribution (`hierarchical_01`).

Interestingly, using the non-centered version (`hierarchical_02`)
again seems to reduce the difference.

## Exercise 04

For each sub-panel in figure 3.7, add a reference line representing
the empirical mean value at each level; that is, the global mean,
the forward mean, and Messi's mean. Compare the empirical values
to the posterior mean values. What do you observe?

In [None]:
football = pd.read_csv('./data/football_players.csv', dtype={'position': 'category'})
football['gps'] = football['goals'] / football['shots']
football

In [None]:
pos_idx = football['position'].cat.codes.values

pos_codes = football.position.cat.categories

n_pos = pos_codes.size

n_players = football.index.size

In [None]:
coords = {'pos': pos_codes}
with pm.Model(coords=coords) as model_football:
    # Hyperparameters
    mu = pm.Beta('mu', 1.7, 5.8) # values determined empirically
    nu = pm.Gamma('nu', mu=125, sigma=50)

    # Parameters for (four) positions
    mu_p = pm.Beta('mu_p', mu=mu, nu=nu, dims='pos')
    nu_p = pm.Gamma('nu_p', mu=125, sigma=50, dims='pos')

    # Parameter for players (goal scoring rate)
    # Remember that `theta` represents the **success rate**
    theta = pm.Beta('theta', mu=mu_p[pos_idx], nu=nu_p[pos_idx])

    gs = pm.Binomial('gs',
                     n=football.shots.values,
                     p=theta,
                     observed=football.goals.values)
    idata_football = pm.sample(draws=3000, target_accepted=0.95, random_seed=4591)

In [None]:
az.plot_trace(idata_football, var_names=['mu', 'nu', 'mu_p', 'nu_p'])
plt.show()

In [None]:
_, ax = plt.subplots(3, 1, figsize=(12, 6), sharex=True)

az.plot_posterior(idata_football,
                  var_names='mu',
                  ax=ax[0])
ax[0].axvline(football['gps'].mean(), color='0.25', ls='--')
ax[0].set_title(r'Global mean)')

az.plot_posterior(
    idata_football.posterior.sel(pos='FW'),
    var_names='mu_p',
    ax=ax[1])
ax[1].axvline(
    football['gps'][football['position'] == 'FW'].mean(),
    color='0.25',
    ls='--'
)
ax[1].set_title(r'Forward position mean')

az.plot_posterior(
    idata_football.posterior.sel(theta_dim_0=1457),
    var_names='theta',
    ax=ax[2]
)
ax[2].axvline(
    football['gps'][football['name'] == 'Lionel Messi'].item(),
    color='0.25',
    ls='--'
)
ax[2].set_title(r'Messi mean')

plt.show()