# Sum to zero comparison, multi-level binomial model, 4 categorical predictors, predictors have per-category intercept.

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotnine as p9
from random import randint

from cmdstanpy import CmdStanModel, write_stan_json, install_cmdstan, rebuild_cmdstan

In [None]:
# notebook display options
np.set_printoptions(precision=2)
np.set_printoptions(suppress=True)
pd.set_option('display.precision', 2)
pd.options.display.float_format = '{:.2f}'.format

# suppress plotnine warnings
import warnings
warnings.filterwarnings('ignore')
# setup plotnine look and feel
p9.theme_set(
  p9.theme_grey() + 
  p9.theme(text=p9.element_text(size=10),
        plot_title=p9.element_text(size=14),
        axis_title_x=p9.element_text(size=12),
        axis_title_y=p9.element_text(size=12),
        axis_text_x=p9.element_text(size=8),
        axis_text_y=p9.element_text(size=8)
       )
)
xlabels_90 = p9.theme(axis_text_x = p9.element_text(angle=90, hjust=1))

## The data generating model

Instantiate the data generating model.

In [None]:
gen_mod = CmdStanModel(stan_file=os.path.join('stan', 
                                              'gen_binomial_4_preds.stan'))

Choose total number of observations and number of categories for age, eth, and edu.

In [None]:
gen_data_dict = {'N':100_000,
                 'N_age':9,
                 'N_eth':3,
                 'N_edu':7,
                'baseline': -4.9,
                'sens': 0.75,
                'spec': 0.9995}
print("total strata",
      (2 * gen_data_dict['N_age'] * gen_data_dict['N_eth'] * gen_data_dict['N_edu']))

Only need to run for 1 sampling iteration to get a complete dataset.

In [None]:
sim_data = gen_mod.sample(data=gen_data_dict,
                          iter_warmup=1, iter_sampling=1, chains=1)

Examine the set of generated data-generating params and resulting dataset.

In [None]:
for var, value in sim_data.stan_variables().items():
    print(var, value[0])  # draws array contains single draw

In [None]:
sim_df = pd.DataFrame({'tests':sim_data.tests[0], 'pos_tests':sim_data.pos_tests[0], 'p_sample':sim_data.p_sample[0] })
sim_df['raw_prev'] = sim_df['pos_tests'] / sim_df['tests']
(
    p9.ggplot(sim_df, p9.aes(x='p_sample'))
    + p9.geom_density(color='black', fill='lightblue')
    + p9.labs(
        x='true prevalance',
        y='',
        title='true prevalence given test sensitivity, specificity across demographics'
    )
    + p9.theme_minimal()
)

Capture the data-generating params

In [None]:
true_params = {
    'beta_0': sim_data.beta_0[0],
    'pct_sex': sim_data.pct_sex[0],
    'beta_sex': sim_data.beta_sex[0],
    'pct_age': sim_data.pct_age[0],
    'beta_age':sim_data.beta_age[0],
    'pct_eth': sim_data.pct_eth[0],
    'beta_eth':sim_data.beta_eth[0],
    'pct_edu': sim_data.pct_edu[0],
    'beta_edu':sim_data.beta_edu[0]
}
true_params

Combine the per-strata arrays into a single dataframe to check our work.

In [None]:
y_X_matrix = pd.DataFrame(data={"pos_tests": sim_data.pos_tests[0].astype(int),
                                "tests": sim_data.tests[0].astype(int),
                                "p_sample": sim_data.p_sample[0],
                                "sex": sim_data.sex[0].astype(int),
                                "age": sim_data.age[0].astype(int),
                                "eth": sim_data.eth[0].astype(int),
                                "edu": sim_data.edu[0].astype(int)})
y_X_matrix.head(20)

In [None]:
y_X_matrix.tail(30)

Assemble the data dictionary of all input data for the model which solves the inverse problem - i.e., estimates regression coefficients given the observed data.  We use the generated data as the inputs.  Because the output files are real-valued outputs, regardless of variable element type, model data variables of type int need to be cast to int.  Here all the observed data is count and categorial data.

In [None]:
data_4_preds = {'N':sim_data.pos_tests.shape[1], 
                'N_age':gen_data_dict['N_age'], 
                'N_eth':gen_data_dict['N_eth'],
                'N_edu':gen_data_dict['N_edu'],
                'pos_tests':sim_data.pos_tests[0].astype(int),
                'tests':sim_data.tests[0].astype(int),
                'sex':sim_data.sex[0].astype(int),
                'age':sim_data.age[0].astype(int), 
                'eth':sim_data.eth[0].astype(int),
                'edu':sim_data.edu[0].astype(int),
                'sens': gen_data_dict['sens'],
                'spec': gen_data_dict['spec'],
                'intercept_prior_mean': gen_data_dict['baseline'],
                'intercept_prior_scale': 2.5}

# print(data_4_preds)

## Fit Model 1:  Hard sum-to-zero constraint

Run the sampler to get posterior estimates of the model conditioned on the data. 

In [None]:
binomial_hard_mod = CmdStanModel(stan_file=os.path.join('stan', 'binomial_4preds_hard.stan'))

In [None]:
binomial_hard_fit = binomial_hard_mod.sample(data=data_4_preds, parallel_chains=4)

Get the summary statistics for all model parameters, transformed parameters, and generated quantities.

In [None]:
hard_fit_summary = binomial_hard_fit.summary(sig_figs=2)

Compare data generating value, posterior estimate of the predictor "sex".

In [None]:
print("global intercept", sim_data.beta_0[0], "offset sex", sim_data.beta_sex[0])
print("per-category observation pcts hardcoded:  0.4, 0.6")
hard_fit_summary.loc[['beta_0', 'beta_sex_raw', 'beta_sex[1]', 'beta_sex[2]']]

Compare data generating value, posterior estimate of age

In [None]:
print("true coeffecients age", sim_data.beta_age[0])
print("per-category observation pcts", sim_data.pct_age[0])
hard_age_summary = hard_fit_summary.filter(regex=r"beta_age\[\d+\]", axis=0)
hard_age_summary

In [None]:
print("true coeffecients eth", sim_data.beta_eth[0])
print("per-category observation pcts", sim_data.pct_eth[0])
hard_eth_summary = hard_fit_summary.filter(regex=r"beta_eth\[\d+\]", axis=0)
hard_eth_summary

In [None]:
print("true coeffecients edu", sim_data.beta_edu[0])
print("per-category observation pcts", sim_data.pct_edu[0])
hard_edu_summary = hard_fit_summary.filter(regex=r"beta_edu\[\d+\]", axis=0)
hard_edu_summary

In the generated quantities block of the `binomial_4_preds.stan` model, 
the variable `y_rep` contains the per-strata estimates of number of pos_tests.
Rather than examine the average across all draws, we examine the estimates
of a few random draws from the sampler.

In [None]:
pos_tests_fitted = binomial_hard_fit.y_rep.astype(int)
df_rep_pos_tests = pd.DataFrame(data={'sim_data pos_tests': sim_data.pos_tests[0].astype(int)});
for x in range(0,200):
    draw = randint(0, 1000);
    df_rep_pos_tests['iter ' + str(draw)] =  pos_tests_fitted[draw]

Plot generated number of pos_tests, estimated number of pos_tests per poststrat cell.

In [None]:
df_long = pd.melt(df_rep_pos_tests, id_vars=['sim_data pos_tests'], var_name='variable', value_name='value')

hard_ppc = (p9.ggplot(df_long, p9.aes(x='sim_data pos_tests', y='value')) +
     p9.geom_jitter(alpha=0.5, width=0.2) +
     p9.geom_abline(intercept=0, slope=1, color='lightblue', linetype='--') +
#     p9.facet_wrap('~ variable', scales='free_y', ncol=2) +
     p9.labs(x='pos_tests (observed)', y='Posterior estiimates (200 draws)', title='Hard sum-to-zero constraint\nPosterior Predictive Check') +
     p9.theme(figure_size=(8,8)) +
     p9.scale_x_continuous(breaks=range(0, 51, 5)) +
     p9.scale_y_continuous(breaks=range(0, 51, 5))                           
    )
hard_ppc

***Takeaway:   the model is close enough on recovering the true parameters so that PPC check seems reasonable..***

## Fit model 2:  `sum_to_zero_vector`

In [None]:
binomial_frazier_mod = CmdStanModel(stan_file=os.path.join('stan', 'binomial_4preds_frazier.stan'))

In [None]:
binomial_frazier_fit = binomial_frazier_mod.sample(data=data_4_preds, parallel_chains=4)

In [None]:
# print(binomial_frazier_fit.diagnose())

In [None]:
frazier_fit_summary = binomial_frazier_fit.summary(sig_figs=2)

In [None]:
print("global intercept", sim_data.beta_0[0], "offset sex", sim_data.beta_sex[0])
print("per-category observation pcts hardcoded:  0.4, 0.6")
frazier_fit_summary.loc[['beta_intercept', 'beta_0', 'beta_sex']]

In [None]:
hard_fit_summary.loc[['beta_0', 'beta_sex[1]', 'beta_sex[2]']]

In [None]:
print("true coeffecients age", sim_data.beta_age[0])
print("per-category observation pcts", sim_data.pct_age[0])
frazier_age_summary = frazier_fit_summary.filter(regex=r"beta_age\[\d+\]", axis=0)
frazier_age_summary

In [None]:
hard_age_summary

In [None]:
print("true coeffecients eth", sim_data.beta_eth[0])
print("per-category observation pcts", sim_data.pct_eth[0])
frazier_eth_summary = frazier_fit_summary.filter(regex=r"beta_eth\[\d+\]", axis=0)
frazier_eth_summary

In [None]:
hard_eth_summary

In [None]:
print("true coeffecients edu", sim_data.beta_edu[0])
print("per-category observation pcts", sim_data.pct_edu[0])
frazier_edu_summary = frazier_fit_summary.filter(regex=r"beta_edu\[\d+\]", axis=0)
frazier_edu_summary

In [None]:
hard_edu_summary

In [None]:
pos_tests_fitted = binomial_frazier_fit.y_rep.astype(int)
df_rep_pos_tests = pd.DataFrame(data={'sim_data pos_tests': sim_data.pos_tests[0].astype(int)});
for x in range(0,4):
    draw = randint(0, 500);
    df_rep_pos_tests['iter ' + str(draw)] =  pos_tests_fitted[draw]
df_rep_pos_tests

In [None]:
df_long = pd.melt(df_rep_pos_tests, id_vars=['sim_data pos_tests'], var_name='variable', value_name='value')

frazier_ppc = (p9.ggplot(df_long, p9.aes(x='sim_data pos_tests', y='value')) +
     p9.geom_point() +
     p9.facet_wrap('~ variable', scales='free_y', ncol=2) +
     p9.labs(x='pos_tests (simulated)', y='Posterior estiimates', title='Sum-to-zero vector\nPosterior Predictive Check') +
     p9.theme(figure_size=(10,8))
    )
frazier_ppc

In [None]:
hard_ppc