# Sum to zero comparison, multi-level binomial model, 4 categorical predictors, predictors have per-category intercept.

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotnine as p9
from random import randint

from cmdstanpy import CmdStanModel, write_stan_json, install_cmdstan, rebuild_cmdstan
from utils_dataviz import ppc_density_plot

In [None]:
# notebook display options
np.set_printoptions(precision=2)
np.set_printoptions(suppress=True)
pd.set_option('display.precision', 2)
pd.options.display.float_format = '{:.2f}'.format

# suppress plotnine warnings
import warnings
warnings.filterwarnings('ignore')
# setup plotnine look and feel
p9.theme_set(
  p9.theme_grey() + 
  p9.theme(text=p9.element_text(size=10),
        plot_title=p9.element_text(size=14),
        axis_title_x=p9.element_text(size=12),
        axis_title_y=p9.element_text(size=12),
        axis_text_x=p9.element_text(size=8),
        axis_text_y=p9.element_text(size=8)
       )
)
xlabels_90 = p9.theme(axis_text_x = p9.element_text(angle=90, hjust=1))

## Overview

In this notebook we develop a model which estimates per-demographic rates of disease prevalence for a population given the binary outcome of a diagnostic test procedure with a specified sensitivity and specificity, and a set of categorical predictors.   Here these are given the names sex, age, eth, and edu for verisimilitude, where sex is a binary-encoded variable and the other categories have > 2 possible values.

In order to fit this model, we need to put a sum-to-zero constraint on the categorical variables.   In the following sections we first generate a dataset, then compare the fits obtained via different implementations of the sum-to-zero constraint.

## The data generating model

In order to easily generate multiple datasets with different characteristics, the data generating model requires the user to specify the total number of tests performed, and the number of categories for age, ethnicity, and education.  The total number of sub-populations is the product of the number of categories per predictor.

The binomial model 

```
data {
  int<lower=1> N;  // total number of tests
  int<lower=1> N_age;
  int<lower=1> N_eth;
  int<lower=1> N_edu;
  real baseline;
  real<lower=0, upper=1> sens;
  real<lower=0, upper=1> spec;
}
transformed data {
  int strata = 2 * N_age * N_eth * N_edu;

}
```

Instantiate the data generating model.


In [None]:
gen_mod = CmdStanModel(stan_file=os.path.join('stan', 
                                              'gen_binomial_4_preds.stan'))

Choose total number of observations and number of categories for age, eth, and edu.


In [None]:
gen_data_dict = {'N':90_000,
                 'N_age':7,
                 'N_eth':3,
                 'N_edu':5,
                'baseline': -4.9,
                'sens': 0.75,
                'spec': 0.9995}
print("total strata",
      (2 * gen_data_dict['N_age'] * gen_data_dict['N_eth'] * gen_data_dict['N_edu']))

Only need to run for 1 sampling iteration to get a complete dataset.


In [None]:
sim_data = gen_mod.sample(data=gen_data_dict,
                          iter_warmup=1, iter_sampling=1, chains=1)

Examine the set of generated data-generating params and resulting dataset.


In [None]:
for var, value in sim_data.stan_variables().items():
    print(var, value[0])  # draws array contains single draw

In [None]:
sim_df = pd.DataFrame({'tests':sim_data.tests[0], 'pos_tests':sim_data.pos_tests[0], 'p_sample':sim_data.p_sample[0] })
sim_df['raw_prev'] = sim_df['pos_tests'] / sim_df['tests']
(
    p9.ggplot(sim_df, p9.aes(x='p_sample'))
    + p9.geom_density(color='black', fill='lightblue')
    + p9.labs(
        x='true prevalance',
        y='',
        title='raw prevalence given test sensitivity, specificity across demographics'
    )
    + p9.theme_minimal()
)

Capture the data-generating params


In [None]:
true_params = {
    'beta_0': sim_data.beta_0[0],
    'pct_sex': sim_data.pct_sex[0],
    'beta_sex': sim_data.beta_sex[0],
    'pct_age': sim_data.pct_age[0],
    'beta_age':sim_data.beta_age[0],
    'pct_eth': sim_data.pct_eth[0],
    'beta_eth':sim_data.beta_eth[0],
    'pct_edu': sim_data.pct_edu[0],
    'beta_edu':sim_data.beta_edu[0]
}
true_params

Combine the per-strata arrays into a single dataframe to check our work.


In [None]:
y_X_matrix = pd.DataFrame(data={"pos_tests": sim_data.pos_tests[0].astype(int),
                                "tests": sim_data.tests[0].astype(int),
                                "p_sample": sim_data.p_sample[0],
                                "sex": sim_data.sex[0].astype(int),
                                "age": sim_data.age[0].astype(int),
                                "eth": sim_data.eth[0].astype(int),
                                "edu": sim_data.edu[0].astype(int)})
y_X_matrix.head(20)

In [None]:
y_X_matrix.tail(30)

Assemble the data dictionary of all input data for the model which solves the inverse problem - i.e., estimates regression coefficients given the observed data.  We use the generated data as the inputs.  Because the output files are real-valued outputs, regardless of variable element type, model data variables of type int need to be cast to int.  Here all the observed data is count and categorial data.


In [None]:
data_4_preds = {'N':sim_data.pos_tests.shape[1], 
                'N_age':gen_data_dict['N_age'], 
                'N_eth':gen_data_dict['N_eth'],
                'N_edu':gen_data_dict['N_edu'],
                'pos_tests':sim_data.pos_tests[0].astype(int),
                'tests':sim_data.tests[0].astype(int),
                'sex':sim_data.sex[0].astype(int),
                'age':sim_data.age[0].astype(int), 
                'eth':sim_data.eth[0].astype(int),
                'edu':sim_data.edu[0].astype(int),
                'sens': gen_data_dict['sens'],
                'spec': gen_data_dict['spec'],
                'intercept_prior_mean': gen_data_dict['baseline'],
                'intercept_prior_scale': 2.5}

# print(data_4_preds)

## Fit Model 1:  Hard sum-to-zero constraint

Run the sampler to get posterior estimates of the model conditioned on the data. 


In [None]:
binomial_hard_mod = CmdStanModel(stan_file=os.path.join('stan', 'binomial_4preds_hard.stan'))

In [None]:
binomial_hard_fit = binomial_hard_mod.sample(data=data_4_preds, parallel_chains=4)

In [None]:
print(binomial_hard_fit.diagnose())

Get the summary statistics for all model parameters, transformed parameters, and generated quantities.


In [None]:
hard_fit_summary = binomial_hard_fit.summary(sig_figs=2)

Compare data generating value, posterior estimate of the predictor "sex".


In [None]:
print("global intercept", sim_data.beta_0[0], "offset sex", sim_data.beta_sex[0])
print("per-category observation pcts hardcoded:  0.4, 0.6")
hard_fit_summary.loc[['beta_0', 'beta_sex_raw', 'beta_sex[1]', 'beta_sex[2]']]

In [None]:
hard_age_summary = hard_fit_summary.filter(regex=r"\.*_age", axis=0)
hard_eth_summary = hard_fit_summary.filter(regex=r"\.*_eth", axis=0)
hard_edu_summary = hard_fit_summary.filter(regex=r"\.*_edu", axis=0)

## Fit model 2:  soft sum-to-zero constraint

In [None]:
binomial_soft_mod = CmdStanModel(stan_file=os.path.join('stan', 'binomial_4preds_soft.stan'))

In [None]:
binomial_soft_fit = binomial_soft_mod.sample(data=data_4_preds, parallel_chains=4)

In [None]:
print(binomial_soft_fit.diagnose())

In [None]:
soft_fit_summary = binomial_soft_fit.summary(sig_figs=2)

In [None]:
print("global intercept", sim_data.beta_0[0], "offset sex", sim_data.beta_sex[0])
print("per-category observation pcts hardcoded:  0.4, 0.6")
soft_fit_summary.loc[['beta_0', 'beta_intercept', 'beta_sex']]

In [None]:
soft_age_summary = soft_fit_summary.filter(regex=r"\.*_age", axis=0)
soft_eth_summary = soft_fit_summary.filter(regex=r"\.*_eth", axis=0)
soft_edu_summary = soft_fit_summary.filter(regex=r"\.*_edu", axis=0)

In [None]:
# plot the distribution of the actual data against a random sample of replicates. We plot 2% of the data - 80 replicates out of 4000.
yrep_soft_pd = binomial_soft_fit.draws_pd(vars='y_rep')
ppc_plot_soft = ppc_density_plot(sim_df, yrep_soft_pd, 1000, 'PPC soft Sum-to-zero', 'sim data dark blue, y_rep sample light blue')
ppc_plot_soft

## Fit model 3:  `sum_to_zero_vector`


In [None]:
binomial_ilr_mod = CmdStanModel(stan_file=os.path.join('stan', 'binomial_4preds_ilr.stan'))

In [None]:
binomial_ilr_fit = binomial_ilr_mod.sample(data=data_4_preds, parallel_chains=4)

In [None]:
print(binomial_ilr_fit.diagnose())

In [None]:
ilr_fit_summary = binomial_ilr_fit.summary(sig_figs=2)

## Compare fits

**Global intercept**

* the hard-sum-to-zero model codes the global intercept as `beta_0`.
* the soft-sum-to-zero model 0-centers the binary predictor `sex`; `beta_intercept` accounts for this centering.


In [None]:
print("global intercept", sim_data.beta_0[0])
ilr_fit_summary.loc[['beta_intercept', 'beta_0']]

In [None]:
hard_fit_summary.loc[['beta_0']]

In [None]:
soft_fit_summary.loc[['beta_intercept', 'beta_0']]

**Sex**

* the ilr model recodes the X matrix column `sex` as a zero-centered vector which is used to estimate `beta_sex`.
* the hard-sum-to-zero model codes `sex` as parameter `beta_sex_raw`, and in the transformed parameter block, defined `beta_sex[1]`, `beta_sex[2]`:

```stan
vector[2] beta_sex = [beta_sex_raw, -beta_sex_raw]';
```


In [None]:
print("coefficient sex", sim_data.beta_sex[0])
print("per-category observation pcts hardcoded:  0.4, 0.6")
ilr_fit_summary.loc[['beta_sex']]

In [None]:
hard_fit_summary.loc[['beta_sex_raw', 'beta_sex[1]', 'beta_sex[2]']]

In [None]:
soft_fit_summary.loc[['beta_sex']]

**Age**


In [None]:
print("true coeffecients age", sim_data.beta_age[0])
print("per-category observation pcts", sim_data.pct_age[0])
ilr_age_summary = ilr_fit_summary.filter(regex=r"\.*_age", axis=0)
ilr_age_summary

In [None]:
hard_age_summary

In [None]:
soft_age_summary

**Eth**


In [None]:
print("true coeffecients eth", sim_data.beta_eth[0])
print("per-category observation pcts", sim_data.pct_eth[0])
ilr_eth_summary = ilr_fit_summary.filter(regex=r"\.*_eth", axis=0)
ilr_eth_summary

In [None]:
hard_eth_summary

In [None]:
soft_eth_summary

**Edu**


In [None]:
print("true coeffecients edu", sim_data.beta_edu[0])
print("per-category observation pcts", sim_data.pct_edu[0])
ilr_edu_summary = ilr_fit_summary.filter(regex=r"\.*_edu", axis=0)
ilr_edu_summary

In [None]:
hard_edu_summary

In [None]:
soft_edu_summary

In [None]:
y_rep_ilr = binomial_ilr_fit.y_rep.astype(int)
obs_vs_rep_ilr_df = pd.DataFrame(data={'sim_data pos_tests': sim_data.pos_tests[0].astype(int)});
for x in range(0,100):
    draw = randint(0, 1000);
    obs_vs_rep_ilr_df['iter ' + str(draw)] =  y_rep_ilr[draw]
# obs_vs_rep_ilr_df

obs_vs_rep_ilr_long = pd.melt(obs_vs_rep_ilr_df, id_vars=['sim_data pos_tests'], var_name='variable', value_name='value')

ilr_ppc = (p9.ggplot(obs_vs_rep_ilr_long, p9.aes(x='sim_data pos_tests', y='value')) +
    p9.geom_jitter(alpha=0.3, color='darkblue') +
    p9.geom_abline(color='orange') +
    p9.labs(x='pos_tests (simulated)', y='Posterior estiimates', title='Sum-to-zero vector\nPosterior Predictive Check') +
    p9.theme(figure_size=(10,10))
    )
ilr_ppc

In [None]:
# plot the distribution of the actual data against a random sample of replicates.
yrep_ilr_pd = binomial_ilr_fit.draws_pd(vars='y_rep')

ppc_plot_ilr = ppc_density_plot(sim_df, yrep_ilr_pd, 160, 'PPC sum_to_zero_vector', 'sim data dark blue, y_rep sample light blue')
ppc_plot_ilr