## Setup and installs

In [None]:
# Setup and installs
%pip install --quiet --upgrade pip
%pip install --quiet pandas numpy patsy statsmodels cmdstanpy arviz xarray netcdf4

import os, sys
import numpy as np
import pandas as pd
from patsy import dmatrix
import arviz as az
from cmdstanpy import CmdStanModel
import cmdstanpy as csp

# Optional: set project root so relative paths work
os.chdir('/Users/ogeohia/PYTHON/eo-colon-cancer-trends-ci5plus')

print("Python:", sys.executable)
print("CmdStan installed:", csp.cmdstan_path() if csp.cmdstan_path() else "No")

# One-time (first run) install of CmdStan toolchain; skip if already installed
if not csp.cmdstan_path():
    csp.install_cmdstan()

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Python: /Users/ogeohia/anaconda3/envs/colon-cancer-data/bin/python
CmdStan installed: /Users/ogeohia/anaconda3/envs/colon-cancer-data/bin/cmdstan


  from .autonotebook import tqdm as notebook_tqdm


## Load data and build spline

In [2]:
# Load data and build spline basis
df = pd.read_csv("data/colon_cancer_full.csv")

# Basic checks
assert (df['py'] > 0).all(), "Found zero/negative py values."

# Age spline (df=4 like your GLM)
age_spline = dmatrix("bs(age_cont, df=4, include_intercept=False)",
                     data=df, return_type='dataframe')
B_age = np.asarray(age_spline)
K_age = B_age.shape[1]

# Male indicator
df['male_ind'] = (df['sex_label'] == 'Male').astype(int)

# Integer IDs for country and region
country_codes = {c:i+1 for i,c in enumerate(sorted(df['country'].unique()))}
region_codes  = {r:i+1 for i,r in enumerate(sorted(df['region'].unique()))}

country_id = df['country'].map(country_codes).astype(int).to_numpy()
J_country = len(country_codes)

# Country -> region mapping
country_to_region = df.groupby('country')['region'].first().to_dict()
region_id_country = np.array([region_codes[country_to_region[c]]
                              for c,_idx in sorted(country_codes.items(), key=lambda x: x[1])], dtype=int)
R_region = len(region_codes)

# Center year (optional)
year_c = (df['year'] - df['year'].mean()).to_numpy()

stan_data = {
    'N': len(df),
    'y': df['cases'].astype(int).to_numpy(),
    'py': df['py'].to_numpy(),
    'J_country': J_country,
    'R_region': R_region,
    'country_id': country_id,
    'region_id_country': region_id_country,
    'K_age': K_age,
    'B_age': B_age,
    'male': df['male_ind'].astype(int).to_numpy(),
    'year_c': year_c
}

len(df), K_age, J_country, R_region

(92326, 5, 48, 12)

## Write Stan model file

In [5]:
# Write Stan model to disk
stan_src = r"""
data {
  int<lower=1> N;
  array[N] int<lower=0> y;                 // was: int<lower=0> y[N];
  vector<lower=0>[N] py;
  int<lower=1> J_country;
  int<lower=1> R_region;
  array[N] int<lower=1> country_id;        // was: int<lower=1> country_id[N];
  array[J_country] int<lower=1> region_id_country; // was: int<lower=1> region_id_country[J_country];
  int<lower=1> K_age;
  matrix[N, K_age] B_age;
  array[N] int<lower=0, upper=1> male;     // was: int<lower=0,upper=1> male[N];
  vector[N] year_c;
}
parameters {
  real alpha;
  vector[R_region] z_region;
  real<lower=0> sigma_region;
  vector[J_country] z_country;
  real<lower=0> sigma_country;
  vector[K_age] beta_age;
  real beta_male;
  real beta_year;
  real<lower=0> phi;
}
transformed parameters {
  vector[R_region] region_eff = z_region * sigma_region;
  vector[J_country] country_eff;
  for (j in 1:J_country)
    country_eff[j] = region_eff[ region_id_country[j] ] + z_country[j] * sigma_country;
}
model {
  // Priors
  alpha        ~ normal(0, 2);
  z_region     ~ normal(0, 1);
  sigma_region ~ exponential(1);
  z_country    ~ normal(0, 1);
  sigma_country~ exponential(1);
  beta_age     ~ normal(0, 1);
  beta_male    ~ normal(0, 1);
  beta_year    ~ normal(0, 0.5);
  phi          ~ exponential(1);

  // Likelihood
  for (n in 1:N) {
    real eta = alpha
               + country_eff[country_id[n]]
               + B_age[n] * beta_age
               + beta_male * male[n]
               + beta_year * year_c[n]
               + log(py[n]);
    y[n] ~ neg_binomial_2_log(eta, phi);
  }
}
generated quantities {
  vector[N] log_lambda;
  int y_rep[N];
  for (n in 1:N) {
    real eta = alpha
               + country_eff[country_id[n]]
               + B_age[n] * beta_age
               + beta_male * male[n]
               + beta_year * year_c[n]
               + log(py[n]);
    log_lambda[n] = eta;
    y_rep[n] = neg_binomial_2_log_rng(eta, phi);
  }
}
"""
os.makedirs("models", exist_ok=True)
stan_path = "models/hierarchical_colon_nb.stan"
with open(stan_path, "w", encoding="utf-8") as f:
    f.write(stan_src)
print("Wrote:", stan_path)

Wrote: models/hierarchical_colon_nb.stan


## Compile and sample

In [6]:
# Compile and sample
model = CmdStanModel(stan_file="models/hierarchical_colon_nb.stan")

fit = model.sample(
    data=stan_data,
    chains=4, parallel_chains=4,
    iter_warmup=1000, iter_sampling=1000,
    adapt_delta=0.95, max_treedepth=12
)

print(fit)
summ = fit.summary()
summ.loc[['alpha','sigma_region','sigma_country','beta_male','beta_year','phi']]

17:42:41 - cmdstanpy - INFO - compiling stan file /Users/ogeohia/PYTHON/eo-colon-cancer-trends-ci5plus/models/hierarchical_colon_nb.stan to exe file /Users/ogeohia/PYTHON/eo-colon-cancer-trends-ci5plus/models/hierarchical_colon_nb


ValueError: Failed to compile Stan model '/Users/ogeohia/PYTHON/eo-colon-cancer-trends-ci5plus/models/hierarchical_colon_nb.stan'. Console:

--- Translating Stan model to C++ code ---
bin/stanc --filename-in-msg=hierarchical_colon_nb.stan --o=/Users/ogeohia/PYTHON/eo-colon-cancer-trends-ci5plus/models/hierarchical_colon_nb.hpp /Users/ogeohia/PYTHON/eo-colon-cancer-trends-ci5plus/models/hierarchical_colon_nb.stan
Syntax error in 'hierarchical_colon_nb.stan', line 57, column 12 to column 13, parsing error:
   -------------------------------------------------
    55:  generated quantities {
    56:    vector[N] log_lambda;
    57:    int y_rep[N];
                     ^
    58:    for (n in 1:N) {
    59:      real eta = alpha
   -------------------------------------------------

";" expected after variable declaration.
It looks like you are trying to use the old array syntax.
Please use the new syntax:
array[N] int y_rep;
make: *** [make/program:66: /Users/ogeohia/PYTHON/eo-colon-cancer-trends-ci5plus/models/hierarchical_colon_nb.hpp] Error 1

Command ['make', 'STANCFLAGS+=--filename-in-msg=hierarchical_colon_nb.stan', '/Users/ogeohia/PYTHON/eo-colon-cancer-trends-ci5plus/models/hierarchical_colon_nb']
	error during processing No such file or directory
