In [None]:
"""
// OUTPUTS - SPECIALLY IMPORTANT FOR PLOTS --------------------------------------------------------------------------------
generated quantities{
    vector[N2] prob2;
    real<lower=0, upper=1.0> pnew[N2];
    
    for (i in 1:N2) {
        if (W[i] == 0) 
            prob2[i] = dot_probability(col(beta,1), X2[i]);
        else if (W[i] == 1)
            prob2[i] = dot_probability(col(beta,2), X2[i]);
        else if (W[i] == 2)
            prob2[i] = dot_probability(col(beta,3), X2[i]);
        else if (W[i] == 3)
            prob2[i] = dot_probability(col(beta,4), X2[i]);
        else
            prob2[i] = dot_probability(col(beta,5), X2[i]);
    }
    
    for (j in 1:N2){
        pnew[j] = inv_logit(prob2[j]);
        }
    }
"""

# Libraries

In [1]:
import numpy             as np
import pandas            as pd
import matplotlib.pyplot as plt
import statsmodels.api   as sm
import seaborn           as sns
import pystan

# Configuring the regression parameters

In [2]:
my_data = pd.read_csv('../../Catalogue/binom_reg_dataset.csv')

In [3]:
my_data.columns

Index([u'CATAID', u'BPT_CLASS', u'LOGIT_CLASS(1-UVUP;0-UVWEAK)',
       u'STELLAR_MASS', u'UV_CLASS', u'WHAN(0-NA;1-RP;2-wA;3-sA;4-SF)',
       u'WHAN_CLASS', u'Z'],
      dtype='object')

In [4]:
# filtering the dataset
redshifts = my_data['Z']
index     = np.where(redshifts.values<=0.4)

# parameters of interest
logit_class = my_data['LOGIT_CLASS(1-UVUP;0-UVWEAK)'].values[index]    # y axis: logit class -- uv upturn yes or no
whan_class  = my_data['WHAN(0-NA;1-RP;2-wA;3-sA;4-SF)'].values[index]  # My types of galaxies
mass        = my_data['STELLAR_MASS'].values[index]                    # 1st parameter
redshift    = redshifts.values[index]                                  # 2nd parameter

In [5]:
# original parameters
x1      = redshift
x2      = mass
y       = logit_class
classes = whan_class
n_obs   = x1.size
n_class = np.unique(whan_class).size

# new parameters - important for plotting!
n_obs2 = 50
x1_sim = np.linspace(x1.min(), x1.max(), n_obs2)
x2_sim = np.linspace(x2.min(), x2.max(), n_obs2)

plot_x1, plot_x2 = np.meshgrid(x1_sim, x2_sim)  # THIS IS WHERE THE GRID IS DONE
   
plot_x1 = plot_x1.reshape(2500, 1)
plot_x2 = plot_x2.reshape(2500, 1)

In [6]:
# dataset to be used in the regression
regression_data      = {}                                                 # Dictionary, as stated in the pystan manual
regression_data['Y'] = y
regression_data['X'] = sm.add_constant(np.column_stack((x1, x1**2, x2, x2**2)))
regression_data['K'] = 5                                                  # Number of betas -- b0, b1, b2, b3
regression_data['W'] = whan_class
regression_data['N'] = n_obs
regression_data['C'] = n_class                                            # Number of different classes (partial pooling)

# dataset to be used in the plot -- after meshgrid
regression_data['X2'] = sm.add_constant(np.column_stack((plot_x1, plot_x1**2, plot_x2, plot_x2)))
regression_data['N2'] = n_obs2**2

In [7]:
print regression_data['W'].size

504


In [None]:
# Fit: STAN code ----------------------------------------------------------------------------------------------------------
stan_code = """
// DECLARATION OF VARIABLES -----------------------------------------------------------------------------------------------
data{
    int<lower=0> N;
    int<lower=0> N2;
    int<lower=0> K;
    int<lower=0> C;
    int<lower=0, upper=1> Y[N];
    int<lower=0, upper=4> W[N];
    matrix[N, K] X;         // redshift and stellar mass 
    matrix[N2,K] X2;        // redshift and stellar mass -- after grid
    }

// DEFINING THE PRIOR(S) --------------------------------------------------------------------------------------------------
parameters{
    matrix[K,C] beta;       // 25 betas!
    real<lower=0> sigma;    // Shared hyperpriors
    real mu;                // Shared hyperpriors
    }
    
// HYPERPRIORS, PRIORS, LIKELIHOOD -- REGRESSION --------------------------------------------------------------------------
model{
    vector[N] prob;
    
    for (i in 1:N) {
        if (W[i] == 0) 
            prob[i] = dot_product(col(beta[i,1],1), X[i]);  // is this correct?
        else if (W[i] == 1)
            prob[i] = dot_product(col(beta[i,2],2), X[i]);
        else if (W[i] == 2)
            prob[i] = dot_product(col(beta[i,3],3), X[i]);
        else if (W[i] == 3)
            prob[i] = dot_product(col(beta[i,4],4), X[i]);
        else
            prob[i] = dot_product(col(beta[i,5],5), X[i]);
        } 
    
    sigma ~ gamma(0.001, 0.001);   // Shared hyperpriors
    mu ~ normal(0, 100);           // Shared hyperpriors
    
    for (i in 1:K) {
        for (j in 1:C){
            beta[i,j] ~ normal(mu, sigma);  // Priors given the hyperpriors
            }
        }
       

    Y ~ bernoulli_logit(prob);                       // Regression  
    }
"""

### Settings for running STAN

In [None]:
iterations = 5000
chains     = 3
warmup     = 2000   # How many of the first iterations we'll ignore - burnin
jobs       = 1

In [None]:
fit = pystan.stan(model_code=stan_code, data=regression_data, iter=iterations, chains=chains, warmup=warmup, n_jobs=jobs)