# Libraries

In [1]:
import numpy             as np
import pandas            as pd
import matplotlib.pyplot as plt
import statsmodels.api   as sm
import seaborn           as sns
import pystan

# Configuring the regression parameters

In [2]:
my_data = pd.read_csv('../../Catalogue/binom_reg_dataset.csv')

In [3]:
# filtering the dataset
redshifts = my_data['Z']
index     = np.where(redshifts.values<=0.4)

# parameters of interest
logit_class = my_data['LOGIT_CLASS(1-UVUP;0-UVWEAK)'].values[index]    # y axis: logit class -- uv upturn yes or no
whan_class  = my_data['WHAN(0-NA;1-RP;2-wA;3-sA;4-SF)'].values[index]  # My types of galaxies
mass        = my_data['STELLAR_MASS'].values[index]                    # 1st parameter
redshift    = redshifts.values[index]                                  # 2nd parameter

In [6]:
# original parameters
x1    = redshift
x2    = mass
y     = logit_class
n_obs = x1.size

# new parameters - important for plotting!
n_obs2 = 50
x1_sim = np.linspace(x1.min(), x1.max(), n_obs2)
x2_sim = np.linspace(x2.min(), x2.max(), n_obs2)

plot_x1, plot_x2 = np.meshgrid(x1_sim, x2_sim)  # THIS IS WHERE THE GRID IS DONE
   
plot_x1 = plot_x1.reshape(2500, 1)
plot_x2 = plot_x2.reshape(2500, 1)

# dataset to be used in the regression
regression_data         = {}                                      # Dictionary, as stated in the pystan manual
regression_data['K']    = 4                                       # Number of betas -- b0, b1, b2, b3
regression_data['C']    = np.unique(whan_class).size              # Number of different classes - for partial pooling
regression_data['X']    = sm.add_constant(np.column_stack((x1, x1**2, x2)))
regression_data['N']    = n_obs
regression_data['Y']    = y
regression_data['WHAN'] = whan_class

# dataset to be used in the plot -- after meshgrid
regression_data['X2']   = sm.add_constant(np.column_stack((plot_x1, plot_x1**2, plot_x2)))
regression_data['N2']   = n_obs2**2

In [7]:
print regression_data['X2'].shape

(2500, 4)


In [None]:
# Fit: STAN code ----------------------------------------------------------------------------------------------------------
stan_code = """
// DECLARATION OF VARIABLES -----------------------------------------------------------------------------------------------
data{
    int<lower=0> N;
    int<lower=0> N2;
    int<lower=0> K;
    int<lower=0> C;
    int Y[N];
    matrix[N,K] X;          // redshift and stellar mass
    matrix[N2,K] X2;        // redshift and stellar mass -- after grid
    }

// DEFINING THE PRIOR(S) --------------------------------------------------------------------------------------------------
parameters{
    matrix[K,C] beta;
    real<lower=0> sigma;   // Shared hyperpriors
    real mu;               // Shared hyperpriors
    }
    
// HYPERPRIORS, PRIORS, LIKELIHOOD -- REGRESSION --------------------------------------------------------------------------
model{
    vector[N] product;
    
    sigma ~ gamma(0.001, 0.001);   // Shared hyperpriors
    mu ~ normal(0, 100);           // Shared hyperpriors
    
    for (i in 1:K) {
        for (j in 1:C){
            beta[i,j] ~ normal(mu, sigma);  // Priors given the hyperpriors
            }
        }
    
    for (i in 1:N) {
        if (WHAN[i] == WHAN[1]) 
            product[i] = dot_product(col(beta,1), X[i]);
        else if (WHAN[i] == WHAN[2])
            product[i] = dot_product(col(beta,2), X[i]);
        else if (WHAN[i] == WHAN[3])
            product[i] = dot_product(col(beta,3), X[i]);
        else if (WHAN[i] == WHAN[4])
            product[i] = dot_product(col(beta,4), X[i]);
        else
            product[i] = dot_product(col(beta,5), X[i]);
    }    

    Y ~ bernoulli_logit(product);                       // Regression  
    }
    
    
// OUTPUTS - SPECIALLY IMPORTANT FOR PLOTS --------------------------------------------------------------------------------
generated quantities{
    vector[N2] product2;
    real<lower=0, upper=1.0> pnew[N2];
    
    for (i in 1:N2) {
        if (WHAN[i] == WHAN[1]) 
            product2[i] = dot_product(col(beta,1), X2[i]);
        else if (WHAN[i] == WHAN[2])
            product2[i] = dot_product(col(beta,2), X2[i]);
        else if (WHAN[i] == WHAN[3])
            product2[i] = dot_product(col(beta,3), X2[i]);
        else if (WHAN[i] == WHAN[4])
            product2[i] = dot_product(col(beta,4), X2[i]);
        else
            product2[i] = dot_product(col(beta,5), X2[i]);
    }
    
    for (j in 1:N2){
        pnew[j] = inv_logit(product2[j]);
        }
    }
""""    

### Settings for running STAN

In [None]:
iterations = 5000
chains     = 3
warmup     = 2000   # How many of the first iterations we'll ignore - burnin
jobs       = 1

In [None]:
fit = pystan.stan(model_code=stan_code, data=regression_data, iter=iteractions, chains=chains, warmup=warmup, n_jobs=jobs)