# Personal functions to be called throughout the code

In [1]:
def standardize(parameter):
    """
    :param parameter: the array with the parameters you wish to standardize
    :return param_mean: mean of the input array
    :return param_std : standard deviation of the input array
    :return stdized_param: final standardized array
    """
    import numpy as np
    
    param_mean    = np.mean(parameter)
    param_std     = np.std(parameter)
    stdized_param = []                  # standardized parameter -- output
    for i in range(parameter.size):
        param_temp = (parameter[i] - param_mean)/param_std
        stdized_param.append(param_temp)
    stdized_param = np.array(stdized_param)
    return (param_mean, param_std, stdized_param)

# Libraries

In [2]:
import numpy             as np
import pandas            as pd
import matplotlib.pyplot as plt
import statsmodels.api   as sm
import seaborn           as sns
import pystan
import time

# Configuring the regression parameters

In [3]:
my_data = pd.read_csv('../../Catalogue/binom_reg_dataset.csv')

In [4]:
my_data.columns

Index([u'CATAID', u'BPT_CLASS', u'LOGIT_CLASS(1-UVUP;0-UVWEAK)',
       u'STELLAR_MASS', u'UV_CLASS', u'WHAN(0-NA;1-RP;2-wA;3-sA;4-SF)',
       u'WHAN_CLASS', u'Z'],
      dtype='object')

In [5]:
# filtering the dataset
redshifts = my_data['Z']
index     = np.where(redshifts.values<=0.4)

# parameters of interest
logit_class = my_data['LOGIT_CLASS(1-UVUP;0-UVWEAK)'].values[index]    # y axis: logit class -- uv upturn yes or no
whan_class  = my_data['WHAN(0-NA;1-RP;2-wA;3-sA;4-SF)'].values[index]  # My types of galaxies
mass        = my_data['STELLAR_MASS'].values[index]                    # 1st parameter
redshift    = redshifts.values[index]                                  # 2nd parameter

In [6]:
# original parameters
x1      = redshift
x2      = standardize(mass)[2]
y       = logit_class
classes = (whan_class+1).astype(int)
n_obs   = x1.size
n_class = np.unique(classes).size

# new parameters - important for plotting!
n_obs2 = 30
x1_sim = np.linspace(x1.min(), x1.max(), n_obs2)
x2_sim = np.linspace(x2.min(), x2.max(), n_obs2)

plot_x1, plot_x2 = np.meshgrid(x1_sim, x2_sim)  # THIS IS WHERE THE GRID IS DONE
   
plot_x1 = plot_x1.reshape((n_obs2**2), 1)
plot_x2 = plot_x2.reshape((n_obs2**2), 1)

In [7]:
print x2.min(), x2.max() # sanity check
print np.median(x2)
print plot_x1.shape

-3.0048533107143856 2.6904846038511474
0.018866401595684694
(900, 1)


In [8]:
# dataset to be used in the regression
regression_data      = {}                                                 # Dictionary, as stated in the pystan manual
regression_data['Y'] = y
regression_data['X'] = sm.add_constant(np.column_stack((x1, x1**2, x2, x2**2)))
regression_data['K'] = regression_data['X'][0,:].size                     # Number of betas -- b0, b1, b2, b3, b4
regression_data['W'] = classes
regression_data['N'] = n_obs
regression_data['C'] = n_class                                      # Number of different classes (partial pooling)

# dataset to be used in the plot -- after meshgrid
regression_data['X2'] = sm.add_constant(np.column_stack((plot_x1, plot_x1**2, plot_x2, plot_x2)))
regression_data['N2'] = n_obs2**2

In [9]:
print regression_data['X2'].shape

(900, 5)


In [10]:
# Fit: STAN code ----------------------------------------------------------------------------------------------------------
stan_code = """
// DECLARATION OF VARIABLES -----------------------------------------------------------------------------------------------
data{
    int<lower=1> N;
    int<lower=1> N2;
    int<lower=1> K;
    int<lower=1> C;
    int W[N];
    int<lower=0, upper=1> Y[N];
    matrix[N, K] X;         // redshift and stellar mass 
    matrix[N2,K] X2;        // redshift and stellar mass -- after grid
    }

// DEFINING THE PRIOR(S) --------------------------------------------------------------------------------------------------
parameters{
    matrix[K,C] beta;       // 25 betas!
    real<lower=0> sigma;    // Shared hyperpriors
    real mu;                // Shared hyperpriors
    }

// MODEL: PROBABILITY, HYPERPRIORS, PRIORS, AND REGRESSION ----------------------------------------------------------------
model {
   vector[N] prob;
    for (i in 1:N) {
      prob[i] = beta[1,W[i]]*X[i,1] + beta[2,W[i]]*X[i,2] + beta[3,W[i]]*X[i,3] + beta[4,W[i]]*X[i,4] + 
      beta[5,W[i]]*X[i,5];
      }

    sigma ~ gamma(0.001, 0.001);                           // shared hyperpriors
    mu ~ normal(0, 100);                                   // shared hyperpriors
     
    for (i in 1:K) {
       for (j in 1:C) beta[i,j] ~ normal(mu, sigma);       // priors
        }

    Y ~ bernoulli_logit(prob);                             // regression
    }

// DATA TO BE PLOTTED -----------------------------------------------------------------------------------------------------
generated quantities{
    vector[N2] prob01;
    vector[N2] eta01;
    vector[N2] prob02;
    vector[N2] eta02;
    vector[N2] prob03;
    vector[N2] eta03;
    vector[N2] prob04;
    vector[N2] eta04;
    vector[N2] prob05;
    vector[N2] eta05;
    
    for(j in 1:N2){
        eta01[j] = beta[1,1]*X2[j,1] + beta[2,1]*X2[j,2] + beta[3,1]*X2[j,3] + beta[4,1]*X2[j,4] + beta[5,1]*X2[j,5];
        eta02[j] = beta[1,2]*X2[j,1] + beta[2,2]*X2[j,2] + beta[3,2]*X2[j,3] + beta[4,2]*X2[j,4] + beta[5,2]*X2[j,5];
        eta03[j] = beta[1,3]*X2[j,1] + beta[2,3]*X2[j,2] + beta[3,3]*X2[j,3] + beta[4,3]*X2[j,4] + beta[5,3]*X2[j,5];
        eta04[j] = beta[1,4]*X2[j,1] + beta[2,4]*X2[j,2] + beta[3,4]*X2[j,3] + beta[4,4]*X2[j,4] + beta[5,4]*X2[j,5];
        eta05[j] = beta[1,5]*X2[j,1] + beta[2,5]*X2[j,2] + beta[3,5]*X2[j,3] + beta[4,5]*X2[j,4] + beta[5,5]*X2[j,5];
        prob01[j] = inv_logit(eta01[j]);
        prob02[j] = inv_logit(eta02[j]);
        prob03[j] = inv_logit(eta03[j]);
        prob04[j] = inv_logit(eta04[j]);
        prob05[j] = inv_logit(eta05[j]);
        }

    }

"""

### Settings for running STAN

In [11]:
iterations    = 8000
chains        = 2
warmup        = 2000    # How many of the first iterations we'll ignore - burnin
jobs          = -1
seed          = 1

In [12]:
# control = {}
# control['max_treedepth'] = 20
# control['adapt_delta'] = 0.99

### The fit:

In [13]:
start = time.time()

fit = pystan.stan(model_code=stan_code, data=regression_data, seed=seed, iter=iterations, chains=chains, 
                  warmup=warmup, n_jobs=jobs)

end = time.time()

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_902d543734551fafeca59854d8a03fd0 NOW.
  tree = Parsing.p_module(s, pxd, full_module_name)


In [14]:
print ("--------------------------------------------------")
print ("CPU process time: %.2f [min]" % float((end-start)/60))
print ("--------------------------------------------------")

--------------------------------------------------
CPU process time: 9.85 [min]
--------------------------------------------------


In [37]:
print pystan.stansummary(fit=fit, digits_summary=3)

Inference for Stan model: anon_model_902d543734551fafeca59854d8a03fd0.
2 chains, each with iter=8000; warmup=2000; thin=1; 
post-warmup draws per chain=6000, total post-warmup draws=12000.

              mean se_mean     sd   2.5%    25%    50%    75%  97.5%  n_eff   Rhat
beta[1,1]   -0.319   0.004  0.215 -0.796 -0.454 -0.298 -0.165  0.037   3650    1.0
beta[2,1]   -0.029   0.002  0.289 -0.597 -0.196 -0.039   0.13  0.592  13900    1.0
beta[3,1]   -0.029   0.002  0.289  -0.61 -0.194 -0.038  0.133  0.559  13920    1.0
beta[4,1]    0.342   0.005   0.23 -0.052  0.176  0.325   0.49   0.84   2178    1.0
beta[5,1]   -0.197   0.001  0.141 -0.502 -0.284 -0.189   -0.1  0.052   9628    1.0
beta[1,2]   -0.214   0.002  0.149 -0.525  -0.31 -0.204  -0.11  0.053   5439    1.0
beta[2,2]   -0.055   0.003  0.289 -0.637 -0.214 -0.057  0.104  0.541  12090    1.0
beta[3,2]   -0.046   0.002  0.289 -0.644 -0.205 -0.048  0.117  0.546  14731    1.0
beta[4,2]    0.231   0.003  0.153 -0.057  0.124  0.229  0.334  

In [99]:
summary = pystan.stansummary(fit=fit, digits_summary=2)

In [100]:
summary_arr = np.array(summary.split('\n'))

In [101]:
new_output = summary_arr[5:-6,]

In [102]:
count = 0
for i in range(new_output.size):
    row = np.array(new_output[i].split())
    if row.size!=11:
        print new_output[i]
        count+=1
print count

beta[1,4]    -0.15  2.2e-3   0.24  -0.67  -0.29  -0.14-4.2e-3   0.31  11486    1.0
beta[4,5]      0.1  1.9e-3   0.15  -0.18-8.9e-3   0.09   0.19   0.41 6372.0    1.0
eta01[861]    0.03  7.9e-3   0.56  -1.01  -0.35-6.0e-4   0.41   1.17 5128.0    1.0
eta01[862]    0.03  7.9e-3   0.56  -1.01  -0.36-1.1e-4   0.41   1.17 5115.0    1.0
eta01[863]    0.03  7.9e-3   0.56  -1.01  -0.36-6.7e-4   0.41   1.17 5103.0    1.0
eta01[864]    0.03  7.9e-3   0.56  -1.01  -0.36-9.5e-4   0.41   1.18 5090.0    1.0
eta01[865]    0.03  7.9e-3   0.56  -1.01  -0.36-8.8e-4   0.41   1.18 5078.0    1.0
eta01[866]    0.03  7.9e-3   0.56  -1.01  -0.36-1.6e-3   0.41   1.18 5067.0    1.0
eta01[867]    0.03  7.9e-3   0.56  -1.01  -0.36-1.8e-3   0.41   1.18 5055.0    1.0
eta01[868]    0.03  8.0e-3   0.57  -1.01  -0.36-2.8e-3   0.41   1.19 5044.0    1.0
eta01[869]    0.03  8.0e-3   0.57  -1.01  -0.36-2.9e-3   0.41   1.19 5033.0    1.0
eta01[870]    0.03  8.0e-3   0.57  -1.02  -0.36-3.4e-3   0.41   1.19 5022.0    1.0
eta0

In [73]:
print summary_arr[20].split()

[u'beta[1,4]', u'-0.152', u'0.002', u'0.241', u'-0.673', u'-0.29', u'-0.138', u'-0.004', u'0.311', u'11486', u'1.0']


In [84]:
sampler = fit.get_sampler_params()

### Extracting fit properties

In [16]:
output = str(fit).split('\n')

In [17]:
pystan.__version__

'2.18.0.0'

In [30]:
new_output = np.array(output[5:-6])

In [33]:
posteriors = list(fit.extract(u'beta').items()[0])

In [24]:
betas = posteriors[1]

In [25]:
print betas.shape

(12000, 5, 5)


In [36]:
diagnostics = []
count = 0
for i in range(new_output.size):
    if len(new_output[i].split())<11:
        print i, len(new_output[i].split()),'\n'
        print new_output[i], '\n', new_output[i].split(), len(new_output[i].split())
        diagnostics.append(len(new_output[i].split()))
        count+=1
    else:
        continue
print np.unique(diagnostics)
print count

15 10 

beta[1,4]    -0.15  2.2e-3   0.24  -0.67  -0.29  -0.14-4.2e-3   0.31  11486    1.0 
['beta[1,4]', '-0.15', '2.2e-3', '0.24', '-0.67', '-0.29', '-0.14-4.2e-3', '0.31', '11486', '1.0'] 10
23 10 

beta[4,5]      0.1  1.9e-3   0.15  -0.18-8.9e-3   0.09   0.19   0.41 6372.0    1.0 
['beta[4,5]', '0.1', '1.9e-3', '0.15', '-0.18-8.9e-3', '0.09', '0.19', '0.41', '6372.0', '1.0'] 10
1787 10 

eta01[861]    0.03  7.9e-3   0.56  -1.01  -0.35-6.0e-4   0.41   1.17 5128.0    1.0 
['eta01[861]', '0.03', '7.9e-3', '0.56', '-1.01', '-0.35-6.0e-4', '0.41', '1.17', '5128.0', '1.0'] 10
1788 10 

eta01[862]    0.03  7.9e-3   0.56  -1.01  -0.36-1.1e-4   0.41   1.17 5115.0    1.0 
['eta01[862]', '0.03', '7.9e-3', '0.56', '-1.01', '-0.36-1.1e-4', '0.41', '1.17', '5115.0', '1.0'] 10
1789 10 

eta01[863]    0.03  7.9e-3   0.56  -1.01  -0.36-6.7e-4   0.41   1.17 5103.0    1.0 
['eta01[863]', '0.03', '7.9e-3', '0.56', '-1.01', '-0.36-6.7e-4', '0.41', '1.17', '5103.0', '1.0'] 10
1790 10 

eta01[864]    0.0