# Personal functions to be called throughout the code

In [1]:
def standardize(parameter):
    """
    :param parameter: the array with the parameters you wish to standardize
    :return param_mean: mean of the input array
    :return param_std : standard deviation of the input array
    :return stdized_param: final standardized array
    """
    import numpy as np
    
    param_mean    = np.mean(parameter)
    param_std     = np.std(parameter)
    stdized_param = []                  # standardized parameter -- output
    for i in range(parameter.size):
        param_temp = (parameter[i] - param_mean)/param_std
        stdized_param.append(param_temp)
    stdized_param = np.array(stdized_param)
    return (param_mean, param_std, stdized_param)

In [2]:
def un_standardize(stdized_par, mean, std):
    """
    This function undoes what the former does!
    """
    
    import numpy as np
    
    recovered_par = []
    for i in range(stdized_par.size):
        param_temp = stdized_par[i]*std + mean
        recovered_par.append(param_temp)
    recovered_par = np.array(recovered_par)
    return (recovered_par)

# Libraries

In [3]:
import numpy             as np
import pandas            as pd
import matplotlib.pyplot as plt
import statsmodels.api   as sm
import seaborn           as sns
import pystan
import time

# Configuring the regression parameters

In [4]:
my_data = pd.read_csv('../../Catalogue/binom_reg_dataset.csv')

In [5]:
# filtering the dataset
redshifts = my_data['Z']
index     = np.where(redshifts.values<=0.4)

# parameters of interest
logit_class = my_data['LOGIT_CLASS(1-UVUP;0-UVWEAK)'].values[index]    # y axis: logit class -- uv upturn yes or no
whan_class  = my_data['WHAN(0-NA;1-RP;2-wA;3-sA;4-SF)'].values[index]  # My types of galaxies
mass        = my_data['STELLAR_MASS'].values[index]                    # 1st parameter
redshift    = redshifts.values[index]                                  # 2nd parameter

In [6]:
# original parameters
mean_mass, std_mass, x2 = standardize(mass)
x1      = redshift
y       = logit_class
classes = (whan_class+1).astype(int)        # Must sum +1 to avoid index issues with STAN
n_obs   = x1.size
n_class = np.unique(classes).size

# new parameters - important for plotting!
n_obs2 = 30
x1_sim = np.linspace(x1.min(), x1.max(), n_obs2)
x2_sim = np.linspace(x2.min(), x2.max(), n_obs2)

# grid
plot_x1, plot_x2 = np.meshgrid(x1_sim, x2_sim)  # THIS IS WHERE THE GRID IS DONE
plot_x1 = plot_x1.reshape((n_obs2**2), 1)
plot_x2 = plot_x2.reshape((n_obs2**2), 1)

In [7]:
print x2.min(), x2.max() # sanity check
print np.median(x2)
print plot_x1.shape

-3.0048533107143856 2.6904846038511474
0.018866401595684694
(900, 1)


In [8]:
# dataset to be used in the regression
regression_data      = {}                                                 # Dictionary, as stated in the pystan manual
regression_data['Y'] = y
regression_data['X'] = sm.add_constant(np.column_stack((x1, x1**2, x2, x2**2)))
regression_data['K'] = regression_data['X'][0,:].size                     # Number of betas -- b0, b1, b2, b3, b4
regression_data['W'] = classes
regression_data['N'] = n_obs
regression_data['C'] = n_class                                      # Number of different classes (partial pooling)

# dataset to be used in the plot -- after meshgrid
regression_data['X2'] = sm.add_constant(np.column_stack((plot_x1, plot_x1**2, plot_x2, plot_x2)))
regression_data['N2'] = n_obs2**2

In [9]:
# Fit: STAN code ----------------------------------------------------------------------------------------------------------
stan_code = """
// DECLARATION OF VARIABLES -----------------------------------------------------------------------------------------------
data{
    int<lower=1> N;
    int<lower=1> N2;
    int<lower=1> K;
    int<lower=1> C;
    int W[N];
    int<lower=0, upper=1> Y[N];
    matrix[N, K] X;         // redshift and stellar mass 
    matrix[N2,K] X2;        // redshift and stellar mass -- after grid
    }

// DEFINING THE PRIOR(S) --------------------------------------------------------------------------------------------------
parameters{
    matrix[K,C] beta;       // 25 betas!
    real<lower=0> sigma;    // Shared hyperpriors
    real mu;                // Shared hyperpriors
    }

// MODEL: PROBABILITY, HYPERPRIORS, PRIORS, AND REGRESSION ----------------------------------------------------------------
model {
   vector[N] prob;
    for (i in 1:N) {
      prob[i] = beta[1,W[i]]*X[i,1] + beta[2,W[i]]*X[i,2] + beta[3,W[i]]*X[i,3] + beta[4,W[i]]*X[i,4] + 
      beta[5,W[i]]*X[i,5];
      }

    sigma ~ gamma(0.001, 0.001);                           // shared hyperpriors
    mu ~ normal(0, 100);                                   // shared hyperpriors
     
    for (i in 1:K) {
       for (j in 1:C) beta[i,j] ~ normal(mu, sigma);       // priors
        }

    Y ~ bernoulli_logit(prob);                             // regression
    }

// DATA TO BE PLOTTED -----------------------------------------------------------------------------------------------------
generated quantities{
    vector[N2] prob01;
    vector[N2] eta01;
    vector[N2] prob02;
    vector[N2] eta02;
    vector[N2] prob03;
    vector[N2] eta03;
    vector[N2] prob04;
    vector[N2] eta04;
    vector[N2] prob05;
    vector[N2] eta05;
    
    for(j in 1:N2){
        eta01[j] = beta[1,1]*X2[j,1] + beta[2,1]*X2[j,2] + beta[3,1]*X2[j,3] + beta[4,1]*X2[j,4] + beta[5,1]*X2[j,5];
        eta02[j] = beta[1,2]*X2[j,1] + beta[2,2]*X2[j,2] + beta[3,2]*X2[j,3] + beta[4,2]*X2[j,4] + beta[5,2]*X2[j,5];
        eta03[j] = beta[1,3]*X2[j,1] + beta[2,3]*X2[j,2] + beta[3,3]*X2[j,3] + beta[4,3]*X2[j,4] + beta[5,3]*X2[j,5];
        eta04[j] = beta[1,4]*X2[j,1] + beta[2,4]*X2[j,2] + beta[3,4]*X2[j,3] + beta[4,4]*X2[j,4] + beta[5,4]*X2[j,5];
        eta05[j] = beta[1,5]*X2[j,1] + beta[2,5]*X2[j,2] + beta[3,5]*X2[j,3] + beta[4,5]*X2[j,4] + beta[5,5]*X2[j,5];
        prob01[j] = inv_logit(eta01[j]);
        prob02[j] = inv_logit(eta02[j]);
        prob03[j] = inv_logit(eta03[j]);
        prob04[j] = inv_logit(eta04[j]);
        prob05[j] = inv_logit(eta05[j]);
        }

    }

"""

### Settings for running STAN

In [10]:
iterations    = 8000
chains        = 2                           # HMC chains
warmup        = 2000                        # How many of the first iterations we'll ignore - burnin
jobs          = -1                          # Run code in parallel -- see pystan documentation
seed          = 1

In [11]:
control = {}
# control['max_treedepth'] = 20
control['adapt_delta'] = 0.9

### The fit:

In [12]:
start = time.time()

fit = pystan.stan(model_code=stan_code, data=regression_data, seed=seed, iter=iterations, chains=chains, 
                  warmup=warmup, n_jobs=jobs, control=control)

end = time.time()

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_902d543734551fafeca59854d8a03fd0 NOW.
  tree = Parsing.p_module(s, pxd, full_module_name)


In [13]:
print ("--------------------------------------------------")
print ("CPU process time: %.2f [min]" % float((end-start)/60))
print ("--------------------------------------------------")

--------------------------------------------------
CPU process time: 9.00 [min]
--------------------------------------------------


### Extracting fit properties

In [14]:
summary = pystan.stansummary(fit=fit, digits_summary=3)

In [15]:
summary_arr = np.array(summary.split('\n'))

In [16]:
new_output = summary_arr[5:-6,]

In [17]:
count = 0
for i in range(new_output.size):
    row = np.array(new_output[i].split())
    if row.size ==11:
        continue
    else:
        print "new_output[%d] = '%s' " % (i, str(new_output[i]))
        count+=1
print count

new_output[1565] = 'eta01[639]  -0.165   0.002   0.26 -0.673 -0.334 -0.1694.58e-4  0.363  15609    1.0' 
new_output[1566] = 'eta01[640]  -0.165   0.002   0.26 -0.674 -0.334 -0.1691.89e-4  0.364  15494    1.0' 
new_output[1568] = 'eta01[642]  -0.166   0.002   0.26 -0.675 -0.335  -0.17-2.55e-4  0.363  15260    1.0' 
new_output[1569] = 'eta01[643]  -0.167   0.002  0.261 -0.676 -0.336 -0.171-6.79e-4  0.362  15142    1.0' 
new_output[1571] = 'eta01[645]  -0.168   0.002  0.261 -0.678 -0.336 -0.172-9.26e-4  0.361  14912    1.0' 
new_output[1572] = 'eta01[646]  -0.168   0.002  0.262 -0.679 -0.337 -0.173-5.53e-4   0.36  14787    1.0' 
new_output[1573] = 'eta01[647]  -0.168   0.002  0.262 -0.679 -0.338 -0.173-6.78e-4  0.361  14661    1.0' 
new_output[1574] = 'eta01[648]  -0.169   0.002  0.263  -0.68 -0.339 -0.174-2.26e-4   0.36  14535    1.0' 
new_output[1575] = 'eta01[649]  -0.169   0.002  0.263 -0.682 -0.339 -0.175-1.95e-4   0.36  14408    1.0' 
new_output[1576] = 'eta01[650]   -0.17   0.002  

In [18]:
new_output[1565] = 'eta01[639]  -0.165   0.002   0.26 -0.673 -0.334 -0.169 4.58e-4  0.363  15609    1.0' 
new_output[1566] = 'eta01[640]  -0.165   0.002   0.26 -0.674 -0.334 -0.169 1.89e-4  0.364  15494    1.0' 
new_output[1568] = 'eta01[642]  -0.166   0.002   0.26 -0.675 -0.335  -0.17 -2.55e-4  0.363  15260    1.0' 
new_output[1569] = 'eta01[643]  -0.167   0.002  0.261 -0.676 -0.336 -0.171 -6.79e-4  0.362  15142    1.0' 
new_output[1571] = 'eta01[645]  -0.168   0.002  0.261 -0.678 -0.336 -0.172 -9.26e-4  0.361  14912    1.0' 
new_output[1572] = 'eta01[646]  -0.168   0.002  0.262 -0.679 -0.337 -0.173 -5.53e-4   0.36  14787    1.0' 
new_output[1573] = 'eta01[647]  -0.168   0.002  0.262 -0.679 -0.338 -0.173 -6.78e-4  0.361  14661    1.0' 
new_output[1574] = 'eta01[648]  -0.169   0.002  0.263  -0.68 -0.339 -0.174 -2.26e-4   0.36  14535    1.0' 
new_output[1575] = 'eta01[649]  -0.169   0.002  0.263 -0.682 -0.339 -0.175 -1.95e-4   0.36  14408    1.0' 
new_output[1576] = 'eta01[650]   -0.17   0.002  0.264 -0.681  -0.34 -0.176 -1.77e-4  0.361  14282    1.0' 
new_output[1577] = 'eta01[651]   -0.17   0.002  0.264 -0.684  -0.34 -0.176 1.49e-5  0.362  14155    1.0' 
new_output[1578] = 'eta01[652]  -0.171   0.002  0.265 -0.686 -0.341 -0.177 1.61e-5  0.363  14029    1.0' 
new_output[1579] = 'eta01[653]  -0.171   0.002  0.266 -0.688 -0.343 -0.177 -2.45e-4  0.364  13903    1.0' 
new_output[1580] = 'eta01[654]  -0.172   0.002  0.266  -0.69 -0.344 -0.177 -5.83e-5  0.365  13777    1.0' 
new_output[1581] = 'eta01[655]  -0.172   0.002  0.267 -0.694 -0.345 -0.178 1.18e-4  0.364  13652    1.0' 
new_output[1582] = 'eta01[656]  -0.173   0.002  0.268 -0.695 -0.345 -0.178 -1.13e-4  0.367  13528    1.0' 
new_output[1583] = 'eta01[657]  -0.174   0.002  0.269 -0.697 -0.346 -0.179 -2.98e-4  0.368  13405    1.0' 
new_output[1584] = 'eta01[658]  -0.174   0.002   0.27 -0.699 -0.347  -0.18 -3.59e-4  0.367  13282    1.0' 
new_output[1585] = 'eta01[659]  -0.175   0.002  0.271   -0.7 -0.349  -0.18 -1.12e-4  0.369  13161    1.0' 
new_output[1586] = 'eta01[660]  -0.175   0.002  0.272 -0.703  -0.35  -0.18 1.99e-4  0.371  13041    1.0' 
new_output[1785] = 'eta01[859]   0.023   0.008  0.553 -1.008 -0.353 3.45e-4  0.379  1.176   5281    1.0' 
new_output[1786] = 'eta01[860]   0.022   0.008  0.553 -1.007 -0.353 -5.11e-4   0.38  1.178   5268    1.0' 
new_output[1787] = 'eta01[861]   0.022   0.008  0.553 -1.007 -0.353 -8.52e-4   0.38  1.179   5255    1.0' 
new_output[4545] = 'eta03[19]   -0.032    0.01  1.021 -2.156 -0.646 -2.78e-4  0.609  1.929   9733    1.0' 
new_output[4566] = 'eta03[40]   -0.031    0.01   0.96 -2.022 -0.607 -6.91e-4  0.572  1.802   9650    1.0' 
new_output[6644] = 'eta04[318]  -0.008   0.004  0.445 -0.924 -0.266 2.65e-4  0.255  0.903  11827    1.0' 
new_output[6781] = 'eta04[455]  -0.152   0.002  0.242 -0.669 -0.293 -0.138 8.21e-4  0.301  10610    1.0' 
new_output[6782] = 'eta04[456]  -0.153   0.002  0.242 -0.671 -0.293 -0.138 2.46e-4    0.3  10598    1.0' 
new_output[6783] = 'eta04[457]  -0.153   0.002  0.242 -0.673 -0.295  -0.14 2.69e-5  0.299  10587    1.0' 
new_output[6784] = 'eta04[458]  -0.154   0.002  0.243 -0.674 -0.296  -0.14 -1.76e-4  0.298  10577    1.0' 
new_output[6785] = 'eta04[459]  -0.155   0.002  0.243 -0.676 -0.298 -0.141 -7.26e-4  0.297  10570    1.0' 
new_output[6786] = 'eta04[460]  -0.156   0.002  0.244 -0.678 -0.299 -0.141 -8.41e-4  0.298  10584    1.0' 
new_output[7141] = 'eta04[815]  -0.522   0.009  0.832 -2.266  -1.02 -0.485 2.71e-4  1.106   9185    1.0' 
new_output[7142] = 'eta04[816]  -0.522   0.009  0.833 -2.265 -1.021 -0.486 2.31e-4  1.107   9176    1.0' 
new_output[7143] = 'eta04[817]  -0.523   0.009  0.833 -2.265 -1.022 -0.486 -6.73e-4  1.105   9166    1.0' 
new_output[7144] = 'eta04[818]  -0.524   0.009  0.833 -2.265 -1.024 -0.486 -4.87e-4  1.105   9156    1.0' 
new_output[7145] = 'eta04[819]  -0.525   0.009  0.833 -2.266 -1.025 -0.487 -2.87e-4  1.103   9146    1.0' 
new_output[7146] = 'eta04[820]  -0.525   0.009  0.834 -2.271 -1.026 -0.488 -8.54e-4  1.104   9135    1.0' 
new_output[7147] = 'eta04[821]  -0.526   0.009  0.834 -2.271 -1.026 -0.489 -8.83e-4  1.103   9125    1.0' 
new_output[7192] = 'eta04[866]  -0.569    0.01  0.906 -2.458 -1.118 -0.532 9.08e-4  1.208   9000    1.0' 
new_output[7193] = 'eta04[867]   -0.57    0.01  0.907 -2.459 -1.118 -0.534 1.32e-4  1.209   8989    1.0' 
new_output[7194] = 'eta04[868]  -0.571    0.01  0.908 -2.464 -1.121 -0.535 -8.07e-4   1.21   8978    1.0' 
new_output[7195] = 'eta04[869]  -0.572    0.01  0.908 -2.468 -1.121 -0.535 -8.01e-4  1.212   8967    1.0' 
new_output[7196] = 'eta04[870]  -0.573    0.01  0.909 -2.471 -1.122 -0.536 -5.54e-4  1.214   8955    1.0' 
new_output[8640] = 'eta05[514]    -0.1   0.001  0.151 -0.408 -0.198 -0.096 7.03e-4  0.191  13318    1.0' 
new_output[8641] = 'eta05[515]  -0.101   0.001  0.151  -0.41 -0.198 -0.096 1.28e-5  0.191  13326    1.0' 
new_output[8642] = 'eta05[516]  -0.101   0.001  0.151  -0.41 -0.199 -0.096 -4.48e-4  0.191  13332    1.0' 
new_output[8666] = 'eta05[540]  -0.114   0.001  0.168 -0.456 -0.223 -0.111 -4.57e-4   0.21  12825    1.0' 

In [19]:
count = 0
for i in range(new_output.size):
    row = np.array(new_output[i].split())
    if row.size ==11:
        continue
    else:
        print "new_output[%d] = '%s' " % (i, str(new_output[i]))
        count+=1
print count

0


In [20]:
header_fit = summary_arr[4].split()
print header_fit

[u'mean', u'se_mean', u'sd', u'2.5%', u'25%', u'50%', u'75%', u'97.5%', u'n_eff', u'Rhat']


In [21]:
header_addendum = 'parameter'
header_fit = [header_addendum] + header_fit
print header_fit

['parameter', u'mean', u'se_mean', u'sd', u'2.5%', u'25%', u'50%', u'75%', u'97.5%', u'n_eff', u'Rhat']


In [22]:
cute_output = list(np.zeros(len(header_fit)))
for i in range(new_output.size):
    if len(new_output[i].split())!=11: # the length of the list must be 11, in which case we connect them directly
        print "there is a problem!"
    else:
        new_output_temp = np.array(new_output[i].split()).reshape(1,11)
        cute_output     = np.vstack((cute_output, new_output_temp))
cute_output = cute_output[1:,:]               # removing the zeroes in the beggining 

# Extracting and saving *ONLY* what really matters for the analysis

In [23]:
recovered_mass  = un_standardize(stdized_par=plot_x2, mean=mean_mass, std=std_mass)
rp      = np.column_stack((plot_x1, recovered_mass))
rp_cols = np.vstack((rp, rp, rp, rp, rp))       # for 5 whan classes we must stack these 5x

In [24]:
parameters = cute_output[:,0].astype(str)
pnew_idxs  = []
for i in range(parameters.size):
    if parameters[i][0:4]=='prob':
        pnew_idxs.append(i)
    else:
        continue

model_results    = np.column_stack((cute_output[pnew_idxs,:], rp_cols))
model_results_df = pd.DataFrame(model_results)
model_results_df.columns = header_fit + ['Z'] + ['LOG_STELLAR_MASS']
model_results_df.to_csv('./Results/fit_results_sharedprior.csv', sep=',', header=True, index=False)

# Let's save the posteriors

In [25]:
posteriors_temp = list(fit.extract(u'beta').items()[0])

In [26]:
posteriors = np.array(posteriors_temp[1])
print posteriors.shape

(12000, 5, 5)


In [27]:
print posteriors[0,:,:]
print posteriors[1,:,:]

[[-0.30165661 -0.29540003  0.0960025  -0.76417379 -0.18347578]
 [-0.24992791 -0.20028471 -0.09260164 -0.10102874 -0.46149182]
 [-0.5947016  -0.25406585 -0.48844322 -0.05928562 -0.57884588]
 [ 0.45533695  0.33577698 -0.23868459 -0.05020825  0.3082239 ]
 [-0.30940595  0.03607526  0.16624798 -0.10046431  0.14329696]]
[[-0.21951483 -0.16346624 -0.04879949 -0.55180359 -0.38855166]
 [ 0.01748974 -0.25022917  0.11296123  0.12274219  0.19270401]
 [ 0.17723978 -0.34640191 -0.08662252 -0.1274178  -0.6413368 ]
 [-0.06536607  0.40127187 -0.00187916 -0.33419414  0.09818001]
 [-0.16797875 -0.12065742  0.10769998 -0.22347425  0.29971567]]


In [28]:
matrix_row = posteriors[0,:,0].size  # rows are b0, b1, b2, b3, b4 -- posteriors given the regression
matrix_col = posteriors[0,0,:].size  # cols are w0, w1, w2, w3, w4 -- WHAN classification

In [29]:
betas_dict = {}
for line in range(matrix_row):
    for col in range(matrix_col):
        betas_dict["beta%d%d" % (line, col)] = posteriors[:, line, col]
betas_df = pd.DataFrame(betas_dict)

In [30]:
betas_df

Unnamed: 0,beta00,beta01,beta02,beta03,beta04,beta10,beta11,beta12,beta13,beta14,...,beta30,beta31,beta32,beta33,beta34,beta40,beta41,beta42,beta43,beta44
0,-0.301657,-0.295400,0.096002,-0.764174,-0.183476,-0.249928,-0.200285,-0.092602,-0.101029,-0.461492,...,0.455337,0.335777,-0.238685,-0.050208,0.308224,-0.309406,0.036075,0.166248,-0.100464,0.143297
1,-0.219515,-0.163466,-0.048799,-0.551804,-0.388552,0.017490,-0.250229,0.112961,0.122742,0.192704,...,-0.065366,0.401272,-0.001879,-0.334194,0.098180,-0.167979,-0.120657,0.107700,-0.223474,0.299716
2,-0.371925,-0.121915,-0.094154,0.356385,-0.035589,-0.081557,-0.099337,-0.013523,0.140552,-0.171299,...,0.229314,0.118218,-0.070238,-0.055439,-0.098184,-0.261564,0.009248,-0.332703,-0.409254,-0.040659
3,0.095406,-0.471567,-0.101094,-0.224647,-0.227385,-0.180233,0.199807,0.241809,-0.379376,-0.205797,...,0.190404,0.109739,0.165317,-0.032640,0.312596,-0.221700,0.035731,0.060294,-0.029861,0.067475
4,-0.433174,-0.086097,0.217448,0.257485,-0.271992,0.123567,0.282753,-0.063750,-0.249446,-0.104581,...,0.203829,0.286823,0.454102,-0.033417,0.575432,-0.302492,-0.166206,0.192325,-0.499765,0.389785
5,-0.270680,-0.238674,-0.500848,0.081827,-0.423054,-0.153198,-0.097769,-0.376700,-0.044973,-0.002222,...,0.448690,0.443629,0.316143,0.566350,0.076505,-0.239620,-0.109076,0.389196,-0.168890,0.077626
6,-0.410127,-0.084564,-0.151059,-0.141748,-0.167378,-0.497218,0.280883,-0.382996,-0.009784,-0.273861,...,0.463230,0.255799,-0.112948,-0.096170,0.067598,0.028207,-0.106845,0.107939,-0.187527,0.239372
7,-0.687948,-0.083078,0.019585,-0.309409,-0.345817,0.277109,-0.441830,0.310527,-0.087189,0.648840,...,0.568843,0.384307,0.096619,0.174253,-0.164896,0.183690,0.079338,0.110974,-0.248624,0.228055
8,-0.051021,-0.168771,0.152471,-0.032811,-0.204176,0.078861,-0.028072,-0.083295,-0.238265,0.112802,...,0.217386,0.049967,-0.525499,0.234798,0.194635,-0.062287,0.037138,-0.108333,-0.184358,0.038611
9,-0.181603,-0.462068,-0.434162,0.075175,0.093745,-0.089348,0.072606,-0.127640,0.178721,-0.033349,...,0.410953,0.046064,-0.010415,0.040323,0.004601,-0.142884,0.149074,0.097601,-0.100766,-0.117723


In [31]:
betas_df.to_csv('./Results/betas_sharedprior.csv', sep=',', header=True, index=False)