# Personal functions to be called throughout the code

In [1]:
def standardize(parameter):
    """
    :param parameter: the array with the parameters you wish to standardize
    :return param_mean: mean of the input array
    :return param_std : standard deviation of the input array
    :return stdized_param: final standardized array
    """
    import numpy as np
    
    param_mean    = np.mean(parameter)
    param_std     = np.std(parameter)
    stdized_param = []                  # standardized parameter -- output
    for i in range(parameter.size):
        param_temp = (parameter[i] - param_mean)/param_std
        stdized_param.append(param_temp)
    stdized_param = np.array(stdized_param)
    return (param_mean, param_std, stdized_param)

In [54]:
def un_standardize(stdized_par, mean, std):
    """
    This function undoes what the former does!
    """
    
    import numpy as np
    
    recovered_par = []
    for i in range(stdized_par.size):
        param_temp = stdized_par[i]*std + mean
        recovered_par.append(param_temp)
    recovered_par = np.array(recovered_par)
    return (recovered_par)

In [None]:
# def my_plots(x, row, col, position):
#     import numpy             as np
#     import matplotlib.pyplot as plt
#     import seaborn           as sns
    
# #     plot_dict = {}
# #     plot_dict["plot{0}{1}".format(row, col)] 
#     fig = plt.subplot(row+1, col+1, position)
#     sns.kdeplot(x, shade=True, c='#e6550d')
#     plt.xlabel(r"$\beta_{%d%d}$" % (row, col), fontsize=12)
#     plt.tick_params('both', labelsize='12')
#     plt.tight_layout()
    
#     return (fig)    

# Libraries

In [2]:
import numpy             as np
import pandas            as pd
import matplotlib.pyplot as plt
import statsmodels.api   as sm
import seaborn           as sns
import pystan
import time

# Configuring the regression parameters

In [3]:
my_data = pd.read_csv('../../Catalogue/binom_reg_dataset.csv')

In [4]:
# filtering the dataset
redshifts = my_data['Z']
index     = np.where(redshifts.values<=0.4)

# parameters of interest
logit_class = my_data['LOGIT_CLASS(1-UVUP;0-UVWEAK)'].values[index]    # y axis: logit class -- uv upturn yes or no
whan_class  = my_data['WHAN(0-NA;1-RP;2-wA;3-sA;4-SF)'].values[index]  # My types of galaxies
mass        = my_data['STELLAR_MASS'].values[index]                    # 1st parameter
redshift    = redshifts.values[index]                                  # 2nd parameter

In [51]:
# original parameters
mean_mass, std_mass, x2 = standardize(mass)
x1      = redshift
y       = logit_class
classes = (whan_class+1).astype(int)        # Must sum +1 to avoid index issues with STAN
n_obs   = x1.size
n_class = np.unique(classes).size

# new parameters - important for plotting!
n_obs2 = 30
x1_sim = np.linspace(x1.min(), x1.max(), n_obs2)
x2_sim = np.linspace(x2.min(), x2.max(), n_obs2)

# grid
plot_x1, plot_x2 = np.meshgrid(x1_sim, x2_sim)  # THIS IS WHERE THE GRID IS DONE
plot_x1 = plot_x1.reshape((n_obs2**2), 1)
plot_x2 = plot_x2.reshape((n_obs2**2), 1)

In [15]:
print x2.min(), x2.max() # sanity check
print np.median(x2)
print plot_x1.shape

-3.0048533107143856 2.6904846038511474
0.018866401595684694
(900, 1)


In [16]:
# dataset to be used in the regression
regression_data      = {}                                                 # Dictionary, as stated in the pystan manual
regression_data['Y'] = y
regression_data['X'] = sm.add_constant(np.column_stack((x1, x1**2, x2, x2**2)))
regression_data['K'] = regression_data['X'][0,:].size                     # Number of betas -- b0, b1, b2, b3, b4
regression_data['W'] = classes
regression_data['N'] = n_obs
regression_data['C'] = n_class                                      # Number of different classes (partial pooling)

# dataset to be used in the plot -- after meshgrid
regression_data['X2'] = sm.add_constant(np.column_stack((plot_x1, plot_x1**2, plot_x2, plot_x2)))
regression_data['N2'] = n_obs2**2

In [17]:
print regression_data['X2'].shape

(900, 5)


In [18]:
# Fit: STAN code ----------------------------------------------------------------------------------------------------------
stan_code = """
// DECLARATION OF VARIABLES -----------------------------------------------------------------------------------------------
data{
    int<lower=1> N;
    int<lower=1> N2;
    int<lower=1> K;
    int<lower=1> C;
    int W[N];
    int<lower=0, upper=1> Y[N];
    matrix[N, K] X;         // redshift and stellar mass 
    matrix[N2,K] X2;        // redshift and stellar mass -- after grid
    }

// DEFINING THE PRIOR(S) --------------------------------------------------------------------------------------------------
parameters{
    matrix[K,C] beta;       // 25 betas!
    real<lower=0> sigma;    // Shared hyperpriors
    real mu;                // Shared hyperpriors
    }

// MODEL: PROBABILITY, HYPERPRIORS, PRIORS, AND REGRESSION ----------------------------------------------------------------
model {
   vector[N] prob;
    for (i in 1:N) {
      prob[i] = beta[1,W[i]]*X[i,1] + beta[2,W[i]]*X[i,2] + beta[3,W[i]]*X[i,3] + beta[4,W[i]]*X[i,4] + 
      beta[5,W[i]]*X[i,5];
      }

    sigma ~ gamma(0.001, 0.001);                           // shared hyperpriors
    mu ~ normal(0, 100);                                   // shared hyperpriors
     
    for (i in 1:K) {
       for (j in 1:C) beta[i,j] ~ normal(mu, sigma);       // priors
        }

    Y ~ bernoulli_logit(prob);                             // regression
    }

// DATA TO BE PLOTTED -----------------------------------------------------------------------------------------------------
generated quantities{
    vector[N2] prob01;
    vector[N2] eta01;
    vector[N2] prob02;
    vector[N2] eta02;
    vector[N2] prob03;
    vector[N2] eta03;
    vector[N2] prob04;
    vector[N2] eta04;
    vector[N2] prob05;
    vector[N2] eta05;
    
    for(j in 1:N2){
        eta01[j] = beta[1,1]*X2[j,1] + beta[2,1]*X2[j,2] + beta[3,1]*X2[j,3] + beta[4,1]*X2[j,4] + beta[5,1]*X2[j,5];
        eta02[j] = beta[1,2]*X2[j,1] + beta[2,2]*X2[j,2] + beta[3,2]*X2[j,3] + beta[4,2]*X2[j,4] + beta[5,2]*X2[j,5];
        eta03[j] = beta[1,3]*X2[j,1] + beta[2,3]*X2[j,2] + beta[3,3]*X2[j,3] + beta[4,3]*X2[j,4] + beta[5,3]*X2[j,5];
        eta04[j] = beta[1,4]*X2[j,1] + beta[2,4]*X2[j,2] + beta[3,4]*X2[j,3] + beta[4,4]*X2[j,4] + beta[5,4]*X2[j,5];
        eta05[j] = beta[1,5]*X2[j,1] + beta[2,5]*X2[j,2] + beta[3,5]*X2[j,3] + beta[4,5]*X2[j,4] + beta[5,5]*X2[j,5];
        prob01[j] = inv_logit(eta01[j]);
        prob02[j] = inv_logit(eta02[j]);
        prob03[j] = inv_logit(eta03[j]);
        prob04[j] = inv_logit(eta04[j]);
        prob05[j] = inv_logit(eta05[j]);
        }

    }

"""

### Settings for running STAN

In [19]:
iterations    = 8000
chains        = 2                           # HMC chains
warmup        = 2000                        # How many of the first iterations we'll ignore - burnin
jobs          = -1                          # Run code in parallel -- see pystan documentation
seed          = 1

In [20]:
control = {}
# control['max_treedepth'] = 20
control['adapt_delta'] = 0.85

### The fit:

In [22]:
start = time.time()

fit = pystan.stan(model_code=stan_code, data=regression_data, seed=seed, iter=iterations, chains=chains, 
                  warmup=warmup, n_jobs=jobs, control=control)

end = time.time()

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_902d543734551fafeca59854d8a03fd0 NOW.
  tree = Parsing.p_module(s, pxd, full_module_name)


In [23]:
print ("--------------------------------------------------")
print ("CPU process time: %.2f [min]" % float((end-start)/60))
print ("--------------------------------------------------")

--------------------------------------------------
CPU process time: 9.41 [min]
--------------------------------------------------


### Extracting fit properties

In [24]:
summary = pystan.stansummary(fit=fit, digits_summary=3)

In [25]:
summary_arr = np.array(summary.split('\n'))

In [26]:
new_output = summary_arr[5:-6,]

In [27]:
# count = 0
# for i in range(new_output.size):
#     row = np.array(new_output[i].split())
#     if row.size ==11:
#         continue
#     else:
#         print "new_output[%d] = '%s' " % (i, str(new_output[i]))
#         count+=1
# print count

new_output[15] = 'beta[1,4]    -0.15   0.002   0.24  -0.65 -0.294 -0.137-4.17e-4  0.312  13106    1.0' 
new_output[4549] = 'eta03[23]   -0.042    0.01  1.041 -2.187 -0.6532.45e-4  0.602  1.946   9954    1.0' 
new_output[4550] = 'eta03[24]   -0.043    0.01  1.041  -2.19 -0.6543.95e-4  0.602  1.945   9962    1.0' 
new_output[4551] = 'eta03[25]   -0.044    0.01  1.041 -2.191 -0.653-5.31e-4    0.6  1.943   9970    1.0' 
new_output[4554] = 'eta03[28]   -0.046    0.01  1.041 -2.193 -0.656-3.58e-5  0.598   1.94   9996    1.0' 
new_output[4566] = 'eta03[40]   -0.038    0.01  0.979 -2.067 -0.6118.07e-4  0.572  1.826   9852    1.0' 
new_output[4568] = 'eta03[42]   -0.039    0.01  0.979 -2.068 -0.6127.95e-4   0.57  1.824   9867    1.0' 
new_output[4569] = 'eta03[43]    -0.04    0.01  0.979 -2.067 -0.6122.41e-4  0.569  1.823   9874    1.0' 
new_output[4570] = 'eta03[44]    -0.04    0.01  0.979 -2.067 -0.612-1.0e-4  0.568  1.821   9882    1.0' 
new_output[4576] = 'eta03[50]   -0.044    0.01  0.979 

In [28]:
# new_output[15] = 'beta[1,4]    -0.15   0.002   0.24  -0.65 -0.294 -0.137 -4.17e-4  0.312  13106    1.0' 
# new_output[4549] = 'eta03[23]   -0.042    0.01  1.041 -2.187 -0.653 2.45e-4  0.602  1.946   9954    1.0' 
# new_output[4550] = 'eta03[24]   -0.043    0.01  1.041  -2.19 -0.654 3.95e-4  0.602  1.945   9962    1.0' 
# new_output[4551] = 'eta03[25]   -0.044    0.01  1.041 -2.191 -0.653 -5.31e-4    0.6  1.943   9970    1.0' 
# new_output[4554] = 'eta03[28]   -0.046    0.01  1.041 -2.193 -0.656 -3.58e-5  0.598   1.94   9996    1.0' 
# new_output[4566] = 'eta03[40]   -0.038    0.01  0.979 -2.067 -0.611 8.07e-4  0.572  1.826   9852    1.0' 
# new_output[4568] = 'eta03[42]   -0.039    0.01  0.979 -2.068 -0.612 7.95e-4   0.57  1.824   9867    1.0' 
# new_output[4569] = 'eta03[43]    -0.04    0.01  0.979 -2.067 -0.612 2.41e-4  0.569  1.823   9874    1.0' 
# new_output[4570] = 'eta03[44]    -0.04    0.01  0.979 -2.067 -0.612 -1.0e-4  0.568  1.821   9882    1.0' 
# new_output[4576] = 'eta03[50]   -0.044    0.01  0.979 -2.062 -0.619 -9.06e-4  0.563  1.825   9930    1.0' 
# new_output[4577] = 'eta03[51]   -0.045    0.01  0.979 -2.063 -0.619 -8.97e-4  0.562  1.824   9939    1.0' 
# new_output[4588] = 'eta03[62]   -0.038   0.009  0.918 -1.942 -0.574 -1.1e-4  0.535  1.707   9789    1.0' 
# new_output[4589] = 'eta03[63]   -0.038   0.009  0.918 -1.941 -0.574 -8.22e-5  0.534  1.704   9797    1.0' 
# new_output[4590] = 'eta03[64]   -0.039   0.009  0.918  -1.94 -0.574 -8.14e-4  0.534  1.702   9804    1.0' 
# new_output[6654] = 'eta04[328]  -0.011   0.004  0.447  -0.95 -0.271 -1.5e-4  0.255   0.89  12833    1.0' 
# new_output[6655] = 'eta04[329]  -0.011   0.004  0.448 -0.952 -0.272 -8.37e-4  0.253   0.89  12850    1.0' 
# new_output[6797] = 'eta04[471]   -0.16   0.002  0.255 -0.705 -0.314 -0.147 5.21e-4  0.331  12442    1.0' 
# new_output[6799] = 'eta04[473]  -0.162   0.002  0.257 -0.706 -0.316 -0.149 2.68e-4  0.332  12381    1.0' 
# new_output[6800] = 'eta04[474]  -0.162   0.002  0.258  -0.71 -0.318 -0.149 1.43e-4  0.332  12351    1.0' 
# new_output[6801] = 'eta04[475]  -0.163   0.002  0.259 -0.715 -0.319  -0.15 -8.54e-6  0.333  12320    1.0' 
# new_output[6802] = 'eta04[476]  -0.164   0.002   0.26 -0.719  -0.32 -0.151 2.92e-4  0.336  12291    1.0' 
# new_output[6803] = 'eta04[477]  -0.165   0.002  0.261 -0.723 -0.322 -0.152 4.04e-4  0.336  12261    1.0' 
# new_output[6804] = 'eta04[478]  -0.166   0.002  0.263 -0.726 -0.323 -0.152 2.75e-4  0.338  12231    1.0' 
# new_output[6805] = 'eta04[479]  -0.167   0.002  0.264 -0.729 -0.325 -0.154 -4.16e-4  0.339  12202    1.0' 
# new_output[6806] = 'eta04[480]  -0.168   0.002  0.265 -0.735 -0.327 -0.154 -8.02e-4   0.34  12173    1.0' 
# new_output[8638] = 'eta05[512]    -0.1   0.001  0.153  -0.41 -0.199 -0.097 8.04e-4  0.194  13136    1.0' 
# new_output[8639] = 'eta05[513]  -0.101   0.001  0.153  -0.41   -0.2 -0.098 7.67e-4  0.194  13131    1.0' 
# new_output[8640] = 'eta05[514]  -0.101   0.001  0.152 -0.411 -0.201 -0.098 1.66e-4  0.193  13125    1.0' 
# new_output[8641] = 'eta05[515]  -0.102   0.001  0.152 -0.411 -0.201 -0.099 -6.62e-4  0.193  13120    1.0'

In [29]:
count = 0
for i in range(new_output.size):
    row = np.array(new_output[i].split())
    if row.size ==11:
        continue
    else:
        print "new_output[%d] = '%s' " % (i, str(new_output[i]))
        count+=1
print count

0


In [30]:
header_fit = summary_arr[4].split()
print header_fit

[u'mean', u'se_mean', u'sd', u'2.5%', u'25%', u'50%', u'75%', u'97.5%', u'n_eff', u'Rhat']


In [31]:
header_addendum = 'parameter'
header_fit = [header_addendum] + header_fit
print header_fit

['parameter', u'mean', u'se_mean', u'sd', u'2.5%', u'25%', u'50%', u'75%', u'97.5%', u'n_eff', u'Rhat']


In [32]:
cute_output = list(np.zeros(len(header_fit)))
for i in range(new_output.size):
    if len(new_output[i].split())!=11: # the length of the list must be 11, in which case we connect them directly
        print "there is a problem!"
    else:
        new_output_temp = np.array(new_output[i].split()).reshape(1,11)
        cute_output     = np.vstack((cute_output, new_output_temp))
cute_output = cute_output[1:,:]               # removing the zeroes in the beggining 

# Extracting and saving *ONLY* what really matters for the analysis

In [83]:
recovered_mass  = un_standardize(stdized_par=plot_x2, mean=mean_mass, std=std_mass)
rp      = np.column_stack((plot_x1, recovered_mass))
rp_cols = np.vstack((rp, rp, rp, rp, rp))       # for 5 whan classes we must stack these 5x

In [84]:
parameters = cute_output[:,0].astype(str)
pnew_idxs  = []
for i in range(parameters.size):
    if parameters[i][0:4]=='prob':
        pnew_idxs.append(i)
    else:
        continue

model_results    = np.column_stack((cute_output[pnew_idxs,:], rp_cols))
model_results_df = pd.DataFrame(model_results)
model_results_df.columns = header_fit + ['Z'] + ['LOG_STELLAR_MASS']
model_results_df.to_csv('./Results/fit_results_sharedprior.csv', sep=',', header=True, index=False)

# Let's save the posteriors

In [86]:
posteriors_temp = list(fit.extract(u'beta').items()[0])

In [88]:
posteriors = np.array(posteriors_temp[1])
print posteriors.shape

(12000, 5, 5)


In [92]:
print posteriors[0,:,:]
print posteriors[1,:,:]

[[-0.05481096  0.06934133 -0.01269459  0.18884744  0.00615592]
 [ 0.1168303   0.11144854  0.02717101 -0.13853936 -0.06490383]
 [ 0.22946814 -0.04814046  0.10276213 -0.05482054  0.01693678]
 [ 0.07854158  0.1202476   0.08203595  0.03259462  0.0289487 ]
 [-0.05476614 -0.03108706  0.22645775  0.12625964 -0.0505289 ]]
[[-0.33419297 -0.07984472  0.22456488 -0.12074692 -0.09734898]
 [ 0.06647065 -0.24869516  0.3221178  -0.24672123  0.2933803 ]
 [ 0.07069463 -0.14832442  0.09795786  0.02236787 -0.03996388]
 [ 0.24215385  0.13330738 -0.03675612  0.07362988  0.22449536]
 [-0.13851452  0.0489519  -0.33325752 -0.2449806   0.09357994]]


In [185]:
matrix_row = posteriors[0,:,0].size  # rows are b0, b1, b2, b3, b4 -- posteriors given the regression
matrix_col = posteriors[0,0,:].size  # cols are w0, w1, w2, w3, w4 -- WHAN classification

In [199]:
betas_dict = {}
for line in range(matrix_row):
    for col in range(matrix_col):
        betas_dict["beta%d%d" % (line, col)] = posteriors[:, line, col]
betas_df = pd.DataFrame(betas_dict)

In [201]:
betas_df

Unnamed: 0,beta00,beta01,beta02,beta03,beta04,beta10,beta11,beta12,beta13,beta14,...,beta30,beta31,beta32,beta33,beta34,beta40,beta41,beta42,beta43,beta44
0,-0.054811,0.069341,-0.012695,0.188847,0.006156,0.116830,0.111449,0.027171,-0.138539,-0.064904,...,0.078542,0.120248,0.082036,0.032595,0.028949,-0.054766,-0.031087,0.226458,0.126260,-0.050529
1,-0.334193,-0.079845,0.224565,-0.120747,-0.097349,0.066471,-0.248695,0.322118,-0.246721,0.293380,...,0.242154,0.133307,-0.036756,0.073630,0.224495,-0.138515,0.048952,-0.333258,-0.244981,0.093580
2,-0.627271,-0.016402,-0.084822,-0.135967,-0.184286,-0.367505,-0.597546,0.000327,-0.379523,0.786571,...,0.392950,0.120252,0.249590,0.359749,0.239442,-0.120336,0.021219,0.038342,0.028989,0.157452
3,-0.122798,-0.222061,-0.008378,0.087676,0.028522,0.373839,0.152509,0.174008,-0.010217,0.731260,...,0.825668,0.533129,0.035977,0.125154,0.058298,-0.280147,-0.102288,-0.110212,-0.126500,0.176211
4,-0.292344,-0.359573,-0.144083,-0.482288,-0.343319,0.298121,-0.070986,-0.277663,0.125471,0.134388,...,0.126866,0.139296,-0.149057,-0.281220,0.036859,-0.138218,-0.082808,0.244130,-0.264756,0.192176
5,-0.602720,-0.279856,-0.694209,-0.128601,-0.251755,0.363550,-0.840801,-0.334926,-0.123797,0.145200,...,0.580600,0.620868,-0.236688,0.396982,0.089314,-0.408159,0.108950,0.134173,-0.160129,0.135434
6,-0.048830,-0.266487,-0.016779,-0.425782,-0.000012,-0.191858,0.082378,-0.002924,0.140816,0.122866,...,0.140358,0.069813,0.064125,-0.108287,0.238370,-0.146642,0.221087,0.242173,-0.125194,0.015365
7,-0.870105,-0.301175,0.015561,-0.205705,-0.215955,0.462568,0.001303,-0.337696,0.433630,-0.066560,...,0.103697,0.535075,-0.259986,0.076989,0.161415,0.004295,0.002317,0.079772,-0.164417,-0.080046
8,-0.313565,-0.024894,0.046437,0.395518,0.001946,-0.055237,-0.166162,-0.290311,-0.172157,-0.362498,...,0.378562,0.181808,-0.040195,0.231883,0.238791,-0.118656,-0.151854,0.010120,-0.422416,0.029954
9,-0.502028,-0.144222,-0.252655,0.190907,-0.135790,0.433482,-0.179603,-0.342171,-0.011122,-0.122940,...,0.350474,0.186810,0.180000,0.214694,0.121711,-0.182559,0.007770,0.310522,-0.070609,0.086340


In [202]:
betas_df.to_csv('./Results/betas_sharedprior.csv', sep=',', header=True, index=False)