# Personal functions to be called throughout the code

In [1]:
def standardize(parameter):
    """
    :param parameter: the array with the parameters you wish to standardize
    :return param_mean: mean of the input array
    :return param_std : standard deviation of the input array
    :return stdized_param: final standardized array
    """
    import numpy as np
    
    param_mean    = np.mean(parameter)
    param_std     = np.std(parameter)
    stdized_param = []                  # standardized parameter -- output
    for i in range(parameter.size):
        param_temp = (parameter[i] - param_mean)/param_std
        stdized_param.append(param_temp)
    stdized_param = np.array(stdized_param)
    return (param_mean, param_std, stdized_param)

In [218]:
def my_plots(x, row, col, position):
    import numpy             as np
    import matplotlib.pyplot as plt
    import seaborn           as sns
    
#     plot_dict = {}
#     plot_dict["plot{0}{1}".format(row, col)] 
    fig = plt.subplot(row+1, col+1, position)
    sns.kdeplot(x, shade=True, c='#e6550d')
    plt.xlabel(r"$\beta_{%d%d}$" % (row, col), fontsize=12)
    plt.tick_params('both', labelsize='12')
    plt.tight_layout()
    
    return (fig)    

# Libraries

In [2]:
import numpy             as np
import pandas            as pd
import matplotlib.pyplot as plt
import statsmodels.api   as sm
import seaborn           as sns
import pystan
import time

# Configuring the regression parameters

In [3]:
my_data = pd.read_csv('../../Catalogue/binom_reg_dataset.csv')

In [4]:
my_data.columns

Index([u'CATAID', u'BPT_CLASS', u'LOGIT_CLASS(1-UVUP;0-UVWEAK)',
       u'STELLAR_MASS', u'UV_CLASS', u'WHAN(0-NA;1-RP;2-wA;3-sA;4-SF)',
       u'WHAN_CLASS', u'Z'],
      dtype='object')

In [5]:
# filtering the dataset
redshifts = my_data['Z']
index     = np.where(redshifts.values<=0.4)

# parameters of interest
logit_class = my_data['LOGIT_CLASS(1-UVUP;0-UVWEAK)'].values[index]    # y axis: logit class -- uv upturn yes or no
whan_class  = my_data['WHAN(0-NA;1-RP;2-wA;3-sA;4-SF)'].values[index]  # My types of galaxies
mass        = my_data['STELLAR_MASS'].values[index]                    # 1st parameter
redshift    = redshifts.values[index]                                  # 2nd parameter

In [6]:
# original parameters
x1      = redshift
x2      = standardize(mass)[2]
y       = logit_class
classes = (whan_class+1).astype(int)
n_obs   = x1.size
n_class = np.unique(classes).size

# new parameters - important for plotting!
n_obs2 = 30
x1_sim = np.linspace(x1.min(), x1.max(), n_obs2)
x2_sim = np.linspace(x2.min(), x2.max(), n_obs2)

plot_x1, plot_x2 = np.meshgrid(x1_sim, x2_sim)  # THIS IS WHERE THE GRID IS DONE
   
plot_x1 = plot_x1.reshape((n_obs2**2), 1)
plot_x2 = plot_x2.reshape((n_obs2**2), 1)

In [7]:
print x2.min(), x2.max() # sanity check
print np.median(x2)
print plot_x1.shape

-3.0048533107143856 2.6904846038511474
0.018866401595684694
(900, 1)


In [8]:
# dataset to be used in the regression
regression_data      = {}                                                 # Dictionary, as stated in the pystan manual
regression_data['Y'] = y
regression_data['X'] = sm.add_constant(np.column_stack((x1, x1**2, x2, x2**2)))
regression_data['K'] = regression_data['X'][0,:].size                     # Number of betas -- b0, b1, b2, b3, b4
regression_data['W'] = classes
regression_data['N'] = n_obs
regression_data['C'] = n_class                                      # Number of different classes (partial pooling)

# dataset to be used in the plot -- after meshgrid
regression_data['X2'] = sm.add_constant(np.column_stack((plot_x1, plot_x1**2, plot_x2, plot_x2)))
regression_data['N2'] = n_obs2**2

In [9]:
print regression_data['X2'].shape

(900, 5)


In [10]:
# Fit: STAN code ----------------------------------------------------------------------------------------------------------
stan_code = """
// DECLARATION OF VARIABLES -----------------------------------------------------------------------------------------------
data{
    int<lower=1> N;
    int<lower=1> N2;
    int<lower=1> K;
    int<lower=1> C;
    int W[N];
    int<lower=0, upper=1> Y[N];
    matrix[N, K] X;         // redshift and stellar mass 
    matrix[N2,K] X2;        // redshift and stellar mass -- after grid
    }

// DEFINING THE PRIOR(S) --------------------------------------------------------------------------------------------------
parameters{
    matrix[K,C] beta;       // 25 betas!
    real<lower=0> sigma;    // Shared hyperpriors
    real mu;                // Shared hyperpriors
    }

// MODEL: PROBABILITY, HYPERPRIORS, PRIORS, AND REGRESSION ----------------------------------------------------------------
model {
   vector[N] prob;
    for (i in 1:N) {
      prob[i] = beta[1,W[i]]*X[i,1] + beta[2,W[i]]*X[i,2] + beta[3,W[i]]*X[i,3] + beta[4,W[i]]*X[i,4] + 
      beta[5,W[i]]*X[i,5];
      }

    sigma ~ gamma(0.001, 0.001);                           // shared hyperpriors
    mu ~ normal(0, 100);                                   // shared hyperpriors
     
    for (i in 1:K) {
       for (j in 1:C) beta[i,j] ~ normal(mu, sigma);       // priors
        }

    Y ~ bernoulli_logit(prob);                             // regression
    }

// DATA TO BE PLOTTED -----------------------------------------------------------------------------------------------------
generated quantities{
    vector[N2] prob01;
    vector[N2] eta01;
    vector[N2] prob02;
    vector[N2] eta02;
    vector[N2] prob03;
    vector[N2] eta03;
    vector[N2] prob04;
    vector[N2] eta04;
    vector[N2] prob05;
    vector[N2] eta05;
    
    for(j in 1:N2){
        eta01[j] = beta[1,1]*X2[j,1] + beta[2,1]*X2[j,2] + beta[3,1]*X2[j,3] + beta[4,1]*X2[j,4] + beta[5,1]*X2[j,5];
        eta02[j] = beta[1,2]*X2[j,1] + beta[2,2]*X2[j,2] + beta[3,2]*X2[j,3] + beta[4,2]*X2[j,4] + beta[5,2]*X2[j,5];
        eta03[j] = beta[1,3]*X2[j,1] + beta[2,3]*X2[j,2] + beta[3,3]*X2[j,3] + beta[4,3]*X2[j,4] + beta[5,3]*X2[j,5];
        eta04[j] = beta[1,4]*X2[j,1] + beta[2,4]*X2[j,2] + beta[3,4]*X2[j,3] + beta[4,4]*X2[j,4] + beta[5,4]*X2[j,5];
        eta05[j] = beta[1,5]*X2[j,1] + beta[2,5]*X2[j,2] + beta[3,5]*X2[j,3] + beta[4,5]*X2[j,4] + beta[5,5]*X2[j,5];
        prob01[j] = inv_logit(eta01[j]);
        prob02[j] = inv_logit(eta02[j]);
        prob03[j] = inv_logit(eta03[j]);
        prob04[j] = inv_logit(eta04[j]);
        prob05[j] = inv_logit(eta05[j]);
        }

    }

"""

### Settings for running STAN

In [11]:
iterations    = 8000
chains        = 2
warmup        = 2000    # How many of the first iterations we'll ignore - burnin
jobs          = -1
seed          = 1

In [12]:
# control = {}
# control['max_treedepth'] = 20
# control['adapt_delta'] = 0.99

### The fit:

In [13]:
start = time.time()

fit = pystan.stan(model_code=stan_code, data=regression_data, seed=seed, iter=iterations, chains=chains, 
                  warmup=warmup, n_jobs=jobs)

end = time.time()

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_902d543734551fafeca59854d8a03fd0 NOW.
  tree = Parsing.p_module(s, pxd, full_module_name)


In [14]:
print ("--------------------------------------------------")
print ("CPU process time: %.2f [min]" % float((end-start)/60))
print ("--------------------------------------------------")

--------------------------------------------------
CPU process time: 8.77 [min]
--------------------------------------------------


### Extracting fit properties

In [15]:
# print pystan.stansummary(fit=fit, digits_summary=3)

In [28]:
summary = pystan.stansummary(fit=fit, digits_summary=3)

In [29]:
summary_arr = np.array(summary.split('\n'))

In [30]:
new_output = summary_arr[5:-6,]

In [44]:
new_output[1787]

u'eta01[861]   0.031   0.008  0.562 -1.009 -0.355-6.01e-4  0.405  1.173   5128    1.0'

In [46]:
# count = 0
# for i in range(new_output.size):
#     row = np.array(new_output[i].split())
#     if row.size ==11:
#         continue
#     else:
#         print "new_output[%d] = '%s' " % (i, str(new_output[i]))
#         count+=1
# print count

new_output[1787] = 'eta01[861]   0.031   0.008  0.562 -1.009 -0.355-6.01e-4  0.405  1.173   5128    1.0' 
new_output[1788] = 'eta01[862]   0.031   0.008  0.563 -1.009 -0.356-1.12e-4  0.406  1.174   5115    1.0' 
new_output[1789] = 'eta01[863]    0.03   0.008  0.563  -1.01 -0.357-6.7e-4  0.406  1.174   5103    1.0' 
new_output[1790] = 'eta01[864]    0.03   0.008  0.563 -1.011 -0.358-9.47e-4  0.405  1.178   5090    1.0' 
new_output[1791] = 'eta01[865]   0.029   0.008  0.564 -1.011 -0.359-8.82e-4  0.405   1.18   5078    1.0' 
new_output[4634] = 'eta03[108]  -0.035   0.008  0.858 -1.835 -0.5369.64e-4  0.482  1.631  10455    1.0' 
new_output[4635] = 'eta03[109]  -0.035   0.008  0.858 -1.834 -0.5378.15e-4  0.482  1.633  10475    1.0' 
new_output[4636] = 'eta03[110]  -0.036   0.008  0.858 -1.835 -0.5389.73e-4  0.481  1.635  10495    1.0' 
new_output[4640] = 'eta03[114]  -0.038   0.008  0.858  -1.84 -0.5381.67e-4  0.479  1.634  10580    1.0' 
new_output[4641] = 'eta03[115]  -0.039   0.008  0.8

In [53]:
# new_output[1787] = 'eta01[861]   0.031   0.008  0.562 -1.009 -0.355 -6.01e-4  0.405  1.173   5128    1.0' 
# new_output[1788] = 'eta01[862]   0.031   0.008  0.563 -1.009 -0.356 -1.12e-4  0.406  1.174   5115    1.0' 
# new_output[1789] = 'eta01[863]    0.03   0.008  0.563  -1.01 -0.357 -6.7e-4  0.406  1.174   5103    1.0' 
# new_output[1790] = 'eta01[864]    0.03   0.008  0.563 -1.011 -0.358 -9.47e-4  0.405  1.178   5090    1.0' 
# new_output[1791] = 'eta01[865]   0.029   0.008  0.564 -1.011 -0.359 -8.82e-4  0.405   1.18   5078    1.0' 
# new_output[4634] = 'eta03[108]  -0.035   0.008  0.858 -1.835 -0.536 9.64e-4  0.482  1.631  10455    1.0' 
# new_output[4635] = 'eta03[109]  -0.035   0.008  0.858 -1.834 -0.537 8.15e-4  0.482  1.633  10475    1.0' 
# new_output[4636] = 'eta03[110]  -0.036   0.008  0.858 -1.835 -0.538 9.73e-4  0.481  1.635  10495    1.0' 
# new_output[4640] = 'eta03[114]  -0.038   0.008  0.858  -1.84 -0.538 1.67e-4  0.479  1.634  10580    1.0' 
# new_output[4641] = 'eta03[115]  -0.039   0.008  0.858 -1.845  -0.54 -3.72e-4  0.478  1.631  10603    1.0' 
# new_output[4656] = 'eta03[130]  -0.035   0.008  0.796  -1.71 -0.499 6.57e-4  0.445   1.52  10281    1.0' 
# new_output[4657] = 'eta03[131]  -0.036   0.008  0.796 -1.711   -0.5 2.42e-4  0.445  1.518  10298    1.0' 
# new_output[4658] = 'eta03[132]  -0.036   0.008  0.796 -1.709   -0.5 -6.13e-4  0.445  1.518  10315    1.0' 
# new_output[6781] = 'eta04[455]  -0.149   0.002  0.245 -0.675 -0.288 -0.134 4.77e-4  0.333  11886    1.0' 
# new_output[6782] = 'eta04[456]   -0.15   0.002  0.246 -0.676 -0.289 -0.135 -1.62e-5  0.334  11903    1.0' 
# new_output[7208] = 'eta04[882]  -0.601   0.009  0.963 -2.613 -1.176 -0.567 -1.87e-4  1.277  11285    1.0' 
# new_output[7209] = 'eta04[883]  -0.602   0.009  0.964 -2.612 -1.177 -0.568 -7.86e-4  1.275  11276    1.0' 
# new_output[7210] = 'eta04[884]  -0.603   0.009  0.964 -2.611 -1.179 -0.568 -8.13e-4  1.272  11266    1.0' 
# new_output[7212] = 'eta04[886]  -0.604   0.009  0.965 -2.613 -1.181  -0.57 -8.18e-4  1.271  11246    1.0' 
# new_output[8640] = 'eta05[514]    -0.1   0.001  0.149 -0.411 -0.195 -0.097 6.96e-4  0.189  12770    1.0' 
# new_output[8641] = 'eta05[515]    -0.1   0.001  0.149 -0.412 -0.195 -0.097 4.55e-4  0.188  12843    1.0' 
# new_output[8643] = 'eta05[517]  -0.101   0.001  0.149 -0.413 -0.196 -0.098 1.09e-4  0.187  12992    1.0' 
# new_output[8644] = 'eta05[518]  -0.101   0.001  0.149 -0.412 -0.197 -0.099 -1.48e-4  0.186  13068    1.0' 
# new_output[8645] = 'eta05[519]  -0.102   0.001  0.149 -0.414 -0.197 -0.099 -3.23e-4  0.186  13144    1.0' 
# new_output[8646] = 'eta05[520]  -0.102   0.001   0.15 -0.414 -0.198   -0.1 -7.72e-4  0.186  13222    1.0' 
# new_output[8666] = 'eta05[540]  -0.112   0.001  0.167 -0.457 -0.217 -0.109 -4.97e-4  0.212  14645    1.0'

In [54]:
count = 0
for i in range(new_output.size):
    row = np.array(new_output[i].split())
    if row.size ==11:
        continue
    else:
        print "new_output[%d] = '%s' " % (i, str(new_output[i]))
        count+=1
print count

0


In [73]:
header_fit = summary_arr[4].split()
print header_fit

[u'mean', u'se_mean', u'sd', u'2.5%', u'25%', u'50%', u'75%', u'97.5%', u'n_eff', u'Rhat']


In [74]:
header_addendum = 'parameter'
header_fit = [header_addendum] + header_fit
print header_fit

['parameter', u'mean', u'se_mean', u'sd', u'2.5%', u'25%', u'50%', u'75%', u'97.5%', u'n_eff', u'Rhat']


In [76]:
cute_output = list(np.zeros(len(header_fit)))
for i in range(new_output.size):
    if len(new_output[i].split())!=11: # the length of the list must be 11, in which case we connect them directly
        print "there is a problem!"
    else:
        new_output_temp = np.array(new_output[i].split()).reshape(1,11)
        cute_output     = np.vstack((cute_output, new_output_temp))
cute_output = cute_output[1:,:]               # removing the zeroes in the beggining 

# Extracting and saving *ONLY* what really matters for the analysis

In [97]:
parameters = cute_output[:,0].astype(str)
pnew_idxs  = []
for i in range(parameters.size):
    if parameters[i][0:4]=='prob':
        pnew_idxs.append(i)
    else:
        continue
        
print cute_output[pnew_idxs,:].shape
print plot_x1.shape

# model_results    = np.column_stack((cute_output[pnew_idxs,:], x1_sim))
# model_results_df.columns = header_fit + ['Z']
model_results =
model_results_df = pd.DataFrame(model_results)
model_results_df.to_csv('./Results/fit_results_sharedprior.csv', sep=',', header=True, index=False)

(4500, 11)
(900, 1)


In [109]:
print cute_output[pnew_idxs,0]

[u'prob01[1]' u'prob01[2]' u'prob01[3]' ... u'prob05[898]' u'prob05[899]'
 u'prob05[900]']


In [124]:
posteriors_temp = list(fit.extract(u'beta').items()[0])

In [125]:
posteriors = np.array(posteriors_temp[1])

In [145]:
print posteriors.shape
print posteriors[0,:,0] # row -- b0,b1,b2,b3,b4  --- betas
print posteriors[0,:,:] # col -- w0,w1,w2,w3,w4  --- emission-line classification (WHAN)

(12000, 5, 5)
[-0.33444698  0.02672725 -0.01634778  0.07920367 -0.17144219]
[[-3.34446985e-01 -2.26543698e-01 -1.89228489e-01 -1.63251421e-01
   3.76569607e-04]
 [ 2.67272508e-02  3.58492315e-02  1.68493309e-02 -5.13688809e-01
   2.32375981e-01]
 [-1.63477759e-02  1.85788680e-01 -3.02080930e-01 -1.27353458e-01
   1.76678414e-01]
 [ 7.92036677e-02  6.57353343e-02 -6.23788267e-02  8.68208981e-02
   6.05670704e-02]
 [-1.71442189e-01 -8.80067036e-03  7.00294745e-02 -1.99786371e-01
  -1.20829257e-01]]


In [148]:
posteriors[11999]

array([[-0.25537303, -0.16851336,  0.20990152,  0.16468466, -0.01015606],
       [ 0.3766161 , -0.08229268,  0.53142079,  0.14922763,  0.15031719],
       [-0.02091751,  0.15819203,  0.04783963,  0.10869276, -0.06914305],
       [ 0.30031139,  0.47312253,  0.13617728,  0.19983463,  0.08732963],
       [-0.09861625, -0.15494211, -0.07553024, -0.06665214,  0.02829974]])

In [154]:
blocks            = posteriors[:,0,0].size  # number of blocks of betas we have
number_of_classes = n_class
betas_size        = regression_data['K']

# Mega-loop to extract and plot the posteriors

In [263]:
betas_dictionary = {}
plots = {}
colour='black'

In [264]:
for each_class in range(number_of_classes):
    for each_beta in range(betas_size):
        print each_class, each_beta
        betas_dictionary["beta{0}{1}".format(each_class, each_beta)] = posteriors[:, each_beta, each_class]
        print np.array(betas_dictionary.values()).size
#         print np.array(betas_dictionary.values()).shape
#         plot_position = len(betas_dictionary.keys())
#         print each_beta, each_class, betas_dictionary.keys()[0]
# #         for p in range(plot_position):
#             plt.subplot(each_beta, each_class, p)
#             sns.kdeplot(posteriors[:, each_beta, each_class], shade=True, c=colour)
#             plt.xlabel(r"$\beta_{{0}{1}}$".format(each_beta, each_class), fontsize=10)
#             plt.tick_params('both', labelsize='10')
           
            
#             plot = my_plots(x=posteriors[:, each_beta, each_class], row=each_beta, col=each_class, position=p)
#             plt.show()   

0 0
12000
0 1
24000
0 2
36000
0 3
48000
0 4
60000
1 0
72000
1 1
84000
1 2
96000
1 3
108000
1 4
120000
2 0
132000
2 1
144000
2 2
156000
2 3
168000
2 4
180000
3 0
192000
3 1
204000
3 2
216000
3 3
228000
3 4
240000
4 0
252000
4 1
264000
4 2
276000
4 3
288000
4 4
300000
