In [1]:
import numpy as np
import pystan
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import time

In [2]:
# Main thread

if __name__ == '__main__':

    # Configuring paths and inputs -------------------------------------------------------------------------------------
    my_data = np.loadtxt('./../../../Catalogue/binom_reg_dataset.csv', delimiter=',', dtype=str)

    my_dictionary = {}
    for i in range(len(my_data[0, :])):                                         # Converting numpy array into dictionary
         my_dictionary[my_data[0, i]] = np.array(my_data[0 + 1:, i], dtype=str)

    logit_class   = my_dictionary['LOGIT_CLASS(1-UVUP;0-UVWEAK)'].astype(int)
    redshift      = my_dictionary['Z'].astype(float)
    stellar_mass  = my_dictionary['STELLAR_MASS'].astype(float)
    
    index = np.where(redshift<=0.4)

    x1    = redshift[index]
    x2    = stellar_mass[index]
    y     = logit_class[index]              # whether this is a galaxy with uv upturn or not
    n_obs = x1.size

    regression_data = {}
    regression_data['K'] = 5      # number of betas
    regression_data['X'] = sm.add_constant(np.column_stack((x1, x1**2, x2, x2**2)))
    regression_data['N'] = n_obs
    regression_data['Y'] = y
    regression_data['LogN'] = np.log(n_obs)
    
    # Data to be plotted -------------------------------------------------------------------------------------------
    n_obs2 = 50    
    x1_sim = np.linspace(x1.min(), x1.max(), n_obs2)
    x2_sim = np.linspace(x2.min(), x2.max(), n_obs2)
    
    plot_x1, plot_x2 = np.meshgrid(x1_sim, x2_sim)  # THIS IS WHERE THE GRID IS DONE
       
    plot_x1 = plot_x1.reshape(2500, 1)
    plot_x2 = plot_x2.reshape(2500, 1)

    regression_data['X2'] = sm.add_constant(np.column_stack((plot_x1, plot_x1**2, plot_x2, plot_x2**2)))
    regression_data['N2'] = n_obs2**2
    
    print regression_data['X2'].shape
    
#     print regression_data['X'].shape
#     print regression_data['X']
#     print regression_data['X2'].shape
#     print regression_data['X2']

    # Fit: STAN code ---------------------------------------------------------------------------------------------------
    stan_code = """
    data{
        int<lower=0> N;
        int<lower=0> N2;
        int<lower=0> K;
        int Y[N];
        matrix[N,K] X;
        matrix[N2,K] X2;
//        real LogN;
    }

    parameters{
        vector[K] beta;
    }

    transformed parameters{
        vector[N] eta;
        eta = X * beta;
    }

    model{
        Y ~ bernoulli_logit(eta);
    }

    generated quantities{
        /* real LLi[N2]; */
        /* real AIC; */ 
        /* real BIC; */
        /* real LogL; */
        vector[N2] etanew;
        real<lower=0, upper=1.0> pnew[N2];
        etanew = X2 * beta;
        for (j in 1:N2){
            pnew[j] = inv_logit(etanew[j]);
            /* LLi[j] = bernoulli_lpmf(1|pnew[j]); */
        }
        /* LogL = sum(LLi); */
        /* AIC = -2 * LogL + 2 * K; */
        /* BIC = -2 * LogL + LogN * K; */
    }
#     """

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_04e2ff1a0d956cbb6eafaa1ed143cfa0 NOW.


(2500, 5)


  tree = Parsing.p_module(s, pxd, full_module_name)




In [None]:
iterations = 7000
chains     = 3                           # HMC chains
warmup     = 3000                        # How many of the first iterations we'll ignore - burnin
jobs       = -1                          # Run code in parallel -- see pystan documentation
seed       = 1

In [3]:
control = {}
control['max_treedepth'] = 30
control['adapt_delta'] = 0.99

Inference for Stan model: anon_model_04e2ff1a0d956cbb6eafaa1ed143cfa0.
3 chains, each with iter=5000; warmup=2000; thin=1; 
post-warmup draws per chain=3000, total post-warmup draws=9000.

               mean se_mean     sd   2.5%    25%    50%    75%  97.5%  n_eff   Rhat
beta[1]       80.82   19.54  83.16 -78.79  22.69  78.25  137.4 247.39   18.0   1.15
beta[2]       25.84    2.07  12.36   3.16  17.28  25.23  33.77  51.73   36.0   1.08
beta[3]      -54.38    5.14  29.91 -116.9 -73.71 -53.06  -33.7   0.35   34.0   1.09
eta[99]        0.04    0.02    0.3  -0.54  -0.15   0.04   0.25   0.61  302.0   1.01
eta[100]      -0.28  4.5e-3   0.13  -0.52  -0.36  -0.28  -0.19  -0.03  782.0    1.0
eta[101]      -0.51  5.0e-3   0.12  -0.75  -0.59  -0.51  -0.43  -0.27  617.0   1.01


In [None]:
start = time.time()

fit = pystan.stan(model_code=stan_code, data=regression_data, seed=seed, iter=iterations, chains=chains, warmup=warmup,
                  n_jobs=jobs, control=control)

end = time.time()
print_time(start, end)

In [4]:
print lines

[0, 1, 2, 3, 4, 5, 6, 7, 108, 109, 110]


In [5]:
print fit

Inference for Stan model: anon_model_04e2ff1a0d956cbb6eafaa1ed143cfa0.
3 chains, each with iter=5000; warmup=2000; thin=1; 
post-warmup draws per chain=3000, total post-warmup draws=9000.

               mean se_mean     sd   2.5%    25%    50%    75%  97.5%  n_eff   Rhat
beta[1]       80.82   19.54  83.16 -78.79  22.69  78.25  137.4 247.39   18.0   1.15
beta[2]       25.84    2.07  12.36   3.16  17.28  25.23  33.77  51.73   36.0   1.08
beta[3]      -54.38    5.14  29.91 -116.9 -73.71 -53.06  -33.7   0.35   34.0   1.09
beta[4]      -15.83    3.66  15.57 -47.01  -26.4 -15.37  -4.85  13.92   18.0   1.15
beta[5]        0.75    0.17   0.72  -0.64   0.24   0.72   1.24    2.2   18.0   1.15
eta[1]        -0.52    0.04   0.27  -1.05  -0.71  -0.53  -0.34  -0.01   46.0   1.06
eta[2]        -0.56    0.02   0.16  -0.88  -0.67  -0.56  -0.45  -0.25   66.0   1.04
eta[3]        -0.24  4.8e-3   0.12  -0.48  -0.32  -0.24  -0.16 4.8e-3  674.0    1.0
eta[4]        -0.14  8.4e-3   0.14  -0.42  -0.24  -0.14

In [None]:
output = str(pystan.misc._print_stanfit(fit, digits_summary=3)).split('\n')

In [None]:
posteriors = list(fit.extract(u'beta').items()[0])

In [None]:
betas = posteriors[1]

In [None]:
print betas.shape

In [None]:
beta0 = betas[:,0]
beta1 = betas[:,1]
beta2 = betas[:,2]
beta3 = betas[:,3]

In [None]:
plt.subplots(1, betas.size, figsize=((5*betas.size),7), sharey=True)

for i in range(betas.size):
    plt.subplot(1,4,betas[i])
    sns.kdeplot(beta0, shade=True, c='#e6550d')
    plt.xlabel(r"$\beta_{%d}$" % i, fontsize=25)
    plt.ylabel(r"Kernel Density", fontsize=25)
    plt.tick_params('both', labelsize='20')
    
plt.tight_layout()
plt.savefig('../../../LargeFilesResults/Model/posteriors_noemlines.pdf', dpi=100)
plt.show()

In [None]:
plt.subplots(1,1, figsize=(25,10), sharey=True)

plot01 = plt.subplot(1,4,1)
sns.kdeplot(beta0, shade=True, c='#e6550d')
plt.xlabel(r"$\beta_{0}$", fontsize=25)
plt.ylabel(r"Kernel Density", fontsize=25)
plt.tick_params('both', labelsize='20')

plt.subplot(1,4,2)
sns.kdeplot(beta1, shade=True, c='#e6550d')
plt.xlabel(r"$\beta_{1}$", fontsize=25)
plt.tick_params('both', labelsize='20')

plt.subplot(1,4,3)
sns.kdeplot(beta2, shade=True, c='#e6550d')
plt.xlabel(r"$\beta_{2}$", fontsize=25)
plt.tick_params('both', labelsize='20')

plt.subplot(1,4,4)
sns.kdeplot(beta3, shade=True, c='#e6550d')
plt.xlabel(r"$\beta_{3}$", fontsize=25)
plt.tick_params('both', labelsize='20')

plt.tight_layout()
plt.savefig('./../Model/Results/posterios_3d_grid.pdf', dpi=100)
plt.show()

In [None]:
output = np.array(output)

In [None]:
new_output = output[5:-6] #removing header and footer
print new_output.shape
print new_output.size
print new_output[0].split()[0]

In [None]:
print new_output[-1]
print new_output[-2]
print new_output[4725].split()[1][0:6]
print new_output[4725].split()[1][6:]

In [None]:
diagnostics = []
for i in range(new_output.size):
    if len(new_output[i].split())<11:
        print i, len(new_output[i].split()),'\n'
        print new_output[i].split(), len(new_output[i].split())
        diagnostics.append(len(new_output[i].split()))
    else:
        continue
print np.unique(diagnostics)

In [None]:
new_output[1029] = 'etanew[521]  -0.249 0.007  0.3603 -0.968  0.490 -0.253 -6.293e-5 0.4649 2627 1.0024' 
new_output[1880] = 'etanew[1372] -0.090 0.0021 0.1322  0.346 -0.179 -0.090 -1.611e-5 0.1682 3938 1.001'

In [None]:
header_fit = output[4].split()
print header_fit

In [None]:
header_addendum = 'parameter'
header_fit = [header_addendum] + header_fit
print header_fit

In [None]:
new_data = header_fit
for i in range(new_output.size):
    if len(new_output[i].split())!=11: #the length of the list must be 11, in which case we connect them directly
        print "there is a problem!"
    else:
        new_output_i = np.array(new_output[i].split()).reshape(1,11)
        new_data = np.vstack((new_data, new_output_i))       

In [None]:
import pandas as pd

In [None]:
new_dataframe = pd.DataFrame(new_data)
new_dataframe.to_csv('../../../LargeFilesResults/Model/fit_results_3d_grid.csv', sep=',', header=False, index=False)

In [None]:
print new_data.shape
print new_data[-1]

In [None]:
betas = {}
betas['beta0'] = beta0
betas['beta1'] = beta1
betas['beta2'] = beta2
betas['beta3'] = beta3

In [None]:
betas_dataframe = pd.DataFrame(betas)
betas_dataframe.to_csv('../../../LargeFilesResults/Model/betas_3d_grid.csv', sep=',', header=True, index=False)

In [None]:
pnew = list(fit.extract(u'pnew').items()[0])

In [None]:
model_results = {}
model_results['pnew'] = pnew[1][0]
model_results['redshift'] = plot_x1.reshape(2500,)
model_results['stellar_mass'] = plot_x2.reshape(2500,)

In [None]:
print model_results['pnew'].shape
print plot_x1.shape
print plot_x2.shape

In [None]:
model_dataframe = pd.DataFrame(model_results)
model_dataframe.to_csv('../../../LargeFilesResults/Model/model_noemlines.csv', sep=',', header=True, index=False)