# DARPA-ARC Notebook 1: Behavior

## Demographics

In [1]:
from my_settings import df

## Calculate average age.
print(df.Age.mean(), df.Age.std())

## Tabulate genders before rejection.
print(df.Gender.value_counts())
print(df[~df.Exlude].Gender.value_counts())

33.94444444444444 8.799170235050394
M    23
F    13
Name: Gender, dtype: int64
M    18
F    10
Name: Gender, dtype: int64


## Assemble Behavior Data

In [3]:
from my_settings import op, version, root_dir, mri_dir, task, subjects, read_csv, np, modality, paradigm, session
from pandas import concat

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
### Define parameters.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

scanner_fix = 2.95

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
### Load and assemble data.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

df = []
for subject in subjects:
    #
    f = op.join(root_dir, 'behavior', '%s_%s_%s_%s-%i' % (subject, task, modality, paradigm, session))
    csv = read_csv(f)
    #
    ## Add subject. Remove missing trials.
    csv['Subject'] = subject
    #
    ## Append.
    df.append(csv)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
## Merge and preprocess.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
    
## Merge and crop.
columns = ['Subject', 'Trial', 'RiskType', 'RewardType', 'Reward', 'ResponseType', 
           'ResponseTime','RiskOnset','ShockOnset']
df = concat(df, sort=False)[columns].reset_index(drop=True)

## Replace missing responses.
df.loc[df.ResponseType==99,'ResponseType'] = np.nan

## Due to code-scanner issues, we will impute all reaction times greater than 2.95s.
if scanner_fix: df.loc[df.ResponseTime>scanner_fix, 'ResponseTime'] = np.nan

## Save.
df.to_csv(op.join(root_dir, 'stan_results/%s_%s_%s.csv' % (task, modality, version)), index=False)
print('Done.')

Done.


## Bayesian Modeling

### Run Models with Stan
See [here](https://github.com/stan-dev/stan/wiki/Prior-Choice-Recommendations) for discussion of choice of logistic priors. Namely:
>Assuming that nonbinary variables have been scaled to have mean 0 and standard deviation 0.5, [Gelman et al (2008)](https://arxiv.org/pdf/0901.4011.pdf) recommended *student_t(1,0,2.5)*, i.e. Cauchy distribution. Later it has been observed that this has too thick tails, so that in cases where data is not informative (e.g. in case of separation) the sampling from the posterior is challenging (see e.g. [Ghosh et al, 2015](http://arxiv.org/abs/1507.07170)). Thus Student's t distribution with higher degrees of freedom is recommended. There is not yet conclusive results what specific value should be recommended, and thus the current recommendation is to choose 3<nu<7. 

>Normal distribution is not recommended as a weakly informative prior, because it is not robust (see [O'Hagan (1979)](https://www.jstor.org/stable/2985064)). Normal distribution would be fine as an informative prior.



In [2]:
from my_settings import os, op, np, read_csv, version, root_dir, mri_dir, task, subjects, stan_models, DataFrame, pickle, modality
import pystan
from pandas import get_dummies

def zscore(arr): return (arr - np.nanmean(arr)) / np.nanstd(arr) 

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
## Specify parameters.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

## Model parameters
interactions = False
nu = 5

## Sampling parameters.
chains = 4
samples = 2000
thin = 4
seed = 47404
n_jobs = 2

print('n_samples: %s' %(chains * samples / 2 / thin))

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
## Assemble data for model.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

## Load data.
df = read_csv(op.join(root_dir, 'stan_results/%s_%s_%s.csv' % (task, modality, version)))
df = df[~np.isnan(df.ResponseType)].reset_index(drop=True)

## Make subject index.
subjects, ix = np.unique(df.Subject, return_inverse=True)        
ix += 1

## Make missing data index.
mi = df.ResponseTime.isnull().astype(int)
mi *= np.cumsum(mi)

## Assemble indepedent variables.
_, med_risk, high_risk = get_dummies(df.RiskType).values.T 
risk = np.vstack([med_risk, high_risk])
intercept = np.ones_like(med_risk)
reward = df.Reward.values                    

if interactions: X = np.vstack([intercept, risk, reward, risk*reward]).T 
else: X = np.vstack([intercept, risk, reward]).T  

## Assemble depedent variables.
N = df.ResponseType.values.astype(int)
Z = df.ResponseTime.values
Z[np.isnan(Z)] = 99

## Z-score variables.
zX = X.copy()
if not interactions: 
    zX[:,-1] = zscore(X[:,-1])
else:
    zX[:,3] = zscore(zX[:,3])
    zX[:,4] = zX[:,1] * zX[:,3]
    zX[:,5] = zX[:,2] * zX[:,3]

## Assemble metadata.
n_obs, n_pred = X.shape
n_subj = subjects.shape[0]
n_miss = len(mi.nonzero()[0])

## Assemble data.
data = dict(n_obs=n_obs, n_pred=n_pred, n_subj=n_subj, n_miss=n_miss, ix=ix, mi=mi, X=zX, N=N, Z=Z)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
## Perform Bayesian modeling.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

for model_name in stan_models:
    print('Running Stan.')
    f = op.join(root_dir, 'stan_models/%s_%s.txt' % (task, model_name))
    fit = pystan.stan(file=f, data=data, chains=chains, iter=samples, thin=thin,
                      seed=seed, n_jobs=n_jobs)
    print('Finished.')
    #
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
    ## Save summary file.
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
    #
    summary = fit.summary()
    summary = DataFrame(summary['summary'], columns=summary['summary_colnames'], index=summary['summary_rownames'])
    f = op.join(root_dir, 'stan_results/%s_%s_%s.csv' % (version, task, model_name))
    summary.to_csv(f)
    #
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
    ## Extract Results.
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
    #
    results = fit.extract()
    #
    ## Append data to results.
    results['Subjects'] = subjects
    results['X'] = X
    results['zX'] = zX
    results['N'] = N
    results['Z'] = Z
    results['ix'] = ix
    results['RiskOnset'] = df.RiskOnset.values
    #
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
    ## Save results.
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
    print('Saving data.')
    #
    ## Save all data.
    f = op.join(root_dir, 'stan_results/%s_%s_%s.pickle' % (version, task, model_name))
    with open(f, 'wb') as f: pickle.dump(results, f)
    #
    ## Save log-likelihood for R.
    np.savetxt(op.join(root_dir, 'stan_results/%s_%s_%s_loglik.txt' % (version, task, model_name)), np.log(results[u'PointPosteriors']))

print('Done.')

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_5c4f3351ab7915a2bb1ad9dfc1b9ea82 NOW.


n_samples: 1000.0
Running Stan.
Finished.
Saving data.


INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_7eb53f59ac79bdaef2b52b436f2d5e6e NOW.


Running Stan.
Finished.
Saving data.
Done.


### Posterior Predictive Checks

In [5]:
from my_settings import os, op, np, read_csv, version, root_dir, mri_dir, task, subjects, stan_models, plt, DataFrame, pickle

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
### Define parameters.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

ncol = 4
decim = 4

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
### Load data.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

## Open results file.
for model_name in stan_models:
    print('Loading data for %s.' % model_name)
    f = op.join(root_dir, 'stan_results/%s_%s_%s.pickle' % (version, task, model_name))
    with open(f, 'rb') as f: results = pickle.load(f)
    #
    ## Extract variables.
    subjects = results['Subjects']
    ix = results['ix'] - 1
    n_subjects = ix.max() + 1
    nrow = int(np.ceil(n_subjects / ncol))
    #
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
    ### Evaluate fit to choice data.
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#   
    #
    print('Evaluating choice data.')
    #
    def bernoulli(p): return np.random.binomial(1,p,size=1)
    #
    Bernoulli = np.vectorize(bernoulli)
    #
    ## Compute true ratio of takes.
    ratio_obs = np.array([results['N'][ix==i].mean() for i in range(n_subjects)])
    #
    ## Generate simulations of take/no-take.
    np.random.seed(47404)
    sN = Bernoulli(results['theta'])
    #
    ## Compute ratio take from simulations.
    ratio_sim = np.array([sN[:,ix==i].mean(axis=1) for i in range(n_subjects)])
    #
    ## Compute difference between observed and median of distribution.
    ppc = ratio_obs - np.median(ratio_sim, axis=1)
    #
    ## Compute group statistic.
    rms_ppc = np.sqrt(np.power(ppc,2).mean())
    #
    ## Initialize plots.
    fig, axes = plt.subplots(nrow,ncol,figsize=(nrow*3,ncol*4),sharey=True)
    #
    for n, subject in enumerate(subjects):
        #
        axes[int(n/ncol),n%ncol].hist(ratio_sim[n], bins=8, color='#7ec0ee')
        axes[int(n/ncol),n%ncol].vlines(ratio_obs[n], 0, 500, linewidth=2, linestyle='--')
        axes[int(n/ncol),n%ncol].set_title(subject.upper())
        x1, x2 = axes[int(n/ncol),n%ncol].get_xlim()
        xticks = np.linspace(x1,x2,3).round(2)
        axes[int(n/ncol),n%ncol].set_xticks(xticks)
    #
    plt.suptitle('PPC = %0.3f' %rms_ppc, y=0.99)
    plt.tight_layout()
    # plt.show()
    plt.savefig(op.join(root_dir, 'plots/ppc/%s_%s_%s_ppc_choice.png' % (version, task, model_name)))
    plt.close('all')
    #
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
    ### Evaluate fit to reaction time data.
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#  
    #
    print('Evaluating reaction time data.')
    #
    ## Extract and provide imputations data.
    Z = results['Z'].copy()
    Z[np.where(Z==99)] = results['Zm'].mean(axis=0)
    #
    ## Initialize plots.
    fig, axes = plt.subplots(nrow,ncol,figsize=(nrow*3,ncol*4))
    #
    for n, subject in enumerate(subjects):
        #
        ## Extract info.
        a0 = results['a0'][:,n]
        if 'non-' in model_name: a1 = results['a1_mu']
        else: a1 = results['a1'][:,n]
        shape = results['shape'][:,n]
        ddb = results['ddb'][:,ix==n]
        #
        ## Calculate gamma parameters.
        mu = a0 + (a1 * ddb.T) 
        scale = mu / shape
        #
        ## Simulate reaction times.
        np.random.seed(47404)
        def gamma(shape,scale): return np.random.gamma(shape,scale,size=1)
        Gamma = np.vectorize(gamma)
        rt_sim = Gamma(shape,scale)
        #
        ## Extract observed reaction times.
        rt_obs = Z[ix==n]
        #
        ## Plot observed.
        density, bins = np.histogram(rt_obs, bins=5, density=True)
        x = bins[:-1] + np.diff(bins) 
        axes[int(n/ncol),n%ncol].plot(x,density,linewidth=3,color='#7ec0ee')
        axes[int(n/ncol),n%ncol].set_title(subject.upper())
        axes[int(n/ncol),n%ncol].set_xlim(0,5)
        #
        ## Plot simulated.
        for arr in rt_sim.T[::decim]:
            density, bins = np.histogram(arr, bins=5, density=True)
            x = bins[:-1] + np.diff(bins) 
            axes[int(n/ncol),n%ncol].plot(x,density,color='k',alpha=0.01)
    #
    plt.tight_layout()
    # plt.show()
    plt.savefig(op.join(root_dir, 'plots/ppc/%s_%s_ppc_rt.png' % (version, model_name)))
    plt.close('all')
    print('Done.')

Loading data for arc_hierarchical.
Evaluating choice data.
Evaluating reaction time data.
Done.
Loading data for arc_non-hierarchical.
Evaluating choice data.
Evaluating reaction time data.
Done.


### Model Comparison: Compute WAIC / CV-LOO
Performed in R. See other script.

### Assemble model outputs for fMRI Analysis

In [4]:
from my_settings import os, op, np, read_csv, version, root_dir, mri_dir, task, subjects, stan_models, categories, merge_on, DataFrame, pickle, modality

for model_name in stan_models:
    #
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
    ### Load data.
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
    #
    ## Open results file.
    print('Loading data.')
    f = op.join(root_dir, 'stan_results/%s_%s_%s.pickle' % (version, task, model_name))
    with open(f, 'rb') as f: results = pickle.load(f)
    #
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
    ### Build dataframe.
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
    #
    print('Assembling dataframe.')
    #
    ## Initialize.
    df = dict()
    #
    ## Add subject info.
    ix = results['ix'] - 1
    df['Subject'] = results['Subjects'][ix]
    #
    ## Add regressors.
    df['RiskType'] = np.where(results['X'][:,1]==1, 2, np.where(results['X'][:,2]==1,3,1))
    df['Reward']   = results['X'][:,3]
    df['theta']    = np.median(results['theta'], axis=0)
    df['ddb']      = 0.25 - np.power(df['theta'] - 0.5, 2)
    #
    ## Extract and provide imputations data.
    Z = results['Z'].copy()
    Z[np.where(Z==99)] = results['Zm'].mean(axis=0)
    df['RT'] = Z
    df['RiskOnset'] = results['RiskOnset']
    #
    ## Assemble dataframe.
    df = DataFrame(df)
    #
    ## Merge with original file.
    csv = read_csv(op.join(root_dir, 'stan_results/%s_%s_%s.csv' % (task, modality, version)))
    csv.drop('ResponseTime', axis=1, inplace=True)
    #
    df = df.merge(csv, how='outer', on=merge_on)
    df = df.sort_values(['Subject','Trial']).reset_index(drop=True)
    df = df[categories]
    #
    ## Save.
    print('Saving dataframe.')
    df.to_csv(op.join(root_dir, 'stan_results/%s_%s_%s_regressors.csv' % (task, model_name, version)), index=False)

print('Done.')

Loading data.
Assembling dataframe.
Saving dataframe.
Loading data.
Assembling dataframe.
Saving dataframe.
Done.
