In [9]:
#import packages
import pandas as pd
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [10]:
#modify import path
import sys
sys.path.append('../our_analysis')

#import our code
from hybrid_mle import fit_hybrid_mixed_model, fit_hybrid_mixed_dynamic_model, fit_hybrid_mixed_fixedAlpha_model, fit_hybrid_mixed_fixedBeta_model


In [11]:
#Set WD
notebook_dir = os.getcwd()

In [12]:
#load dataset
df = pd.read_csv("../dataset/all_subjects.csv") #our dataset
# df = pd.read_csv("../dataset/beh_noslow.csv") #i'm fitting to replicate the paper.

#Let's remove all of the slow trials for the df
df = df[df['slow']==0]
# We need to cast the choice data to an int
df['choice1'] = df['choice1'].astype(int)
df['choice2'] = df['choice2'].astype(int)
df['final_state'] = df['final_state'].astype(int)


# ## some notes about using all_subjects:
# # If you get errors you may need to rename the fields of allsubjects becuase the original paper dataset (beh_noslow.csv) and our dataset (all_subjects) have slightly different capitalizations of headers.
# # If you get additional errors filter out the NaNs from the df before sending them to the processing function. 

# #MLE takes time the first simple run was quite fast (~10min), but expect longer depending on your CPU and the complexity of the model you're running

story_trials = df[df["condition"] == "story"]
abstract_trials=df[df["condition"] == "abstract"]

# Fit Hybrid Model
We're currently fitting using the provided stan model. I made modifications to ensure we could just pass dfs instead of raw csvs. 

We are using a mixed effects approach to estimate the weight allowing it to vary across participants. Not necessary as we could fix all effects (the pset is unclear what they're really asking for here), but this provides more nuance to the analysis. 

Note we likely won't be very close to the reported table in the paper as estimated individually and then pooled across all participants.

We can rerun this to get results closer to the paper by setting the priors in the stan model. Look at the upper/lower parameter bounds in the stan file (sdn_hw2/our_analysis/hybrid_mixed.stan). We want a decent width for the parameter search but we can make them a bit closer to the actual paper. @TODO by anyone: it would be good to check the supplemental to see if they added a note about what the priors were

In [15]:
story_results_df, params, logli_story_basic = fit_hybrid_mixed_model(
    data_df=story_trials,
    stan_file="../our_analysis/hybrid_mixed.stan",
    output_file="story_params.csv"
)

abstract_results_df, params, logli_abstract_basic = fit_hybrid_mixed_model(
    data_df=abstract_trials,
    stan_file="../our_analysis/hybrid_mixed.stan",
    output_file="abstract_params.csv"
)

12:49:54 - cmdstanpy - INFO - compiling stan file C:\Users\Sarah\Documents\GitHub\sdn_hw2\our_analysis\hybrid_mixed.stan to exe file C:\Users\Sarah\Documents\GitHub\sdn_hw2\our_analysis\hybrid_mixed.exe


ValueError: Failed to compile Stan model 'C:\Users\Sarah\Documents\GitHub\sdn_hw2\our_analysis\hybrid_mixed.stan'. Console:
make: *** No rule to make target 'C:/Users/Sarah/Documents/GitHub/sdn_hw2/our_analysis/hybrid_mixed.exe'.  Stop.

Command ['make', 'STANCFLAGS+=--filename-in-msg=hybrid_mixed.stan', 'C:/Users/Sarah/Documents/GitHub/sdn_hw2/our_analysis/hybrid_mixed.exe']
	error during processing No such file or directory


In [16]:
#store the results in a clean table we can use for the assignment
results =[] 

for df in [story_results_df,abstract_results_df]:
    params=df.drop(columns='w').iloc[0].to_dict()
    params['w_mean'] = df['w'].mean()
    params['w_std'] = df['w'].std()
    results.append(params)

results[0]['logli']=logli_story_basic
results[1]['logli']=logli_abstract_basic

results_df = pd.DataFrame(results)
results_df.head()
## we should probably just save this as a csv here. It has the participant field which we can drop in later formatting.
results_df.to_csv("hybrid_model_params_abstract.csv", index=False)



NameError: name 'story_results_df' is not defined

In [None]:
results_df.head()


Unnamed: 0,participant,condition,alpha1,alpha2,lmbd,beta1,beta2,p,w_mean,w_std,logli
0,1,story,0.010083,0.859748,0.862889,7.93245,2.00154,0.042903,0.472224,0.348867,-7483.01
1,5,abstract,0.160292,0.497685,0.449943,6.07055,2.99636,0.151726,0.571508,0.315155,-7160.51


# Let's do some ablation:

Here we're going to start fitting with fixed parameters. To do this I added a modified fit function that allows us to prefix certain parameters. This should work to finish the next few parts

In [36]:
# PARAM_NAMES = ('alpha1', 'alpha2', 'lmbd', 'beta1', 'beta2', 'p', 'w')


## USE THE PARAM NAMES ABOVE. ONLY INPUT THE PARAMS you are FIXING and the value you are fixing it to. 
fixed_param = {
    'lmbd': 1
}

lfix_story_results_df, params, logli_story = fit_hybrid_mixed_dynamic_model(
    data_df=story_trials, #note the trial type
    stan_file="../our_analysis/dynamic_hybrid_mixed.stan",
    output_file="story_params_l1.csv",   ##change the output csv name so you don't overwrite your work
    fixed_params = fixed_param,
    return_logli = True
)

lfix_abstract_results_df, params, logli_abstract = fit_hybrid_mixed_dynamic_model(
    data_df=abstract_trials, #note the trial type
    stan_file="../our_analysis/dynamic_hybrid_mixed.stan",
    output_file="abstract_params_l1.csv",   ##change the output csv name so you don't overwrite your work
    fixed_params = fixed_param,
    return_logli = True
)


23:24:55 - cmdstanpy - INFO - Chain [1] start processing
23:24:59 - cmdstanpy - INFO - Chain [1] done processing
23:24:59 - cmdstanpy - INFO - Chain [1] start processing
23:25:02 - cmdstanpy - INFO - Chain [1] done processing
23:25:02 - cmdstanpy - INFO - Chain [1] start processing
23:25:05 - cmdstanpy - INFO - Chain [1] done processing
23:25:05 - cmdstanpy - INFO - Chain [1] start processing
23:25:08 - cmdstanpy - INFO - Chain [1] done processing
23:25:08 - cmdstanpy - INFO - Chain [1] start processing
23:25:11 - cmdstanpy - INFO - Chain [1] done processing
23:25:11 - cmdstanpy - INFO - Chain [1] start processing
23:25:15 - cmdstanpy - INFO - Chain [1] done processing
23:25:15 - cmdstanpy - INFO - Chain [1] start processing
23:25:18 - cmdstanpy - INFO - Chain [1] done processing
23:25:18 - cmdstanpy - INFO - Chain [1] start processing
23:25:23 - cmdstanpy - INFO - Chain [1] done processing
23:25:23 - cmdstanpy - INFO - Chain [1] start processing
23:25:26 - cmdstanpy - INFO - Chain [1]

Optimized parameters:
OrderedDict([('lp__', -7481.09), ('alpha1_free[1]', 0.00753296), ('alpha2_free[1]', 0.859634), ('beta1_free[1]', 8.78983), ('beta2_free[1]', 2.00173), ('p_free[1]', 0.03901), ('w_free[1]', 0.999863), ('w_free[2]', 0.326662), ('w_free[3]', 0.959254), ('w_free[4]', 0.166404), ('w_free[5]', 0.44218), ('w_free[6]', 0.999857), ('w_free[7]', 0.271942), ('w_free[8]', 0.684072), ('w_free[9]', 0.432946), ('w_free[10]', 0.943569), ('w_free[11]', 0.575632), ('w_free[12]', 0.185833), ('w_free[13]', 0.236582), ('w_free[14]', 0.439239), ('w_free[15]', 0.088396), ('w_free[16]', 0.645006), ('w_free[17]', 0.452008), ('w_free[18]', 0.307097), ('w_free[19]', 0.000291329), ('w_free[20]', 0.00814565), ('w_free[21]', 0.8659), ('w_free[22]', 0.437793), ('w_free[23]', 0.325304), ('w_free[24]', 9.0745e-06), ('w_free[25]', 0.354187), ('w_free[26]', 0.55265), ('w_free[27]', 0.755806), ('w_free[28]', 7.82674e-05), ('w_free[29]', 0.200375), ('w_free[30]', 0.999267), ('w_free[31]', 3.24397e-05

00:48:07 - cmdstanpy - INFO - Chain [1] start processing
00:48:09 - cmdstanpy - INFO - Chain [1] done processing
00:48:09 - cmdstanpy - INFO - Chain [1] start processing
00:48:11 - cmdstanpy - INFO - Chain [1] done processing
00:48:11 - cmdstanpy - INFO - Chain [1] start processing
00:48:13 - cmdstanpy - INFO - Chain [1] done processing
00:48:13 - cmdstanpy - INFO - Chain [1] start processing
00:48:17 - cmdstanpy - INFO - Chain [1] done processing
00:48:17 - cmdstanpy - INFO - Chain [1] start processing
00:48:19 - cmdstanpy - INFO - Chain [1] done processing
00:48:19 - cmdstanpy - INFO - Chain [1] start processing
00:48:22 - cmdstanpy - INFO - Chain [1] done processing
00:48:22 - cmdstanpy - INFO - Chain [1] start processing
00:48:25 - cmdstanpy - INFO - Chain [1] done processing
00:48:25 - cmdstanpy - INFO - Chain [1] start processing
00:48:28 - cmdstanpy - INFO - Chain [1] done processing
00:48:28 - cmdstanpy - INFO - Chain [1] start processing
00:48:30 - cmdstanpy - INFO - Chain [1]

Optimized parameters:
OrderedDict([('lp__', -7176.87), ('alpha1_free[1]', 0.0790916), ('alpha2_free[1]', 0.501271), ('beta1_free[1]', 5.86149), ('beta2_free[1]', 2.98553), ('p_free[1]', 0.16042), ('w_free[1]', 0.999944), ('w_free[2]', 0.0035422), ('w_free[3]', 0.999956), ('w_free[4]', 0.525829), ('w_free[5]', 0.488169), ('w_free[6]', 0.523792), ('w_free[7]', 0.567041), ('w_free[8]', 0.999941), ('w_free[9]', 0.502274), ('w_free[10]', 0.27372), ('w_free[11]', 0.807923), ('w_free[12]', 0.675336), ('w_free[13]', 0.179527), ('w_free[14]', 0.143424), ('w_free[15]', 0.851555), ('w_free[16]', 0.301968), ('w_free[17]', 0.436049), ('w_free[18]', 0.62676), ('w_free[19]', 0.451639), ('w_free[20]', 0.999894), ('w_free[21]', 0.875674), ('w_free[22]', 0.391449), ('w_free[23]', 0.502863), ('w_free[24]', 0.00154378), ('w_free[25]', 0.470933), ('w_free[26]', 0.670427), ('w_free[27]', 0.26635), ('w_free[28]', 0.92394), ('w_free[29]', 0.82599), ('w_free[30]', 0.755405), ('w_free[31]', 0.999735), ('w_free[

In [37]:
lfix_story_results_df.head()
## You should save these as a csv. Snag the logli from above as well!

Unnamed: 0,participant,condition,alpha1,alpha2,lmbd,beta1,beta2,p,w
0,1,story,0.007533,0.859634,1,8.78983,2.00173,0.03901,0.999863
1,2,story,0.007533,0.859634,1,8.78983,2.00173,0.03901,0.326662
2,3,story,0.007533,0.859634,1,8.78983,2.00173,0.03901,0.959254
3,7,story,0.007533,0.859634,1,8.78983,2.00173,0.03901,0.166404
4,10,story,0.007533,0.859634,1,8.78983,2.00173,0.03901,0.44218


In [38]:
lfix_abstract_results_df.head()

Unnamed: 0,participant,condition,alpha1,alpha2,lmbd,beta1,beta2,p,w
0,5,abstract,0.079092,0.501271,1,5.86149,2.98553,0.16042,0.999944
1,6,abstract,0.079092,0.501271,1,5.86149,2.98553,0.16042,0.003542
2,8,abstract,0.079092,0.501271,1,5.86149,2.98553,0.16042,0.999956
3,9,abstract,0.079092,0.501271,1,5.86149,2.98553,0.16042,0.525829
4,11,abstract,0.079092,0.501271,1,5.86149,2.98553,0.16042,0.488169


In [39]:
#store the results in a clean table we can use for the assignment
results =[] 

for df in [lfix_story_results_df,lfix_abstract_results_df]:
    params=df.drop(columns='w').iloc[0].to_dict()
    params['w_mean'] = df['w'].mean()
    params['w_std'] = df['w'].std()
    results.append(params)

results_df = pd.DataFrame(results)
results_df.head()
## we should probably just save this as a csv here. It has the participant field which we can drop in later formatting.
results_df.to_csv("hybrid_model_params_fix_l1.csv", index=False)

print(logli_story, logli_abstract)

-7481.09 -7176.87


# Fixing P

In [None]:
# PARAM_NAMES = ('alpha1', 'alpha2', 'lmbd', 'beta1', 'beta2', 'p', 'w')


## USE THE PARAM NAMES ABOVE. ONLY INPUT THE PARAMS you are FIXING and the value you are fixing it to. 
fixed_param = {
    'p':0
}

fixp_story_results_df, params, logli_story = fit_hybrid_mixed_dynamic_model(
    data_df=story_trials, #note the trial type
    stan_file="../our_analysis/dynamic_hybrid_mixed.stan",
    output_file="story_params_p0.csv",   ##change the output csv name so you don't overwrite your work
    fixed_params = fixed_param,
    return_logli = True
)

fixp_abstract_results_df, params, logli_abstract = fit_hybrid_mixed_dynamic_model(
    data_df=abstract_trials, #note the trial type
    stan_file="../our_analysis/dynamic_hybrid_mixed.stan",
    output_file="abstract_params_p0.csv",   ##change the output csv name so you don't overwrite your work
    fixed_params = fixed_param,
    return_logli = True
)


In [None]:
fixp_story_results_df = story_results_df
fixp_abstract_results_df = abstract_results_df

In [None]:
story_results_df.head()
## You should save these as a csv. Snag the logli from above as well!

Unnamed: 0,participant,condition,alpha1,alpha2,lmbd,beta1,beta2,p,w
0,1,story,0.011795,0.841689,0.79269,8.72079,2.03577,0,0.999942
1,2,story,0.011795,0.841689,0.79269,8.72079,2.03577,0,0.357561
2,3,story,0.011795,0.841689,0.79269,8.72079,2.03577,0,0.999364
3,7,story,0.011795,0.841689,0.79269,8.72079,2.03577,0,0.194911
4,10,story,0.011795,0.841689,0.79269,8.72079,2.03577,0,0.510715


In [None]:
abstract_results_df.head()

Unnamed: 0,participant,condition,alpha1,alpha2,lmbd,beta1,beta2,p,w
0,5,abstract,0.165608,0.45782,0.449007,8.32262,3.12165,0,0.999995
1,6,abstract,0.165608,0.45782,0.449007,8.32262,3.12165,0,0.000426
2,8,abstract,0.165608,0.45782,0.449007,8.32262,3.12165,0,0.999987
3,9,abstract,0.165608,0.45782,0.449007,8.32262,3.12165,0,0.375395
4,11,abstract,0.165608,0.45782,0.449007,8.32262,3.12165,0,0.403804


In [None]:
#store the results in a clean table we can use for the assignment
results =[] 

for df in [story_results_df,abstract_results_df]:
    params=df.drop(columns='w').iloc[0].to_dict()
    params['w_mean'] = df['w'].mean()
    params['w_std'] = df['w'].std()
    results.append(params)

results_df = pd.DataFrame(results)
results_df.head()
## we should probably just save this as a csv here. It has the participant field which we can drop in later formatting.
results_df.to_csv("hybrid_model_params_fix_p0.csv", index=False)

print(logli_story, logli_abstract)

-7548.79 -7617.97


# Setting Alpha = Alpha, and Beta=Beta

In [None]:
_alphafix_story_results_df, params, logli = fit_hybrid_mixed_fixedAlpha_model(
    data_df=story_trials,
    stan_file="../our_analysis/equalLearning_mle.stan",
    output_file="story_params_fix_alpha.csv"
)

# need to return the logli as well