In [1]:
import os, sys, glob, random, math, patsy 
import pandas as pd
import numpy as np
from sklearn import metrics
from scipy.spatial import distance
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch

import warnings
warnings.filterwarnings("ignore")

#---------------------------------------------------------------------------

user = os.path.expanduser('~')
sys.path.insert(0, f'{user}/Dropbox/Projects/toolbox/toolbox')
from utils import merge_dfs

ques_prefixes   = ['oci', 'zbpd', 'sds', 'aes', 'sss', 'lsas_av', 'apdis', 'bapq']
all_prefixes    = ['oci', 'zbpd', 'sds', 'aes', 'sss', 'lsas_av', 'apdis', 'bapq'] + ['audit', 'aq', 'eat', 'pid5', 'pdi', 'stai_t', 'stai_s', 'pq16', 'pss', 'ucls', 'dtm', 'dtn', 'dtp', 'ucls', 'sh']

#---------------------------------------------------------------------------

# convenience functions
def subset_df(df, ques_prefixes):

    ques_dfs = []
    for ques in ques_prefixes: 
        ques_dfs.append(df.filter(regex=(f"{ques}_.*")))
    ques_df = pd.concat(ques_dfs, axis=1)
    ques_cols = [c for c in ques_df if ('_att' not in c) & ('score' not in c)]
    ques_df = ques_df[ques_cols]
    ques_df.insert(0, 'sub_id', df['sub_id'])
    return ques_df

## load & clean

In [2]:
main_dir = f'{user}/Desktop/SNT_data/SNT-online_behavioral/Data'

In [3]:
# initial sample
init_dir = f'{main_dir}/Initial_2021/Summary/Individual_summaries'

# merge dfs
beh_fname  = glob.glob(f'{init_dir}/SNT-behavior_n*')[0]
post_fname = glob.glob(f'{init_dir}/SNT-posttask_n*')[0]
ques_fname = glob.glob(f'{init_dir}/Questionnaire_summary_n*')[0]
df_list = [beh_fname, post_fname, ques_fname]
init_df = merge_dfs([pd.read_excel(x) for x in df_list])
init_df.insert(1, 'sample', 0)
print(f'Initial raw n={len(init_df)}')

#------------------------------------------------------------------------
# rt normal
#------------------------------------------------------------------------
rt_mask_low = init_df['reaction_time_mean'] > 2
rt_mask_hi  = init_df['reaction_time_mean'] < 15
rt_mask     = rt_mask_low & rt_mask_hi
init_df     = init_df[rt_mask]
print(f'RT normal: n={len(init_df)}')

#------------------------------------------------------------------------
# memory above threshold
#------------------------------------------------------------------------
init_df = init_df[init_df['memory_mean'] > .20]
print(f'memory thresh: n={len(init_df)}')

#------------------------------------------------------------------------
# dots working
#------------------------------------------------------------------------
dots = init_df[['first_dots_affil','second_dots_affil','assistant_dots_affil','powerful_dots_affil','boss_dots_affil']]
dots_mask = (np.sum(dots == -.92, 1) != 5) & np.isfinite(init_df['first_dots_affil'])
init_df = init_df[dots_mask]
print(f'dots worked: n={len(init_df)}')

#------------------------------------------------------------------------
# attention checks - TODO
#------------------------------------------------------------------------
# [c for c in init_df.columns if 'att' in c]

#------------------------------------------------------------------------
# questionnaires basically complete
#------------------------------------------------------------------------

ques_mask = np.isfinite(init_df['lsas_av_score'])
init_df   = init_df[ques_mask]
print(f'questinnaires: n={len(init_df)}')

init_df.reset_index(inplace=True, drop=True)
init_df.to_excel(f'{init_dir}/../All-data_summary_n{len(init_df)}_clean.xlsx', index=False)

# # to run hetcor
# init_ques_df = subset_df(init_df, all_prefixes)
# init_ques_df.to_csv(f'{init_dir}/Questionnaire_items_n{len(init_df)}.csv', index=False)

Initial raw n=706
RT normal: n=670
memory thresh: n=665
dots worked: n=613
questinnaires: n=613


In [4]:
# replication sample
repl_dir   = f'{main_dir}/Replication_2022/Summary/Individual_summaries'

# merge dfs
beh_fname  = glob.glob(f'{repl_dir}/SNT-behavior_n*')[0]
post_fname = glob.glob(f'{repl_dir}/SNT-posttask_n*')[0]
ques_fname = glob.glob(f'{repl_dir}/Questionnaire_summary_n*')[0]
df_list = [beh_fname, post_fname, ques_fname]
repl_df = merge_dfs([pd.read_excel(x) for x in df_list])
repl_df.insert(1, 'sample', 1)
print(f'Initial raw n={len(repl_df)}')


#------------------------------------------------------------------------
# rt normal
#------------------------------------------------------------------------
rt_mask_low = repl_df['reaction_time_mean'] > 2
rt_mask_hi  = repl_df['reaction_time_mean'] < 15
rt_mask     = rt_mask_low & rt_mask_hi
repl_df     = repl_df[rt_mask]
print(f'RT normal: n={len(repl_df)}')

#------------------------------------------------------------------------
# memory above threshold
#------------------------------------------------------------------------
repl_df = repl_df[repl_df['memory_mean'] > .20]
print(f'memory thresh: n={len(repl_df)}')

#------------------------------------------------------------------------
# dots working
#------------------------------------------------------------------------
dots = repl_df[['first_dots_affil','second_dots_affil','assistant_dots_affil','powerful_dots_affil','boss_dots_affil']]
dots_mask = (np.sum(dots == -.92, 1) != 5) & np.isfinite(repl_df['first_dots_affil'])
repl_df = repl_df[dots_mask]
print(f'dots worked: n={len(repl_df)}')

#------------------------------------------------------------------------
# questionnaires
#------------------------------------------------------------------------
ques_mask = np.isfinite(repl_df['lsas_av_score'])
repl_df = repl_df[ques_mask]
print(f'questinnaires: n={len(repl_df)}')

repl_df.reset_index(inplace=True, drop=True)
repl_df.to_excel(f'{repl_dir}/../All-data_summary_n{len(repl_df)}_clean.xlsx', index=False)

# # to run hetcor
# repl_ques_df = subset_df(repl_df, all_prefixes)
# repl_ques_df.to_csv(f'{repl_dir}/Questionnaire_items_n{len(repl_ques_df)}.csv', index=False)

Initial raw n=286
RT normal: n=270
memory thresh: n=270
dots worked: n=266
questinnaires: n=264
