In [1]:
import os, sys, glob, datetime, warnings
import pandas as pd
import numpy as np
from pathlib import Path
import pycircstat
warnings.filterwarnings('ignore') # ignore all!

# directory to social_navigation_analysis
user = os.path.expanduser('~')
sys.path.insert(0, '..')
sys.path.insert(0, str(Path(f'{user}/Dropbox/Projects/social_navigation_analysis/social_navigation_analysis'))) 
import preprocess as snt_preprc
from preprocess import load_data
from info import decision_trials
import utils

# samples:
main_dir = f'{user}/Desktop/SNT_data'
datasets = pd.read_excel(f'{main_dir}/SNT-datasets.xlsx') 
print(f"All datasets: {datasets['sample'].values}")

# test main function
%run -i '../tests/test_ComputeBehavior2.py'

  return np.array(np.nancumsum(values, axis=0) /
  return np.array(np.nancumsum(values, axis=0) /
  return np.array(np.nancumsum(values, axis=0) /
  return np.array(np.nancumsum(values, axis=0) /
  return np.array(np.nancumsum(values, axis=0) /
  return np.array(np.nancumsum(values, axis=0) /
..
----------------------------------------------------------------------
Ran 11 tests in 0.089s

OK


All datasets: ['CUD' 'HC' 'PD' 'PTSD' 'prolific_initial_vt' 'prolific_initial'
 'prolific_replication' 'schema_day01' 'schema_day03' 'adolescent_pilot'
 'schema_day01_older' 'schema_day03_older']


In [2]:
main_dir = f'{user}/Desktop/SNT_data'
datasets = pd.read_excel(f'{main_dir}/SNT-datasets.xlsx') 
datasets

Unnamed: 0,sample,raw_directory,raw_format,experimenter,options_version,last_processed,n_raw_files,n_organized_files,n_timing_files,n_behavior_files
0,CUD,SNT-fmri_CUD/Data/SNT/Logs,log,kb,,2022-11-11 18:24:42.497,,,,
1,HC,SNT-fmri_original/Data/SNT/Logs,log,rt,,2022-11-11 18:26:04.754,,,,
2,PD,SNT-behavior_PD/Data/SNT/Logs,log,nr,,2022-11-11 18:26:27.132,,,,
3,PTSD,SNT-behavior_PTSD/Data/SNT/Logs,log,af,,2022-11-11 18:27:24.168,,,,
4,prolific_initial_vt,SNT-online_behavioral/Data/Initial_2021/SNT/Ra...,csv,ms,standard,2022-11-16 14:28:11.782,,,,
5,prolific_initial,SNT-online_behavioral/Data/Initial_2021/SNT/Ra...,csv,ms,standard,2022-11-11 17:32:46.319,,,,
6,prolific_replication,SNT-online_behavioral/Data/Replication_2022/SN...,csv,ms,standard,2022-11-11 17:33:22.139,,,,
7,schema_day01,SNT-online_schema/Data/SNT/day01/Raw,csv,ms,schema,2022-11-11 17:45:58.004,,,,
8,schema_day03,SNT-online_schema/Data/SNT/day03/Raw,csv,ms,schema,2022-11-11 17:46:14.593,,,,
9,adolescent_pilot,SNT-online_adolescent_pilot/Data/SNT/CSVs,csv,ms,adolescent_pilot,2022-11-11 18:16:55.603,,,,


In [3]:
# Usage notes:
# expects & creates some folder structure like:
    # SNT/Raw_files
    # SNT/Organized
    # SNT/Behavior
    # SNT/Posttask

#-----------------------------------------------------
# convenience functions

def find_files(directory):
    return [f for f in glob.glob(f"{directory}/*") if '~$' not in f]

#-----------------------------------------------------

# dataset info
# synapse_dir = '/Volumes/synapse/projects/SocialSpace/Projects/'
# datasets    = pd.read_excel(f'{synapse_dir}/SNT-datasets.xlsx') 

main_dir = f'{user}/Desktop/SNT_data'
datasets = pd.read_excel(f'{main_dir}/SNT-datasets.xlsx') 
samples  = ['PD'] # datasets['sample'].values # 

# print(f"All datasets: {datasets['sample'].values}")
# print(f'Selected datasets: {samples}')

# what to do
overwrite      = True
parse_data     = False
compute_beh    = True 
summarize_beh  = True
summarize_post = False

for sample in samples:

    #---------------------------
    # define the project details
    #---------------------------

    
    proj_ix     = np.where(datasets['sample'].values==sample)[0][0]
    project_dir = f"{datasets.loc[proj_ix,'raw_directory']}"
    file_format = datasets.loc[proj_ix,'raw_format']
    datasets.loc[proj_ix, 'last_processed'] = datetime.datetime.now()
    raw_dir    = f"{main_dir}/{datasets.loc[proj_ix, 'raw_directory']}"

    
    #------------------------------------------------
    if parse_data: 
        # parse the raw file into an organized file
    #------------------------------------------------

    
        # find files
        raw_files = find_files(raw_dir)
        raw_files.sort()
        print(f'{sample}: found {len(raw_files)} files')
        errors = []
        for raw_fname in raw_files:

            fname = raw_fname.split('/')[-1]
        
            try:

                print(f'{fname}: parsing raw', end='\r')

                if file_format == 'log':
                    xlsx_fname = snt_preprc.parse_log(raw_fname, experimenter=datasets.loc[proj_ix,'experimenter'], 
                                                      output_timing=True, out_dir=f'{raw_dir}/..')
                
                elif file_format == 'csv':
                    xlsx_fname = snt_preprc.parse_csv(raw_fname, snt_version=datasets.loc[proj_ix,'options_version'], 
                                                      out_dir=f'{raw_dir}/..')
                
                elif file_format == 'txt': # txt -> csv -> xlsx
                    csv_raw_dir = raw_dir.replace('-txt', '-csv')
                    raw_fname   = snt_preprc.format_txt_as_csv(raw_fname, out_dir=csv_raw_dir)
                    xlsx_fname  = snt_preprc.parse_csv(raw_fname, snt_version=datasets.loc[proj_ix,'options_version'], 
                                                       out_dir=f'{csv_raw_dir}/..')

            except: 

                errors.append(f'Parsing raw: {raw_fname}')
        
        
    #------------------------------------------------
    if compute_beh: 
        print('COMPUTING BEHAVIOR')
        # compute behavior from organized file 
    #------------------------------------------------

    
        if sample != 'schema_day03': # doesnt have snt 

            xlsx_files = find_files(f'{raw_dir}/../Organized') # where organized directory should have been made
            xlsx_files.sort()
            print(f'{sample}: found {len(xlsx_files)} files')
            errors = []

            for xi, xlsx_fname in enumerate(xlsx_files):

                try: 

                    fname = xlsx_fname.split('/')[-1]
                    if (overwrite) or (not os.path.isfile(f'{raw_dir}/../Behavior/{fname}_behavior.xlsx')): 
                        print(f'{xi+1}/{len(xlsx_files)} {fname}: computing behavior', end='\r')       
                        snt_preprc.compute_behavior(xlsx_fname, 
                                                    weight_types=False, 
                                                    decision_types=False, 
                                                    coord_types=False, 
                                                    overwrite=overwrite,
                                                    out_dir=f'{raw_dir}/..')

                except: 

                    errors.append(f'Computing behavior: {xlsx_fname}')

        #------------------------------------------
        # compute rdvs for mvpa analyses
        #------------------------------------------
        #  print(f'Finished parsing & computing behavior with {len(errors)} errors')
        # # check number of files:
        #  n_xlsx = find_files(f"{raw_dir}/../Organized")
        #  if len(raw_files) != n_xlsx: print('There are missing organized excel files')

    
    #------------------------------------------------
    if summarize_beh: 
        # summarize across subjects
    #------------------------------------------------

    
        if 'schema_day03' not in sample: # doesnt have snt

            #------------------------------------------------
            print('Summarizing behavior')
            #------------------------------------------------

            # n_timing    = len([f for f in glob.glob(f"{project_dir}/{datasets.loc[proj_ix, 'timing_directory']}/*") if '~$' not in f])
            # datasets.loc[proj_ix, 'n_raw_files']       = n_raw
            # datasets.loc[proj_ix, 'n_organized_files'] = n_xlsx
            # datasets.loc[proj_ix, 'n_timing_files']    = n_timing
            # datasets.loc[proj_ix, 'n_behavior_files']  = n_behavior

            # check number of files:
            xlsx_files  = find_files(f"{raw_dir}/../Organized")
            behav_files = find_files(f"{raw_dir}/../Behavior")
            if len(xlsx_files) != len(behav_files): print(f'There are missing behavioral files: {len(behav_files)}!={len(xlsx_files)}')
            snt_preprc.summarize_behavior(behav_files, out_dir=f'{raw_dir}/..')

    #------------------------------------------------
    if summarize_post: 
        # summarize across subjects
    #------------------------------------------------
    
        # if there is a posttask folder concatenate autoamatically and output:
        if os.path.exists(f'{raw_dir}/../Posttask'):
            
            #------------------------------------------------
            print('Summarizing posttask')
            #------------------------------------------------

            post_files = find_files(f'{raw_dir}/../Posttask')
            post_df = pd.concat([pd.read_excel(f) for f in post_files], axis=0)
            post_df.rename(columns={'Unnamed: 0':'sub_id'}, inplace=True)
            post_df.to_excel(f'{raw_dir}/../SNT-posttask_n{len(post_df)}.xlsx', index=False)

            # TODO: add in dots, self-reports, posttask etc automatically... eg, check if there is another summary sheet available, add in as an argument to merge on sub_id column?
# datasets.to_excel(f'{main_dir}/SNT-datasets.xlsx', index=False)

print(f'Found {len(errors)} errors')

COMPUTING BEHAVIOR
PD: found 61 files
Summarizing behavior computing behavior
Found 0 errors of 61


# Merge

In [26]:
import functools
summary_dir = '/Users/matty_gee/Desktop/SNT_data/SNT-fmri_original/Data/SNT'
dfs = []
for fname in ['SNT-behavior_n*', 'Self*']:
    df = pd.read_excel(glob.glob(f'{summary_dir}/{fname}')[0])
    df.sort_values(by='sub_id', inplace=True)
    df['sub_id'] = df['sub_id'].astype(int)
    dfs.append(df)
summary_df = functools.reduce(lambda x, y: pd.merge(x, y, on = 'sub_id', how = 'outer'), dfs)
first_cols = ['sub_id']
summary_df.insert(1, 'dx', 'HC')
summary_df.to_excel(f'{summary_dir}/All-data_summary_n{len(summary_df)}.xlsx', index=False)

In [24]:
summary_dir = '/Volumes/synapse/projects/SocialSpace/Projects/SNT-fmri_CUD/Data/Summary'
cud_df = pd.read_excel(f'{summary_dir}/All-data_summary_n84.xlsx')
pd.concat([cud_df, summary_df]).to_excel('All-data_summary_n{len(cud_df)+len(summary_df)}.xlsx', index=False)

# Others

### RDVs

In [None]:
# base_dir  = '/Volumes/synapse/projects/SocialSpace/Projects/SNT-fmri_original/Data/SNT'
# beh_dir   = f'{base_dir}/Behavior'
# beh_files = glob.glob(beh_dir + '/*.xlsx')

# if compute_rdvs:
# snt_preprc.compute_rdvs(beh_file, metric='euclidean', output_all=False, out_dir=base_dir)

### Memory + dots

In [18]:
import pandas as pd
import numpy as np

#-----------------------
# version details
#-----------------------

synapse_dir = '/Volumes/synapse/projects/SocialSpace/Projects/'
task_versions = pd.read_excel(f'{synapse_dir}/SNT-datasets_task-versions.xlsx', sheet_name='CUD')
task_versions.sort_values(by='character_role_num', inplace=True)

sample   = 'CUD'
data_dir = '/Volumes/synapse/projects/SocialSpace/Projects/SNT-fmri_CUD/Data'
info     = pd.read_excel(f'{data_dir}/Summary/SNT-task_versions_n84.xlsx')

#-----------------------
# memory
#-----------------------

roles = ['first', 'second', 'assistant', 'powerful', 'boss', 'neutral']
ans = ['powerful','powerful','boss','neutral','first','assistant','first','second','neutral','powerful',
       'boss','first','neutral','assistant','assistant','neutral','second','powerful','assistant','second',
       'neutral','second','powerful','first','boss','first','boss','assistant','second','boss'] # this will probably be the same for the ptsd sample too?

mem_df = pd.read_excel(glob.glob(f'{data_dir}/Summary/SNT-memory*_raw.xlsx')[0])
n_mem  = len(mem_df)
if not os.path.exists(f'{data_dir}/Summary/SNT-memory_n{n_mem}.xlsx'):

    memory_df = []
    for s,sub in mem_df.iterrows():

        # get correct version
        task_ver = task_versions[[c for c in task_versions.columns if f'v{sub.Task_ver}' in c]]
        opts     = [x.lower() for x in task_ver[f'v{sub.Task_ver}_name']]

        # score the answers
        resps   = np.array([x.lower() for x in list(sub.values[2:])]) 
        resps   = [roles[opts.index(r)] for r in resps] # get characte role for each response
        correct = (np.array(resps) == np.array(ans)) * 1

        df = pd.DataFrame([sub.Sub_id, sub.Task_ver] + list(correct)).T
        df.columns = ['sub_id', 'task_version'] + [f'memory_{q+1}_{a}' for q, a in enumerate(ans)]
        memory_df.append(df)
    memory_df = pd.concat(memory_df)

    # add means
    for role in roles:
        memory_df[f'memory_{role}'] = np.mean(memory_df[[c for c in memory_df.columns if role in c]], axis=1)
    memory_df['memory_mean'] = np.mean(memory_df[[c for c in memory_df.columns if 'memory' in c]], axis=1)
    memory_df['memory_mean_main'] = np.mean(memory_df[[f'memory_{r}' for r in roles[:5]]], axis=1)
    memory_df.to_excel(f'{data_dir}/Summary/SNT-memory_n{len(memory_df)}_processed.xlsx', index=False)

print('Memory processing completed')

Memory processing completed


In [None]:
#-----------------------
# dots
#-----------------------

dots_dir  = f'{data_dir}/SNT/Dots'
dots_jpgs = glob.glob(f'{dots_dir}/Dots*jpg')
print(f'Found {len(dots_jpgs)} dots jpgs')
recon_dir = f'{dots_dir}/recon'
if not os.path.exists(recon_dir): os.mkdir(recon_dir)

# append to an existing dots summary if it exists
initial_fname = glob.glob(f'{data_dir}/Summary/SNT-dots_n*.xlsx')
if initial_fname: 
    dots_df  = pd.read_excel(initial_fname[0])
    dots_df.sort_values(by='sub_id', inplace=True)
    dots_df.reset_index(drop=True, inplace=True)
    sub_list = dots_df['sub_id'].values
    dots_df = [dots_df]
else:
    sub_list = []
    dots_df = []

print(f'{len(sub_list)} subjects dots already processed')

for j, jpg in enumerate(dots_jpgs):

    sub_id = int(jpg.split('Dots_')[1].split('.jpg')[0])
    if sub_id in sub_list:

        try:

            print(f'{j+1} {jpg}', end="\r")
            recon_img, df = snt_preprc.process_dots(jpg)
            recon_img.save(f'{recon_dir}/Dots-recon_{sub_id}.jpg')

            # get the correct character roles for each dot
            sub_ix   = np.where(info['sub_id'].values == sub_id)[0][0]
            task_v   = info.loc[sub_ix, 'snt_ver'] 
            task_ver = task_versions[[c for c in task_versions.columns if f'v{task_v}' in c]]
            chars    = {name: roles[i] for i, name in enumerate(task_ver[f'v{task_v}_dots-name'])}
            df.columns = [col.replace(x, chars[x]) for x in chars for col in df.columns if x in col]         
            df.insert(0, 'sub_id', sub_id)
            dots_df.append(df)

        except:

            print(f'ERROR: {jpg}')

# dots_df = pd.concat(dots_df)
# dots_df.to_excel(f'{data_dir}/Summary/SNT-dots_n{len(dots_df)}.xlsx', index=False)
if initial_fname: os.remove(initial_fname) # replace old w/ new
print('Dots processing completed')


#-----------------------
# merge all
#-----------------------

# summary_files = [f for f in glob.glob(f'{data_dir}/Summary/*') if ('~$' not in f) and ('All-summary') not in f]
# merged_df     = utils.merge_dfs(summary_files)
# merged_df     = utils.move_cols_to_front(merged_df, ['sub_id', 'dx'])
# merged_df.to_excel(f'{data_dir}/Summary/All-summary_n{len(merged_df)}.xlsx', index=False)