In [6]:
import os, sys, glob, datetime, warnings
import pandas as pd
import numpy as np
from pathlib import Path
import pycircstat

warnings.filterwarnings('ignore') # ignore all!
# or: catch specific warnings
# with warnings.catch_warnings():
#     warnings.filterwarnings('ignore', r'All-NaN (slice|axis) encountered')

# directory to social_navigation_analysis:
user = os.path.expanduser('~')
sys.path.insert(0, '..')
sys.path.insert(0, str(Path(f'{user}/Dropbox/Projects/social_navigation_analysis/social_navigation_analysis'))) 
import preprocess as snt_preprc
from preprocess import load_data

In [7]:
main_dir = f'{user}/Desktop/SNT_data'
datasets = pd.read_excel(f'{main_dir}/SNT-datasets.xlsx') 
print(f"All datasets: {datasets['sample'].values}")

All datasets: ['CUD' 'HC' 'PD' 'PTSD' 'prolific_initial_vt' 'prolific_initial'
 'prolific_replication' 'schema_day01' 'schema_day03' 'adolescent_pilot']


In [10]:
# Usage notes:
# expects & creates some folder structure like:
    # SNT/Raw_files
    # SNT/Organized
    # SNT/Behavior
    # SNT/Posttask

#-----------------------------------------------------
# convenience functions

def find_files(directory):
    return [f for f in glob.glob(f"{directory}/*") if '~$' not in f]

#-----------------------------------------------------

# dataset info
# synapse_dir = '/Volumes/synapse/projects/SocialSpace/Projects/'
# datasets    = pd.read_excel(f'{synapse_dir}/SNT-datasets.xlsx') 

main_dir = f'{user}/Desktop/SNT_data'
datasets = pd.read_excel(f'{main_dir}/SNT-datasets.xlsx') 
samples  = ['CUD','HC','PD','PTSD']

# print(f"All datasets: {datasets['sample'].values}")
# print(f'Selected datasets: {samples}')

# what to do
overwrite   = False
compute_beh = True # right now: both parsing & behavior; TODO: split up
summarize   = True

for sample in samples:

    #---------------------------
    # define the project details
    #---------------------------

    proj_ix     = np.where(datasets['sample'].values==sample)[0][0]
    project_dir = f"{datasets.loc[proj_ix,'raw_directory']}"
    file_format = datasets.loc[proj_ix,'raw_format']
    datasets.loc[proj_ix, 'last_processed'] = datetime.datetime.now()
    raw_dir    = f"{main_dir}/{datasets.loc[proj_ix, 'raw_directory']}"
    

    if compute_beh: 

        # find files
        raw_files = find_files(raw_dir)
        print(f'{sample}: found {len(raw_files)} files')
        errors = []
        for raw_fname in raw_files:
            
            #------------------------------------------------
            # parse the raw file into an organized file
            #------------------------------------------------

            fname = raw_fname.split('/')[-1]
        
            try:

                print(f'{fname}: parsing raw', end='\r')

                # TODO check sub_id - does it exist already?

                if file_format == 'log':
                    xlsx_fname = snt_preprc.parse_log(raw_fname, experimenter=datasets.loc[proj_ix,'experimenter'], output_timing=False, out_dir=f'{raw_dir}/..')
                elif file_format == 'csv':
                    xlsx_fname = snt_preprc.parse_csv(raw_fname, snt_version=datasets.loc[proj_ix,'options_version'], out_dir=f'{raw_dir}/..')
                elif file_format == 'txt': # txt -> csv -> xlsx
                    csv_raw_dir = raw_dir.replace('-txt', '-csv')
                    raw_fname   = snt_preprc.format_txt_as_csv(raw_fname, out_dir=csv_raw_dir)
                    xlsx_fname  = snt_preprc.parse_csv(raw_fname, snt_version=datasets.loc[proj_ix,'options_version'], out_dir=f'{csv_raw_dir}/..')

            except: 

                errors.append(f'Parsing raw: {raw_fname}')
        
            #------------------------------------------
            # compute behavior from organized file 
            #------------------------------------------
            
            if sample != 'schema_day03': # doesnt have snt 
            
                try: 

                    if xlsx_fname is not None:
                        
                        fname = xlsx_fname.split('/')[-1]
                        print(f'{fname}: computing behavior', end='\r')          
                        snt_preprc.compute_behavior(xlsx_fname, weight_types=False, decision_types=False, coord_types=False, out_dir=f'{raw_dir}/..')
                
                except: 
                    
                    errors.append(f'Computing behavior: {raw_fname}')


            #------------------------------------------
            # compute rdvs for mvpa analyses
            #------------------------------------------
            # 
            # 
            # 
            #


        print(f'Finished parsing & computing behavior with {len(errors)} errors')

        # check number of files:
        n_xlsx = find_files(f"{raw_dir}/../Organized")
        if len(raw_files) != n_xlsx: print('There are missing organized excel files')




    #------------------------------
    # summarize across subjects
    #------------------------------  
    
    if summarize: 

        if sample != 'schema_day03': # doesnt have snt

            print('Summarizing behavior')

            # n_timing    = len([f for f in glob.glob(f"{project_dir}/{datasets.loc[proj_ix, 'timing_directory']}/*") if '~$' not in f])
            # datasets.loc[proj_ix, 'n_raw_files']       = n_raw
            # datasets.loc[proj_ix, 'n_organized_files'] = n_xlsx
            # datasets.loc[proj_ix, 'n_timing_files']    = n_timing
            # datasets.loc[proj_ix, 'n_behavior_files']  = n_behavior

            # check number of files:
            xlsx_files  = find_files(f"{raw_dir}/../Organized")
            behav_files = find_files(f"{raw_dir}/../Behavior")
            if len(xlsx_files) != len(behav_files): print(f'There are missing behavioral files: {len(behav_files)}!={len(xlsx_files)}')
            snt_preprc.summarize_behavior(behav_files, out_dir=f'{raw_dir}/..')

        # if there is a posttask folder concatenate autoamatically and output:
        if os.path.exists(f'{raw_dir}/../Posttask'):
            
            print('Summarizing posttask')

            post_files = find_files(f'{raw_dir}/../Posttask')
            if sample != 'schema_day03':
                if len(post_files) != len(behav_files): print(f'There are missing posttask files: {len(post_files)}!={len(behav_files)}')

            post_df = pd.concat([pd.read_excel(f) for f in post_files], axis=0)
            post_df.rename(columns={'Unnamed: 0':'sub_id'}, inplace=True)
            post_df.to_excel(f'{raw_dir}/../SNT-posttask_n{len(post_df)}.xlsx', index=False)

            # TODO: add in dots, self-reports, posttask etc automatically... eg, check if there is another summary sheet available, add in as an argument to merge on sub_id column?

datasets.to_excel(f'{main_dir}/SNT-datasets.xlsx', index=False)


CUD: found 80 files
Finished parsing & computing behavior with 0 errors
There are missing organized excel files
Summarizing behavior
HC: found 21 files80
Creating subdirectory for behavior
Finished parsing & computing behavior with 0 errors
There are missing organized excel files
Summarizing behavior
PD: found 61 files21
Creating subdirectory for behavior
Finished parsing & computing behavior with 0 errors
There are missing organized excel files
Summarizing behavior
PTSD: found 58 files
Creating subdirectory for behavior
Finished parsing & computing behavior with 0 errors
There are missing organized excel files
Summarizing behavior
Summarizing 58 of 58

# ADD IN

### RDVs

In [None]:
base_dir  = '/Volumes/synapse/projects/SocialSpace/Projects/SNT-fmri_original/Data/SNT'
beh_dir   = f'{base_dir}/Behavior'
beh_files = glob.glob(beh_dir + '/*.xlsx')

# if compute_rdvs:
# snt_preprc.compute_rdvs(beh_file, metric='euclidean', output_all=False, out_dir=base_dir)

### Memory + dots

In [None]:
import pandas as pd
import numpy as np

# task version. info:
synapse_dir = '/Volumes/synapse/projects/SocialSpace/Projects/'
task_versions = pd.read_excel(f'{synapse_dir}/SNT_datasets_task-versions.xlsx', sheet_name='CUD')
task_versions.sort_values(by='character_role_num', inplace=True)


# loop over datasets
sample   = 'CUD'
data_dir = '/Volumes/synapse/projects/SocialSpace/Projects/SNT-fmri_CUD/Data'
info     = pd.read_excel(f'{data_dir}/SNT-task_versions.xlsx')


#-----------------------
# version details
#-----------------------



#-----------------------
# memory
#-----------------------

# should be the answers to all the questionnaires..?
ans = ['powerful','powerful','boss','neutral','first','assistant','first','second','neutral','powerful',
       'boss','first','neutral','assistant','assistant','neutral','second','powerful','assistant','second',
       'neutral','second','powerful','first','boss','first','boss','assistant','second','boss'] # this will probably be the same for the ptsd sample too?

mem_df = pd.read_excel(glob.glob(f'{data_dir}/SNT-memory*_raw.xlsx')[0])
n_mem  = len(mem_df)
if not os.path.exists(f'{data_dir}/Summary/SNT-memory_n{n_mem}.xlsx'):
    
    memory_df = []
    for s,sub in mem_df.iterrows():

        # get correct version
        task_ver = task_versions[[c for c in task_versions.columns if f'v{sub.Task_ver}' in c]]
        opts     = [x.lower() for x in task_ver[f'v{sub.Task_ver}_name']]
     
        # score the answers
        resps   = np.array([x.lower() for x in list(sub.values[2:])]) 
        resps   = [snt_info.character_roles[opts.index(r)] for r in resps] # get characte role for each response
        correct = (np.array(resps)==np.array(ans)) * 1
        acc     = np.mean(correct)

        df = pd.DataFrame([sub.Sub_id, sub.Task_ver, acc] + list(correct)).T
        df.columns = ['sub_id','task_version','memory_mean'] + [f'memory_{r+1}_{role}' for r,role in enumerate(roles)]
        memory_df.append(df)
    memory_df = pd.concat(memory_df)
    memory_df.to_excel(f'{data_dir}/Summary/SNT-memory_n{len(memory_df)}_processed.xlsx')

print('Memory processing completed')


#-----------------------
# dots
#-----------------------

dots_dir  = f'{data_dir}/Dots'
dots_jpgs = glob.glob(f'{dots_dir}/Dots*jpg')
print(f'Found {len(dots_jpgs)} dots jpgs')

# append to an existing dots summary if it exists
initial_fname = glob.glob(f'{data_dir}/Summary/SNT-dots_n*.xlsx')
if len(initial_fname) > 0: 
    dots_df  = pd.read_excel(initial_fname[0][0])
    sub_list = dots_df['sub_id'].values
else:
    dots_df  = []
    sub_list = []
    
print(f'{len(sub_list)} subjects dots already processed')
for j, jpg in enumerate(dots_jpgs):
    
    sub_id = int(jpg.split('Dots_')[1].split('.jpg')[0])
    if sub_id not in sub_list:
        
        try:
            
            print(f'{j+1} {jpg}', end="\r")
            df = snt_preprc.process_dots(jpg)[1]
            
            # get the correct character roles for each dot
            sub_ix   = np.where(info['sub_id'].values == sub_id)[0][0]
            task_v   = info.loc[sub_ix, 'version'] 
            task_ver = task_versions[[c for c in task_versions.columns if f'v{task_v}' in c]]
            chars    = [x.lower() for x in task_ver[f'v{task_v}_dots-name']]
            for k,i in chars.items(): df.columns = df.columns.str.replace(k, i, regex=True) # replace columns

            df.insert(0, 'sub_id', sub_id)
            dots_df.append(df)
            
        except:
            print(f'ERROR: {jpg}')

dots_df = pd.concat(dots_df)
dots_df.to_excel(f'{data_dir}/Summary/SNT-dots_n{len(dots_df)}.xlsx', index=False)
if len(initial_fname) > 0: os.remove(initial_fname) # replace old w/ new

print('Dots processing completed')


#-----------------------
# merge all
#-----------------------

summary_files = [f for f in glob.glob(f'{data_dir}/Summary/*') if ('~$' not in f) and ('All-summary') not in f]
merged_df     = snt_utils.merge_dfs(summary_files)
merged_df     = snt_utils.move_cols_to_front(merged_df, ['sub_id', 'dx'])
merged_df.to_excel(f'{data_dir}/Summary/All-summary_n{len(merged_df)}.xlsx', index=False)