In [2]:
import info, utils, preprocess



In [3]:
import pandas as pd
import numpy as np
import json
import re
import glob
from datetime import date
import math

validated_decisions = pd.read_excel(f'../data/snt_validated_sorted_mem50_n81.xlsx')
validated_decisions = validated_decisions.sort_values(by = 'Decision').reset_index(drop=True)

# TODO : this ^ is diff for schema....
character_roles = info.character_roles + ['neutral']
display(info.decision_trials.head(3))

# the pavlovia files are in csv
csv_dir = '../data/CSVs/'
csvs    = [csv for csv in glob.glob(csv_dir + "/*.csv")]
print(f'Found {len(csvs)} CSVs')

Unnamed: 0,slide_num,cogent_slide_num,scene_num,trial_type,dimension,decision_num,char_decision_num,char_role,char_role_num,cogent_opt1,...,cogent_duration,cogent_onset_2015,cogent_offset_2015,cogent_duration_2015,online_opt1_affil,online_opt2_affil,online_opt1_power,online_opt2_power,Opt1_txt_online,Opt2_txt_online
0,4,slide_4,1,Decision,affil,1,1,First,1,1.0,...,11.991,55.972,67.995,12.023,-1.0,1.0,0.0,0.0,YourememberManhewasapain,YourememberYoureallylikedhimbackthen
1,6,slide_6,1,Decision,affil,2,2,First,1,1.0,...,11.99,72.001,84.027,12.026,1.0,-1.0,0.0,0.0,Hughimforalongmoment,Shakehishandinstead
2,8,slide_8,1,Decision,affil,3,3,First,1,1.0,...,11.99,92.032,104.038,12.006,-1.0,1.0,0.0,0.0,YupItsbeenawhileImstillgettingsettled,YupWeshouldcatchupHowaboutdinnersoon


Found 5 CSVs


# read & clean

In [4]:
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")

    
class ParseCsv:
    
    def __init__(self, csv_fname):

        self.csv      = csv_fname
        self.data     = pd.read_csv(csv_fname)
        self.sub_id   = self.data.prolific_id[0]
        self.task_ver = self.data['task_ver'].values[0]
    
        self.task_functions = {'snt': self.process_snt,
                               'memory': self.process_memory,
                               'dots': self.process_dots, 
                               'forced_choice': self.process_forced_choice}
                               #'judgments': self.process_character_judgments}


        # ordered: ['first', 'second', 'assistant', 'powerful', 'boss', 'neutral']
        self.img_sets = {'OFA': ['OlderFemaleBl_2','OlderMaleW_1','OlderMaleBr_2','OlderMaleW_4','OlderFemaleBr_3','OlderFemaleW_1'],
                            'OFB': ['OlderFemaleW_2','OlderMaleBr_1','OlderMaleW_5','OlderMaleBl_3','OlderFemaleW_3','OlderFemaleBl_1'], 
                            'OFC': ['OlderFemaleBl_2','OlderMaleBr_1','OlderMaleBr_4','OlderMaleW_5','OlderFemaleW_3','OlderFemaleW_1'], 
                            'OFD': ['OlderFemaleW_2','OlderMaleW_1','OlderMaleW_5','OlderMaleBr_3','OlderFemaleBr_3','OlderFemaleBl_1'], 
                            'OMA': ['OlderMaleBr_2','OlderFemaleW_2','OlderFemaleBr_5','OlderFemaleW_3','OlderMaleBr_1','OlderMaleW_5'], 
                            'OMB': ['OlderMaleW_1','OlderFemaleBl_2','OlderFemaleW_1','OlderFemaleBl_3','OlderMaleW_4','OlderMaleBr_4'], 
                            'OMC': ['OlderMaleBr_4','OlderFemaleBl_2','OlderFemaleBl_1','OlderFemaleW_3','OlderMaleW_3','OlderMaleW_5'], 
                            'OMD': ['OlderMaleW_1','OlderFemaleW_2','OlderFemaleW_1','OlderFemaleBr_5','OlderMaleBr_3','OlderMaleBr_4'], 
                            'YFA': ['YoungerFemaleBr_1','YoungerMaleW_4','OlderMaleBr_4','YoungerMaleW_3','OlderFemaleBr_5','OlderFemaleW_1'], 
                            'YFB': ['YoungerFemaleW_3','YoungerMaleBr_2','YoungerMaleW_2','OlderMaleBr_3','OlderFemaleW_4','OlderFemaleBl_1'], 
                            'YFC': ['YoungerFemaleBr_1','YoungerMaleBr_2','OlderMaleBr_4','OlderMaleW_4','OlderFemaleW_3','OlderFemaleW_1'], 
                            'YFD': ['YoungerFemaleW_3','YoungerMaleW_4','OlderMaleW_5','OlderMaleBr_3','OlderFemaleBr_5','OlderFemaleBl_1'],
                            'YMA': ['YoungerMaleBr_2','YoungerFemaleW_3','OlderFemaleBl_1','OlderFemaleW_4','OlderMaleBr_3','YoungerMaleW_2'],
                            'YMB': ['YoungerMaleW_4','YoungerFemaleBr_1','OlderFemaleW_1','OlderFemaleBr_5','YoungerMaleW_3','OlderMaleBr_4'],
                            'YMC': ['YoungerMaleBr_2','YoungerFemaleBr_1','OlderFemaleBl_1','OlderFemaleW_3','OlderMaleW_3','OlderMaleW_5'],
                            'YMD': ['YoungerMaleW_4','YoungerFemaleW_3','OlderFemaleW_1','OlderFemaleBr_3','OlderMaleBr_3','OlderMaleBr_4']}

        self.clean()

    def clean(self):
        
        # data can be two identical rows for some reason
        if self.data.shape[0] > 1: 
            self.data = self.data.iloc[0,:].to_frame().T
        
        ## standardize naming conventions ##

        # make everything lower case
        self.data.columns = map(str.lower, self.data.columns)
        self.data = self.data.apply(lambda x: x.astype(str).str.lower())

        # replace character names in dataframe entries
        replace_substrings = {'newcomb':'powerful', 'hayworth':'boss'}

        if 'O' in self.task_ver or 'Y' in self.task_ver:  # this doesnt apply to adolescent version...
            if 'F' in self.task_ver: 
                order = ['maya','chris','anthony','newcomb','hayworth','kayce']
            else: 
                order = ['chris','maya','kayce','newcomb','hayworth','anthony']
            for name in order: replace_substrings[name] = character_roles[order.index(name)]

        self.data.replace(replace_substrings, inplace=True, regex=True) # replace elements
        
        # replace column headers (incl. w/ character names)
        replace_substrings['.'] = '_'
        replace_substrings['narrative'] = 'snt'
        replace_substrings['demographics'] = 'judgments'
        for k,i in replace_substrings.items():
            self.data.columns = self.data.columns.str.replace(k,i, regex=True)
        
        # race judgments may need to be reworked
        cols = utils.find_pattern(self.data.columns, 'race_*_*')
        if len(cols) > 0: 
            rename = {}
            for col in cols:
                split_ = col.split('_')
                rename[col] = f'judgment_{split_[1]}_{split_[0]}_{split_[2]}'
            self.data.rename(columns=rename, inplace=True)
            
        return self.data
   
    def run_pipeline(self):
        
        possible_tasks = np.array(list(self.task_functions.keys()))
        tasks_mask = np.array([any([task in c for c in self.data.columns]) for task in possible_tasks])
        tasks = possible_tasks[tasks_mask]

        # preprocess the tasks if they completed snt
        post_snt = []
        if 'snt' not in tasks: 
            print(f'Subject does not appear to have completed the SNT')
        else:
            self.task_functions['snt']()
            for task in tasks[1:]:
                post_snt.append(self.task_functions[task]())
        self.post = pd.concat(post_snt, axis=1)
        
        return [self.snt, self.post]

    def process_characters(self):
    
        # simple classes: masculine & feminine, dark skin & light skin

        if 'character_info_' not in self.data.columns: # older version
            img_names = [i.lower() for i in self.img_sets[self.task_ver]]
        else: # newer version
            img_names = [self.data[f'character_info_{r}_img'].values[0].lower() for r in character_roles] # case insensitive

        gender_bool    = [any([ss in i for ss in ['girl','woman','female']]) for i in img_names]
        skincolor_bool = [any([ss in i for ss in ['br','bl','brown','black','dark']]) for i in img_names]

        # make into df
        self.characters = pd.concat([pd.DataFrame(np.array(['feminine' if b  else 'masculine' for b in gender_bool])[np.newaxis], 
                                                    index=[self.sub_id], columns=[f'{r}_gender' for r in character_roles]),
                                     pd.DataFrame(np.array(['brown' if b  else 'white' for b in skincolor_bool])[np.newaxis], 
                                                    index=[self.sub_id], columns=[f'{r}_skincolor' for r in character_roles])], axis=1)
                
        return self.characters
 
    def process_snt(self):
    
        snt_bps  = np.array([int(re.sub('["\]"]', '', d.split(':')[1])) for d in self.data['snt_choices'].values[0].split(',')]) # single column
        snt_opts = self.data['snt_opts_order'].values[0].split('","') # split on delimter
        self.snt = pd.DataFrame(columns=['decision_num', 'button_press', 'decision', 'affil', 'power'])

        for q, question in enumerate(snt_opts):

            # organize
            opt1    = utils.remove_nontext(question.split(';')[1]) # this delimeter might change?
            opt2    = utils.remove_nontext(question.split(';')[2])
            sort_ix = np.argsort((opt1, opt2)) # order options alphabetically

            # parse the choice
            choice   = sort_ix[snt_bps[q] - 1] + 1 # choice -> 1 or 2, depending on alphabetical ordering
            decision = validated_decisions.iloc[q]
            affl = np.array(decision['Opt{}_Affl_Discrete'.format(int(choice))]) # grab the correct option's affil value
            pwr  = np.array(decision['Opt{}_Pwr_Discrete'.format(int(choice))]) # & power
            self.snt.loc[q,:] = [q + 1, snt_bps[q], affl + pwr, affl, pwr]

        snt_rts = np.array([int(utils.remove_nontext(rt)) for rt in self.data['snt_rts'].values[0].split(',')])
        self.snt['reaction_time'] = snt_rts[np.array(validated_decisions['Slide_num']) - 1] / 1000
    
        self.snt = info.decision_trials[['decision_num','dimension','scene_num','char_role_num','char_decision_num']].merge(self.snt, on='decision_num')
        convert_dict = {'decision_num': int,
                        'dimension': str,
                        'scene_num': int,
                        'char_role_num': int,
                        'char_decision_num': int,
                        'button_press': int,
                        'decision': int,
                        'affil': int,
                        'power': int,
                        'reaction_time': float}
        self.snt = self.snt.astype(convert_dict)
        # snt_df.to_excel(f'{self.data_dir}/Task/Organized/SNT_{self.sub_id}.xlsx', index=False)

        return self.snt

    def process_memory(self, version='adult'):

        cols = [c for c in self.data.columns if 'memory' in c]
        if len(cols) == 0:
            print('There are no memory columns in the csv')
            
        else: 

            # correct answers when questions are alphabetically sorted
            # {0: 'first', 1: 'second', 2: 'assistant', 3: 'newcomb', 4: 'hayworth', 5: 'neutral'}
            if version == 'adult':
                corr = [1,4,5,4,5,0,0,0,4,1,3,3,1,4,5,3,1,0,2,5,5,2,2,2,3,3,0,1,2,4] 
                # original? : [1,3,5,3,5,0,0,0,3,1,2,2,1,3,5,2,1,0,4,5,5,4,4,4,2,2,0,1,4,3]...????
            elif version == 'adolescent':
                corr = [1,4,0,5,5,4,0,0,0,4,3,3,3,4,1,5,3,1,2,5,2,5,1,2,2,2,3,0,1,4]
            
            if 'memory_resps' in cols or 'character_memory' in cols: # older version
                # these versions compressed responses into a single column with delimeter
                try: 
                    memory_  = [t.split(';')[1:2] for t in self.data['memory_resps'].values[0].split('","')]
                except: 
                    memory_  = [t.split(';')[1:2] for t in self.data['character_memory'].values[0].split('","')]
                ques_ = [m[0].split(':')[0] for m in memory_]
                resp_ = [m[0].split(':')[1] for m in memory_]

            else: # newer version

                ques_  = self.data[[c for c in cols if 'question' in c]].values[0]
                resp_  = self.data[[c for c in cols if 'resp' in c]].values[0]
            
            memory = sorted(list(zip(ques_, resp_)))
            self.memory = pd.DataFrame(np.zeros((1,6)), columns=[f'memory_{cr}' for cr in character_roles])
            for r, resp in enumerate(memory): 
                if resp[1] == character_roles[corr[r]]: 
                    self.memory[f'memory_{character_roles[corr[r]]}'] += 1/5

            # combine summary & trial x trial
            self.memory['memory_mean'] = np.mean(self.memory.values)
            self.memory['memory_rt']   = np.mean(self.data[[c for c in cols if 'rt' in c]].values[0].astype(float) / 1000)
            memory_resp_df = pd.DataFrame(np.array([r[1] for r in memory]).reshape(1, -1), 
                                          columns=[f'memory_{q + 1 :02d}_{character_roles[r]}' for q, r in enumerate(corr)])

            self.memory = pd.concat([self.memory, memory_resp_df], axis=1)
            self.memory.index = [self.sub_id]
            self.memory.insert(0, 'task_ver', self.data['task_ver'].values[0])
                
            return self.memory
    
    def process_dots(self):

        cols = [c for c in self.data.columns if 'dots' in c]
        if len(cols) == 0:
            print('There are no dots columns in the csv')
            
        else: 
            
            self.dots = pd.DataFrame(index=[self.sub_id], columns=[f'{c}_dots_{d}' for c in character_roles for d in ['affil','power']])

            # rename & standardize 
            if 'dots_resps' in cols: # older version 
                for row in data['dots_resps'].values[0].split(','):
                    split_ = row.split(';')
                    role = utils.remove_nontext(split_[0].split(':')[0])
                    self.dots[f'{role}_dots_affil'] = (float(split_[1].split(':')[1]) - 500)/500
                    self.dots[f'{role}_dots_power'] = (500 - float(split_[2].split(':')[1]))/500

            else: # newer version 
                for role in character_roles:
                    self.dots[f'{role}_dots_affil'] = (float(self.data[f'dots_{role}_affil'].values[0]) - 500)/500
                    self.dots[f'{role}_dots_power'] = (500 - float(self.data[f'dots_{role}_power'].values[0]))/500

            # get means
            for dim in ['affil','power']:
                self.dots[f'dots_{dim}_mean'] = np.mean(self.dots[[c for c in self.dots.columns if dim in c]],1).values[0]
                    
            return self.dots

    def process_forced_choice(self):

        cols = [c for c in self.data.columns if 'forced_choice' in c]
        if len(cols) == 0:
            print('There are no character judgments columns in the csv')
        
        else:
        
            choice_df_ = self.data[cols]
            n_choices  = int(len(choice_df_.columns) / 3) # 3 cols for each trial

            self.forced_choice = pd.DataFrame()
            for t in np.arange(0, n_choices):

                options = choice_df_[f'forced_choice_{t}_comparison'].values[0].split('_&_')
                rt      = float(choice_df_[f'forced_choice_{t}_rt'].values[0])

                # organize the responses
                resp    = float(choice_df_[f'forced_choice_{t}_resp'].values[0]) - 50 # center
                if resp < 0: choice = options[0]
                else:        choice = options[1]

                ans                   = np.array([0, 0])
                options               = sorted(options)
                ans_ix                = options.index(choice)
                ans[ans_ix]           = np.abs(resp)
                ans[np.abs(ans_ix-1)] = -np.abs(resp)

                self.forced_choice.loc[0, [f'{options[0]}_v_{options[1]}_{options[0]}']] = ans[0]
                self.forced_choice.loc[0, [f'{options[0]}_v_{options[1]}_{options[1]}']] = ans[1]   
                self.forced_choice.loc[0, [f'{options[0]}_v_{options[1]}_reaction_time']] = rt

            self.forced_choice.index = [self.sub_id]
            self.forced_choice.columns = ['forced_choice_' + c for c in self.forced_choice.columns]
                
            return self.forced_choice

        
for csv in csvs:
    data = pd.read_csv(csv)
    parser = ParseCsv(csv)
    data_ = parser.data

    # testing snt parsing
    snt, post = parser.run_pipeline()
    # print(f'{csv} snt has shape={snt.shape}')
    # display(snt.head(3))

    # print(f'{csv} post task has shape={post.shape}')

    # testing judgments parsing
    # judgements = parser.process_character_judgments()

    if len([c for c in data_.columns if 'judgments' in c]) == 0:
        print(f'MISSING judgments: {csv}')
        break
        
    # else:
    #     parser = ParseCsv(csv)
    #     data_ = parser.data
    #     characters = parser.process_characters()
    #     snt = parser.process_snt()
    #     memory = parser.process_memory()
    #     dots = parser.process_dots()
    #     ratings = parser.process_character_judgments()
    #     snt, post = parser.run_pipeline()



MISSING judgments: ../data/CSVs/Prolific-initial_older-format.csv


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [5]:
sub_id = '1'
for csv in csvs: 
    
    parser = ParseCsv(csv)
    data = parser.data
    # process_character_judgments(a)

    cols = [ccc for ccc in [cc for cc in [c for c in data.columns if 'judgments' in c] if 'resp' in cc] if 'snt_schema_judgments' not in ccc]
    if len(cols) == 0:
        print('There are no character judgments columns in the csv')
        break 
    else: 
                    
        if len(data[cols]): judgments = data[cols].iloc[0,:].values
        else:               judgments = data[cols].values
        judgment_cols = [('_').join(j.split('_')[1:3]) for j in cols]
        judgments = pd.DataFrame(judgments.reshape(1,-1), 
                                index=[sub_id], columns=judgment_cols)

        # mean of character judgments                   
        rating_dims = np.unique([('_').join(j.split('_')[1:]) for j in judgment_cols if 'self' not in j])
        for dim in rating_dims: 
            judgments[f'{dim}_mean'] = np.mean(judgments[[f'{r}_{dim}' for r in character_roles]], axis=1)[0]

There are no character judgments columns in the csv


In [6]:
# 'character_dimensions', 'character_relationship', 'character_race'
# 
rows = [char.split(';') for char in parser.data['character_relationship'].values[0].split(',')]
for row in rows: 
    role = utils.remove_nontext(row[0])
    print([utils.remove_nontext(r) for r in row[1:]])


['valence100', 'stability100', 'arousal68', 'rt7772']
['valence50', 'stability72', 'arousal1', 'rt6643']
['valence63', 'stability72', 'arousal68', 'rt6838']
['valence64', 'stability93', 'arousal51', 'rt6698']
['valence77', 'stability91', 'arousal70', 'rt7609']
['valence95', 'stability92', 'arousal76', 'rt7303']


In [None]:
# # relationship evaluations
# try:
#     character_relationship = [char.split(';') for char in data['character_relationship'].values[0].split(',')]
# except: 
#     character_relationship = [char.split(';') for char in data['relationship_feelings'].values[0].split(',')]
    
# rel_df = pd.DataFrame(index=[sub_id], columns=[char + '_' + rel for char in character_roles for rel in ['arousal','valence','stability']])
# for char in character_relationship:
#     role = char_roles[char_names.index(remove_nontext(char[0]))]
#     for r in range(1,4):
#         rel = char[r].split(':')[0]
#         rating = float(char[r].split(':')[1])
#         rel_df[role + '_' + rel] = rating
# for dim in ['arousal','valence','stability']:
#     rel_df['mean_' + dim] = np.mean(rel_df[[char + '_' + dim for char in character_roles]].values)

# # character evaluations    
# dims = ['status', 'competence', 'youthful', 'gender', 'similarity', 'approachable', 'dominant', 'likability', 'trustworthy', 'race']
# try:
#     resps = [row.split(';') for row in data['character_dimensions'].values[0].split(',')]
# except: 
#     resps = [row.split(';') for row in data['character_ratings'].values[0].split(',')]

# dim_df = pd.DataFrame(index=[sub_id], columns=[char + '_' + dim for char in char_roles for dim in dims])
# for resp in resps:
#     name = remove_nontext(resp[0])
#     if name != 'attention':
#         role = char_roles[char_names.index(remove_nontext(resp[0]))]
#         dim_df[role + '_' + resp[1].split(':')[0]] = int(resp[1].split(':')[1])

# for dim in ['status', 'competence', 'youthful', 'gender', 'similarity', 'approachable', 'dominant',

#  'likability', 'trustworthy']:
#             dim_df['mean_' + dim] = np.mean(dim_df[[char + '_' + dim for char in char_roles]].values)

#         # add race
#         try:
#             char_race = data['character_race'].values[0].split('","')
#         except:
#             char_race = data['race_categorization'].values[0].split('","')
#         for resp in char_race:
#             role = char_roles[char_names.index(remove_nontext(resp.split(';')[0].split(',')[0]))]
#             dim_df[role + '_race'] = resp.split(';')[1].split(':')[1]


#             snt_good = data['storyline_questions.good'].values
#             snt_bad = data['storyline_questions.bad'].values
#             if 'ug_questions.good' in data:
#                 ug_good = data['ug_questions.good'].values
#                 ug_bad = data['ug_questions.bad'].values
#             else:
#                 ug_good = ['nan']
#                 ug_bad = ['nan']
#             feedback_dfs.append([sub_id, list(snt_good)[0], list(snt_bad)[0], list(ug_good)[0], list(ug_bad)[0]])