In [1]:
import pandas as pd
import common

## 1. Load the collected data into pandas-friendly data structures.

The Excel spreadsheet contains many sheets, each with one participant's data.
For these analyses, all trials will be loaded into a single dataframe.

In [2]:
def pid_from_sheet(sheet):
    """ Pull characters from sheet name before the first hyphen. """
    return sheet[: sheet.find('-')]

def row_is_data(neutral, hitmiss):
    """ Return True if row contains raw data. """
    if str(hitmiss).lower().strip("\"").strip("'") in ['hit', 'miss']:
        try:
            return 1000 < int(neutral) < 9999
        except TypeError:
            return False
    return False

def row_is_meta(neutral, hitrate):
    """ Return True if the row contains metadata. """
    if str(neutral).lower() in ['session', 'participant', ]:
        return True
    if 'anxiety' in str(hitrate).lower():
        return True
    if 'rate' in str(hitrate).lower():
        return True
    return False

def meta_in_row(row):
    """ Return one metadata (key, value) tuple from one row, with py-friendly naming """
    if 'hit rate' in str(row[6]).lower():
        try:
            return ('xl_hit_rate', float(row[7]))
        except ValueError:
            return ('xl_hit_rate', 0.0)
    elif 'state anxiety' in str(row[6]).lower():
        try:
            return ('state_anxiety', int(row[7]))
        except ValueError:
            return ('state_anxiety', 0)
    elif 'trait anxiety' in str(row[6]).lower():
        try:
            return ('trait_anxiety', int(row[7]))
        except ValueError:
            return ('trait_anxiety', 0)
    elif str(row[1]) in ['session', 'participant', ]:
        return (str(row[1]).lower(), str(row[2]))
    return None

def meta_from_df(df, sheet):
    """ Extract meta-data from the dataframe, return it as a dict. """
    meta_dict = {'sheet': sheet[: -4], 'pid': pid_from_sheet(sheet)}
    if str(df.columns[5]).lower() == 'hit rate':
        # For most sheets, meta 'Hit Rate' is embedded in the top row as a column header.
        # but this check is necessary because exactly one sheet moves it down a row. :(
        meta_dict['xl_hit_rate'] = float(df.columns[6])
    # after harvesting the Hit Rate value, rename it so it can act as an actual column name.
    df = df.rename({df.columns[6]: 'score'}, axis='columns')
    for row in df.itertuples():
        k, v = meta_in_row(row)
        meta_dict[k] = v
    # One sheet is missing meta_data, so fill it in.
    if 'session' not in meta_dict.keys():
        meta_dict['session'] = 1
    if 'participant' not in meta_dict.keys():
        meta_dict['participant'] = meta_dict['pid']
    return meta_dict

def harvest_sheets(xlf):
    """ Pull data from each sheet in sheets, return two dataframes. """
    dataframes = []
    metadicts = []
    cols_to_keep = ['Hit_Miss_raw', 'MaskedStim_raw', 'RT_raw', 'choice_raw', 'neutral']
    df_sex = None
    
    xl = pd.ExcelFile(xlf)
    print("{} data sheets were found".format(len(xl.sheet_names)))
    
    for i, sheet in enumerate(xl.sheet_names):
        if sheet.endswith(".tsv"):
            # Each tsv sheet contains complete results for each of 36 trials per participant
            pid = pid_from_sheet(sheet)
            print("  {:03}. loading {} from sheet '{}'".format(i + 1, pid, sheet))
            df = pd.read_excel(xlf, sheet)
            df['sheet'] = sheet[: -4]
            df['pid'] = pid
            df['datetime'] = sheet[sheet.find('-') + 1: -4]
            # The dataframe, df, has everything from the excel sheet, keep the raw data in df_data
            df_data = df[df.apply(lambda x: row_is_data(x[0], x[1]), axis=1)]
            dataframes.append(df_data[['sheet', 'pid', 'datetime', ] + cols_to_keep])
            # The dataframe, df, has everything from the excel sheet, keep the meta-data in df_meta
            df_meta = df[df.apply(lambda x: row_is_meta(x[0], x[5]), axis=1)]
            metadicts.append(meta_from_df(df_meta, sheet))
        elif sheet == "Gender":
            # This sheet appears to be the only place with sex information for all participants
            print("  {:03}. loading sex data from sheet '{}'".format(i + 1, sheet))
            df = pd.read_excel('data.xlsx', 'Gender')
            df_sex_f = df[['Female', ]].reindex()
            df_sex_f.columns = ['pid', ]
            df_sex_f['sex'] = 'F'
            df_sex_m = df[['Male', ]].reindex()
            df_sex_m.columns = ['pid', ]
            df_sex_m['sex'] = 'M'
            df_sex = pd.concat([df_sex_f.dropna(axis=0), df_sex_m.dropna(axis=0), ], axis=0).reindex()
            df_sex['sheet'] = df_sex['pid'].apply(lambda x: x[: -4])
            df_sex = df_sex.set_index('sheet')[['sex', ]]
            # Remove the duplicate sheet name entered twice in the Excel 'Gender' sheet.
            df_sex = df_sex[~df_sex.index.duplicated()]
        else:
            print("       ignoring sheet {}".format(sheet))
    
    # Concatenate participant metadata into a single dataframe.
    df_meta = pd.concat([pd.DataFrame(metadicts).set_index('sheet'), df_sex, ], axis=1, sort=True)

    # Concatenate all trials into a single dataframe, and add sex information.
    df_data = pd.concat(dataframes).reindex()
    df_data['sex'] = df_data['sheet'].map(df_meta['sex'])
    
    print("{} data sheets loaded; {} participants found; {:,} trials.".format(
        len(xl.sheet_names), df_meta.shape[0], df_data.shape[0]
    ))
    
    return df_meta, df_data

In [3]:
""" Read the Excel file and collect most of its data. 
    Save raw-data and meta-data out to separate tsv files.
"""
df_meta, df_data = harvest_sheets(common.source_file)

df_meta.to_csv(common.meta_file, sep="\t")
df_data.to_csv(common.data_file, index_label="sequence", sep="\t")

159 data sheets were found
  001. loading jg01 from sheet 'jg01-05-16-2016-15-07.tsv'
  002. loading ap01 from sheet 'ap01-05-16-2016-15-21.tsv'
  003. loading jb01 from sheet 'jb01-05-17-2016-15-12.tsv'
  004. loading kl00 from sheet 'kl00-05-17-2016-15-24.tsv'
  005. loading ka01 from sheet 'ka01-05-18-2016-14-28.tsv'
  006. loading jl01 from sheet 'jl01-05-18-2016-15-10.tsv'
  007. loading js01 from sheet 'js01-05-18-2016-15-23.tsv'
  008. loading bg01 from sheet 'bg01-05-19-2016-15-11.tsv'
  009. loading dz01 from sheet 'dz01-05-19-2016-15-38.tsv'
  010. loading gp01 from sheet 'gp01-05-23-2016-08-02.tsv'
  011. loading ja00 from sheet 'ja00-05-23-2016-15-02.tsv'
  012. loading rv99 from sheet 'rv99-05-24-2016-10-38.tsv'
  013. loading na99 from sheet 'na99-05-24-2016-13-39.tsv'
  014. loading vm99 from sheet 'vm99-05-24-2016-13-54.tsv'
  015. loading bf99 from sheet 'bf99-05-24-2016-15-30.tsv'
  016. loading yq99 from sheet 'yq99-05-24-2016-15-40.tsv'
  017. loading ad99 from shee

In [4]:
# Sanity check
df_meta.sample(8)

Unnamed: 0_level_0,pid,xl_hit_rate,state_anxiety,trait_anxiety,session,participant,sex
sheet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
mp99-10-18-2016-15-03,mp99,0.555556,35,31,1,mp99,M
ak02-09-27-2016-15-08,ak02,0.444444,31,45,1,ak02,F
jr00-10-04-2016-15-09,jr00,0.583333,50,49,1,jr00,F
ns00-01-18-2018-10-44,ns00,0.555556,52,50,1,ns00,F
ci00-08-02-2017-11-27,ci00,0.472222,35,46,1,ci00,M
ges00-08-02-2017-11-51,ges00,0.555556,46,51,1,ges00,F
hy98-10-13-2016-12-44,hy98,0.416667,38,37,1,hy98,M
ja00-05-23-2016-15-02,ja00,0.472222,0,0,1,ja00,


In [5]:
# Sanity check
df_data.sample(8)

Unnamed: 0,sheet,pid,datetime,Hit_Miss_raw,MaskedStim_raw,RT_raw,choice_raw,neutral,sex
15,pk02-10-18-2016-15-14,pk02,10-18-2016-15-14,'Miss','/Users/researchstudent3/Dropbox/sarika2/IAPS2...,8.866065,'slash',2038,M
19,kk99-01-18-2018-13-01,kk99,01-18-2018-13-01,'Miss','/Users/SarikaArora/Dropbox/sarika2/IAPS2008/6...,4.385511,'z',7140,F
24,yk00-01-18-2018-09-43,yk00,01-18-2018-09-43,'Hit','/Users/scienceresearch2/Dropbox/sarika2/IAPS2...,14.212268,'slash',7233,M
1,vm99-05-24-2016-13-54,vm99,05-24-2016-13-54,'Hit','/Users/SarikaArora/Dropbox/sarika2/IAPS2008/8...,2.377178,'slash',5130,F
24,xh99-10-13-2016-12-19,xh99,10-13-2016-12-19,'Hit','/Users/researchstudent1/Dropbox/sarika2/IAPS2...,1.536788,'slash',7233,M
34,kl99-01-18-2018-10-34,kl99,01-18-2018-10-34,'Miss','/Users/scienceresearch2/Dropbox/sarika2/IAPS2...,2.913352,'slash',2480,M
35,kp01-01-18-2018-13-33,kp01,01-18-2018-13-33,'Miss','/Users/scienceresearch3/Dropbox/sarika2/IAPS2...,17.638926,'slash',5740,M
19,kl00-05-17-2016-15-24,kl00,05-17-2016-15-24,'Hit','/Users/SarikaArora/Dropbox/sarika2/IAPS2008/5...,4.036808,'slash',7140,F


In [6]:
# Error check - We need to drop these from sex comparisons if we do not know the sex.
df_meta[df_meta.isna().any(axis=1)]

Unnamed: 0_level_0,pid,xl_hit_rate,state_anxiety,trait_anxiety,session,participant,sex
sheet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ja00-05-23-2016-15-02,ja00,0.472222,0,0,1,ja00,
km00-08-03-2017-12-32,km00,0.583333,40,43,1,km00,
mk99-10-14-2016-10-37,mk99,0.444444,61,43,1,mk99,


In [7]:
# Error check - We only kept actual data, so probably fine here.
# Just missing three participants' sex information (36 * 3 = 108 rows)
df_data[df_data.isna().any(axis=1)]

Unnamed: 0,sheet,pid,datetime,Hit_Miss_raw,MaskedStim_raw,RT_raw,choice_raw,neutral,sex
0,ja00-05-23-2016-15-02,ja00,05-23-2016-15-02,'Miss','/Users/SarikaArora/Dropbox/sarika2/IAPS2008/6...,1.792425,'slash',7700,
1,ja00-05-23-2016-15-02,ja00,05-23-2016-15-02,'Miss','/Users/SarikaArora/Dropbox/sarika2/IAPS2008/3...,1.699087,'z',5130,
2,ja00-05-23-2016-15-02,ja00,05-23-2016-15-02,'Miss','/Users/SarikaArora/Dropbox/sarika2/IAPS2008/6...,16.663343,'z',9360,
3,ja00-05-23-2016-15-02,ja00,05-23-2016-15-02,'Miss','/Users/SarikaArora/Dropbox/sarika2/IAPS2008/6...,3.151378,'slash',7031,
4,ja00-05-23-2016-15-02,ja00,05-23-2016-15-02,'Hit','/Users/SarikaArora/Dropbox/sarika2/IAPS2008/5...,1.272671,'z',7040,
...,...,...,...,...,...,...,...,...,...
31,km00-08-03-2017-12-32,km00,08-03-2017-12-32,'Miss','/Users/scienceresearch2/Dropbox/sarika2/IAPS2...,5.703349,'slash',5731,
32,km00-08-03-2017-12-32,km00,08-03-2017-12-32,'Hit','/Users/scienceresearch2/Dropbox/sarika2/IAPS2...,5.844194,'z',7547,
33,km00-08-03-2017-12-32,km00,08-03-2017-12-32,'Hit','/Users/scienceresearch2/Dropbox/sarika2/IAPS2...,2.206360,'slash',7060,
34,km00-08-03-2017-12-32,km00,08-03-2017-12-32,'Hit','/Users/scienceresearch2/Dropbox/sarika2/IAPS2...,6.688367,'z',2480,
