In [None]:
""" Preparatory block """
import pandas as pd

## 1. Load the collected data into pandas-friendly data structures.

The Excel spreadsheet contains many sheets, each with one participant's data.
For these analyses, all trials will be loaded into a single dataframe.

In [None]:
# Which named columns should we pull from each Excel sheet?
cols_to_keep = ['Hit_Miss_raw', 'MaskedStim_raw', 'RT_raw', 'choice_raw', 'neutral']

def row_is_data(neutral, hitmiss):
    """ Return True if row contains raw data. """
    if str(hitmiss).lower().strip("\"").strip("'") in ['hit', 'miss']:
        try:
            return 1000 < int(neutral) < 9999
        except TypeError:
            return False
    return False

def row_is_meta(neutral, hitrate):
    """ Return True if the row contains metadata. """
    if str(neutral).lower() in ['session', 'participant', ]:
        return True
    if 'anxiety' in str(hitrate).lower():
        return True
    if 'rate' in str(hitrate).lower():
        return True
    return False

def meta_in_row(row):
    """ Return one metadata (key, value) tuple from one row """
    if 'hit rate' in str(row[6]).lower():
        try:
            return ('hit rate', float(row[7]))
        except ValueError:
            return ('hit rate', 0.0)
    elif 'state anxiety' in str(row[6]).lower():
        try:
            return ('state anxiety', int(row[7]))
        except ValueError:
            return ('state anxiety', 0)
    elif 'trait anxiety' in str(row[6]).lower():
        try:
            return ('trait anxiety', int(row[7]))
        except ValueError:
            return ('trait anxiety', 0)
    elif str(row[1]) in ['session', 'participant', ]:
        return (str(row[1]).lower(), str(row[2]))
    return None

def meta_from_df(df, pid):
    """ Extract meta-data from the dataframe, return it as a dict. """
    meta_dict = {'pid': pid}
    if str(df.columns[5]).lower() == 'hit rate':
        # For most sheets, meta 'Hit Rate' is embedded in the top row as a column header.
        # but this check is necessary because exactly one sheet moves it down a row. :(
        meta_dict[df.columns[5].lower()] = float(df.columns[6])
    # after harvesting the Hit Rate value, rename it so it can act as an actual column name.
    df = df.rename({df.columns[6]: 'score'}, axis='columns')
    for row in df.itertuples():
        k, v = meta_in_row(row)
        meta_dict[k] = v
    return meta_dict

def harvest_sheets(sheets):
    """ Pull data from each sheet in sheets, return two dataframes. """
    dataframes = []
    metadicts = []
    
    for i, sheet in enumerate(sheets):
        if sheet.endswith(".tsv"):
            print("  {:03}. loading sheet '{}'".format(i + 1, sheet))
            df = pd.read_excel("data.xlsx", sheet)
            pid = sheet[: sheet.find('-')]
            df['pid'] = pid
            df['datetime'] = sheet[sheet.find('-') + 1: -4]
            print("       {}, {}".format(pid, sheet[sheet.find('-') + 1: -4]))
            # The dataframe, df, has everything from the excel sheet, keep the raw data in df_data
            df_data = df[df.apply(lambda x: row_is_data(x[0], x[1]), axis=1)]
            dataframes.append(df_data[['pid', 'datetime', ] + cols_to_keep])
            # The dataframe, df, has everything from the excel sheet, keep the meta-data in df_meta
            df_meta = df[df.apply(lambda x: row_is_meta(x[0], x[5]), axis=1)]
            metadicts.append(meta_from_df(df_meta, pid))
        else:
            print("       ignoring sheet {}".format(sheet))
    
    return pd.DataFrame(metadicts), pd.concat(dataframes)


""" Read the Excel file and collect most of its data. 
    Save raw-data and meta-data out to separate tsv files.
"""
xl = pd.ExcelFile("data.xlsx")
print("{} data sheets were found".format(len(xl.sheet_names)))
df_meta, df_data = harvest_sheets(xl.sheet_names)

df_meta.to_csv("metadata.tsv", sep="\t")
df_data.to_csv("rawdata.tsv", index_label="sequence", sep="\t")
print("{} data sheets loaded into a {}-shaped dataframe with {}-shaped metadata.".format(
    len(xl.sheet_names), df_data.shape, df_meta.shape
))
