The purpose of this script is to parse output from PsychoPy into fMRI log files (event timing files for each condition / stimulus, which can later be fed to FEAT). You must run this script individually for each participant

# Import necessaries

In [None]:
%pylab inline
import pandas as pd
import os
import numpy as np
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

pd.set_option('display.max_rows', 150)

In [None]:
# Define top dir
topDir = open('../top_dir_win.txt').read().replace('\n', '')

In [None]:
## Specify subject and input data
subj = "subject-001" 

inFilePath = topDir + "/behavioural_data/fmri_runs1/" + subj + '/'

# # Specify directories for output (and make directories if they don't alraedy exist)
outFilePath = topDir + "/MRIanalyses/" + 'assets/' + subj + '/' + subj + "_log_files/"
if not os.path.exists(outFilePath):
    os.makedirs(outFilePath)


# Load behavioural data from each run

In [None]:
# Define columns we want to read in
studyCols = ['cond','studied_word', 'studyWord.started']
testCols = ['test_cond', 'tested_word', 'testWord.started', 'test_resp.corr']           

# The easiest way to deal with multiple datasets is to define a function for reading in the data
def readData(run):
    
    # Look for inFile for study1 in 'true first run' folder, for subject-014 only (see behavioural_data/data_collection_notes.txt for explanation)
    if subj=='subject-014' and run=='study1':
        inFile = inFilePath + 'true-first-run/subject-014_study2.csv'  # This might look incorrect, but it's not - see data collection notes!
    
    # Same issue with subject-016
    elif subj=='subject-016' and run=='study1':
        inFile = inFilePath + 'true-first-run/subject-016_study2.csv' 
    
    # Had some trouble getting started with subject-018, which led to multiple csv's for study1. See data collection notes
    elif subj=='subject-018' and run=='study1':
        inFile = inFilePath + 'subject-018_study1_3.csv'
        
    # For some reason subject-022's study1 data is labelled study1_1. Must have re-started experiment during setup.
    if subj=='subject-022' and run=='study1':
        inFile = inFilePath + 'subject-022_study1_1.csv' 
        
    else:
        inFile = inFilePath + subj + "_" + run + ".csv"
    
    if run == 'study1' or run == 'study2':
        columns = studyCols
    elif run == 'test':
        columns = testCols
        
    df = pd.read_csv(inFile, usecols = columns)

    
    # Remoave NaNs
    df.dropna(inplace=True)
    
    # Rename columns for convenience 
    df.rename(columns={'cond':'CONDITION', 'studied_word':'WORD', 'studyWord.started': 'WORDON',
                      'test_cond':"CONDITION", 'tested_word':'WORD', 'testWord.started':'WORDON',
                      'test_resp.corr': 'ACC'}, inplace=True)
    
    # Add run #
    df['RUN'] = run
    
    return df

df_study1 = readData('study1') # study 1
df_study2 = readData('study2') # study 2
df_test = readData('test') # test phase (does not have a run # because there's only one run!)


In [None]:
# Add a placeholder ACC column to study phase data (hack so we can later use WORD as a key for filling in responses)
df_study1['ACC'] = df_study1['WORD']
df_study2['ACC'] = df_study2['WORD']


In [None]:
# Make dict mapping words to their test phase responses
testResps = pd.Series(df_test.ACC.values,index=df_test.WORD).to_dict()
# now replace values in Study phase w responses
df_study1.replace({'ACC':testResps}, inplace=True)
df_study2.replace({'ACC':testResps}, inplace=True)

# Adjust stimulus timing

Note that stim timings are relative to hitting "start" in PsychoPy, however there is a variable (uncontrolled) period between "start" and the actual onset of trails/scans, because the experiment begins with an instruction/"please wait" screen. Therefore, we need to make timings relative to the onset of trials, which was triggered by the MRI tech hitting "s" as they initiated the scan

In [None]:
# Define a function to fix stim timing
def correctTimings(run):

    # Read in log file (which tells us exactly when the "s" was pressed, relative to PsychoPy starting)
    
    # As above, we have to look in a weird place for subject-014 study1
    if subj=='subject-014' and run=='study1':
        log_file = inFilePath + 'true-first-run/subject-014_study2.log'
    elif subj=='subject-016' and run=='study1':
        log_file = inFilePath + 'true-first-run/subject-016_study2.log'
    else:    
        log_file = inFilePath + subj + "_" + run + '.log'
    
    
    # Read in as list (each line is an element)
    f = open(log_file)
    log = f.readlines()
    
    # Pull out the line containing "Keypress: s" (there should only be one instance in the whole run)
    for i in log:
        if "Keypress: s" in i:
#             print(i)
            startLine = i
    
    # Extract timing
    absStart = float(startLine.split("\t")[0])
    
    # Call the df for this run
    df = eval('df_' + run)
    
    # Subtract the absolute start time from WORDON. Also subtract the time for dummy scans (5 TRs, with each TR being 1.8 s)
    df['WORDON_adj'] = df['WORDON'] - (absStart + (1.8 * 5))
    
    # Return the adjusted dataframe
    return df
            
# Use the function to adjust stim timing in each df. Note that the first timepoint should always begin at 
# approximately 1.5 seconds, because the first cue (which ought to start at timepoint 0) lasts 1.5 seconds.
df_study1_adj = correctTimings('study1')
df_study2_adj = correctTimings('study2')
df_test_adj = correctTimings('test')
#dft_adj = correctTimings('3')


In [None]:
# Take a look
# df_test_adj

In [None]:
# Concatenate the df's from all three runs 
df = pd.concat([df_study1_adj, df_study2_adj, df_test_adj])

# Add word duration
df['wordDuration'] = 2.5

# Reset index
df.reset_index(inplace=True)
df.drop(axis=1,labels='index', inplace=True)

# df

# Quality checking! 
Confirm that each word appears 4 times, and that each word is always in the same condition

In [None]:
# Confirm that df contains 120 rows: (60 x 2 for study phase, 90 for test)
if len(df)==210:
    print("210 rows in df - all good!")
else:
    print("WARNING - there are", len(df), "rows in df!")

In [None]:
# Confirm that every word occurs 3 times (no output unless there is a problem)
for i in list(df['WORD']):
    if list(df['WORD']).count(i)!=3:
        
        ## commented out because it generates a lot of useless output ##
        
        # Foil words will only occur once. So, check whether suspect words are foils
#         if all(df.loc[df['WORD'] == i, 'CONDITION'] == 'foil'):
#             print(i, "appears", list(df['WORD']).count(i), "times, but it is a foil word")



        # If the word does not appear 3 times, and is NOT a foil word, print a warning
        if all(df.loc[df['WORD'] == i, 'CONDITION'] != 'foil'): 
            print("WARNING WARNING", i, "appears", list(df['WORD']).count(i), "times WARNING WARNING")


In [None]:
# Confirm that every word is always in the same condition. 

# To do this, we need to make a copy of df containing only WORD and CONDITION
df_debug = df[['WORD','CONDITION']].copy()

# Optional: Try artificially messing with word-condition mapping (by shuffling the condition column), to test whether this works 
# df_debug['CONDITION'] = np.random.permutation(df_debug['CONDITION'].values)

aloud_debug = df_debug.loc[df_debug['CONDITION'] == 'aloud', 'WORD'].tolist()
silent_debug = df_debug.loc[df_debug['CONDITION'] == 'silent', 'WORD'].tolist()
foil_debug = df_debug.loc[df_debug['CONDITION'] == 'foil', 'WORD'].tolist()

# Check for any overlap between the three lists. If there is no overlap, we are golden.

# This is a bit messy but it works:
as_overlap = list(set(aloud_debug) & set(silent_debug))
af_overlap = list(set(aloud_debug) & set(foil_debug))
fs_overlap = list(set(foil_debug) & set(silent_debug))

overlap = as_overlap + af_overlap + fs_overlap

if len(as_overlap)==0 or len(af_overlap)==0 or len(fs_overlap)==0:
    print("Each word is always in the same condition")
else:
    print("WARNING:",len(overlap)," word(s) appeared in multiple conditions. These are:", overlap)

In [None]:
# We can also check manually, if desired
# df.sort_values(["WORD","CONDITION"])

# Generate condition-wise output log files for FSL


In [None]:
# Loop through runs and conditions
conditions = ['aloud','silent', 'foil']
runs = ['study1', 'study2', 'test']

for run in runs:
    for cond in conditions:
        dfTmp = df[df['CONDITION'].isin([cond]) & df['RUN'].isin([run])][['WORDON_adj','wordDuration']]
        
        # Skip empty intances (e.g. there are no foil words in study1 or study2)
        if dfTmp.empty:
            continue
            
        dfTmp['col3'] = 1
        dfTmp.to_csv(outFilePath + '/' + subj + '_PE_' + str(run) + '_' + cond + '_alltrials.txt', sep='\t', header=False, index=False) 

# Generate trial-wise output log files for FSL

In [None]:
# Loop over the list of words, saving a separate log file for each word/run combination. 
wordList = list(set(df['WORD']))

for word in wordList:
    for cond in conditions:
        for run in runs:
            dfTmp = df[df['CONDITION'].isin([cond]) & df['RUN'].isin([run]) & df['WORD'].isin([word])][['WORDON_adj','wordDuration']]
            
            # Python will try to make a file for every possible combination of word, condition, and run. This means we get 2X the number of
            # files we want, because it tries to match every word to both silent and aloud, resulting in one empty dfTmp for every word. To avoid
            # this, skip every iteration where dfTmp is empty.
            if dfTmp.empty:
                continue
            
            dfTmp['col3'] = 1
            dfTmp.to_csv(outFilePath + '/' + subj + '_PE_' + str(run) + '_' + cond + '_' + word + '.txt', sep='\t', header=False, index=False) 


In [None]:
# USe this for quality checking
# df.loc[df['WORD']=='holiday']

# Write lists for correct and incorrect words to file

In [None]:
# Take data from test phase, and only for aloud and silent words (makes things easier)
dft = df[df['RUN'].isin(['test']) & df['CONDITION'].isin(['aloud','silent'])]

corr_words = dft[dft['ACC']==1.0]['WORD'].tolist()
incorr_words = dft[dft['ACC']==0.0]['WORD'].tolist()

acc_words_path = "../../MRIanalyses/" + 'assets/' + subj + '/' 

# Make sure there is no overlap - if everything is okay, write to disk
if len(list(set(corr_words) & set(incorr_words))) > 0:
    print("WARNING: some words are categorized as both correct and incorrect")
else:
    for thisList in ['corr_words', 'incorr_words']:
        fn = acc_words_path + subj + '_PE_' + thisList + '.txt'
        with open(fn, 'w') as f:
            for item in eval(thisList):
                f.write("%s\n" % item)
        