In [None]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

Specify path to directory that holds MST files (downloaded from Gorilla).
Enter filename, including file extension.

In [None]:
infilepath = r"C:\Possible_online_studies\NLP_expressive_writing\analysis\Processed_data\MST"
#infilepath = r"C:\Users\testing\Desktop\luzia_testing\EW_study\analysis\Processed_data\MST"
filename = input("Please enter input filename, including file extension.")

We also specify the task number (1,2,3,4) for later. We could extract this from the file number, but then of course we would assume that it is necessarily part of the filename, which is not true, so I prefer this solution.

In [None]:
task_number = input("Please enter the task number (1,2 3 or 4). ")

Now we load the data file. First of all we specify a list of column names. These are the columns we want to read in. I decided on these by inspection of the Gorilla files.
Some notes on the remaining options: one could specify the data type for all or a subset of the columns when reading in the data. I chose not to do that here, mainly because some columns contain mixed data types and also because we are not really using all of them, anyways (I just wanted to have a look). I did include the low_memory = False option due to the mixed data types, but I doubt this will have made any difference in terms of speed due to the small size of the data frame.

In [None]:
filter_cols = ['UTC Timestamp','UTC Date','Local Timestamp','Local Date','Participant Public ID','Participant Status','Trial Number','Reaction Time','Response','Correct','Incorrect','Timed Out','ANSWER','Task Name']

In [None]:
MST_df = pd.read_csv(os.path.join(infilepath,filename),usecols = filter_cols, parse_dates = [0,1,2,3],infer_datetime_format = True, dayfirst = True,low_memory = False)


In [None]:
MST_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46838 entries, 0 to 46837
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   UTC Timestamp          46837 non-null  object        
 1   UTC Date               46837 non-null  datetime64[ns]
 2   Local Timestamp        46837 non-null  object        
 3   Local Date             46837 non-null  datetime64[ns]
 4   Participant Public ID  46837 non-null  object        
 5   Participant Status     46837 non-null  object        
 6   Task Name              46837 non-null  object        
 7   Trial Number           46827 non-null  object        
 8   Reaction Time          46716 non-null  object        
 9   Response               21834 non-null  object        
 10  Correct                46837 non-null  float64       
 11  Incorrect              46837 non-null  float64       
 12  Timed Out              1408 non-null   float64       
 13  A

From the Gorilla file, you just get all subjects lumped together, trial by trial. We want to calculate a score for each participant, so in a first step we'll get a list of the Qualtrics Participant IDs (Participant Public ID column in Gorilla.

In [None]:
Participant_ID = MST_df['Participant Public ID'].dropna().unique()

In [None]:
MST_df.filter(items = ['Participant Public ID', 'Response','ANSWER','Trial Number','Task Name']).dropna(how = 'any').head()

Unnamed: 0,Participant Public ID,Response,ANSWER,Trial Number,Task Name
3,R_Q4fzFrqUEdCj7ln,Old,Old,1,MST_set4test
5,R_Q4fzFrqUEdCj7ln,New,New,2,MST_set4test
7,R_Q4fzFrqUEdCj7ln,Old,Similar,3,MST_set4test
11,R_Q4fzFrqUEdCj7ln,Old,Old,5,MST_set4test
13,R_Q4fzFrqUEdCj7ln,New,New,6,MST_set4test


Again, many ways to accomplish the objective of the next step. Basically we want to figure out the number of correct similar (LDI score) and correct old (REC score) trials. We also want to know how many times each participant responded 'old' for new items (REC)/'similar' for new items (LDI). This is to correct for any bias a participant may have for responding 'old'/'similar' overall.
First of all, I am adding columns to the data frame for each of these categories and filling them with 0s.

In [None]:
#MST_all['LDI'] = 0
#MST_all['REC'] = 0

MST_df['old_corr'] = 0
MST_df['old_new'] = 0
MST_df['similar_corr'] = 0
MST_df['similar_new'] = 0

Now we just replace the 0s with 1s as appropriate in each column. There are many ways to do this - you could use where, replace, etc instead of what I've done here.

In [None]:
MST_df.loc[(MST_df.Response=='Old') & (MST_df.ANSWER=='Old'),'old_corr']=1
MST_df.loc[(MST_df.Response=='Old') & (MST_df.ANSWER=='New'),'old_new']=1
MST_df.loc[(MST_df.Response=='Similar') & (MST_df.ANSWER=='Similar'),'similar_corr']=1
MST_df.loc[(MST_df.Response=='Similar') & (MST_df.ANSWER=='New'),'similar_new']=1

In [None]:
task_names = MST_df['Task Name'].dropna().unique()

We now calculate the LDI and REC scores for each participant. This is done in a for loop and results are appended to an array. We are going to use this later to build our results dataframe.

In [None]:
coll_resp = []
for count, id in enumerate(Participant_ID):
    Participant_df = MST_df[MST_df['Participant Public ID']==id]
    rec_part = Participant_df.old_corr.sum() - Participant_df.old_new.sum() # numerator for rec score
    ldi_part = Participant_df.similar_corr.sum() - Participant_df.similar_new.sum() # numerator for ldi score
    targets_part = len(Participant_df.ANSWER[Participant_df.ANSWER=='Old']) # denominator for rec score
    similar_part = len(Participant_df.ANSWER[Participant_df.ANSWER=='Similar']) # denominator for ldi score
    prob_rec = abs(Participant_df.old_corr.sum()/targets_part) - abs(Participant_df.old_new.sum()/targets_part) #rec score proper
    prob_ldi = abs(Participant_df.similar_corr.sum()/similar_part)-abs(Participant_df.similar_new.sum()/similar_part) #ldi score proper
    date = Participant_df.iloc[0,3]
    part_met = [id,rec_part, ldi_part, targets_part,similar_part,prob_rec,prob_ldi,date] # assemble all the values for a given participant
    coll_resp.append(part_met) #append


In [None]:
coll_resp

[['R_Q4fzFrqUEdCj7ln',
  44,
  0,
  128,
  128,
  0.34375,
  0.0,
  Timestamp('2021-06-06 14:56:00')],
 ['R_1GTSsjX5sbMS8hl',
  50,
  4,
  128,
  128,
  0.390625,
  0.03125,
  Timestamp('2021-05-21 11:37:00')],
 ['R_qI7uLRbMYJ0k66B',
  50,
  6,
  128,
  128,
  0.390625,
  0.046875,
  Timestamp('2021-05-15 02:16:00')],
 ['R_DC4q7zU4bMBjk6R',
  58,
  5,
  128,
  128,
  0.453125,
  0.0390625,
  Timestamp('2021-05-08 13:33:00')],
 ['R_3qCqCxGvt9m6ADP',
  50,
  23,
  128,
  128,
  0.390625,
  0.1796875,
  Timestamp('2021-05-04 12:33:00')],
 ['R_31aWb4Oregw3IjC',
  52,
  30,
  128,
  128,
  0.40625,
  0.234375,
  Timestamp('2021-05-03 16:15:00')],
 ['R_31oRMzMUIvnl0Rm',
  36,
  29,
  128,
  128,
  0.28125,
  0.2265625,
  Timestamp('2021-04-27 11:48:00')],
 ['R_1rcdSV4lwwUkW83',
  48,
  48,
  128,
  128,
  0.375,
  0.375,
  Timestamp('2021-04-19 16:54:00')],
 ['R_2ztKBA1jlSaEUfM',
  48,
  25,
  128,
  128,
  0.375,
  0.1953125,
  Timestamp('2021-04-19 11:38:00')],
 ['R_1kIzco66AwdTE4w',
  62,

Now we build the dataframe and add a column to specify the task number.

In [None]:
LDI_df = pd.DataFrame(data = coll_resp, columns = ['qualtrics_id', 'rec_part','ldi_part','targets','lures','rec_prob','ldi_prob','date_time'])

In [None]:
LDI_df['task_type'] = int(task_number)

In [None]:
LDI_df.head()

Unnamed: 0,qualtrics_id,rec_part,ldi_part,targets,lures,rec_prob,ldi_prob,date_time,task_type
0,R_Q4fzFrqUEdCj7ln,44,0,128,128,0.34375,0.0,2021-06-06 14:56:00,4
1,R_1GTSsjX5sbMS8hl,50,4,128,128,0.390625,0.03125,2021-05-21 11:37:00,4
2,R_qI7uLRbMYJ0k66B,50,6,128,128,0.390625,0.046875,2021-05-15 02:16:00,4
3,R_DC4q7zU4bMBjk6R,58,5,128,128,0.453125,0.039062,2021-05-08 13:33:00,4
4,R_3qCqCxGvt9m6ADP,50,23,128,128,0.390625,0.179688,2021-05-04 12:33:00,4


The last thing we need to do before saving the dataframe is to check whether someone had trouble with the task and completed it separately. We need to check this because it would result in a different Qualtrics ID than the one associated with the survey part.

In [None]:
only_mst = input("Enter filename for re-completed status check, including file extension: ")
only_mst_df = pd.read_csv(os.path.join(infilepath,only_mst),skiprows = [0,2],usecols = ['Response ID','Please enter your participant ID'])

Sometimes it took them a couple of tries to get through to the task. Because of this, we just filter out those rows where the Qualtrics ID does not appear in the actual Gorilla file.

In [None]:
only_mst_df = only_mst_df[only_mst_df['Response ID'].isin(LDI_df.qualtrics_id)]

We now save this to a new file. We'll need this later on, to deal with the missnig Qualtrics IDs from the survey part.

In [None]:
only_mst_outname = input('Enter output filename for MST only, including file extension:')
only_mst_df.to_csv(os.path.join(infilepath, only_mst_outname),index = False)

Finally, we save the results to a file.

In [None]:
ldi_outfile_name = input('Please enter a filename for the ldi output file, including the file extension. ')
LDI_df.to_csv(os.path.join(infilepath,ldi_outfile_name), index = False)