In [2]:
import pandas as pd
import os
from os.path import isfile, join
import numpy as np
import json
import ast
import matplotlib.pyplot as plt
import warnings
import csv

path = "data"

In [48]:
#get new demographics from Prolific and new data from Cognition Run (single file!), change filenames by changing batch and week number

batch = 0
week = 1

data = pd.read_csv(f"data/CogRun_data_batch{batch}_week{week}.csv", quoting=csv.QUOTE_MINIMAL, engine='python')
demogr = pd.read_csv(f"data/prolific_demographics_batch{batch}_week{week}.csv", quoting=csv.QUOTE_MINIMAL, engine='python')
demogr_bsln = pd.read_csv(f"data/prolific_demographics_baseline_{batch}.csv", quoting=csv.QUOTE_MINIMAL, engine='python')

In [18]:
#check if some participants only appear in one platform due to technical issues.

cogrun_part = data["PROLIFIC_PID"].unique()
print("In CogRun: ", len(cogrun_part))

all_part = demogr["Participant id"].unique()
print("ALL: ", len(all_part)) 

only_in_prolific = list(set(all_part) - set(cogrun_part)) 
only_in_cogrun = list(set(cogrun_part) - set(all_part)) #this should always be 0
print(only_in_prolific)
print(only_in_cogrun)

In CogRun:  33
ALL:  34
['6786c32f329b1d694103f225']
[]


In [38]:
#create the dataframe you will write in and then safe to csv

columns = ['PROLIFIC_PID', 'sex', 'bled', 'failed_attention_checks', 'no_press_trials', 'avg_rt', 'data rows', 'status', 'action', 'batch', 'contraception', 'comment']
part = pd.DataFrame(columns=columns)

#just a quick check

print("Column titles:")
print(part.columns.tolist())
print("Length: ", len(part))

Column titles:
['PROLIFIC_PID', 'sex', 'bled', 'failed_attention_checks', 'no_press_trials', 'avg_rt', 'data rows', 'status', 'action', 'batch', 'contraception', 'comment']
Length:  0


In [39]:
#adds information to participants summary table

cycle = []
no_cycle = [] #all of them should be cycling in the weekly and this should therefore stay empty

for pid in all_part:
    status_val = demogr.loc[demogr["Participant id"] == pid, "Status"].values
    count = data[data["PROLIFIC_PID"] == pid].shape[0] if pid in data["PROLIFIC_PID"].values else 0
    no_press = "NaN" #added later below
    sex = demogr.loc[demogr["Participant id"] == pid, "Sex"].values[0]
    comment = (
            f"only {count} rows of CogRun data" if count <= 100 and count > 0
            else "No data" if count == 0
            else ""
        )
    action = "RETURN" if count <= 100 else ""

    if pid in only_in_prolific:
        part.loc[len(part)] = {
            "PROLIFIC_PID": pid,
            "status": status_val[0] if len(status_val) > 0 else "UNKNOWN",
            "action": action,
            "comment": comment,
            "data rows": count,
            "batch": batch,
            "sex": sex
        }
        print("check what happened to data of", pid)
        
    else:        
        mask_bled = (data["PROLIFIC_PID"] == pid) & (data["response"] == '{"bleeding":["Yes"]}')
        mask_preg = (data["PROLIFIC_PID"] == pid) & (data["response"] == '{"pregnancy":["No"]}')
        mask_horm = (data["PROLIFIC_PID"] == pid) & (data["response"] == '{"hormonal_contraceptives":["No"]}')
        mask_med = (data["PROLIFIC_PID"] == pid) & (data["response"] == '{"affect_hormones":["No"]}')
        cyc_cond = mask_preg.any() and mask_horm.any() and mask_med.any()                  

        if action != "RETURN":
            if cyc_cond:
                action = ""
                cycle.append(pid)
            else:
                action = "Remove from Cycle group"
                no_cycle.append(pid)
                print(f"No Cycle anymore for {pid} - check reason")

        part.loc[len(part)] = {
                "PROLIFIC_PID": pid,
                "status": status_val[0] if len(status_val) > 0 else "UNKNOWN",
                "action": action,
                "comment": comment,
                "data rows": count,
                "batch": batch,
                "sex": sex,
                "bled": 1 if not data[mask_bled].empty else 0,
                "contraception": 1 if data[mask_horm].empty else 0,
                "comment": comment
            }

No Cycle anymore for 66a4f108a03ede09bb40255c - check reason
No Cycle anymore for 66952367721687887540318f - check reason
check what happened to data of 6786c32f329b1d694103f225


In [43]:
#check if length and groups seem fine, take glimps at the saved info. If duplicates, check code above (you probably ran it twice without creating a new empty dataframe).

print("Amount of participants: ", len(part))
# print(part)

print("Cycle: ", len(cycle))
print("No Cycle: ", len(no_cycle))
print("No Cycle IDs: ", no_cycle) #check why those don't seem to cycle anymore

duplicates = part[part.duplicated(subset=['PROLIFIC_PID'], keep=False)]
print("Duplicates in participants: ", duplicates)

Amount of participants:  34
Cycle:  30
No Cycle:  2
No Cycle IDs:  ['66a4f108a03ede09bb40255c', '66952367721687887540318f']
Duplicates in participants:  Empty DataFrame
Columns: [PROLIFIC_PID, sex, bled, failed_attention_checks, no_press_trials, avg_rt, data rows, status, action, batch, contraception, comment]
Index: []


In [47]:
#check for no press trials and attention checks and add info to participant summary. Check what happened with the ones that don't have data, maybe ask them on Prolific.

attention_failures = [
    '{"attention_check":["Yes"]}',
    '{"attention_check":["Rather not tell"]}'
]

for pid in part["PROLIFIC_PID"]:
    df_pid = data[data["PROLIFIC_PID"] == pid]
    
    if df_pid.empty:
        print("No data: ", pid)
        no_presses = "no data"
        avg_rt = np.nan
        att_fail = 0
    else:
        no_presses = 0
        att_fail = 0
        rt_list = []

        df_cpt = df_pid[df_pid["trial_type"] == "ctl-task-cpt"]

        for _, row in df_cpt.iterrows():
            try:
                key_presses = ast.literal_eval(row["key_presses"])
                sum_rt = 0
                num_rt = 0
                for j in range(len(key_presses)):
                    if key_presses[j] in ["smooth", "error", "oops", "while_blue", "while_red"]:
                        if j > 0:
                            num_rt += 1
                            sum_rt += float(key_presses[j - 1])
                if num_rt != 0:
                    rt_list.append(sum_rt / num_rt)
                else:
                    no_presses += 1
            except (ValueError, SyntaxError):
                warnings.warn_explicit("NaN value in reaction time ignored", Warning, pid, row["trial_index"])
                continue
        avg_rt = np.mean(rt_list) if rt_list else np.nan

        att_fail += df_pid["response"].isin(attention_failures).sum()
        df_drsp = df_pid[df_pid["eventType"] == "drsp"]
        for _, row in df_drsp.iterrows():
            try:
                answers = ast.literal_eval(row["responses"])
                if int(answers["Q13"]) != 5:
                    print("PHQ attention check failed:", pid)
                    att_fail += 1
            except (ValueError, KeyError):
                continue

    row_p = part[part["PROLIFIC_PID"] == pid].index[0]
    part.loc[row_p, "no_press_trials"] = no_presses
    part.loc[row_p, "avg_rt"] = avg_rt
    part.loc[row_p, "failed_attention_checks"] = att_fail

No data:  6786c32f329b1d694103f225
PHQ attention check failed: 60f7c71d1cffec381351b0b3


In [45]:
#creates a csv file with the participant summary

part.to_csv(f"data/participants_batch{batch}_week{week}.csv", index=False)

In [None]:
#In the file check for the following:
# - Participants with more than 5 no press trials -> ask to return!
# - Participants with more than 2 failed attention checks -> should have failed attention check completion code
# - Participants with less data rows than 100 -> didn't finish the task and should be asked to return
# => no need to remove anyone from groups at this point, all that performed fine during baseline will be reinvited
# - If action tells you to remove participants from Cycle group, they clicked something that indicates they don't cycle anymore -> check what's the reason in the CogRun data 
# -> if seems valid (e.g. got pregnant) remove from Cycle group in Prolific and include in No Cycle
# -> sometimes they just use a copper IUD and think it's hormonal - it's not so we leave them in
# - Participants who have status "time out" or "returned", but their data otherwise seems fine -> double check CogRun data and potentially pay them anyways
# - Participants who have 2 data rows + completion code probably cheated (ask Clara or Nace what that means) and may complain that they want to be paid -> decide one by one, but try to return

#In Prolific: 
# - Check for people with completion code "No Code" 
# -> if CogRun data is present, check what completion code they should have gotten, assign to group and pay bonus if applicable
# -> if no or little data in CogRun ask to return

#Sanity checks you can do:
# - Participants grouped to Cycle but contraception = 1 -> check data, keep in Cycle group if Copper IUD or mistake, otherwise change groups
# - Compare if all in Cycle group are cycling and female -> if not, check data or ask participant
# - Check if reaction times seem reasonable