In [137]:
import pandas as pd
import os
from os.path import isfile, join
import numpy as np
import json
import ast
import matplotlib.pyplot as plt
import warnings
import csv

path = "data"

In [178]:
#download new demographics from Prolfic and new data from Cognition Run (single file!). Change filenames here, e.g. for second batch ...baseline_1

batch = 1

data = pd.read_csv(f"data/CogRun_data_baseline_{batch}.csv", quoting=csv.QUOTE_MINIMAL, engine='python')
demogr = pd.read_csv(f"data/prolific_demographics_baseline_{batch}.csv", quoting=csv.QUOTE_MINIMAL, engine='python')

columns = ['PROLIFIC_PID', 'sex', 'cycling', 'failed_attention_checks', 'no_press_trials', 'avg_rt', 'data rows', 'status', 'group', 'batch', 'contraception', 'menopause', 'comment']
part = pd.DataFrame(columns=columns)

In [179]:
cogrun_part = data["PROLIFIC_PID"].unique()
print("In CogRun: ", len(cogrun_part))

all_part = demogr["Participant id"].unique()
print("ALL: ", len(all_part))

#check if some participants only appear in one platform due to technical issues. you can also check who was not included in a group with the ingroups list.

only_in_prolific = list(set(all_part) - set(cogrun_part))
only_in_cogrun = list(set(cogrun_part) - set(all_part))
print(only_in_prolific)
print(only_in_cogrun)

In CogRun:  216
ALL:  224
['615ba31942e97cfd47ad9812', '65968cc09a29fb117cecbae6', '5e425666d872680d82f638e4', '66daf1642fab82cb5a1df01b', '670dbc335bb9e80f94a636ca', '651c15087c67f2bc3f516e86', '60cd29821711a41cf8d42a92', '67cf04ef29c9f59a6b97bd72']
[]


In [180]:
#just a quick check

print("Column titles:")
print(part.columns.tolist())
print("Length: ", len(part))

Column titles:
['PROLIFIC_PID', 'sex', 'cycling', 'failed_attention_checks', 'no_press_trials', 'avg_rt', 'data rows', 'status', 'group', 'batch', 'contraception', 'menopause', 'comment']
Length:  0


In [181]:
#adds information to baseline participants table

cycle = []
no_cycle = []
contraception = []

for pid in all_part:
    status_val = demogr.loc[demogr["Participant id"] == pid, "Status"].values
    count = data[data["PROLIFIC_PID"] == pid].shape[0] if pid in data["PROLIFIC_PID"].values else 0
    no_press = "NaN"
    sex_options = ["female", "male", "intersex"]
    assigned_sex = None
    for sex in sex_options:
        mask = (data["PROLIFIC_PID"] == pid) & (data["response"] == f'{{"assigned_sex":"{sex}"}}')
        if not data[mask].empty:
            assigned_sex = sex
            break
    else: assigned_sex = demogr.loc[demogr["Participant id"] == pid, "Sex"].values[0]
    comment = (
            f"only {count} rows of CogRun data" if count <= 100 and count > 0
            else "No data" if count == 0
            else ""
        )
    group = "EXCLUDED" if count <= 100 else ""

    if pid in only_in_prolific:
        part.loc[len(part)] = {
            "PROLIFIC_PID": pid,
            "status": status_val[0] if len(status_val) > 0 else "UNKNOWN",
            "group": group,
            "comment": comment,
            "data rows": count,
            "batch": batch,
            "sex": assigned_sex
        }
        print("check: ", pid)
        
    else:        
        mask_cycle = (data["PROLIFIC_PID"] == pid) & (data["response"] == '{"menstrual_cycle":"Yes"}')
        mask_length = (data["PROLIFIC_PID"] == pid) & (data["question_id"] == 7) & (data["eventType"] == "reproductive_status") & pd.to_numeric(data["response"], errors="coerce").between(21, 35)
        mask_reg = (data["PROLIFIC_PID"] == pid) & (data["response"] == '{"cycle_fluctuation":["Yes, regular cycle lengths"]}') #'{"cycle_fluctuation":["No, irregular"]}')
        mask_meno = (data["PROLIFIC_PID"] == pid) & (data["response"] == '{"menopause_transition":["No"]}')
        mask_meno_t = (data["PROLIFIC_PID"] == pid) & (data["response"] == '{"menopause_transition":["Yes"]}')
        mask_preg = (data["PROLIFIC_PID"] == pid) & (data["response"] == '{"pregnant_breastfeeding":["No"]}')
        mask_iud = (data["PROLIFIC_PID"] == pid) & data["response"].isin(['{"iud_use":["NO"]}', '{"iud_use":["YES, COPPER IUD"]}'])
        mask_horm = (data["PROLIFIC_PID"] == pid) & (data["response"] == '{"hormone_use":["NO"]}')
        mask_gyn = (data["PROLIFIC_PID"] == pid) & (data["response"] == '{"gynecologic_conditions":["NO"]}')
        mask_contr = (data["PROLIFIC_PID"] == pid) & (data["response"] == '{"contraceptive_use":["No"]}')
        mask_contr_t = (data["PROLIFIC_PID"] == pid) & (data["response"] == '{"contraceptive_use":["Yes"]}')
        cyc_cond = mask_cycle.any() and mask_length.any() and mask_reg.any() and mask_meno.any() and mask_preg.any() and mask_iud.any() and mask_horm.any() and mask_gyn.any() and mask_contr.any()                  
        
        if group != "EXCLUDED":
            if cyc_cond:
                group = "Cycle"
                cycle.append(pid)
            else:
                group = "No Cycle"
                no_cycle.append(pid)
                # print(f"No Cycle for {pid}")

        if mask_contr_t.any():
            contraception.append(pid)
                
        part.loc[len(part)] = {
                "PROLIFIC_PID": pid,
                "status": status_val[0] if len(status_val) > 0 else "UNKNOWN",
                "group": group,
                "comment": comment,
                "data rows": count,
                "batch": batch,
                "sex": assigned_sex,
                "cycling": 1 if not data[mask_cycle].empty else 0,
                "contraception": 1 if not data[mask_contr_t].empty else 0,
                "menopause": 1 if not data[mask_meno_t].empty else 0,
                "comment": comment
            }

check:  615ba31942e97cfd47ad9812
check:  65968cc09a29fb117cecbae6
check:  60cd29821711a41cf8d42a92
check:  651c15087c67f2bc3f516e86
check:  670dbc335bb9e80f94a636ca
check:  66daf1642fab82cb5a1df01b
check:  67cf04ef29c9f59a6b97bd72
check:  5e425666d872680d82f638e4


In [185]:
#check if length and groups seem fine, take glimps at the saved info. If duplicates, check code above.

print("Amount of participants: ", len(part))
#print(part)

print("Cycle group: ", len(cycle))
print("No Cycle group: ", len(no_cycle))
print("Cycle IDs: ", cycle)
print("Contraception group: ", len(contraception))
print("Contraception IDs: ", contraception)

duplicates = part[part.duplicated(subset=['PROLIFIC_PID'], keep=False)]
print("Duplicates in paricipants: ", duplicates)

common = set(cycle) & set(contraception)
print("common: ", common)

Amount of participants:  224
Cycle group:  65
No Cycle group:  138
Cycle IDs:  ['5c63e773f9b6300001c9ce20', '6113ad4d3cf80aa40e77bbb8', '5c4f7b99b33a5300013dbfdd', '671a44d9d93641d1d9e96883', '5cab22cd68194f001511b2d6', '61601e9a6b8306196d2a129c', '60056657125e50465c15533e', '60f41ae35227aa02240ba0d9', '59cb95053306be000195be2e', '5c2b339a85602a00012ffedd', '6736f4a8fe83ffce0936bdc4', '5a9aa66a35237b000112937b', '60292aabf20c7d2af65541a7', '67d163e7b7bcc1e9718e01da', '67b9e921b4b5ee4019522dbb', '650ac282d9578d72800bf303', '60aad516e34718d318bfb44d', '66f4c82536b5c8271f53564c', '5e203085ac30fa331b586690', '67f14975a8c12321ab2d0e61', '614e1c414dff2e589169f360', '67a72a2a346c08cd36e7860a', '65f0831414fd6e9e51e08708', '6172d51173f6962cef6b18cf', '6565f2ba13586fa869dd3b28', '60ad28c6d838565067ab1b6c', '5daae19f8a06b60016d1bc32', '66c5f76d9a214b469f6476c8', '6702f60844f21087e0840e3d', '60c8a4ece55be5a30e0ccc5c', '642cb1f7f42c4e0a6c5d4eef', '664b3888f484550fe6d377bc', '65cb0f21257c8a98f470bb5

In [186]:
#check for no press trials and attention checks and add info to participant summary; wait until done. Check what happened with the ones that don't have data, maybe ask them on Prolific.
    
for pid in part["PROLIFIC_PID"]:
    rows = data.index[data["PROLIFIC_PID"] == pid].tolist() if pid in data["PROLIFIC_PID"].values else []
    if rows == []:
        print("No data: ", pid)
        no_presses = "no data"
    else:
        no_presses = 0
        att_fail = 0
        for r in rows:
            if data.loc[r]["trial_type"] == "ctl-task-cpt":
                sum_rt = 0
                num_rt = 0
                try:
                    key_presses = ast.literal_eval(data.loc[r]["key_presses"])
                    for j in range(len(key_presses)):
                        if key_presses[j] in ["smooth", "error", "oops", "while_blue", "while_red"]:
                            num_rt += 1
                            sum_rt += float(key_presses[j - 1])
                    if num_rt != 0:
                        avg_rt = sum_rt / num_rt
                    else:
                        no_presses += 1
                except ValueError:
                    warnings.warn_explicit("NaN value in reaction time ignored", Warning, pid, data.loc[r]["trial_index"])
            if data.loc[r]["eventType"] == "ius":
                answrs = ast.literal_eval(data.loc[r]["responses"])
                if int(answrs["Q08"]) > 1:
                    print("IUS attention check failed: ", pid)
                    att_fail += 1
            if data.loc[r]["eventType"] == "phq":
                answrs = ast.literal_eval(data.loc[r]["responses"])
                if int(answrs["Q07"]) != 2:
                    print("PHQ attention check failed: ", pid)
                    att_fail += 1
            if data.loc[r]["eventType"] == "psst":
                answrs = ast.literal_eval(data.loc[r]["responses"])
                if int(answrs["Q09"]) > 1:
                    print("PSST attention check failed: ", pid)
                    att_fail += 1
                
    row_p = part[part["PROLIFIC_PID"] == pid].index[0]
    part.loc[row_p, ["no_press_trials"]] = no_presses
    part.loc[row_p, ["avg_rt"]] = avg_rt
    part.loc[row_p, ["failed_attention_checks"]] = att_fail
        
print("done")

IUS attention check failed:  67e469d0bbb689eacffa0ee0
No data:  615ba31942e97cfd47ad9812
IUS attention check failed:  65faedca7e439b344f133fdb
No data:  65968cc09a29fb117cecbae6
No data:  60cd29821711a41cf8d42a92
No data:  651c15087c67f2bc3f516e86
No data:  670dbc335bb9e80f94a636ca
No data:  66daf1642fab82cb5a1df01b
No data:  67cf04ef29c9f59a6b97bd72
No data:  5e425666d872680d82f638e4
done


In [160]:
#creates a csv file with the participant summary

part.to_csv(f"data/participants_baseline_{batch}.csv", index=False)

In [None]:
#In the file check for the following:
# - Participants with more than 5 no press trials -> ask to return!
# - Participants with more than 2 failed attention checks -> should have failed attention check completion code
# - Participants with less data rows than 100 -> didn't finish the task and should be asked to return
# => all of the above should be removed from Cycle and No Cycle groups, crosscheck in the end with Prolific if everyone returned or timed out was excluded from all gorups

#In Prolific: 
# - Check for people with completion code "No Code" 
# -> if CogRun data is present, check what completion code they should have gotten, assign to group and pay bonus if applicable
# -> if no or little data in CogRun ask to return
# - If you are unsure about grouping someone (e.g. due to ambiguous data), you can ask them and then regroup accordingly.

#Sanity checks you can do:
# - check if participants are grouped the same here vs. Prolific 
# - Participants grouped to Cycle but contraception = 1 -> check data, keep in Cycle group if Copper IUD or mistake, otherwise change groups
# - Compare if all in Cycle group are cycling and female -> if not, check data or ask participant
# - Check if reaction times seem reasonable