In [1]:
import os
import re
from collections import Counter
import pandas as pd
import numpy as np


In [2]:
input_dir = r"P:\Spironolactone\E4"
main_qualtrics_dir = r"P:\Spironolactone\main_qualtrics"
participant_folders = os.listdir(input_dir)
participant_folders = [f for f in participant_folders if re.search("^p[0][0-9][0-9]",f.lower())] 

In [3]:
def get_participant_num(folder_name):
    """
    Get participant number from folder
    name.

    Parameters
    ----------
    folder_name:    str
        Name of participant e4 data folder

    Returns
    -------
    Participant number as an integer value.

    """
    pnum = re.findall("p[0][0-9][0-9]",folder_name.lower())
    return int(pnum.pop()[1:])

def flag_duplicates(folder_name):
    """
    Flag duplicate E4 folders.
    Happens when E4 recording was
    interrupted for some reason.

    Parameters
    ----------
    folder_name:    str
    Name of participant e4 data folder

    Returns
    -------
    list of duplicate participant numbers.
    """
    parts = [num[:4] for num in participant_folders]
    dups = [int(num[1:]) for num, count in Counter(parts).items() if count>1]
    return dups

def remove_multindex(in_df, axis, level):
    """
    Remove multi index of specified level
    along specified axis.

    Parameters
    ----------
    in_df:  pd DataFrame
        df to operate on
    axis:   int
        rows = 0
        cols = 1
    level:  int
        level of multindex to remove.
    
    Returns
    -------
    dataframe with specified level of multiindex
    removed
    """
    if axis == 0:
        out_df = in_df.reset_index(level = level, drop = True)
    else:
        out_df = in_df.T.reset_index(level = level, drop = True).T
    return out_df
    
def get_rowdiff(values):
    """
    calculate difference
    between rows of series

    Paramters
    ---------
    values

    Returns
        differences between successive
        rows
    """
    if len(values)<=1:
        return values
    else:
        new_vals = []
        for i,val in enumerate(values):
            if i==0:
                new_val = val
            else:
                new_val = val-values[i-1]
            new_vals.append(new_val)
    return new_vals

def find_min_delta(tag_deltas, axis = 0):
    """
    find minimum difference
    between successive tags.

    Parameters
    ----------
    tag_deltas: pd DataFrame
        dataframe of row-to-row differences
    axis:   int
        0 for min row  w/in col
        1 for min col w/in row
    
    Returns
    -------
        min value for each column (participant)
    """
    min_diffs = tag_deltas.apply(lambda x: min(x),axis = axis)
    return min_diffs

def return_likely_doubles(min_deltas, time_delta_df, threshold):
    """
    find likely double tags
    based on min delta

    Parameters
    ----------
    min_deltas: pd Series
        minimum difference between rows
        for each participant
    time_delta_df:  pd DataFrame
        dataframe representing differences
        between successive rows
    threshold:  float
        min_deltas will be multiplied by
        threshold to define an acceptable
        window for identifying double tags
        eg: look for row differences less than
        1.5*min_deltas, where 1.5 would represent
        thresh

    Returns
    -------
        time_delta_df masked with nan where
        threshold exceeded. Non-nan values
        represent possible double tags.
    """
    double_tag_df = time_delta_df[
                                time_delta_df<=min_deltas*threshold
                                ].dropna(how = "all",axis = 0)
    return double_tag_df

def get_num_double_tags(double_tag_df):
    """
    Get number of double tags
    detected in dataframe.

    Parameters
    ----------
    double_tag_df:  pd DataFrame
        dataframe with value for likely
        double tags, NaN elswhere
    
    Returns
    -------
    value counts of double tags identfied
    """
    double_counts = double_tag_df.apply(
                                        lambda x:
                                        x.notna().sum()
                                        ).value_counts()
    return double_counts

def get_best_thresh(tag_val_counts):
    """
    get threshold that maximises
    number of double tags detected

    Parameters
    ----------
    tag_val_counts: list of tuples
        of type [(two_counts, threshold)]
    
    Returns
    -------
        Threshold at which max number of 
        two double tags were detected.

    """
    max_num = max([val[0] for val in tag_val_counts])
    max_threshold = [thresh for (num,thresh)
                    in tag_val_counts if num == max_num]
    if len(max_threshold)>1:
        print(f"More than 1 max val detected. Manual check advised.")
    return max_threshold

def detect_missing_doubles(double_tags_df):
    """
    Flag participants with (possible)
    missing double_tags.

    Parameters
    ----------

    Returns
    -------
    """
    num_tags = double_tags_df.apply(
                                    lambda x: len(x.dropna())
                                    )
    flags_df = num_tags[num_tags != 2]
    flags_df = flags_df.reset_index()
    flags_df.columns = ["pnum","num_double_tags"]
    return flags_df



Read in tag files for all participants and assemble those w/o duplicates into a single dataframe.

In [32]:
missing_tags = []
pnums = []
tags_dat = []
duplicates = flag_duplicates(participant_folders)

for folder in participant_folders:
    pnum = get_participant_num(folder)
    if pnum in duplicates:
        print(f"More than one tag file exists for participant {pnum}. Skipping.")
        continue
    try:
        tags_df = pd.read_csv(os.path.join(input_dir,folder,"tags.csv"),header = 0,names = [pnum])
    except FileNotFoundError:
        print(f"No tags file found for participant {pnum}.Manual check advised.")
        missing_tags.append(pnum)
        continue
    pnums.append(pnum)
    tags_dat.append(tags_df)

tags_df = pd.concat(tags_dat,axis = 1,keys =[pnum for pnum in pnums])
tags_df = remove_multindex(tags_df,1,1)
#tags_df = tags_df.apply(lambda x: pd.to_datetime(x))

More than one tag file exists for participant 1. Skipping.
More than one tag file exists for participant 1. Skipping.
More than one tag file exists for participant 10. Skipping.
More than one tag file exists for participant 10. Skipping.
More than one tag file exists for participant 20. Skipping.
More than one tag file exists for participant 20. Skipping.
More than one tag file exists for participant 27. Skipping.
More than one tag file exists for participant 27. Skipping.


Look for likely double tags based on time between successive tags. There is no "hard and fast" criterion, but given the fact that (a) they're only supposed to double-tag twice and (b) double tags are supposed to happen in quick succession (with no other button press supposed to be quicker), the approach used here is based on the following steps:
(i) Look for the smallest row difference for each participant (where participants = columns, rows = tags).
(ii) Look for row differences within a certain above this minimum difference (threshold is chosen based on the value that maximises the number of double tags detected).
(iii) Count the number of double tags detected.
(iv) Flag those participants for whom (for a given threshold) either too few or too many double tags were detected.
These participants can then be checked manually, or we can check whether the words 'tag' or 'E4' are mentioned in their session notes.

In [130]:
tags_diff_df = tags_df.apply(lambda x: get_rowdiff(x))
min_deltas = find_min_delta(tags_diff_df)
thresholds = np.arange(1.5,5,0.5)
all_tag_vals = []
for thresh in thresholds:
    double_df = return_likely_doubles(min_deltas, tags_diff_df, thresh)
    tag_vals = get_num_double_tags(double_df)
    num_twos = tag_vals[2]
    all_tag_vals.append((num_twos,thresh))
max_thresh = get_best_thresh(all_tag_vals)
double_df = return_likely_doubles(min_deltas, tags_diff_df,max_thresh)
detect_missing_doubles_df = detect_missing_doubles(double_df)

In [143]:
qualtrics_df = pd.read_csv(os.path.join(main_qualtrics_dir,"main_dat.csv"),skiprows = [1,2],usecols = ["DQ-1", "NOTES"])
qualtrics_df.columns = ["pnum","session_notes"]
qualtrics_df = qualtrics_df.drop(labels = qualtrics_df[qualtrics_df.session_notes.isna()].index, axis = 0)

In [144]:
substr = ["tag","e4"]
reg_substr = "|".join(substr)
qualtrics_df.loc[qualtrics_df["session_notes"].str.lower().str.contains(reg_substr),'pnum']

10     3.0
11     4.0
16    10.0
18    12.0
19    17.0
21    18.0
22     9.0
32    16.0
34    27.0
35    19.0
37    29.0
Name: pnum, dtype: float64

In [145]:
detect_missing_doubles_df

Unnamed: 0,pnum,num_double_tags
0,6,1
1,11,1
2,12,1
3,13,1
4,14,0
5,16,1
6,24,1
7,29,1
8,30,1
9,32,1
