In [8]:
import os
import re
from collections import Counter
import pandas as pd
import numpy as np
import plotly.express as px


In [2]:
input_dir = r"P:\Spironolactone\E4"
main_qualtrics_dir = r"P:\Spironolactone\main_qualtrics"
participant_folders = os.listdir(input_dir)
participant_folders = [f for f in participant_folders if re.search("^p[0][0-9][0-9]",f.lower())] 

In [3]:
def get_participant_num(folder_name):
    """
    Get participant number from folder
    name.

    Parameters
    ----------
    folder_name:    str
        Name of participant e4 data folder

    Returns
    -------
    Participant number as an integer value.

    """
    pnum = re.findall("p[0][0-9][0-9]",folder_name.lower())
    return int(pnum.pop()[1:])

def flag_duplicates(folder_name):
    """
    Flag duplicate E4 folders.
    Happens when E4 recording was
    interrupted for some reason.

    Parameters
    ----------
    folder_name:    str
    Name of participant e4 data folder

    Returns
    -------
    list of duplicate participant numbers.
    """
    parts = [num[:4] for num in folder_name]
    dups = [int(num[1:]) for num, count in Counter(parts).items() if count>1]
    return dups

def remove_multindex(in_df, axis, level):
    """
    Remove multi index of specified level
    along specified axis.

    Parameters
    ----------
    in_df:  pd DataFrame
        df to operate on
    axis:   int
        rows = 0
        cols = 1
    level:  int
        level of multindex to remove.
    
    Returns
    -------
    dataframe with specified level of multiindex
    removed
    """
    if axis == 0:
        out_df = in_df.reset_index(level = level, drop = True)
    else:
        out_df = in_df.T.reset_index(level = level, drop = True).T
    return out_df
    
def get_rowdiff(values):
    """
    calculate difference
    between rows of series

    Paramters
    ---------
    values

    Returns
        differences between successive
        rows
    """
    if len(values)<=1:
        return values
    else:
        new_vals = []
        for i,val in enumerate(values):
            if i==0:
                new_val = val
            else:
                new_val = val-values[i-1]
            new_vals.append(new_val)
    return new_vals

def find_min_delta(tag_deltas, axis = 0):
    """
    find minimum difference
    between successive tags.

    Parameters
    ----------
    tag_deltas: pd DataFrame
        dataframe of row-to-row differences
    axis:   int
        0 for min row  w/in col
        1 for min col w/in row
    
    Returns
    -------
        min value for each column (participant)
    """
    min_diffs = tag_deltas.apply(lambda x: min(x),axis = axis)
    return min_diffs

def return_likely_doubles(min_deltas, time_delta_df, threshold):
    """
    find likely double tags
    based on min delta

    Parameters
    ----------
    min_deltas: pd Series
        minimum difference between rows
        for each participant
    time_delta_df:  pd DataFrame
        dataframe representing differences
        between successive rows
    threshold:  float
        min_deltas will be multiplied by
        threshold to define an acceptable
        window for identifying double tags
        eg: look for row differences less than
        1.5*min_deltas, where 1.5 would represent
        thresh

    Returns
    -------
        time_delta_df masked with nan where
        threshold exceeded. Non-nan values
        represent possible double tags.
    """
    double_tag_df = time_delta_df[
                                time_delta_df<=min_deltas*threshold
                                ].dropna(how = "all",axis = 0)
    return double_tag_df

def get_num_double_tags(double_tag_df):
    """
    Get number of double tags
    detected in dataframe.

    Parameters
    ----------
    double_tag_df:  pd DataFrame
        dataframe with value for likely
        double tags, NaN elswhere
    
    Returns
    -------
    value counts of double tags identfied
    """
    double_counts = double_tag_df.apply(
                                        lambda x:
                                        x.notna().sum()
                                        ).value_counts()
    return double_counts

def get_best_thresh(tag_val_counts):
    """
    get threshold that maximises
    number of double tags detected

    Parameters
    ----------
    tag_val_counts: list of tuples
        of type [(two_counts, threshold)]
    
    Returns
    -------
        Threshold at which max number of 
        two double tags were detected.

    """
    max_num = max([val[0] for val in tag_val_counts])
    max_threshold = [thresh for (num,thresh)
                    in tag_val_counts if num == max_num]
    if len(max_threshold)>1:
        print(f"More than 1 max val detected. Manual check advised.")
    return max_threshold

def detect_missing_doubles(double_tags_df):
    """
    Flag participants with (possible)
    missing double_tags.

    Parameters
    ----------

    Returns
    -------
    """
    num_tags = double_tags_df.apply(
                                    lambda x: len(x.dropna())
                                    )
    flags_df = num_tags[num_tags != 2]
    flags_df = flags_df.reset_index()
    flags_df.columns = ["pnum","num_double_tags"]
    return flags_df


def check_pnums(*args):
    """
    Get a single list of all
    pnums worth double checking
    for any reason (duplicates,
    <min tags, qualtrics notes detected)

    Parameters
    ----------
    args:   list
        lists of participant numbers
        for which problems have been
        detected.
        Eg: duplicates, below_min
    
    Returns
    -------
    A single list of participant numbers
    to check.
    """
    all_pnums = []
    for arg in args:
        all_pnums.extend(arg)
    return set(all_pnums)

def find_e4_notes(study_session_df,notes_col, pnum_col, keywords):
    """
    Get participant numbers for whom E4 related
    issues were flagged in the data acquisition 
    session.

    Parameters
    ----------
    study_session_df:   pd DataFrame
        data frame containing main study session
        data. Must have participant number and session
        notes.
    notes_col, pnum_col:    str
        names of session notes/participant number columns
    keywords:   list[str]
        a list of strings representing keywords to look for
        in the session notes.
    
    Returns
    -------
    participant numbers for whom E4 related events were recorded
    in the session notes.
    """
    reg_substr = "|".join(keywords)
    flagged_participants = study_session_df.loc[study_session_df[
                            notes_col].str.lower().str.contains(reg_substr),
                            pnum_col].astype(int).values
    return flagged_participants

def check_double_tags(double_tag_df, num_tags):
    """
    This function lets you inspect participants
    with unusual double tag numbers (below or above
    2).

    Parameters
    ----------
    double_tag_df:  pd DataFrame
        dataframe of likely double tags
    num_tags:   int
                the number of tags to look for
                eg 1, 3, 4...
    
    Returns
    -------
    A dataframe showing only cols for participants
    with the specified number of double tags.
    OR
    if no participants found for the specified number
    of tags, this function will return None.
    """

    double_view_df = double_tag_df.loc[
                                        :,double_tag_df.notna().sum()==num_tags
                                        ]
    if double_view_df.shape[1]<=1:
        print("No participants found for this number of tags.")
    else:
        return double_view_df

Read in tag files for all participants and assemble those w/o duplicates into a single dataframe. The steps are as follows:
(i) Get folder for participant, extract participant number from folder name.
    If participant pnum has more than one folder, skip this record and print a message to the console.
    This will get printed twice for each participant.
(ii) Retrieve tags file for participant pnum.
    If no tags file found, print a message and move on.
    If the number of tags is less than the minimum number of tags expected (14),
    skip this participant and continue.
(iii) Assemble tags into a single dataframe, with columns = participants and rows = tags.
Note that participants with issues (more than one folder, no tags file) will be stored in duplicates, below_min and missing_tags.

In [4]:
missing_tags = []
pnums = []
tags_dat = []
below_min = []
duplicates = flag_duplicates(participant_folders)

for folder in participant_folders:
    pnum = get_participant_num(folder)
    if pnum in duplicates:
        print(f"More than one tag file exists for participant {pnum}. Skipping.")
        continue
    try:
        tags_df = pd.read_csv(os.path.join(input_dir,folder,"tags.csv"),header = 0,names = [pnum])
    except FileNotFoundError:
        print(f"No tags file found for participant {pnum}.Manual check advised.")
        missing_tags.append(pnum)
        continue
    if tags_df.shape[0]<14:
        print(f"Participant {pnum} recorded fewer than the minimum number of tags. Manual check advised.")
        below_min.append(pnum)
        continue
    pnums.append(pnum)
    tags_dat.append(tags_df)

tags_df = pd.concat(tags_dat,axis = 1,keys =[pnum for pnum in pnums])
tags_df = remove_multindex(tags_df,1,1)

More than one tag file exists for participant 1. Skipping.
More than one tag file exists for participant 1. Skipping.
More than one tag file exists for participant 10. Skipping.
More than one tag file exists for participant 10. Skipping.
Participant 12 recorded fewer than the minimum number of tags. Manual check advised.
Participant 14 recorded fewer than the minimum number of tags. Manual check advised.
More than one tag file exists for participant 20. Skipping.
More than one tag file exists for participant 20. Skipping.
More than one tag file exists for participant 27. Skipping.
More than one tag file exists for participant 27. Skipping.


In [54]:
# for the visualization below, convert to date time.
tags_as_dt_df = tags_df.apply(lambda x: pd.to_datetime(x,errors = 'coerce',unit = "s"))


In [57]:
events

0            Firstbeat
1            RT1_start
2              RT1_end
3                 Drug
4            RT2_start
5              RT2_end
6           Film_start
7             Film_end
8            RT3_start
9              RT3_end
10    DT1_music_starts
11    DT2_music_starts
dtype: object

In [59]:
x_ax = np.arange(0,tags_df.shape[0])
fig = px.scatter(tags_as_dt_df, x = tags_as_dt_df.columns, y = x_ax, labels = dict(value = "UTC tag time stamp",y = "tag number"))
fig.add_hline(y = 10, line_width = 2, line_dash = 'dash')

Look for likely double tags based on time between successive tags. There is no "hard and fast" criterion, but given the fact that (a) they're only supposed to double-tag twice and (b) double tags are supposed to happen in quick succession (with no other button press supposed to be quicker), the approach used here is based on the following steps:
(i) Look for the smallest row difference for each participant (where participants = columns, rows = tags).
(ii) Look for row differences within a certain above this minimum difference (threshold is chosen based on the value that maximises the number of double tags detected).
(iii) Count the number of double tags detected.
(iv) Flag those participants for whom (for a given threshold) either too few or too many double tags were detected.
These participants can then be checked manually, or we can check whether the words 'tag' or 'E4' are mentioned in their session notes.

In [5]:
# this function will calculate the difference between successive rows (ie tags)
tags_diff_df = tags_df.apply(lambda x: get_rowdiff(x))
# this finds the smallest difference in the df generated above
min_deltas = find_min_delta(tags_diff_df)
# this is a range of thresholds to try out to find the best tag difference
# to detect double tags. The way it works is that min_deltas will be multiplied
# by thresh and we will look for tag time differences below (or equal to) the resulting threshold.
thresholds = np.arange(1.5,5,0.5)
all_tag_vals = []
# try out the different thresholds
for thresh in thresholds:
    double_df = return_likely_doubles(min_deltas, tags_diff_df, thresh)
    tag_vals = get_num_double_tags(double_df)
    num_twos = tag_vals[2]
    all_tag_vals.append((num_twos,thresh))
# get the threshold for which the largest number of double tags was recorded
max_thresh = get_best_thresh(all_tag_vals)
# return a dataframe of likely double tags where detected, NaN elswhere
double_df = return_likely_doubles(min_deltas, tags_diff_df,max_thresh)
# get an overview of pnums with double tags less or greater than 2 and the tags detected.
detect_missing_doubles_df = detect_missing_doubles(double_df)

We will now add an events column to the "likely double tag" dataframe. This represents the events that should have been tagged, up to the first double tag that indicates the start of the music section.

In [45]:
# events up to first double tag
events = pd.Series([
                    "Firstbeat","RT1_start","RT1_end","Drug",
                    "RT2_start","RT2_end","Film_start","Film_end",
                    "RT3_start","RT3_end","DT1_music_starts",
                    "DT2_music_starts"
                    ])
                    
double_df["Events"] = events

single_tags = check_double_tags(double_df,1)
print("\nThe following participants had double tags detected around the start of the music section:\n")
print(double_df[double_df.Events.isin(["DT1_music_starts","DT2_music_starts"])])



The following participants had double tags detected around the start of the music section:

       2     3     4     5     6    8    9  11  13    15  ...    28    29  30  \
10  1.96  0.36   NaN  0.27  1.92  NaN  NaN NaN NaN  5.87  ...  5.91  1.57 NaN   
11   NaN   NaN  0.48   NaN   NaN  5.5  6.4 NaN NaN   NaN  ...   NaN   NaN NaN   

      31  32    33  34    35    36            Events  
10  7.11 NaN  5.94 NaN  5.88  6.12  DT1_music_starts  
11   NaN NaN   NaN NaN   NaN   NaN  DT2_music_starts  

[2 rows x 30 columns]


In [47]:
events

0            Firstbeat
1            RT1_start
2              RT1_end
3                 Drug
4            RT2_start
5              RT2_end
6           Film_start
7             Film_end
8            RT3_start
9              RT3_end
10    DT1_music_starts
11    DT2_music_starts
dtype: object

Check for notes in the main qualtrics survey containing info regarding E4 tags. Here, we look for the words "tag" or "e4" to flag participants for whom problems with the E4 watch were reported. You can include more keywords by modifying the keywords variable below.

In [134]:
qualtrics_df = pd.read_csv(os.path.join(main_qualtrics_dir,"main_dat.csv"),skiprows = [1,2],usecols = ["DQ-1", "NOTES"])
qualtrics_df.columns = ["pnum","session_notes"]
qualtrics_df = qualtrics_df.drop(labels = qualtrics_df[qualtrics_df.session_notes.isna()].index, axis = 0)
keywords = ["tag","e4"]
flagged_participants = find_e4_notes(qualtrics_df,"session_notes","pnum",keywords)

We have 4 categories of participants for whom we recorded issues. We will now assemble these participant numbes into a single list using .extend

In [147]:
print(f"\nThe following participants had fewer or more than the expected number of double tags:\n{[num for num in detect_missing_doubles_df.pnum]}\n")
manual_check_pnums = check_pnums(flagged_participants,missing_tags, below_min,duplicates)
print(f"The following participants are worth checking manually:\n{manual_check_pnums}")
print("\nA breakdown of reasons:")
print(f"duplicates:\n{duplicates}")
print(f"missing tag files:\n{missing_tags}")
print(f"fewer than min tags (14):\n{below_min}")
print(f"session notes mention E4:\n{flagged_participants}")


The following participants had fewer or more than the expected number of double tags:
[6, 11, 13, 16, 24, 29, 30, 32, 34, 35]

The following participants are worth checking manually:
{1, 3, 4, 9, 10, 12, 14, 16, 17, 18, 19, 20, 27, 29}

A breakdown of reasons:
duplicates:
[1, 10, 20, 27]
missing tag files:
[]
fewer than min tags (14):
[12, 14]
session notes mention E4:
[ 3  4 10 12 17 18  9 16 27 19 29]
