In [1]:
import os
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from preprocess_modules import utilities_hrv as hrvutils
from preprocess_modules import utilities as dutils

Invoking __init__.py for preprocess_modules


In [108]:
def select_columns_main(in_df, select_list):
    """
    Select columns to process.
    Main qualtrics version.
    
    Parameters
    ----------
    in_df:  pd DataFrame
        dataframe to operate on
    select_list: list[str]
        names of columns to retain
    Returns
    -------
    in_df with only specified columns
    """
    in_df = in_df.loc[:,select_list]
    return in_df

def remove_newlines(in_df, rounds = 1):
    """
    Remove newlines from text.
    if rounds = 1, just remove leading
    and trailing newlines
    if rounds > 1, additionally replace
    newlines in text with whitespace.

    Parameters
    ----------
    in_df:  pd DataFrame
        input dataframe
    rounds: int
        1 or more rounds, see above
    
    Returns
    -------
    in_df with newlines removed
    """
    if rounds == 1:
        in_df = in_df.applymap(lambda x: x.strip())
    else:
        in_df = in_df.applymap(lambda x: x.strip())
        in_df = in_df.applymap(lambda x: x.replace("\n", " "))
    return in_df

def make_dict(keys, values):
    """
    Make dicts for questionnaire scoring

    Parameters
    ----------
    keys:   list[str]
        string values of questionnaire
        scores as they appear in qualtrics
    values: list[int]
        the scores corresponding to the
        string values in keys
    
    Returns
    -------
        dict containing string values as keys
        and ints for replacement as values.
    """
    keys = [key.lower() for key in keys]
    my_key_dict = dict(zip(keys,values))
    return my_key_dict

def filter_cols(in_df, substr):
    """
    filter for relevant columns
    Parameters
    ----------
    in_df:  pd DataFrame
        input dataframe
    substr: str
        string to filter column
        names by
    Returns
    -------
        list of filtered column names
    """
    return [col for col in in_df if substr in col]

def repl_numeric(sub_df, key_dict):
    """
    Replace strings with numeric score.

    Parameters
    ----------
    sub_df: pd DataFrame
        dataframe containing only cols
        referring to given survey
    key_dict:   dict
        dictionary containing key-value pairs
        for replacement
    rem_newl:   
        whether or not to remove newline characters
        None/not provided = don't strip new lines
        any other value = do remove them.
    
    Returns
    -------
        sub_df, containing numeric representations of
        questionnaire responses.

    """
    sub_df = sub_df.applymap(lambda x: key_dict[x.lower()])
    return sub_df

def strip_unwanted(sub_df,reg_exp):
    """
    Remove unwanted elements from string.

    Parameters
    ----------
    sub_df: pd DataFrame
        dataframe containing questionnaire
        scores in string format
    reg_exp:    str
        this will be the first argument in
        re.sub()
    
    Returns
    -------
        sub_df w/out unwanted content
    """
    sub_df = sub_df.applymap(
                            lambda x: re.sub(reg_exp,
                            "",x).rstrip()
                            )
    return sub_df

def preprocess_subdf(in_df,substr,keys,values,rem_nl = None, strip_re = None):
    """
    Preprocess questionnaire scores.

    Parameters
    ----------
    in_df: pd DataFrame
        main qualtrics dataframe
    substr: str
        substr to identify relevant columns
    keys:   list[str]
        scores as they appear in OG sub_df
    values: list[int]
        numeric scores to replace str vals with
    remove_newlines:
        if provided, remove any unwanted newline
        characters
    strip_re: str, optional
        provide string to use in re.sub(),
        eg for removing brackets from survey
        scores.
    
    Returns
    -------
        sub_df containing numeric rather than str
        scores.
    """
    sub_df = in_df.loc[:, filter_cols(in_df,substr)]
    # remove new line characters if needed
    if rem_nl is not None:
        sub_df = remove_newlines(sub_df,rounds = 2)
    if strip_re is not None:
        sub_df = strip_unwanted(sub_df,strip_re)
    sub_df = repl_numeric(sub_df,make_dict(keys, values))
    return sub_df

def change_header(in_df,col_names,val_range):
    """
    Change head for easier multiplication.

    Parameters
    ----------
    in_df:  pd DataFrame
    col_names:  list[str]
        columns in in_df to rename
    val_range:  array
        range of values ot replace
        string names with
    
    Returns
    -------
    subsection of input dataframe
    with names replaced by values in
    val_range
    """
    sub_df = in_df.loc[:,col_names]
    sub_df.columns = val_range
    return sub_df


In [134]:
main_dir = r"P:\Spironolactone\main_qualtrics"
qualtrics_df = pd.read_csv(os.path.join(main_dir, "main_dat21.csv"), skiprows=[1,2])
qualtrics_df = dutils.remove_incomplete_rows(qualtrics_df, "Finished")
qualtrics_df = hrvutils.remove_invalid_records(qualtrics_df,"DQ-1",[1])
qualtrics_df = hrvutils.remove_duplicate_participants(qualtrics_df,"DQ-1")
qualtrics_df = qualtrics_df.set_index("DQ-1")

In [135]:
# key/value pairs
# childhood trauma questionnaire
ctq_keys = ["never true","rarely true", "sometimes true", "often true", "very often true"]
ctq_values = np.arange(1,6)
# STAI
stai_keys = ["almost never","sometimes","often", "almost always"]
stai_values = np.arange(1,5)
# ERQ
erq_keys = ["strongly disagree","disagree","slightly disagree","neither agree nor disagree", "slightly agree", "agree","strongly agree"]
erq_values = np.arange(1,8)
# PANAS
panas_keys = ["very slightly or not at all", "a little", "moderately", "quite a bit", "extremely"]
panas_values = np.arange(1,6)
# CADSS
cadss_keys = ["not at all", "slightly", "moderately", "considerably", "extremely"]
cadss_values = np.arange(1,6)

In [136]:
stai_df = preprocess_subdf(qualtrics_df,"622_",stai_keys,stai_values)
ctq_df = preprocess_subdf(qualtrics_df,"Q409_",ctq_keys,ctq_values)
erq_df = preprocess_subdf(qualtrics_df,"ERQ_",erq_keys,erq_values,2)
panas_t1_df = preprocess_subdf(qualtrics_df,"PANAS",panas_keys,panas_values)
panas_t2_df = preprocess_subdf(qualtrics_df,"Q511_",panas_keys,panas_values)
panas_t3_df = preprocess_subdf(qualtrics_df,"Q528_",panas_keys,panas_values)
cadss_df = preprocess_subdf(qualtrics_df, "Q594_",cadss_keys,cadss_values,strip_re = r"\([^)]*\)")

Now score surveys.

In [137]:
pos_affect_items_panas = [val-1 for val in [1,3,5,9,10,12,14,16,17,19]]
neg_affect_items_panas = [val-1 for val in [2,4,6,7,8,11,13,15,18,20]]

In [144]:
# PANAS
panas_pos_scores = []
panas_neg_scores = []
labels = []
for i,panas in enumerate([panas_t1_df,panas_t2_df,panas_t3_df]):
    neg_affect = panas.iloc[:,neg_affect_items_panas].sum(axis = 1)
    pos_affect = panas.iloc[:,pos_affect_items_panas].sum(axis = 1)
    label = '_'.join(["panas",str(i)])
    labels.append(label)
    panas_pos_scores.append(pos_affect)
    panas_neg_scores.append(neg_affect)

sum_dict_keys = ["panas_t1_neg","panas_t1_pos","panas_t2_neg","panas_t2_pos","panas_t3_neg","panas_t3_pos"]
sum_dict_vals = [panas_neg_scores[0],panas_pos_scores[0], panas_neg_scores[1],panas_pos_scores[1],panas_neg_scores[2],panas_pos_scores[2]]
panas_dict = dict(zip(sum_dict_keys, sum_dict_vals))
panas_summary_df = pd.DataFrame(data = panas_dict)

In [188]:
# STAI. Some items are reverse scored. You can account for this at an earlier stage (when replacing the string values).
# Because I'd already done the preprocessing, I'll just replace the relevant vals here instead.
stai_new_keys = stai_values
rev_vals = list(stai_values)
rev_vals.reverse()
stai_dict_new = dict(zip(stai_new_keys,rev_vals))
reverse_items = [val-21 for val in [21,23,26,27,30,33,34,36,39]]
stai_df.iloc[:,reverse_items] = stai_df.iloc[
                                            :,reverse_items
                                            ].applymap(
                                            lambda x: stai_dict_new[x]
                                            )

stai_scored = stai_df.sum(axis = 1).rename("stai_trait")

In [189]:
# ERQ
cra_items = [0,3,5,6,8]
sup_items = [1,2,4,7]
erq_cra = erq_df.iloc[:,cra_items].sum(axis = 1).rename("erq_cra")
erq_sup = erq_df.iloc[:, sup_items].sum(axis = 1).rename("erq_sup")

Acute diary

In [344]:
acute_diary_df = qualtrics_df.filter(like = "Q195", axis = 1)

In [345]:
use_cols = []
search_strings = ["".join(["#",str(num),"_"]) for num in np.arange(1,5)]
for s in search_strings:#
    cols = filter_cols(acute_diary_df,s)
    use_cols.append(cols)

In [346]:
# replace string vals with an integer
# this is just an indicator as to whether or not they reported an intrusion.
acute_diary_df = acute_diary_df.replace("0",np.nan).replace(0,np.nan)
acute_diary_df.loc[:,use_cols[0]] = acute_diary_df.loc[:,use_cols[0]].notnull().astype(int)

In [347]:
int_content_df = change_header(acute_diary_df,use_cols[0],np.arange(1,13))
int_viv_df = change_header(acute_diary_df,use_cols[2],np.arange(1,13))
int_dist_df = change_header(acute_diary_df,use_cols[3],np.arange(1,13))
int_frequency_df = change_header(acute_diary_df,use_cols[1],np.arange(1,13))
int_count = int_content_df*int_frequency_df
viv_load = (int_frequency_df*int_viv_df).sum(axis = 1).rename("vividness_load")
dist_load = (int_frequency_df*int_dist_df).sum(axis = 1).rename("distress_load")
int_count.columns = ["_".join(["memory",str(col)]) for col in int_count.columns]

In [349]:
old_names = use_cols[2:]
new_names = ["vividness", "distress"]

for i,name in enumerate(old_names):
    new_name = ["_".join([new_names[i],str(val)]) for val in np.arange(1,13)]
    mydict = dict(zip(name,new_name))
    acute_diary_df = acute_diary_df.rename(mydict,axis = 1)

In [352]:
acute_diary_scores_df = pd.concat([int_count,acute_diary_df.iloc[:,24:]],axis = 1)
acute_diary_scores_df["sum_ints"] = acute_diary_scores_df.filter(like = "memory",axis = 1).sum(axis = 1)
acute_diary_scores_df["vividness_load"] = viv_load
acute_diary_scores_df["distress_load"] = dist_load

In [354]:
all_scores_df = pd.concat([panas_summary_df,stai_scored,erq_cra,erq_sup,acute_diary_scores_df],axis = 1)

In [356]:
all_scores_df.to_csv(os.path.join(r"P:\Spironolactone","study_day_scores.csv"),index = False)