In [1]:
import os
import datetime
import pandas as pd
import numpy as np

In [119]:
main_dir = r"P:\Spironolactone\main_qualtrics"
hrv_dir = r"P:\Spironolactone\Firstbeat"
e4_dirs = os.listdir(r"P:\Spironolactone\E4")
hrv_files = [file for file in os.listdir(hrv_dir) if file.lower().startswith('p') and file.endswith('.csv')]

In [110]:
col_list =  ["Status","DQ-1","Firstbeat_on_time","baseline start","baseline end","Q645","Q646","FILM-START","Q648","Q649"]
new_names = ["response_type","Participant_number","Firstbeat_start","RT1_start","RT1_end","RT2_start","RT2_end","Film_start","RT3_start","RT3_end"]
qualtrics_df = pd.read_csv(os.path.join(main_dir,"main_dat.csv"),usecols =col_list,skiprows= [1,2])
qualtrics_df.columns = new_names

In [85]:
def remove_invalid_records(in_df, exclude_pnums = None):
    """
    remove invalid records

    Parameters
    ----------
    in_df:  pd DataFrame
        dataframe to operate on
    exclude_pnums: list[int]
        participant numbers to drop
        (eg if their records are excluded
        from further analysis)
    Returns
    -------
        dataframe w/o invalid records
    """
    # participant number cannot be nan and must be <100
    in_df = in_df[(in_df.Participant_number.notna())
                    &(in_df.Participant_number<100)]
    if exclude_pnums:
        in_df = in_df.drop(labels = in_df[
                in_df.Participant_number.isin(exclude_pnums)].index,
                axis = 0)
    return in_df
    
def remove_duplicate_participants(in_df):
    """
    If we have duplicate records
    for a given participant, remove
    one if it is all nans.

    Parameters
    ----------
    in_df:  pd Dataframe

    Returns
    -------
        in_df w/o all nan duplicates
    """
    duplicated = in_df.loc[in_df.duplicated(subset = "Participant_number"),"Participant_number"]
    if duplicated is None:
        return in_df
    else:
        drop_inds = []
        for dup in duplicated:
            nan_sum = in_df[in_df.Participant_number == dup].isna().sum(axis = 1)
            nan_max_ind = nan_sum[nan_sum == nan_sum.max()].index
            drop_inds.append(nan_max_ind[0])
        in_df = in_df.drop(labels = drop_inds, axis = 0)
        return in_df        


In [111]:
qualtrics_df = remove_invalid_records(qualtrics_df,exclude_pnums = [1])
qualtrics_df = remove_duplicate_participants(qualtrics_df)


In [126]:
import warnings
def select_hrv_record(pnum,hrv_files):
    """
    select hrv file for participant
    
    Parameters
    ----------
    pnum:   int or float
        participant number whos record
        needs to be retrieved
    hrv_files:  list[str]
        list of hrv files
    
    Returns
    -------
        name of hrv file for participant pnum (str)
    """
    recs = [file for file in hrv_files if int(file[1:4])==pnum]
    if len(recs)>1:
        warnings.warn("Found more than one file for this participant.\nManual check advised.")
        recs = recs[0]
    return recs.pop()
    
    

In [173]:
time_cols = [col for col in qualtrics_df.columns if any(k in col for k in ["start","end"])]
print(time_cols)
for col in time_cols:
    try:
        qualtrics_df[col] = pd.to_datetime(qualtrics_df[col])
    except ValueError:
        print(f"{col} cannot be converted to datetime format. Manual check advised.")

['Firstbeat_start', 'RT1_start', 'RT1_end', 'RT2_start', 'RT2_end', 'Film_start', 'RT3_start', 'RT3_end']
RT2_start cannot be converted to datetime format. Manual check advised.


In [175]:
qualtrics_df.RT1_start - qualtrics_df.Firstbeat_start

9    0 days 00:55:00
10   0 days 00:37:00
11   0 days 00:39:00
12   0 days 00:46:00
14   0 days 00:33:00
15   0 days 00:46:00
16   0 days 00:39:00
17   0 days 01:00:00
18   0 days 00:53:00
19   0 days 00:39:00
20   0 days 00:31:00
21   0 days 00:37:00
22   0 days 00:32:00
23   0 days 00:38:00
24   0 days 00:31:00
25   0 days 00:47:00
26   0 days 00:41:00
27   0 days 00:41:00
28   0 days 00:39:00
29   0 days 00:28:00
30   0 days 00:32:00
31   0 days 00:43:00
32   0 days 00:47:00
33   0 days 00:39:00
34   0 days 00:42:00
35   0 days 00:34:00
36   0 days 00:34:00
37   0 days 00:43:00
38   0 days 00:33:00
39   0 days 00:39:00
40   0 days 00:33:00
41   0 days 00:28:00
42   0 days 00:31:00
43   0 days 00:28:00
44   0 days 00:17:00
45   0 days 00:35:00
46   0 days 00:35:00
47   0 days 00:41:00
48   0 days 00:45:00
49   0 days 00:40:00
dtype: timedelta64[ns]

In [128]:
my_rec = select_hrv_record(3,hrv_files)

In [None]:
class FirsbeatTimings:
    """ get firstbeat relative timings"""
    def __init__(self,hrv_filename,hrv_filepath):
        self.filename = hrv_filename
        self.filepath = hrv_filepath
    
    def get_rel_times(self,pnum,in_df):
        """
        get relative times to identify HRV sections

        """
        

    def get_firstbeat_times(self,starttime = None, endtime = None, tag_name):
        """
        Get start time in Firstbeat ref time

        Parameters
        ----------
        starttime:  interval start time (rel time from qualtrics firstbeat tag)
        endtime:    interval end time (rel time from qualtrics firstbeat tag)
        tag_name:   str
            indicator of tag type (eg "film_start")
        
        Returns
        -------
        
        
        """
        hrv_frame = pd.read_csv(os.path.join(self.filepath,self.filename),
                                header = 0, names = ["IB_intervals"],
                                skiprows= np.arange(0,4))
        
        


In [138]:
my_file = pd.read_csv(os.path.join(hrv_dir,my_rec),header = 0, names = ["IB_intervals"],skiprows = np.arange(0,4))