In [2]:
import pandas as pd
import numpy as np

# Import Dataset

In [57]:
# Import RedcapExport.csv as redcap
redcap = pd.read_csv('Redcap Export.csv')
# Display the first 5 rows of the dataframe
redcap.head()

Unnamed: 0,record_id,redcap_event_name,redcap_repeat_instrument,redcap_repeat_instance,subjectid,subject_referral,covid_group,interventiongroup,subject_information_complete,mmrc_score,...,pdf_outcomemeasures,pdf_complete,diary_sessions_1,diary_sessions_2,diary_sessions_3,diary_sessions_4,diary_mip_1,diary_mip_2,diary_mip_3,training_diary_complete
0,1,screening_arm_1,,,1.0,Physical Therapy,1.0,,2.0,2.0,...,,,,,,,,,,
1,1,visit_2_arm_1,,,,,,,,0.0,...,pcs01v2 measures.pdf,2.0,14.0,14.0,14.0,14.0,65.0,83.0,80.0,2.0
2,1,baseline_arm_1,,1.0,,,,,,2.0,...,pcs01v1 measures.pdf,2.0,,,,,,,,
3,2,screening_arm_1,,,2.0,Physical Therapy,1.0,,0.0,2.0,...,,,,,,,,,,
4,2,visit_2_arm_1,,,,,,,,2.0,...,pcs02v2 measures.pdf,2.0,,,,,,,,0.0


In [58]:
# Step 1. Extract covid_group and interventiongroup from 'screening_arm_1' rows
# and map them to all rows with the same record_id

# Create a small DataFrame containing just screening_arm_1 rows
screening_data = redcap[redcap['redcap_event_name'] == 'screening_arm_1']

# Build a mapping dictionary for covid_group and interventiongroup
covid_group_map = screening_data.set_index('record_id')['covid_group'].to_dict()
interventiongroup_map = screening_data.set_index('record_id')['interventiongroup'].to_dict()

# Fill missing values in covid_group and interventiongroup columns across all rows
redcap['covid_group'] = redcap.apply(
    lambda row: covid_group_map.get(row['record_id'], row['covid_group']),
    axis=1
)
redcap['interventiongroup'] = redcap.apply(
    lambda row: interventiongroup_map.get(row['record_id'], row['interventiongroup']),
    axis=1
)

# Step 2. Remove 'screening_arm_1' rows
redcap = redcap[redcap['redcap_event_name'].isin(['baseline_arm_1', 'visit_2_arm_1'])]

# Step 3. Create the time_point column
time_point_map = {
    'baseline_arm_1': 1,
    'visit_2_arm_1': 2
}
redcap['time_point'] = redcap['redcap_event_name'].map(time_point_map)

# ---------------------------------------------------------------------------- #

# NOTE: Remember to update RedCAP (FIX THIS)
# Step 4. Manually overwrite interventiongroup using your dictionary
manual_interventiongroup = {
    1:1, 2:0, 3:1, 4:0, 5:0, 6:0, 7:1, 8:1, 9:0, 10:1,
    11:1, 13:1, 14:0, 15:1, 16:0, 17:0, 18:1, 20:0, 21:1
}
redcap['interventiongroup'] = redcap['record_id'].map(manual_interventiongroup).combine_first(redcap['interventiongroup'])

# Temporarily set Covid_group = 1 for record_id 7
redcap.loc[redcap['record_id'] == 7, 'covid_group'] = 1
# NOTE: Remember to delete this after next RedCap download (FIX THIS)

# ---------------------------------------------------------------------------- #

# Set subject_female
redcap = redcap.sort_values(['record_id', 'time_point'])
time_point_1_mask = redcap['time_point'] == 1
redcap.loc[time_point_1_mask, 'subject_female'] = np.where(
    redcap.loc[time_point_1_mask, 'subject_gender'] == 'Female', 1,
    np.where(redcap.loc[time_point_1_mask, 'subject_gender'] == 'Male', 0, np.nan)
)
redcap['subject_female'] = redcap.groupby('record_id')['subject_female'].ffill()


# Remove all columns where every value is NaN
redcap = redcap.dropna(axis=1, how='all')

# Drop unnecessary columns
redcap = redcap.drop(columns=['redcap_event_name', 'redcap_repeat_instance'])

# Remove all columns that end with '_complete'
redcap = redcap.loc[:, ~redcap.columns.str.endswith('_complete')]



In [29]:

# Export the cleaned DataFrame to a new CSV file
redcap.to_csv('cleaned_redcap_data.csv', index=False)

redcap.head(30)


Unnamed: 0,record_id,covid_group,interventiongroup,mmrc_score,pcfs_score,subject_dob,subject_gender,subject_ethnicity,health_ldl,health_hdl,...,pdf_outcomemeasures,diary_sessions_1,diary_sessions_2,diary_sessions_3,diary_sessions_4,diary_mip_1,diary_mip_2,diary_mip_3,time_point,subject_female
1,1,1.0,1.0,0.0,1.0,,,,,,...,pcs01v2 measures.pdf,14.0,14.0,14.0,14.0,65.0,83.0,80.0,2,
2,1,1.0,1.0,2.0,2.0,1992-04-16,Female,Latino,,,...,pcs01v1 measures.pdf,,,,,,,,1,1.0
4,2,1.0,0.0,2.0,2.0,,,,,,...,pcs02v2 measures.pdf,,,,,,,,2,
5,2,1.0,0.0,2.0,2.0,1940-03-13,Female,White,91.0,60.0,...,pcs02v1 measures.pdf,,,,,,,,1,1.0
7,3,1.0,1.0,3.0,2.0,,,,,,...,pcs03v2 measures.pdf,14.0,14.0,14.0,14.0,89.0,108.0,120.0,2,
8,3,1.0,1.0,4.0,4.0,1976-11-24,Female,Hispanic,,,...,pcs03v1 measures.pdf,,,,,,,,1,1.0
10,4,1.0,0.0,0.0,1.0,,,,,,...,,,,,,,,,2,
11,4,1.0,0.0,1.0,2.0,1986-04-30,Male,White,,,...,,,,,,,,,1,0.0
13,5,1.0,0.0,3.0,3.0,,,,,,...,,,,,,,,,2,
14,5,1.0,0.0,3.0,3.0,1963-12-26,,,,,...,,,,,,,,,1,


# Respiratory Strength

In [None]:
def calculate_mip_predicted1(row):
    # https://www.atsjournals.org/doi/full/10.1164/ajrccm.158.5.9712006
    try:
        age = float(row['data_age'])
        weight = float(row['data_kilograms'])
        
        # Check gender using subject_female (1=Female, 0=Male, NaN=unknown)
        if row['subject_female'] == 0:  # Male (since 0 = Male)
            return 126 - (1.028 * age) + (0.343 * weight)
        elif row['subject_female'] == 1:  # Female (since 1 = Female)
            height = float(row['data_centimeters'])  # Ensure 'height_cm' exists
            return 171 - (0.694 * age) + (0.861 * weight) - (0.743 * height)
        else:
            return None  # Unknown gender (NaN or other)
    except (KeyError, ValueError, TypeError):
        return None  # Missing data or invalid type
    

def calculate_mip_predicted2(row):
    # https://pubmed.ncbi.nlm.nih.gov/25141521/
    try:
        age = float(row['data_age'])
        weight = float(row['data_kilograms'])
        
        if row['subject_female'] == 0:  # Male
            return 124.39 - (0.91 * age) + (0.63 * weight)
        elif row['subject_female'] == 1:  # Female
            return 77.57 - (0.59 * age) + (0.62 * weight)
        else:
            return None  # Unknown gender
    except (KeyError, ValueError, TypeError):
        return None  # Missing data or invalid type
    
# Compute predicted values
redcap['mip_predicted1'] = redcap.apply(calculate_mip_predicted1, axis=1)
redcap['mip_predicted2'] = redcap.apply(calculate_mip_predicted2, axis=1)

In [None]:

# Check results
redcap[['record_id', 'mip_pre_max', 'mip_post_max', 'mip_predicted1', 'mip_predicted2', 'mip_pre_max_percentpredict_1', 'mip_pre_max_percentpredict_2', 'mip_post_max_percentpredict_1', 'mip_post_max_percentpredict_2']].head(30)

# Export results to a new CSV file

Unnamed: 0,record_id,mip_pre_max,mip_post_max,mip_predicted1,mip_predicted2,mip_pre_max_percentpredict_1,mip_pre_max_percentpredict_2,mip_post_max_percentpredict_1,mip_post_max_percentpredict_2
2,1,52.0,31.0,96.834,105.81,0.537001,0.491447,0.320135,0.292978
1,1,106.0,94.0,91.1068,103.826,1.16347,1.020939,1.031756,0.905361
5,2,49.0,45.0,47.1697,68.992,1.038802,0.710227,0.954002,0.65225
4,2,47.0,56.0,47.4162,69.116,0.991222,0.680016,1.181031,0.810232
8,3,69.0,73.0,116.469,119.682,0.592432,0.576528,0.626776,0.60995
7,3,136.0,128.0,114.9581,118.38,1.18304,1.148843,1.113449,1.081264
11,4,56.0,47.0,118.2519,147.329,0.473565,0.380102,0.397457,0.319014
10,4,71.0,66.0,119.3152,149.282,0.595062,0.47561,0.553157,0.442116
14,5,36.0,29.0,,,,,,
13,5,48.0,43.0,,,,,,


In [64]:
# Define measures and suffixes
measures = ['mip', 'fit', 'smip', 'pif', 'sindex', 'volume']
suffixes = ['max', 'min', 'mean']

# Calculate max, min, mean for pre and post columns
for measure in measures:
    # Pre columns
    pre_cols = [f"imt_{measure}_{i}_pre" for i in [1, 2, 3]]
    if all(col in redcap.columns for col in pre_cols):
        redcap[f"{measure}_pre_max"] = redcap[pre_cols].max(axis=1, skipna=True)
        redcap[f"{measure}_pre_min"] = redcap[pre_cols].min(axis=1, skipna=True)
        redcap[f"{measure}_pre_mean"] = redcap[pre_cols].mean(axis=1, skipna=True)
    else:
        print(f"⚠️ Warning: Missing pre columns for {measure}")

    # Post columns
    post_cols = [f"imt_{measure}_{i}_post" for i in [1, 2, 3]]
    if all(col in redcap.columns for col in post_cols):
        redcap[f"{measure}_post_max"] = redcap[post_cols].max(axis=1, skipna=True)
        redcap[f"{measure}_post_min"] = redcap[post_cols].min(axis=1, skipna=True)
        redcap[f"{measure}_post_mean"] = redcap[post_cols].mean(axis=1, skipna=True)
    else:
        print(f"⚠️ Warning: Missing post columns for {measure}")

# Now calculate fatigue and fatigue percent
for measure in measures:
    for suffix in suffixes:
        pre_col = f"{measure}_pre_{suffix}"
        post_col = f"{measure}_post_{suffix}"

        if pre_col in redcap.columns and post_col in redcap.columns:
            # Fatigue: pre - post
            fatigue_col = f"{measure}_{suffix}_fatigue"
            redcap[fatigue_col] = redcap[pre_col] - redcap[post_col]

            # Fatigue Percent: (pre - post) / pre * 100
            fatigue_percent_col = f"{measure}_{suffix}_fatigue_percent"
            redcap[fatigue_percent_col] = ((redcap[pre_col] - redcap[post_col]) / redcap[pre_col]) * 100
        else:
            print(f"⚠️ Warning: Missing columns for {measure}_{suffix}")

In [65]:
# Calculate percent of predicted values for MIP only
predicted_vars = ['predicted1', 'predicted2']  # these should exist in your dataset: 'mip_predicted1', 'mip_predicted2'

for suffix in suffixes:
    for timepoint in ['pre', 'post']:
        measure_col = f"mip_{timepoint}_{suffix}"
        for pred_var in predicted_vars:
            pred_col = f"mip_{pred_var}"
            percentpredict_col = f"mip_{timepoint}_{suffix}_percentpredict_{pred_var[-1]}"
            
            if measure_col in redcap.columns and pred_col in redcap.columns:
                redcap[percentpredict_col] = redcap[measure_col] / redcap[pred_col]
            else:
                print(f"⚠️ Warning: Missing columns for {measure_col} or {pred_col}")


In [66]:
redcap.head()

Unnamed: 0,record_id,covid_group,interventiongroup,mmrc_score,pcfs_score,subject_dob,subject_gender,subject_ethnicity,health_ldl,health_hdl,...,mip_post_max_percentpredict_1,mip_post_max_percentpredict_2,mip_pre_min_percentpredict_1,mip_pre_min_percentpredict_2,mip_post_min_percentpredict_1,mip_post_min_percentpredict_2,mip_pre_mean_percentpredict_1,mip_pre_mean_percentpredict_2,mip_post_mean_percentpredict_1,mip_post_mean_percentpredict_2
2,1,1.0,1.0,2.0,2.0,1992-04-16,Female,Latino,,,...,0.320135,0.292978,0.47504,0.434742,0.289155,0.264625,0.499136,0.456794,0.309809,0.283527
1,1,1.0,1.0,0.0,1.0,,,,,,...,1.031756,0.905361,0.834186,0.731994,0.812234,0.712731,1.05005,0.921413,0.940288,0.825098
5,2,1.0,0.0,2.0,2.0,1940-03-13,Female,White,91.0,60.0,...,0.954002,0.65225,0.826802,0.565283,0.848002,0.579777,0.961069,0.657081,0.911602,0.623261
4,2,1.0,0.0,2.0,2.0,,,,,,...,1.181031,0.810232,0.695965,0.477458,0.927953,0.636611,0.822504,0.564269,1.033402,0.708953
8,3,1.0,1.0,4.0,4.0,1976-11-24,Female,Hispanic,,,...,0.626776,0.60995,0.334853,0.325864,0.463643,0.451196,0.437885,0.426129,0.535192,0.520825


# Subjective Questionnaires

## Woods Mental Fatigue Inventory

In [8]:
# List of columns to sum
woods_columns = [
    'woods_concentration', 'woods_decisions', 'woods_confusion',
    'woods_memory', 'woods_words', 'woods_takethingsin',
    'woods_processingspeed', 'woods_thoughtsmixed', 'woods_muzzy'
]

# Create the new column 'woods_sum'
redcap['woods_sum'] = redcap[woods_columns].sum(axis=1)

# Display mean, min, max, and std of the 'woods_sum' column
print("Woods Sum Statistics:")
print(f"Mean: {redcap['woods_sum'].mean()}")
print(f"Min: {redcap['woods_sum'].min()}")
print(f"Max: {redcap['woods_sum'].max()}")
print(f"Std: {redcap['woods_sum'].std()}")


Woods Sum Statistics:
Mean: 16.483870967741936
Min: 1.0
Max: 31.0
Std: 8.958686539673606


## Fatigue Severity Scale (FSS)

In [9]:
# List of columns to sum
fss_columns = ['fss_motivation', 'fss_exercise', 'fss_easily', 'fss_functioning',
       'fss_problems', 'fss_sustained', 'fss_duties', 'fss_disabling',
       'fss_social']

# Create the new column 'fss_sum'
redcap['fss_sum'] = redcap[fss_columns].sum(axis=1)

# Create a new column fss_dichotomous if fss_sum is greater than 36
redcap['fss_dichotomous'] = redcap['fss_sum'].apply(lambda x: 1 if x > 36 else 0)

# Print the mean, min, max, and std of the 'fss_sum' column
print("FSS Sum Statistics:")
print(f"Mean: {redcap['fss_sum'].mean()}")
print(f"Min: {redcap['fss_sum'].min()}")
print(f"Max: {redcap['fss_sum'].max()}")
print(f"Std: {redcap['fss_sum'].std()}")

FSS Sum Statistics:
Mean: 38.67741935483871
Min: 10.0
Max: 62.0
Std: 16.682100380895672


## PEM (DSQ)

In [10]:
# https://pmc.ncbi.nlm.nih.gov/articles/PMC6165517/
# A frequency and severity score of 2, 2 on any items 1–5 is indicative of PEM.

# Lists of columns
freq_cols = [
    'dsq_heavy_freq', 'dsq_nextday_freq', 'dsq_mentallytired_freq',
    'dsq_minexercise_freq', 'dsq_drained_freq'
]

severity_cols = [
    'dsq_heavy_severity', 'dsq_nextday_severity', 'dsq_mentallytired_severity',
    'dsq_minexercise_severity', 'dsq_drained_severity'
]

# Sum of frequency
redcap['dsq_freq_sum'] = redcap[freq_cols].sum(axis=1)

# Sum of severity
redcap['dsq_severity_sum'] = redcap[severity_cols].sum(axis=1)

# Sum of both frequency and severity
redcap['dsq_sum'] = redcap['dsq_freq_sum'] + redcap['dsq_severity_sum']

# Dichotomous indicator (1 if any item has freq >=2 AND severity >=2)
def compute_dsq_dichotomous(row):
    for freq_col, sev_col in zip(freq_cols, severity_cols):
        if (row[freq_col] >= 2) and (row[sev_col] >= 2):
            return 1
    return 0

redcap['dsq_dichotomous'] = redcap.apply(compute_dsq_dichotomous, axis=1)

# Print the frequency counts for dsq_dichotomous
print("DSQ Dichotomous Frequency Counts:")
print(redcap['dsq_dichotomous'].value_counts())

# Print the mean, min, max, and std of the 'dsq_sum' column
print("DSQ Sum Statistics:")
print(f"Mean: {redcap['dsq_sum'].mean()}")
print(f"Min: {redcap['dsq_sum'].min()}")
print(f"Max: {redcap['dsq_sum'].max()}")



DSQ Dichotomous Frequency Counts:
dsq_dichotomous
1    19
0    12
Name: count, dtype: int64
DSQ Sum Statistics:
Mean: 15.290322580645162
Min: 0.0
Max: 38.0


## Pittsburgh Sleep Index (PSQI)

In [11]:
# Keep the raw Q2 latency in a separate column (assuming it's the original minutes)
# If 'psqi_latency' currently holds raw minutes, just rename it to 'psqi_latency_raw'
redcap['psqi_latency_raw'] = redcap['psqi_latency']

# Recode Q2 (raw latency in minutes) into 0-3 scale
def recode_q2(x):
    if pd.isnull(x):
        return np.nan
    elif x < 15:
        return 0
    elif 15 <= x <= 30:
        return 1
    elif 31 <= x <= 60:
        return 2
    else:  # > 60 minutes
        return 3

redcap['psqi_latency_q2'] = redcap['psqi_latency_raw'].apply(recode_q2)

# Q5a is already coded as 0-3, keep it as is in 'psqi_latency30'
redcap['psqi_latency_q5a'] = redcap['psqi_latency30']

# Sum the two recoded latency subscores
redcap['psqi_latency_sum'] = redcap[['psqi_latency_q2', 'psqi_latency_q5a']].sum(axis=1)

# Map the summed value into the final component score
def map_latency_component(x):
    if pd.isnull(x):
        return np.nan
    elif x == 0:
        return 0
    elif 1 <= x <= 2:
        return 1
    elif 3 <= x <= 4:
        return 2
    else:  # 5-6
        return 3

redcap['psqi_latency'] = redcap['psqi_latency_sum'].apply(map_latency_component)

# 3. Sleep Duration
def score_sleep_duration(hours):
    if hours >= 7:
        return 0
    elif 6 <= hours < 7:
        return 1
    elif 5 <= hours < 6:
        return 2
    else:
        return 3

redcap['psqi_duration'] = redcap['psqi_hours'].apply(score_sleep_duration)

# 4. Sleep Efficiency
def hhmm_to_decimal(time_val):
    """
    Converts HHMM integer to decimal hours.
    E.g. 2230 -> 22 + 30/60 = 22.5
    """
    if pd.isnull(time_val):
        return np.nan
    hours = time_val // 100
    minutes = time_val % 100
    return hours + minutes / 60.0

# --- Convert psqi_sleepstart and psqi_sleepend to decimal hours ---
redcap['sleepstart_decimal'] = redcap['psqi_sleepstart'].apply(hhmm_to_decimal)
redcap['sleepend_decimal'] = redcap['psqi_sleepend'].apply(hhmm_to_decimal)

# --- Calculate Time in Bed (handling overnight shifts) ---
def calculate_time_in_bed(start, end):
    if pd.isnull(start) or pd.isnull(end):
        return np.nan
    time_in_bed = end - start
    if time_in_bed <= 0:
        time_in_bed += 24
    return time_in_bed

redcap['time_in_bed'] = redcap.apply(
    lambda row: calculate_time_in_bed(row['sleepstart_decimal'], row['sleepend_decimal']), axis=1
)

# --- Calculate Sleep Efficiency ---
redcap['sleep_efficiency'] = (redcap['psqi_hours'] / redcap['time_in_bed']) * 100

# --- Score PSQI Component 4: Habitual Sleep Efficiency ---
def score_sleep_efficiency(efficiency):
    if pd.isnull(efficiency):
        return np.nan
    elif efficiency >= 85:
        return 0
    elif 75 <= efficiency < 85:
        return 1
    elif 65 <= efficiency < 75:
        return 2
    else:
        return 3

redcap['psqi_efficiency'] = redcap['sleep_efficiency'].apply(score_sleep_efficiency)

# 5. Sleep Disturbances
disturbance_items = [
    'psqi_wake', 'psqi_bathroom', 'psqi_breathe', 'psqi_snore',
    'psqi_cold', 'psqi_hot', 'psqi_dreams', 'psqi_pain', 'psqi_other'
]

# TEMPORARY FIX: NEED TO REMOVE LATER AND FIX DATA ENTRY
redcap['psqi_other'] = pd.to_numeric(redcap['psqi_other'], errors='coerce').fillna(0)

redcap['psqi_disturbances_raw'] = redcap[disturbance_items].sum(axis=1)

redcap['psqi_disturbances'] = pd.cut(
    redcap['psqi_disturbances_raw'],
    bins=[-1, 0, 9, 18, 27],  # Adjusted bins
    labels=[0, 1, 2, 3]
).astype(int)


# 6. Use of Sleep Medications
redcap['psqi_medication'] = redcap['psqi_medicine']

# 7. Daytime Dysfunction
redcap['psqi_dysfunction_raw'] = redcap[['psqi_sleepy', 'psqi_enthusiasm']].sum(axis=1)
redcap['psqi_dysfunction'] = pd.cut(
    redcap['psqi_dysfunction_raw'],
    bins=[-1, 0, 2, 4, 6],
    labels=[0, 1, 2, 3]
).astype(int)

# 8. Calculate PSQI Sum
component_cols = [
    'psqi_quality', 'psqi_latency', 'psqi_duration', 
    'psqi_efficiency', 'psqi_disturbances', 
    'psqi_medication', 'psqi_dysfunction'
]
redcap['psqi_sum'] = redcap[component_cols].sum(axis=1)

# 9. Dichotomous classification: 0 = Good sleep, 1 = Poor sleep
redcap['psqi_dichotomous'] = (redcap['psqi_sum'] > 5).astype(int)


# List of all component columns plus the sum
components = [
    'psqi_quality', 'psqi_latency', 'psqi_duration', 'psqi_efficiency',
    'psqi_disturbances', 'psqi_medication', 'psqi_dysfunction', 'psqi_sum'
]

# Summary stats: mean, min, max
summary_stats = redcap[components].agg(['mean', 'min', 'max'])
print("Summary statistics for PSQI components and sum:")
print(summary_stats)

# Frequency counts for the dichotomous variable
dichotomous_counts = redcap['psqi_dichotomous'].value_counts(dropna=False)
print("\nFrequency counts for psqi_dichotomous:")
print(dichotomous_counts)


Summary statistics for PSQI components and sum:
      psqi_quality  psqi_latency  psqi_duration  psqi_efficiency  \
mean      1.387097      1.419355       0.935484          0.83871   
min       0.000000      0.000000       0.000000          0.00000   
max       3.000000      3.000000       3.000000          3.00000   

      psqi_disturbances  psqi_medication  psqi_dysfunction  psqi_sum  
mean           1.483871         0.935484           1.16129   8.16129  
min            0.000000         0.000000           0.00000   1.00000  
max            3.000000         3.000000           3.00000  18.00000  

Frequency counts for psqi_dichotomous:
psqi_dichotomous
1    21
0    10
Name: count, dtype: int64


## BDI-TDI

In [12]:
# Compute BDI Score
redcap['bdi_sum'] = redcap[['bdi_functional', 'bdi_task', 'bdi_effort']].sum(axis=1)

# Compute TDI Score
redcap['tdi_sum'] = redcap[['tdi_functional', 'tdi_task', 'tdi_effort']].sum(axis=1)


## ODI / NDI

In [13]:
# Sum ODI columns
odi_cols = [
    'odi_intensity', 'odi_personalcare', 'odi_lifting', 'odi_walking',
    'odi_sitting', 'odi_standing', 'odi_sleeping', 'odi_sex',
    'odi_social', 'odi_traveling'
]
redcap['odi_sum'] = redcap[odi_cols].sum(axis=1, skipna=True)

# Sum NDI columns
ndi_cols = [
    'ndi_intensity', 'ndi_personalcare', 'ndi_lifting', 'ndi_work',
    'ndi_headaches', 'ndi_concentration', 'ndi_sleeping',
    'ndi_driving', 'ndi_reading', 'ndi_recreation'
]
redcap['ndi_sum'] = redcap[ndi_cols].sum(axis=1, skipna=True)


## Psychology Questionnaire

In [14]:
# Define the columns for each measure
anxiety_cols = ['gad_anxious', 'gad_worrying']
depression_cols = ['phq_hopeless', 'phq_anhedonia']
ptsd_cols = ['ptsd_nightmares', 'ptsd_intrusive', 'ptsd_startled', 'ptsd_detached', 'ptsd_guilty']

# Calculate sum scores
redcap['anxiety_sum'] = redcap[anxiety_cols].sum(axis=1, skipna=True)
redcap['depression_sum'] = redcap[depression_cols].sum(axis=1, skipna=True)
redcap['ptsd_sum'] = redcap[ptsd_cols].sum(axis=1, skipna=True)

# Create dichotomous variables
# PTSD-5: 3 most sensitive, 5 most specific, 4 most efficient https://pmc.ncbi.nlm.nih.gov/articles/PMC5023594/ 
# PHq-2: >=2 --> https://pubmed.ncbi.nlm.nih.gov/33026888/, https://jamanetwork.com/journals/jama/fullarticle/2766865
# GAD-2: >=3 --> https://www.sciencedirect.com/science/article/abs/pii/S0003999318303903, https://www.sciencedirect.com/science/article/abs/pii/S0163834315002406, https://pmc.ncbi.nlm.nih.gov/articles/PMC6163062/, https://pmc.ncbi.nlm.nih.gov/articles/PMC7306644/
redcap['anxiety_dichotomous'] = (redcap['anxiety_sum'] >= 3).astype(int)
redcap['depression_dichotomous'] = (redcap['depression_sum'] >= 2).astype(int)
redcap['ptsd_dichotomous'] = (redcap['ptsd_sum'] >= 3).astype(int)


## SF-PA

In [15]:
# Define the columns for SFPA
sfpa_cols = [
    'sfpa_vigorous', 'sfpa_moderate', 'sfpa_lifting',
    'sfpa_stairs2', 'sfpa_stairs1', 'sfpa_stooping',
    'sfpa_walkingmile', 'sfpa_walkingblocks2',
    'sfpa_walkingblocks1', 'sfpa_bathingdress'
]

# Calculate the sum score
redcap['sfpa_sum'] = redcap[sfpa_cols].sum(axis=1, skipna=True)

# Export

In [16]:
redcap.head(30)

Unnamed: 0,record_id,covid_group,interventiongroup,mmrc_score,pcfs_score,subject_dob,subject_gender,subject_ethnicity,health_ldl,health_hdl,...,tdi_sum,odi_sum,ndi_sum,anxiety_sum,depression_sum,ptsd_sum,anxiety_dichotomous,depression_dichotomous,ptsd_dichotomous,sfpa_sum
1,1,1.0,1.0,0.0,1.0,,,,,,...,9.0,4.0,7.0,2.0,0.0,7.0,0,0,1,27.0
2,1,1.0,1.0,2.0,2.0,1992-04-16,Female,Latino,,,...,0.0,7.0,9.0,3.0,3.0,7.0,1,1,1,24.0
4,2,1.0,0.0,2.0,2.0,,,,,,...,5.0,5.0,0.0,2.0,1.0,10.0,0,0,1,22.0
5,2,1.0,0.0,2.0,2.0,1940-03-13,Female,White,91.0,60.0,...,0.0,6.0,0.0,3.0,2.0,10.0,1,1,1,23.0
7,3,1.0,1.0,3.0,2.0,,,,,,...,7.0,4.0,6.0,1.0,2.0,5.0,0,1,1,22.0
8,3,1.0,1.0,4.0,4.0,1976-11-24,Female,Hispanic,,,...,0.0,17.0,21.0,2.0,4.0,5.0,0,1,1,17.0
10,4,1.0,0.0,0.0,1.0,,,,,,...,0.0,5.0,15.0,5.0,3.0,6.0,1,1,1,29.0
11,4,1.0,0.0,1.0,2.0,1986-04-30,Male,White,,,...,0.0,4.0,15.0,4.0,4.0,7.0,1,1,1,28.0
13,5,1.0,0.0,3.0,3.0,,,,,,...,-5.0,23.0,23.0,4.0,3.0,7.0,1,1,1,18.0
14,5,1.0,0.0,3.0,3.0,1963-12-26,,,,,...,0.0,23.0,27.0,3.0,4.0,6.0,1,1,1,18.0


In [17]:
# Create DataFrame for time_point = 1
df_CS = redcap[redcap['time_point'] == 1].copy()

# Create DataFrame for Covid_group = 1
df_RCT = redcap[redcap['covid_group'] == 1].copy()

# Export to CSV files
df_CS.to_csv('df_CS.csv', index=False)
df_RCT.to_csv('df_RCT.csv', index=False)


In [18]:
# Print all columns in df_RCT that begin with hrv
hrv_columns = [col for col in df_RCT.columns if col.startswith('hrv')]
print("Columns in df_RCT that begin with 'hrv':")

Columns in df_RCT that begin with 'hrv':
