In [12]:
import pandas as pd
import numpy as np

# Import Dataset

In [13]:
# Import RedcapExport.csv as redcap
redcap = pd.read_csv('Redcap Export.csv')
# Display the first 5 rows of the dataframe
redcap.head()

Unnamed: 0,record_id,redcap_event_name,redcap_repeat_instrument,redcap_repeat_instance,subjectid,subject_referral,covid_group,interventiongroup,subject_information_complete,mmrc_score,...,pdf_outcomemeasures,pdf_complete,diary_sessions_1,diary_sessions_2,diary_sessions_3,diary_sessions_4,diary_mip_1,diary_mip_2,diary_mip_3,training_diary_complete
0,1,screening_arm_1,,,1.0,Physical Therapy,1.0,,2.0,2.0,...,,,,,,,,,,
1,1,visit_2_arm_1,,,,,,,,0.0,...,pcs01v2 measures.pdf,2.0,14.0,14.0,14.0,14.0,65.0,83.0,80.0,2.0
2,1,baseline_arm_1,,1.0,,,,,,2.0,...,pcs01v1 measures.pdf,2.0,,,,,,,,
3,2,screening_arm_1,,,2.0,Physical Therapy,1.0,,0.0,2.0,...,,,,,,,,,,
4,2,visit_2_arm_1,,,,,,,,2.0,...,pcs02v2 measures.pdf,2.0,,,,,,,,0.0


In [14]:
# Step 1. Extract covid_group and interventiongroup from 'screening_arm_1' rows
# and map them to all rows with the same record_id

# Create a small DataFrame containing just screening_arm_1 rows
screening_data = redcap[redcap['redcap_event_name'] == 'screening_arm_1']

# Build a mapping dictionary for covid_group and interventiongroup
covid_group_map = screening_data.set_index('record_id')['covid_group'].to_dict()
interventiongroup_map = screening_data.set_index('record_id')['interventiongroup'].to_dict()

# Fill missing values in covid_group and interventiongroup columns across all rows
redcap['covid_group'] = redcap.apply(
    lambda row: covid_group_map.get(row['record_id'], row['covid_group']),
    axis=1
)
redcap['interventiongroup'] = redcap.apply(
    lambda row: interventiongroup_map.get(row['record_id'], row['interventiongroup']),
    axis=1
)

# Step 2. Remove 'screening_arm_1' rows
redcap = redcap[redcap['redcap_event_name'].isin(['baseline_arm_1', 'visit_2_arm_1'])]

# Step 3. Create the time_point column
time_point_map = {
    'baseline_arm_1': 1,
    'visit_2_arm_1': 2
}
redcap['time_point'] = redcap['redcap_event_name'].map(time_point_map)

# ---------------------------------------------------------------------------- #

# NOTE: Remember to update RedCAP (FIX THIS)
# Step 4. Manually overwrite interventiongroup using your dictionary
manual_interventiongroup = {
    1:1, 2:0, 3:1, 4:0, 5:0, 6:0, 7:1, 8:1, 9:0, 10:1,
    11:1, 13:1, 14:0, 15:1, 16:0, 17:0, 18:1, 20:0, 21:1
}
redcap['interventiongroup'] = redcap['record_id'].map(manual_interventiongroup).combine_first(redcap['interventiongroup'])

# Add the name mapping (from your previous request)
name_mapping = {
    1: 'michelle', 2: 'gay', 3: 'maria', 4: 'kevin', 5: 'jimmy',
    6: 'blanca', 7: 'joyti', 8: 'mary', 9: 'monika', 10: 'carlos',
    13: 'michael', 14: 'howard', 15: 'alyssa', 16: 'alice',
    18: 'anjie', 21: 'morgan'
}
redcap['name'] = redcap['record_id'].map(name_mapping)

# ---------------------------------------------------------------------------- #

# Set subject_female
redcap = redcap.sort_values(['record_id', 'time_point'])
time_point_1_mask = redcap['time_point'] == 1
redcap.loc[time_point_1_mask, 'subject_female'] = np.where(
    redcap.loc[time_point_1_mask, 'subject_gender'] == 'Female', 1,
    np.where(redcap.loc[time_point_1_mask, 'subject_gender'] == 'Male', 0, np.nan)
)
redcap['subject_female'] = redcap.groupby('record_id')['subject_female'].ffill()


# Remove all columns where every value is NaN
redcap = redcap.dropna(axis=1, how='all')

# Drop unnecessary columns
redcap = redcap.drop(columns=['redcap_event_name', 'redcap_repeat_instance'])

# Remove all columns that end with '_complete'
redcap = redcap.loc[:, ~redcap.columns.str.endswith('_complete')]

# Remove all columns that end with '_csv'
redcap = redcap.loc[:, ~redcap.columns.str.endswith('_csv')]



In [15]:

# Export the cleaned DataFrame to a new CSV file
redcap.to_csv('cleaned_redcap_data.csv', index=False)

redcap.head(30)


Unnamed: 0,record_id,covid_group,interventiongroup,mmrc_score,pcfs_score,subject_dob,subject_gender,subject_ethnicity,health_smoking,health_history,...,diary_sessions_1,diary_sessions_2,diary_sessions_3,diary_sessions_4,diary_mip_1,diary_mip_2,diary_mip_3,time_point,name,subject_female
2,1,1.0,1.0,2.0,2.0,1992-04-16,Female,Latino,0.0,hypothyroid,...,,,,,,,,1,michelle,1.0
1,1,1.0,1.0,0.0,1.0,,,,,,...,14.0,14.0,14.0,14.0,65.0,83.0,80.0,2,michelle,1.0
5,2,1.0,0.0,2.0,2.0,1940-03-13,Female,White,0.0,lung cancer 2016,...,,,,,,,,1,gay,1.0
4,2,1.0,0.0,2.0,2.0,,,,,,...,,,,,,,,2,gay,1.0
8,3,1.0,1.0,4.0,4.0,1976-11-24,Female,Hispanic,,asthma,...,,,,,,,,1,maria,1.0
7,3,1.0,1.0,3.0,2.0,,,,,,...,14.0,14.0,14.0,14.0,89.0,108.0,120.0,2,maria,1.0
11,4,1.0,0.0,1.0,2.0,1986-04-30,Male,White,,"anxiety, depression",...,,,,,,,,1,kevin,0.0
10,4,1.0,0.0,0.0,1.0,,,,,,...,,,,,,,,2,kevin,0.0
14,5,1.0,0.0,3.0,3.0,1963-12-26,Male,White,,,...,,,,,,,,1,jimmy,0.0
13,5,1.0,0.0,3.0,3.0,,,,,,...,,,,,,,,2,jimmy,0.0


# Respiratory Strength

In [16]:
def calculate_mip_predicted1(row):
    # https://www.atsjournals.org/doi/full/10.1164/ajrccm.158.5.9712006
    try:
        age = float(row['data_age'])
        weight = float(row['data_kilograms'])
        
        # Check gender using subject_female (1=Female, 0=Male, NaN=unknown)
        if row['subject_female'] == 0:  # Male (since 0 = Male)
            return 126 - (1.028 * age) + (0.343 * weight)
        elif row['subject_female'] == 1:  # Female (since 1 = Female)
            height = float(row['data_centimeters'])  # Ensure 'height_cm' exists
            return 171 - (0.694 * age) + (0.861 * weight) - (0.743 * height)
        else:
            return None  # Unknown gender (NaN or other)
    except (KeyError, ValueError, TypeError):
        return None  # Missing data or invalid type
    

def calculate_mip_predicted2(row):
    # https://pubmed.ncbi.nlm.nih.gov/25141521/
    try:
        age = float(row['data_age'])
        weight = float(row['data_kilograms'])
        
        if row['subject_female'] == 0:  # Male
            return 124.39 - (0.91 * age) + (0.63 * weight)
        elif row['subject_female'] == 1:  # Female
            return 77.57 - (0.59 * age) + (0.62 * weight)
        else:
            return None  # Unknown gender
    except (KeyError, ValueError, TypeError):
        return None  # Missing data or invalid type
    
# Compute predicted values
redcap['mip_predicted1'] = redcap.apply(calculate_mip_predicted1, axis=1)
redcap['mip_predicted2'] = redcap.apply(calculate_mip_predicted2, axis=1)

In [17]:
# Define measures and suffixes
measures = ['mip', 'fit', 'smip', 'pif', 'sindex', 'volume']
suffixes = ['max', 'min', 'mean']

# Calculate max, min, mean for pre and post columns
for measure in measures:
    # Pre columns
    pre_cols = [f"imt_{measure}_{i}_pre" for i in [1, 2, 3]]
    if all(col in redcap.columns for col in pre_cols):
        redcap[f"{measure}_pre_max"] = redcap[pre_cols].max(axis=1, skipna=True)
        redcap[f"{measure}_pre_min"] = redcap[pre_cols].min(axis=1, skipna=True)
        redcap[f"{measure}_pre_mean"] = redcap[pre_cols].mean(axis=1, skipna=True)
    else:
        print(f"⚠️ Warning: Missing pre columns for {measure}")

    # Post columns
    post_cols = [f"imt_{measure}_{i}_post" for i in [1, 2, 3]]
    if all(col in redcap.columns for col in post_cols):
        redcap[f"{measure}_post_max"] = redcap[post_cols].max(axis=1, skipna=True)
        redcap[f"{measure}_post_min"] = redcap[post_cols].min(axis=1, skipna=True)
        redcap[f"{measure}_post_mean"] = redcap[post_cols].mean(axis=1, skipna=True)
    else:
        print(f"⚠️ Warning: Missing post columns for {measure}")

# Now calculate fatigue and fatigue percent
for measure in measures:
    for suffix in suffixes:
        pre_col = f"{measure}_pre_{suffix}"
        post_col = f"{measure}_post_{suffix}"

        if pre_col in redcap.columns and post_col in redcap.columns:
            # Fatigue: pre - post
            fatigue_col = f"{measure}_{suffix}_fatigue"
            redcap[fatigue_col] = redcap[pre_col] - redcap[post_col]

            # Fatigue Percent: (pre - post) / pre * 100
            fatigue_percent_col = f"{measure}_{suffix}_fatigue_percent"
            redcap[fatigue_percent_col] = ((redcap[pre_col] - redcap[post_col]) / redcap[pre_col]) * 100
        else:
            print(f"⚠️ Warning: Missing columns for {measure}_{suffix}")

In [18]:
# Calculate percent of predicted values for MIP only
predicted_vars = ['predicted1', 'predicted2']  # these should exist in your dataset: 'mip_predicted1', 'mip_predicted2'

for suffix in suffixes:
    for timepoint in ['pre', 'post']:
        measure_col = f"mip_{timepoint}_{suffix}"
        for pred_var in predicted_vars:
            pred_col = f"mip_{pred_var}"
            percentpredict_col = f"mip_{timepoint}_{suffix}_percentpredict_{pred_var[-1]}"
            
            if measure_col in redcap.columns and pred_col in redcap.columns:
                redcap[percentpredict_col] = redcap[measure_col] / redcap[pred_col]
            else:
                print(f"⚠️ Warning: Missing columns for {measure_col} or {pred_col}")


In [19]:
redcap[['record_id', 'name', 'mip_pre_max', 'mip_post_max', 'mip_predicted1', 'mip_predicted2', 'mip_pre_max_percentpredict_1', 'mip_pre_max_percentpredict_2', 'mip_post_max_percentpredict_1', 'mip_post_max_percentpredict_2']].head(30)

Unnamed: 0,record_id,name,mip_pre_max,mip_post_max,mip_predicted1,mip_predicted2,mip_pre_max_percentpredict_1,mip_pre_max_percentpredict_2,mip_post_max_percentpredict_1,mip_post_max_percentpredict_2
2,1,michelle,52.0,31.0,96.834,105.81,0.537001,0.491447,0.320135,0.292978
1,1,michelle,106.0,94.0,91.1068,103.826,1.16347,1.020939,1.031756,0.905361
5,2,gay,49.0,45.0,47.1697,68.992,1.038802,0.710227,0.954002,0.65225
4,2,gay,47.0,56.0,47.4162,69.116,0.991222,0.680016,1.181031,0.810232
8,3,maria,69.0,73.0,116.469,119.682,0.592432,0.576528,0.626776,0.60995
7,3,maria,136.0,128.0,114.9581,118.38,1.18304,1.148843,1.113449,1.081264
11,4,kevin,56.0,47.0,118.2519,147.329,0.473565,0.380102,0.397457,0.319014
10,4,kevin,71.0,66.0,119.3152,149.282,0.595062,0.47561,0.553157,0.442116
14,5,jimmy,36.0,29.0,89.6687,117.327,0.401478,0.306835,0.323413,0.247172
13,5,jimmy,48.0,43.0,88.3996,114.996,0.542989,0.417406,0.486428,0.373926


In [20]:
# Test W
test_row = {
    'data_age': 31,
    'subject_female': 1,
    'data_kilograms': 63.5,
    'data_centimeters': 157.5
}

# Assuming you've already defined calculate_mip_predicted1 and calculate_mip_predicted2 as above:

predicted1 = calculate_mip_predicted1(test_row)
predicted2 = calculate_mip_predicted2(test_row)

print(f"MIP Predicted 1: {predicted1:.2f} cmH2O")
print(f"MIP Predicted 2: {predicted2:.2f} cmH2O")


MIP Predicted 1: 87.14 cmH2O
MIP Predicted 2: 98.65 cmH2O


In [21]:
redcap.head()

Unnamed: 0,record_id,covid_group,interventiongroup,mmrc_score,pcfs_score,subject_dob,subject_gender,subject_ethnicity,health_smoking,health_history,...,mip_post_max_percentpredict_1,mip_post_max_percentpredict_2,mip_pre_min_percentpredict_1,mip_pre_min_percentpredict_2,mip_post_min_percentpredict_1,mip_post_min_percentpredict_2,mip_pre_mean_percentpredict_1,mip_pre_mean_percentpredict_2,mip_post_mean_percentpredict_1,mip_post_mean_percentpredict_2
2,1,1.0,1.0,2.0,2.0,1992-04-16,Female,Latino,0.0,hypothyroid,...,0.320135,0.292978,0.47504,0.434742,0.289155,0.264625,0.499136,0.456794,0.309809,0.283527
1,1,1.0,1.0,0.0,1.0,,,,,,...,1.031756,0.905361,0.834186,0.731994,0.812234,0.712731,1.05005,0.921413,0.940288,0.825098
5,2,1.0,0.0,2.0,2.0,1940-03-13,Female,White,0.0,lung cancer 2016,...,0.954002,0.65225,0.826802,0.565283,0.848002,0.579777,0.961069,0.657081,0.911602,0.623261
4,2,1.0,0.0,2.0,2.0,,,,,,...,1.181031,0.810232,0.695965,0.477458,0.927953,0.636611,0.822504,0.564269,1.033402,0.708953
8,3,1.0,1.0,4.0,4.0,1976-11-24,Female,Hispanic,,asthma,...,0.626776,0.60995,0.334853,0.325864,0.463643,0.451196,0.437885,0.426129,0.535192,0.520825


# CPET

## Predicted VO2 Max

In [22]:
# Convert kilograms to pounds
redcap['data_pounds'] = redcap['data_kilograms'] * 2.20462


# https://www.sciencedirect.com/science/article/abs/pii/S0033062017300476 (also https://www.ajconline.org/article/S0002-9149(17)30873-1/abstract)
redcap['cpet_vo2max_predicted'] = (
    79.9
    - (0.39 * redcap['data_age'])
    - (13.7 * redcap['subject_female'])
    - (0.127 * redcap['data_pounds'])
)

# Percent of predicted VO2 max
redcap['cpet_vo2max_percentpredicted'] = (
    redcap['cpet_vo2peak_relative'] / redcap['cpet_vo2max_predicted']
) * 100


In [23]:
redcap[['record_id', 'name', 'cpet_vo2peak_relative', 'cpet_vo2max_predicted', 'cpet_vo2max_percentpredicted']].head(30)

Unnamed: 0,record_id,name,cpet_vo2peak_relative,cpet_vo2max_predicted,cpet_vo2max_percentpredicted
2,1,michelle,25.0,32.441008,77.062957
1,1,michelle,29.8,33.336965,89.39026
5,2,gay,14.5,14.932876,97.101185
4,2,gay,14.1,14.876879,94.777943
8,3,maria,13.3,15.673506,84.856571
7,3,maria,16.4,16.261478,100.851838
11,4,kevin,31.9,39.517211,80.724321
10,4,kevin,29.3,38.649252,75.810006
14,5,jimmy,19.0,34.57902,54.946613
13,5,jimmy,21.1,35.614971,59.244749


In [24]:
redcap[['record_id', 'name', 'mip_pre_max_percentpredict_1']].head(30)

Unnamed: 0,record_id,name,mip_pre_max_percentpredict_1
2,1,michelle,0.537001
1,1,michelle,1.16347
5,2,gay,1.038802
4,2,gay,0.991222
8,3,maria,0.592432
7,3,maria,1.18304
11,4,kevin,0.473565
10,4,kevin,0.595062
14,5,jimmy,0.401478
13,5,jimmy,0.542989


# FMD

In [25]:
# Predicted FMD values: https://pubmed.ncbi.nlm.nih.gov/35709326/

# Subjective Questionnaires

## Woods Mental Fatigue Inventory

In [26]:
# List of columns to sum
woods_columns = [
    'woods_concentration', 'woods_decisions', 'woods_confusion',
    'woods_memory', 'woods_words', 'woods_takethingsin',
    'woods_processingspeed', 'woods_thoughtsmixed', 'woods_muzzy'
]

# Create the new column 'woods_sum'
redcap['woods_sum'] = redcap[woods_columns].sum(axis=1)

# Display mean, min, max, and std of the 'woods_sum' column
print("Woods Sum Statistics:")
print(f"Mean: {redcap['woods_sum'].mean()}")
print(f"Min: {redcap['woods_sum'].min()}")
print(f"Max: {redcap['woods_sum'].max()}")
print(f"Std: {redcap['woods_sum'].std()}")


Woods Sum Statistics:
Mean: 15.36111111111111
Min: 0.0
Max: 31.0
Std: 9.103069428268304


## Fatigue Severity Scale (FSS)

In [27]:
# List of columns to sum
fss_columns = ['fss_motivation', 'fss_exercise', 'fss_easily', 'fss_functioning',
       'fss_problems', 'fss_sustained', 'fss_duties', 'fss_disabling',
       'fss_social']

# Create the new column 'fss_sum'
redcap['fss_sum'] = redcap[fss_columns].sum(axis=1)

# Create a new column fss_dichotomous if fss_sum is greater than 36
redcap['fss_dichotomous'] = redcap['fss_sum'].apply(lambda x: 1 if x > 36 else 0)

# Print the mean, min, max, and std of the 'fss_sum' column
print("FSS Sum Statistics:")
print(f"Mean: {redcap['fss_sum'].mean()}")
print(f"Min: {redcap['fss_sum'].min()}")
print(f"Max: {redcap['fss_sum'].max()}")
print(f"Std: {redcap['fss_sum'].std()}")

FSS Sum Statistics:
Mean: 37.55555555555556
Min: 10.0
Max: 63.0
Std: 17.332783874075073


## PEM (DSQ)

In [28]:
# https://pmc.ncbi.nlm.nih.gov/articles/PMC6165517/
# A frequency and severity score of 2, 2 on any items 1–5 is indicative of PEM.

# Lists of columns
freq_cols = [
    'dsq_heavy_freq', 'dsq_nextday_freq', 'dsq_mentallytired_freq',
    'dsq_minexercise_freq', 'dsq_drained_freq'
]

severity_cols = [
    'dsq_heavy_severity', 'dsq_nextday_severity', 'dsq_mentallytired_severity',
    'dsq_minexercise_severity', 'dsq_drained_severity'
]

# Sum of frequency
redcap['dsq_freq_sum'] = redcap[freq_cols].sum(axis=1)

# Sum of severity
redcap['dsq_severity_sum'] = redcap[severity_cols].sum(axis=1)

# Sum of both frequency and severity
redcap['dsq_sum'] = redcap['dsq_freq_sum'] + redcap['dsq_severity_sum']

# Dichotomous indicator (1 if any item has freq >=2 AND severity >=2)
def compute_dsq_dichotomous(row):
    for freq_col, sev_col in zip(freq_cols, severity_cols):
        if (row[freq_col] >= 2) and (row[sev_col] >= 2):
            return 1
    return 0

redcap['dsq_dichotomous'] = redcap.apply(compute_dsq_dichotomous, axis=1)

# Print the frequency counts for dsq_dichotomous
print("DSQ Dichotomous Frequency Counts:")
print(redcap['dsq_dichotomous'].value_counts())

# Print the mean, min, max, and std of the 'dsq_sum' column
print("DSQ Sum Statistics:")
print(f"Mean: {redcap['dsq_sum'].mean()}")
print(f"Min: {redcap['dsq_sum'].min()}")
print(f"Max: {redcap['dsq_sum'].max()}")



DSQ Dichotomous Frequency Counts:
dsq_dichotomous
1    21
0    15
Name: count, dtype: int64
DSQ Sum Statistics:
Mean: 14.36111111111111
Min: 0.0
Max: 38.0


  redcap['dsq_freq_sum'] = redcap[freq_cols].sum(axis=1)
  redcap['dsq_severity_sum'] = redcap[severity_cols].sum(axis=1)
  redcap['dsq_sum'] = redcap['dsq_freq_sum'] + redcap['dsq_severity_sum']
  redcap['dsq_dichotomous'] = redcap.apply(compute_dsq_dichotomous, axis=1)


## Pittsburgh Sleep Index (PSQI)

In [29]:
# Keep the raw Q2 latency in a separate column (assuming it's the original minutes)
# If 'psqi_latency' currently holds raw minutes, just rename it to 'psqi_latency_raw'
redcap['psqi_latency_raw'] = redcap['psqi_latency']

# Recode Q2 (raw latency in minutes) into 0-3 scale
def recode_q2(x):
    if pd.isnull(x):
        return np.nan
    elif x < 15:
        return 0
    elif 15 <= x <= 30:
        return 1
    elif 31 <= x <= 60:
        return 2
    else:  # > 60 minutes
        return 3

redcap['psqi_latency_q2'] = redcap['psqi_latency_raw'].apply(recode_q2)

# Q5a is already coded as 0-3, keep it as is in 'psqi_latency30'
redcap['psqi_latency_q5a'] = redcap['psqi_latency30']

# Sum the two recoded latency subscores
redcap['psqi_latency_sum'] = redcap[['psqi_latency_q2', 'psqi_latency_q5a']].sum(axis=1)

# Map the summed value into the final component score
def map_latency_component(x):
    if pd.isnull(x):
        return np.nan
    elif x == 0:
        return 0
    elif 1 <= x <= 2:
        return 1
    elif 3 <= x <= 4:
        return 2
    else:  # 5-6
        return 3

redcap['psqi_latency'] = redcap['psqi_latency_sum'].apply(map_latency_component)

# 3. Sleep Duration
def score_sleep_duration(hours):
    if hours >= 7:
        return 0
    elif 6 <= hours < 7:
        return 1
    elif 5 <= hours < 6:
        return 2
    else:
        return 3

redcap['psqi_duration'] = redcap['psqi_hours'].apply(score_sleep_duration)

# 4. Sleep Efficiency
def hhmm_to_decimal(time_val):
    """
    Converts HHMM integer to decimal hours.
    E.g. 2230 -> 22 + 30/60 = 22.5
    """
    if pd.isnull(time_val):
        return np.nan
    hours = time_val // 100
    minutes = time_val % 100
    return hours + minutes / 60.0

# --- Convert psqi_sleepstart and psqi_sleepend to decimal hours ---
redcap['sleepstart_decimal'] = redcap['psqi_sleepstart'].apply(hhmm_to_decimal)
redcap['sleepend_decimal'] = redcap['psqi_sleepend'].apply(hhmm_to_decimal)

# --- Calculate Time in Bed (handling overnight shifts) ---
def calculate_time_in_bed(start, end):
    if pd.isnull(start) or pd.isnull(end):
        return np.nan
    time_in_bed = end - start
    if time_in_bed <= 0:
        time_in_bed += 24
    return time_in_bed

redcap['time_in_bed'] = redcap.apply(
    lambda row: calculate_time_in_bed(row['sleepstart_decimal'], row['sleepend_decimal']), axis=1
)

# --- Calculate Sleep Efficiency ---
redcap['sleep_efficiency'] = (redcap['psqi_hours'] / redcap['time_in_bed']) * 100

# --- Score PSQI Component 4: Habitual Sleep Efficiency ---
def score_sleep_efficiency(efficiency):
    if pd.isnull(efficiency):
        return np.nan
    elif efficiency >= 85:
        return 0
    elif 75 <= efficiency < 85:
        return 1
    elif 65 <= efficiency < 75:
        return 2
    else:
        return 3

redcap['psqi_efficiency'] = redcap['sleep_efficiency'].apply(score_sleep_efficiency)

# 5. Sleep Disturbances
disturbance_items = [
    'psqi_wake', 'psqi_bathroom', 'psqi_breathe', 'psqi_snore',
    'psqi_cold', 'psqi_hot', 'psqi_dreams', 'psqi_pain', 'psqi_other'
]

# TEMPORARY FIX: NEED TO REMOVE LATER AND FIX DATA ENTRY
redcap['psqi_other'] = pd.to_numeric(redcap['psqi_other'], errors='coerce').fillna(0)

redcap['psqi_disturbances_raw'] = redcap[disturbance_items].sum(axis=1)

redcap['psqi_disturbances'] = pd.cut(
    redcap['psqi_disturbances_raw'],
    bins=[-1, 0, 9, 18, 27],  # Adjusted bins
    labels=[0, 1, 2, 3]
).astype(int)


# 6. Use of Sleep Medications
redcap['psqi_medication'] = redcap['psqi_medicine']

# 7. Daytime Dysfunction
redcap['psqi_dysfunction_raw'] = redcap[['psqi_sleepy', 'psqi_enthusiasm']].sum(axis=1)
redcap['psqi_dysfunction'] = pd.cut(
    redcap['psqi_dysfunction_raw'],
    bins=[-1, 0, 2, 4, 6],
    labels=[0, 1, 2, 3]
).astype(int)

# 8. Calculate PSQI Sum
component_cols = [
    'psqi_quality', 'psqi_latency', 'psqi_duration', 
    'psqi_efficiency', 'psqi_disturbances', 
    'psqi_medication', 'psqi_dysfunction'
]
redcap['psqi_sum'] = redcap[component_cols].sum(axis=1)

# 9. Dichotomous classification: 0 = Good sleep, 1 = Poor sleep
redcap['psqi_dichotomous'] = (redcap['psqi_sum'] > 5).astype(int)


# List of all component columns plus the sum
components = [
    'psqi_quality', 'psqi_latency', 'psqi_duration', 'psqi_efficiency',
    'psqi_disturbances', 'psqi_medication', 'psqi_dysfunction', 'psqi_sum'
]

# Summary stats: mean, min, max
summary_stats = redcap[components].agg(['mean', 'min', 'max'])
print("Summary statistics for PSQI components and sum:")
print(summary_stats)

# Frequency counts for the dichotomous variable
dichotomous_counts = redcap['psqi_dichotomous'].value_counts(dropna=False)
print("\nFrequency counts for psqi_dichotomous:")
print(dichotomous_counts)


Summary statistics for PSQI components and sum:
      psqi_quality  psqi_latency  psqi_duration  psqi_efficiency  \
mean      1.305556      1.416667       0.972222         0.828571   
min       0.000000      0.000000       0.000000         0.000000   
max       3.000000      3.000000       3.000000         3.000000   

      psqi_disturbances  psqi_medication  psqi_dysfunction   psqi_sum  
mean           1.472222         0.916667          1.083333   7.972222  
min            0.000000         0.000000          0.000000   1.000000  
max            3.000000         3.000000          3.000000  18.000000  

Frequency counts for psqi_dichotomous:
psqi_dichotomous
1    24
0    12
Name: count, dtype: int64


  redcap['psqi_latency_raw'] = redcap['psqi_latency']
  redcap['psqi_latency_q2'] = redcap['psqi_latency_raw'].apply(recode_q2)
  redcap['psqi_latency_q5a'] = redcap['psqi_latency30']
  redcap['psqi_latency_sum'] = redcap[['psqi_latency_q2', 'psqi_latency_q5a']].sum(axis=1)
  redcap['psqi_duration'] = redcap['psqi_hours'].apply(score_sleep_duration)
  redcap['sleepstart_decimal'] = redcap['psqi_sleepstart'].apply(hhmm_to_decimal)
  redcap['sleepend_decimal'] = redcap['psqi_sleepend'].apply(hhmm_to_decimal)
  redcap['time_in_bed'] = redcap.apply(
  redcap['sleep_efficiency'] = (redcap['psqi_hours'] / redcap['time_in_bed']) * 100
  redcap['psqi_efficiency'] = redcap['sleep_efficiency'].apply(score_sleep_efficiency)
  redcap['psqi_disturbances_raw'] = redcap[disturbance_items].sum(axis=1)
  redcap['psqi_disturbances'] = pd.cut(
  redcap['psqi_medication'] = redcap['psqi_medicine']
  redcap['psqi_dysfunction_raw'] = redcap[['psqi_sleepy', 'psqi_enthusiasm']].sum(axis=1)
  redcap['psqi_dysf

## BDI-TDI

In [30]:
# Compute BDI Score
redcap['bdi_sum'] = redcap[['bdi_functional', 'bdi_task', 'bdi_effort']].sum(axis=1)

# Compute TDI Score
redcap['tdi_sum'] = redcap[['tdi_functional', 'tdi_task', 'tdi_effort']].sum(axis=1)


  redcap['bdi_sum'] = redcap[['bdi_functional', 'bdi_task', 'bdi_effort']].sum(axis=1)
  redcap['tdi_sum'] = redcap[['tdi_functional', 'tdi_task', 'tdi_effort']].sum(axis=1)


## ODI / NDI

In [31]:
# Sum ODI columns
odi_cols = [
    'odi_intensity', 'odi_personalcare', 'odi_lifting', 'odi_walking',
    'odi_sitting', 'odi_standing', 'odi_sleeping', 'odi_sex',
    'odi_social', 'odi_traveling'
]
redcap['odi_sum'] = redcap[odi_cols].sum(axis=1, skipna=True)

# Sum NDI columns
ndi_cols = [
    'ndi_intensity', 'ndi_personalcare', 'ndi_lifting', 'ndi_work',
    'ndi_headaches', 'ndi_concentration', 'ndi_sleeping',
    'ndi_driving', 'ndi_reading', 'ndi_recreation'
]
redcap['ndi_sum'] = redcap[ndi_cols].sum(axis=1, skipna=True)


  redcap['odi_sum'] = redcap[odi_cols].sum(axis=1, skipna=True)
  redcap['ndi_sum'] = redcap[ndi_cols].sum(axis=1, skipna=True)


## Psychology Questionnaire

In [32]:
# Define the columns for each measure
anxiety_cols = ['gad_anxious', 'gad_worrying']
depression_cols = ['phq_hopeless', 'phq_anhedonia']
ptsd_cols = ['ptsd_nightmares', 'ptsd_intrusive', 'ptsd_startled', 'ptsd_detached', 'ptsd_guilty']

# Calculate sum scores
redcap['anxiety_sum'] = redcap[anxiety_cols].sum(axis=1, skipna=True)
redcap['depression_sum'] = redcap[depression_cols].sum(axis=1, skipna=True)
redcap['ptsd_sum'] = redcap[ptsd_cols].sum(axis=1, skipna=True)

# Create dichotomous variables
# PTSD-5: 3 most sensitive, 5 most specific, 4 most efficient https://pmc.ncbi.nlm.nih.gov/articles/PMC5023594/ 
# PHq-2: >=2 --> https://pubmed.ncbi.nlm.nih.gov/33026888/, https://jamanetwork.com/journals/jama/fullarticle/2766865
# GAD-2: >=3 --> https://www.sciencedirect.com/science/article/abs/pii/S0003999318303903, https://www.sciencedirect.com/science/article/abs/pii/S0163834315002406, https://pmc.ncbi.nlm.nih.gov/articles/PMC6163062/, https://pmc.ncbi.nlm.nih.gov/articles/PMC7306644/
redcap['anxiety_dichotomous'] = (redcap['anxiety_sum'] >= 3).astype(int)
redcap['depression_dichotomous'] = (redcap['depression_sum'] >= 2).astype(int)
redcap['ptsd_dichotomous'] = (redcap['ptsd_sum'] >= 3).astype(int)


  redcap['anxiety_sum'] = redcap[anxiety_cols].sum(axis=1, skipna=True)
  redcap['depression_sum'] = redcap[depression_cols].sum(axis=1, skipna=True)
  redcap['ptsd_sum'] = redcap[ptsd_cols].sum(axis=1, skipna=True)
  redcap['anxiety_dichotomous'] = (redcap['anxiety_sum'] >= 3).astype(int)
  redcap['depression_dichotomous'] = (redcap['depression_sum'] >= 2).astype(int)
  redcap['ptsd_dichotomous'] = (redcap['ptsd_sum'] >= 3).astype(int)


## SF-PA

In [33]:
# Define the columns for SFPA
sfpa_cols = [
    'sfpa_vigorous', 'sfpa_moderate', 'sfpa_lifting',
    'sfpa_stairs2', 'sfpa_stairs1', 'sfpa_stooping',
    'sfpa_walkingmile', 'sfpa_walkingblocks2',
    'sfpa_walkingblocks1', 'sfpa_bathingdress'
]

# Calculate the sum score
redcap['sfpa_sum'] = redcap[sfpa_cols].sum(axis=1, skipna=True)

  redcap['sfpa_sum'] = redcap[sfpa_cols].sum(axis=1, skipna=True)


# Export

In [34]:
redcap.head(30)

Unnamed: 0,record_id,covid_group,interventiongroup,mmrc_score,pcfs_score,subject_dob,subject_gender,subject_ethnicity,health_smoking,health_history,...,tdi_sum,odi_sum,ndi_sum,anxiety_sum,depression_sum,ptsd_sum,anxiety_dichotomous,depression_dichotomous,ptsd_dichotomous,sfpa_sum
2,1,1.0,1.0,2.0,2.0,1992-04-16,Female,Latino,0.0,hypothyroid,...,0.0,7.0,9.0,3.0,3.0,7.0,1,1,1,24.0
1,1,1.0,1.0,0.0,1.0,,,,,,...,9.0,4.0,7.0,2.0,0.0,7.0,0,0,1,27.0
5,2,1.0,0.0,2.0,2.0,1940-03-13,Female,White,0.0,lung cancer 2016,...,0.0,6.0,0.0,3.0,2.0,10.0,1,1,1,23.0
4,2,1.0,0.0,2.0,2.0,,,,,,...,5.0,5.0,0.0,2.0,1.0,10.0,0,0,1,22.0
8,3,1.0,1.0,4.0,4.0,1976-11-24,Female,Hispanic,,asthma,...,0.0,17.0,21.0,2.0,4.0,5.0,0,1,1,17.0
7,3,1.0,1.0,3.0,2.0,,,,,,...,7.0,4.0,6.0,1.0,2.0,5.0,0,1,1,22.0
11,4,1.0,0.0,1.0,2.0,1986-04-30,Male,White,,"anxiety, depression",...,0.0,4.0,15.0,4.0,4.0,7.0,1,1,1,28.0
10,4,1.0,0.0,0.0,1.0,,,,,,...,0.0,5.0,15.0,5.0,3.0,6.0,1,1,1,29.0
14,5,1.0,0.0,3.0,3.0,1963-12-26,Male,White,,,...,0.0,23.0,27.0,3.0,4.0,6.0,1,1,1,18.0
13,5,1.0,0.0,3.0,3.0,,,,,,...,-5.0,23.0,23.0,4.0,3.0,7.0,1,1,1,18.0


In [35]:
# Create DataFrame for time_point = 1
df_CS = redcap[redcap['time_point'] == 1].copy()

# Create DataFrame for Covid_group = 1
df_RCT = redcap[redcap['covid_group'] == 1].copy()

# Export to CSV files
df_CS.to_csv('df_CS.csv', index=False)
df_RCT.to_csv('df_RCT.csv', index=False)


In [36]:
# Print the mean scores of the following variables grouped by interventiongroup for time_point = 1:
variables_to_mean = [
    'mip_pre_max', 'mip_post_max', 'mip_predicted1', 'mip_predicted2',
    'mip_pre_max_percentpredict_1', 'mip_pre_max_percentpredict_2',
    'mip_post_max_percentpredict_1', 'mip_post_max_percentpredict_2',
    'mmrc_score', 'data_age'
]

# Group by interventiongroup and calculate mean for each variable (only include time_point = 1)
temp = redcap[redcap['time_point'] == 1]  # Filter for time_point = 1
mean_scores = temp.groupby('interventiongroup')[variables_to_mean].mean().reset_index()
# Print the mean scores DataFrame
print("Mean Scores by Intervention Group:")
print(mean_scores)

Mean Scores by Intervention Group:
   interventiongroup  mip_pre_max  mip_post_max  mip_predicted1  \
0                0.0         42.0     42.857143       77.229057   
1                1.0         62.8     61.700000       96.154400   

   mip_predicted2  mip_pre_max_percentpredict_1  mip_pre_max_percentpredict_2  \
0      100.743857                      0.582006                      0.428028   
1      112.565200                      0.659478                      0.558729   

   mip_post_max_percentpredict_1  mip_post_max_percentpredict_2  mmrc_score  \
0                       0.589917                       0.441567    2.142857   
1                       0.640868                       0.542852    1.800000   

    data_age  
0  57.428571  
1  38.800000  
