In [28]:
import pandas as pd
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)  # Show all rows


import numpy as np

# Import Dataset

In [29]:
# Import RedcapExport.csv as redcap
redcap = pd.read_csv('Redcap Export.csv')

# Step 1. Extract covid_group and interventiongroup from 'screening_arm_1' rows
# and map them to all rows with the same record_id

# Create a small DataFrame containing just screening_arm_1 rows
screening_data = redcap[redcap['redcap_event_name'] == 'screening_arm_1']

# Build a mapping dictionary for covid_group and interventiongroup
covid_group_map = screening_data.set_index('record_id')['covid_group'].to_dict()
interventiongroup_map = screening_data.set_index('record_id')['interventiongroup'].to_dict()

# Fill missing values in covid_group and interventiongroup columns across all rows
redcap['covid_group'] = redcap.apply(
    lambda row: covid_group_map.get(row['record_id'], row['covid_group']),
    axis=1
)
redcap['interventiongroup'] = redcap.apply(
    lambda row: interventiongroup_map.get(row['record_id'], row['interventiongroup']),
    axis=1
)

# Step 2. Remove 'screening_arm_1' rows
redcap = redcap[redcap['redcap_event_name'].isin(['baseline_arm_1', 'visit_2_arm_1'])]

# Step 3. Create the time_point column
time_point_map = {
    'baseline_arm_1': 1,
    'visit_2_arm_1': 2
}
redcap['time_point'] = redcap['redcap_event_name'].map(time_point_map)

# ---------------------------------------------------------------------------- #

# NOTE: Remember to update RedCAP (FIX THIS)
# Step 4. Manually overwrite interventiongroup using your dictionary
manual_interventiongroup = {
    1:1, 2:0, 3:1, 4:0, 5:0, 6:0, 7:1, 8:1, 9:0, 10:1,
    13:1, 14:0, 15:0, 16:0, 17:0, 18:1, 20:0, 21:1, 25:1, 29:1
}
redcap['interventiongroup'] = redcap['record_id'].map(manual_interventiongroup).combine_first(redcap['interventiongroup'])

# Add the name mapping (from your previous request)
name_mapping = {
    1: 'michelle', 2: 'gay', 3: 'maria', 4: 'kevin', 5: 'jimmy',
    6: 'blanca', 7: 'joyti', 8: 'mary', 9: 'monika', 10: 'carlos', 11: 'marty',
    12: 'bri', 13: 'michael', 14: 'howard', 15: 'alyssa', 16: 'alice', 17: 'tatum',
    18: 'anjie', 19: 'shavonne', 20: 'sam', 21: 'morgan', 22: 'michael', 23: 'margaret',
    24: 'kloe', 25: 'wendy', 26: 'ethel', 27: 'conor', 28: 'chris', 29: 'jay', 30: 'nancy h'
}
redcap['name'] = redcap['record_id'].map(name_mapping)

# ---------------------------------------------------------------------------- #

# Remove all columns where every value is NaN
redcap = redcap.dropna(axis=1, how='all')

# Drop unnecessary columns
redcap = redcap.drop(columns=['redcap_event_name', 'redcap_repeat_instance'])

# Remove all columns that end with '_complete'
redcap = redcap.loc[:, ~redcap.columns.str.endswith('_complete')]

# Remove all columns that end with '_csv'
redcap = redcap.loc[:, ~redcap.columns.str.endswith('_csv')]

# Export the cleaned DataFrame to a new CSV file
redcap.to_csv('cleaned_redcap_data.csv', index=False)


# Demographics

## Gender

In [30]:
# Set subject_female
redcap = redcap.sort_values(['record_id', 'time_point'])
time_point_1_mask = redcap['time_point'] == 1
redcap.loc[time_point_1_mask, 'subject_female'] = np.where(
    redcap.loc[time_point_1_mask, 'subject_gender'] == 'Female', 1,
    np.where(redcap.loc[time_point_1_mask, 'subject_gender'] == 'Male', 0, np.nan)
)
redcap['subject_female'] = redcap.groupby('record_id')['subject_female'].ffill()


## Race

In [31]:
import pandas as pd
import numpy as np

# Step 1 — Fill missing ethnicity within each record_id
redcap['subject_ethnicity'] = (
    redcap.groupby('record_id')['subject_ethnicity']
          .transform(lambda x: x.ffill().bfill())
)

# Step 2 — Mapping function for categories
def map_ethnicity(value):
    if pd.isna(value):
        return 'Unknown'
    val = value.strip().lower()

    # Check multi-race first
    if ',' in val or 'more than' in val or ' and ' in val:
        return 'More than One Race'
    
    # Now single-race checks
    if 'hispanic' in val or 'latino' in val or 'latina' in val:
        return 'Hispanic or Latino'
    elif 'american indian' in val or 'alaskan' in val:
        return 'American Indian/Alaskan Native'
    elif 'filipino' in val:
        return 'Asian'
    elif 'indian' in val or 'asian' in val:
        return 'Asian'
    elif 'pacific islander' in val or 'hawaiian' in val:
        return 'Native Hawaiian or Pacific Islander'
    elif 'black' in val or 'african' in val:
        return 'Black or African American'
    elif 'white' in val:
        return 'White'
    else:
        return 'Unknown'

redcap['subject_ethnicity_categorized'] = redcap['subject_ethnicity'].apply(map_ethnicity)

# Step 3 — Create pivot table
categories_order = [
    'Hispanic or Latino',
    'American Indian/Alaskan Native',
    'Asian',
    'Native Hawaiian or Pacific Islander',
    'Black or African American',
    'White',
    'More than One Race',
    'Unknown'
]

final_table = (
    redcap.pivot_table(
        index='subject_ethnicity_categorized',
        columns='subject_gender',
        aggfunc='size',
        fill_value=0
    )
    .reindex(categories_order)
    .fillna(0)  # ensure no NaN, only 0
    .astype(int)  # make sure they're integers, not floats
    .rename_axis(index=None, columns=None)
)

print(final_table)


                                     Female  Male
Hispanic or Latino                        4     1
American Indian/Alaskan Native            0     0
Asian                                     3     1
Native Hawaiian or Pacific Islander       0     0
Black or African American                 3     1
White                                     6     8
More than One Race                        3     0
Unknown                                   0     0


# Respiratory Strength

In [32]:
def calculate_mip_predicted1(row):
    # https://www.atsjournals.org/doi/full/10.1164/ajrccm.158.5.9712006
    try:
        age = float(row['data_age'])
        weight = float(row['data_kilograms'])
        
        # Check gender using subject_female (1=Female, 0=Male, NaN=unknown)
        if row['subject_female'] == 0:  # Male (since 0 = Male)
            return 126 - (1.028 * age) + (0.343 * weight)
        elif row['subject_female'] == 1:  # Female (since 1 = Female)
            height = float(row['data_centimeters'])  # Ensure 'height_cm' exists
            return 171 - (0.694 * age) + (0.861 * weight) - (0.743 * height)
        else:
            return None  # Unknown gender (NaN or other)
    except (KeyError, ValueError, TypeError):
        return None  # Missing data or invalid type
    

def calculate_mip_predicted2(row):
    # https://pubmed.ncbi.nlm.nih.gov/25141521/
    try:
        age = float(row['data_age'])
        weight = float(row['data_kilograms'])
        
        if row['subject_female'] == 0:  # Male
            return 124.39 - (0.91 * age) + (0.63 * weight)
        elif row['subject_female'] == 1:  # Female
            return 77.57 - (0.59 * age) + (0.62 * weight)
        else:
            return None  # Unknown gender
    except (KeyError, ValueError, TypeError):
        return None  # Missing data or invalid type

    
def calculate_mip_predicted3(row):
    # # J. A. Evans and W. A. Whitelaw, “The assessment of maximal respiratory mouth pressures in adults,” Respiratory Care, vol. 54, no. 10, pp. 1348–1359, 2009.
    try:
        age = float(row['data_age'])
        weight = float(row['data_kilograms'])
        
        if row['subject_female'] == 0:  # Male
            return 120 - (0.41 * age)
        elif row['subject_female'] == 1:  # Female
            return 108 - (0.61 * age)
        else:
            return None  # Unknown gender
    except (KeyError, ValueError, TypeError):
        return None  # Missing data or invalid type

def calculate_sindex_predicted(row):
    # https://www.jornaldepneumologia.com.br/details/4136/en-US/maximal-dynamic-inspiratory-pressure--s-index-prediction-values-and-diagnosis-accuracy
    try:
        age = float(row['data_age'])
        weight = float(row['data_kilograms'])
        height = float(row['data_centimeters'])

        if row['subject_female'] == 0: # Male
            return -32.3 + (0.47 * weight) - (0.39 * age) + (0.79 * height)
        elif row['subject_female'] == 1: # Female
            return -54 + (0.47 * weight) - (0.39 * age) + (0.79 * height)
        else:
            return None  # Unknown
    except (KeyError, ValueError, TypeError):
        return None  # Missing data or invalid type
    
# Compute predicted values
redcap['mip_predicted1'] = redcap.apply(calculate_mip_predicted1, axis=1)
redcap['mip_predicted2'] = redcap.apply(calculate_mip_predicted2, axis=1)
redcap['mip_predicted3'] = redcap.apply(calculate_mip_predicted3, axis=1)
redcap['sindex_predicted'] = redcap.apply(calculate_sindex_predicted, axis=1)

In [33]:
# Define measures (just the regular ones)
measures = ['mip', 'pif', 'sindex', 'volume']  # Removed 'smip' since we'll handle it separately

# Calculate max for pre and post columns (only max, no min/mean)
for measure in measures:
    # Pre columns
    pre_cols = [f"imt_{measure}_{i}_pre" for i in [1, 2, 3]]
    if all(col in redcap.columns for col in pre_cols):
        redcap[f"{measure}_pre_max"] = redcap[pre_cols].max(axis=1, skipna=True)
    else:
        print(f"⚠️ Warning: Missing pre columns for {measure}")

    # Post columns
    post_cols = [f"imt_{measure}_{i}_post" for i in [1, 2, 3]]
    if all(col in redcap.columns for col in post_cols):
        redcap[f"{measure}_post_max"] = redcap[post_cols].max(axis=1, skipna=True)
    else:
        print(f"⚠️ Warning: Missing post columns for {measure}")

# Now calculate fatigue and fatigue percent (only for max)
for measure in measures:
    pre_col = f"{measure}_pre_max"
    post_col = f"{measure}_post_max"

    if pre_col in redcap.columns and post_col in redcap.columns:
        # Fatigue: pre - post
        fatigue_col = f"{measure}_max_fatigue"
        redcap[fatigue_col] = redcap[pre_col] - redcap[post_col]

        # Fatigue Percent: (pre - post) / pre * 100
        fatigue_percent_col = f"{measure}_max_fatigue_percent"
        redcap[fatigue_percent_col] = ((redcap[pre_col] - redcap[post_col]) / redcap[pre_col]) * 100
    else:
        print(f"⚠️ Warning: Missing columns for {measure}_max")

In [34]:
# First, calculate max SMIP from the three trials
smip_pre_cols = [f"imt_smip_{i}_pre" for i in [1, 2, 3]]
smip_post_cols = [f"imt_smip_{i}_post" for i in [1, 2, 3]]

if all(col in redcap.columns for col in smip_pre_cols + smip_post_cols):
    # Find which trial has the max SMIP value for each participant
    redcap['smip_pre_max'] = redcap[smip_pre_cols].max(axis=1, skipna=True)
    redcap['smip_post_max'] = redcap[smip_post_cols].max(axis=1, skipna=True)
    
    # Find the trial number of the max SMIP for pre and post
    # Using apply to extract the trial number from the column name with the max value
    def get_trial_number(row, cols):
        max_col = row[cols].idxmax()
        return int(max_col.split('_')[2])  # Extract the number from 'imt_smip_2_pre'
    
    redcap['max_smip_pre_trial'] = redcap.apply(lambda row: get_trial_number(row, smip_pre_cols), axis=1)
    redcap['max_smip_post_trial'] = redcap.apply(lambda row: get_trial_number(row, smip_post_cols), axis=1)
    
    # Now extract the corresponding ID, slopesmip, and fit values for the trial with max SMIP
    for idx, row in redcap.iterrows():
        # For pre values
        trial_pre = row['max_smip_pre_trial']
        redcap.at[idx, 'id_pre_max'] = redcap.at[idx, f'imt_id_{trial_pre}_pre']
        redcap.at[idx, 'slopesmip_pre_max'] = redcap.at[idx, f'imt_slopesmip_{trial_pre}_pre']
        redcap.at[idx, 'fit_pre_max'] = redcap.at[idx, f'imt_fit_{trial_pre}_pre']
        
        # For post values
        trial_post = row['max_smip_post_trial']
        redcap.at[idx, 'id_post_max'] = redcap.at[idx, f'imt_id_{trial_post}_post']
        redcap.at[idx, 'slopesmip_post_max'] = redcap.at[idx, f'imt_slopesmip_{trial_post}_post']
        redcap.at[idx, 'fit_post_max'] = redcap.at[idx, f'imt_fit_{trial_post}_post']
    
    # Calculate fatigue for SMIP and the special variables
    redcap['smip_max_fatigue'] = redcap['smip_pre_max'] - redcap['smip_post_max']
    redcap['smip_max_fatigue_percent'] = ((redcap['smip_pre_max'] - redcap['smip_post_max']) / redcap['smip_pre_max'].replace(0, np.nan)) * 100
    
    # Also calculate fatigue for the special variables if desired
    redcap['id_max_fatigue'] = redcap['id_pre_max'] - redcap['id_post_max']
    redcap['slopesmip_max_fatigue'] = redcap['slopesmip_pre_max'] - redcap['slopesmip_post_max']
    redcap['fit_max_fatigue'] = redcap['fit_pre_max'] - redcap['fit_post_max']
    
else:
    print("⚠️ Warning: Missing SMIP columns")

In [35]:
# Calculate percent of predicted values for MIP only (using max only)
predicted_vars = ['predicted1', 'predicted2', 'predicted3']  # these should exist in your dataset: 'mip_predicted1', 'mip_predicted2'

for timepoint in ['pre', 'post']:
    measure_col = f"mip_{timepoint}_max"  # Only using max now
    for pred_var in predicted_vars:
        pred_col = f"mip_{pred_var}"
        percentpredict_col = f"mip_{timepoint}_max_percentpredict_{pred_var[-1]}"
        
        if measure_col in redcap.columns and pred_col in redcap.columns:
            redcap[percentpredict_col] = (redcap[measure_col] / redcap[pred_col]) * 100  # Multiply by 100 to get percentage
        else:
            print(f"⚠️ Warning: Missing columns for {measure_col} or {pred_col}")

predicted_vars = ['predicted']  # only sindex now

for timepoint in ['pre', 'post']:
    measure_col = f"sindex_{timepoint}_max"  # Only using max now
    for pred_var in predicted_vars:
        pred_col = f"sindex_{pred_var}"
        percentpredict_col = f"sindex_{timepoint}_max_percentpredict"
        
        if measure_col in redcap.columns and pred_col in redcap.columns:
            redcap[percentpredict_col] = (redcap[measure_col] / redcap[pred_col]) * 100  # Multiply by 100 to get percentage
        else:
            print(f"⚠️ Warning: Missing columns for {measure_col} or {pred_col}")           

# CPET

## Predicted VO2 Max

In [36]:
# Convert kilograms to pounds
redcap['data_pounds'] = redcap['data_kilograms'] * 2.20462


# https://www.sciencedirect.com/science/article/abs/pii/S0033062017300476 (also https://www.ajconline.org/article/S0002-9149(17)30873-1/abstract)
redcap['cpet_vo2peak_predicted'] = (
    79.9
    - (0.39 * redcap['data_age'])
    - (13.7 * redcap['subject_female'])
    - (0.127 * redcap['data_pounds'])
)

# Percent of predicted VO2 max
redcap['cpet_vo2peak_percentpredicted'] = (
    redcap['cpet_vo2peak_relative'] / redcap['cpet_vo2peak_predicted']
) * 100


## Ventilation / Gas Exchange / Cardiac Classifications

In [37]:
# Initialize impairment columns
redcap["cpet_gasexchangeimpairment"] = 0
redcap["cpet_ventilationimpairment"] = 0
redcap["cpet_cardiacimpairment"] = 0

# Apply classification logic
gas_exchange_mask = (redcap["cpet_petco2_at_vslope"] < 36) & (redcap["cpet_vevco2slope_peak"] >= 34)
ventilation_mask = (redcap["cpet_petco2_at_vslope"] < 36) & (redcap["cpet_vevco2slope_peak"] < 34)
cardiac_mask = (redcap["cpet_o2pulse"] < 10)

redcap.loc[gas_exchange_mask, "cpet_gasexchangeimpairment"] = 1
redcap.loc[ventilation_mask, "cpet_ventilationimpairment"] = 1
redcap.loc[cardiac_mask, "cpet_cardiacimpairment"] = 1

print("\n--- Results ---")
print(redcap[["cpet_gasexchangeimpairment", "cpet_ventilationimpairment", "cpet_cardiacimpairment"]].sum())


--- Results ---
cpet_gasexchangeimpairment     1
cpet_ventilationimpairment    40
cpet_cardiacimpairment        13
dtype: int64


## PetCo2 Change

In [38]:
redcap['cpet_petco2_change'] = redcap['cpet_petco2_peak'] - redcap['cpet_petco2_at_vslope']

# FMD

In [39]:
# Predicted FMD values: https://pubmed.ncbi.nlm.nih.gov/35709326/

# Autonomic

In [40]:
# Create a variable called autonomic_dysfunction that is 1 if all of the following conditions are met, otherwise 0: cpet_hr_restingsit > 75, (cpet_hr_peak - cpet_hr_restingsit) < 89, cpet_hrr1 < 25
redcap['autonomic_dysfunction'] = np.where(
    (redcap['cpet_hr_restingsit'] > 75) &
    ((redcap['cpet_hr_peak'] - redcap['cpet_hr_restingsit']) < 89) &
    (redcap['cpet_hrr1'] < 25),
    1, 0
)

# Subjective Questionnaires

## Woods Mental Fatigue Inventory

In [41]:
# List of columns to sum
woods_columns = [
    'woods_concentration', 'woods_decisions', 'woods_confusion',
    'woods_memory', 'woods_words', 'woods_takethingsin',
    'woods_processingspeed', 'woods_thoughtsmixed', 'woods_muzzy'
]

# Create the new column 'woods_sum'
redcap['woods_sum'] = redcap[woods_columns].sum(axis=1)

# Display mean, min, max, and std of the 'woods_sum' column
print("Woods Sum Statistics:")
print(f"Mean: {redcap['woods_sum'].mean()}")
print(f"Min: {redcap['woods_sum'].min()}")
print(f"Max: {redcap['woods_sum'].max()}")
print(f"Std: {redcap['woods_sum'].std()}")


Woods Sum Statistics:
Mean: 14.909090909090908
Min: 0.0
Max: 31.0
Std: 9.174834595348761


## Fatigue Severity Scale (FSS)

In [42]:
# List of columns to sum
fss_columns = ['fss_motivation', 'fss_exercise', 'fss_easily', 'fss_functioning',
       'fss_problems', 'fss_sustained', 'fss_duties', 'fss_disabling',
       'fss_social']

# Create the new column 'fss_sum'
redcap['fss_sum'] = redcap[fss_columns].sum(axis=1)

# Create a new column fss_dichotomous if fss_sum is greater than 36
redcap['fss_dichotomous'] = redcap['fss_sum'].apply(lambda x: 1 if x > 36 else 0)

# Print the mean, min, max, and std of the 'fss_sum' column
print("FSS Sum Statistics:")
print(f"Mean: {redcap['fss_sum'].mean()}")
print(f"Min: {redcap['fss_sum'].min()}")
print(f"Max: {redcap['fss_sum'].max()}")
print(f"Std: {redcap['fss_sum'].std()}")

FSS Sum Statistics:
Mean: 38.31818181818182
Min: 10.0
Max: 63.0
Std: 16.55684040084933


## PEM (DSQ)

In [43]:
# https://pmc.ncbi.nlm.nih.gov/articles/PMC6165517/
# A frequency and severity score of 2, 2 on any items 1–5 is indicative of PEM.

# Lists of columns
freq_cols = [
    'dsq_heavy_freq', 'dsq_nextday_freq', 'dsq_mentallytired_freq',
    'dsq_minexercise_freq', 'dsq_drained_freq'
]

severity_cols = [
    'dsq_heavy_severity', 'dsq_nextday_severity', 'dsq_mentallytired_severity',
    'dsq_minexercise_severity', 'dsq_drained_severity'
]

# Sum of frequency
redcap['dsq_freq_sum'] = redcap[freq_cols].sum(axis=1)

# Sum of severity
redcap['dsq_severity_sum'] = redcap[severity_cols].sum(axis=1)

# Sum of both frequency and severity
redcap['dsq_sum'] = redcap['dsq_freq_sum'] + redcap['dsq_severity_sum']

# Dichotomous indicator (1 if any item has freq >=2 AND severity >=2)
def compute_dsq_dichotomous(row):
    for freq_col, sev_col in zip(freq_cols, severity_cols):
        if (row[freq_col] >= 2) and (row[sev_col] >= 2):
            return 1
    return 0

redcap['dsq_dichotomous'] = redcap.apply(compute_dsq_dichotomous, axis=1)

# Print the frequency counts for dsq_dichotomous
print("DSQ Dichotomous Frequency Counts:")
print(redcap['dsq_dichotomous'].value_counts())

# Print the mean, min, max, and std of the 'dsq_sum' column
print("DSQ Sum Statistics:")
print(f"Mean: {redcap['dsq_sum'].mean()}")
print(f"Min: {redcap['dsq_sum'].min()}")
print(f"Max: {redcap['dsq_sum'].max()}")



DSQ Dichotomous Frequency Counts:
dsq_dichotomous
1    26
0    18
Name: count, dtype: int64
DSQ Sum Statistics:
Mean: 14.454545454545455
Min: 0.0
Max: 38.0


## Pittsburgh Sleep Index (PSQI)

In [44]:
# If 'psqi_latency' currently holds raw minutes, just rename it to 'psqi_latency_raw'
redcap['psqi_latency_raw'] = redcap['psqi_latency']

# Recode Q2 (raw latency in minutes) into 0-3 scale
def recode_q2(x):
    if pd.isnull(x):
        return np.nan
    elif x < 15:
        return 0
    elif 15 <= x <= 30:
        return 1
    elif 31 <= x <= 60:
        return 2
    else:  # > 60 minutes
        return 3

redcap['psqi_latency_q2'] = redcap['psqi_latency_raw'].apply(recode_q2)

# Q5a is already coded as 0-3, keep it as is in 'psqi_latency30'
redcap['psqi_latency_q5a'] = redcap['psqi_latency30']

# Sum the two recoded latency subscores
redcap['psqi_latency_sum'] = redcap[['psqi_latency_q2', 'psqi_latency_q5a']].sum(axis=1)

# Map the summed value into the final component score
def map_latency_component(x):
    if pd.isnull(x):
        return np.nan
    elif x == 0:
        return 0
    elif 1 <= x <= 2:
        return 1
    elif 3 <= x <= 4:
        return 2
    else:  # 5-6
        return 3

redcap['psqi_latency'] = redcap['psqi_latency_sum'].apply(map_latency_component)

# 3. Sleep Duration
def score_sleep_duration(hours):
    if hours >= 7:
        return 0
    elif 6 <= hours < 7:
        return 1
    elif 5 <= hours < 6:
        return 2
    else:
        return 3

redcap['psqi_duration'] = redcap['psqi_hours'].apply(score_sleep_duration)

# 4. Sleep Efficiency
def hhmm_to_decimal(time_val):
    """
    Converts HHMM integer to decimal hours.
    E.g. 2230 -> 22 + 30/60 = 22.5
    """
    if pd.isnull(time_val):
        return np.nan
    hours = time_val // 100
    minutes = time_val % 100
    return hours + minutes / 60.0

# --- Convert psqi_sleepstart and psqi_sleepend to decimal hours ---
redcap['sleepstart_decimal'] = redcap['psqi_sleepstart'].apply(hhmm_to_decimal)
redcap['sleepend_decimal'] = redcap['psqi_sleepend'].apply(hhmm_to_decimal)

# --- Calculate Time in Bed (handling overnight shifts) ---
def calculate_time_in_bed(start, end):
    if pd.isnull(start) or pd.isnull(end):
        return np.nan
    time_in_bed = end - start
    if time_in_bed <= 0:
        time_in_bed += 24
    return time_in_bed

redcap['time_in_bed'] = redcap.apply(
    lambda row: calculate_time_in_bed(row['sleepstart_decimal'], row['sleepend_decimal']), axis=1
)

# --- Calculate Sleep Efficiency ---
redcap['sleep_efficiency'] = (redcap['psqi_hours'] / redcap['time_in_bed']) * 100

# --- Score PSQI Component 4: Habitual Sleep Efficiency ---
def score_sleep_efficiency(efficiency):
    if pd.isnull(efficiency):
        return np.nan
    elif efficiency >= 85:
        return 0
    elif 75 <= efficiency < 85:
        return 1
    elif 65 <= efficiency < 75:
        return 2
    else:
        return 3

redcap['psqi_efficiency'] = redcap['sleep_efficiency'].apply(score_sleep_efficiency)

# 5. Sleep Disturbances
disturbance_items = [
    'psqi_wake', 'psqi_bathroom', 'psqi_breathe', 'psqi_snore',
    'psqi_cold', 'psqi_hot', 'psqi_dreams', 'psqi_pain', 'psqi_other'
]

# TEMPORARY FIX: NEED TO REMOVE LATER AND FIX DATA ENTRY
redcap['psqi_other'] = pd.to_numeric(redcap['psqi_other'], errors='coerce').fillna(0)

redcap['psqi_disturbances_raw'] = redcap[disturbance_items].sum(axis=1)

redcap['psqi_disturbances'] = pd.cut(
    redcap['psqi_disturbances_raw'],
    bins=[-1, 0, 9, 18, 27],  # Adjusted bins
    labels=[0, 1, 2, 3]
).astype(int)


# 6. Use of Sleep Medications
redcap['psqi_medication'] = redcap['psqi_medicine']

# 7. Daytime Dysfunction
redcap['psqi_dysfunction_raw'] = redcap[['psqi_sleepy', 'psqi_enthusiasm']].sum(axis=1)
redcap['psqi_dysfunction'] = pd.cut(
    redcap['psqi_dysfunction_raw'],
    bins=[-1, 0, 2, 4, 6],
    labels=[0, 1, 2, 3]
).astype(int)

# 8. Calculate PSQI Sum
component_cols = [
    'psqi_quality', 'psqi_latency', 'psqi_duration', 
    'psqi_efficiency', 'psqi_disturbances', 
    'psqi_medication', 'psqi_dysfunction'
]
redcap['psqi_sum'] = redcap[component_cols].sum(axis=1)

# 9. Dichotomous classification: 0 = Good sleep, 1 = Poor sleep
# https://www.psychiatry.pitt.edu/sites/default/files/inline-files/PSQI%20Article.pdf
# https://pmc.ncbi.nlm.nih.gov/articles/PMC11973415/
redcap['psqi_dichotomous'] = (redcap['psqi_sum'] > 5).astype(int)


# List of all component columns plus the sum
components = [
    'psqi_quality', 'psqi_latency', 'psqi_duration', 'psqi_efficiency',
    'psqi_disturbances', 'psqi_medication', 'psqi_dysfunction', 'psqi_sum'
]

# Summary stats: mean, min, max
summary_stats = redcap[components].agg(['mean', 'min', 'max'])
print("Summary statistics for PSQI components and sum:")
print(summary_stats)

# Frequency counts for the dichotomous variable
dichotomous_counts = redcap['psqi_dichotomous'].value_counts(dropna=False)
print("\nFrequency counts for psqi_dichotomous:")
print(dichotomous_counts)


Summary statistics for PSQI components and sum:
      psqi_quality  psqi_latency  psqi_duration  psqi_efficiency  \
mean      1.363636      1.454545       0.977273         0.860465   
min       0.000000      0.000000       0.000000         0.000000   
max       3.000000      3.000000       3.000000         3.000000   

      psqi_disturbances  psqi_medication  psqi_dysfunction  psqi_sum  
mean                1.5         0.954545          1.159091      8.25  
min                 0.0         0.000000          0.000000      1.00  
max                 3.0         3.000000          3.000000     18.00  

Frequency counts for psqi_dichotomous:
psqi_dichotomous
1    30
0    14
Name: count, dtype: int64


## Dyspnea

In [45]:
# Compute BDI Score
redcap['bdi_sum'] = redcap[['bdi_functional', 'bdi_task', 'bdi_effort']].sum(axis=1)

# Compute TDI Score
redcap['tdi_sum'] = redcap[['tdi_functional', 'tdi_task', 'tdi_effort']].sum(axis=1)

# Create dichotomous mmrc_dichotomous (1 if mmrc_grade >=2 else 0)
# https://pubmed.ncbi.nlm.nih.gov/34670858/
# https://pmc.ncbi.nlm.nih.gov/articles/PMC4541543/
# https://publications.ersnet.org/content/erjor/9/6/00592-2023
# https://journalpulmonology.org/en-the-copd-assessment-test-modified-articulo-S2531043721001197
redcap['mmrc_dichotomous'] = redcap['mmrc_score'].apply(lambda x: 1 if x >= 2 else 0 if pd.notnull(x) else np.nan)

## ODI / NDI

In [46]:
# Sum ODI columns
odi_cols = [
    'odi_intensity', 'odi_personalcare', 'odi_lifting', 'odi_walking',
    'odi_sitting', 'odi_standing', 'odi_sleeping', 'odi_sex',
    'odi_social', 'odi_traveling'
]
redcap['odi_sum'] = redcap[odi_cols].sum(axis=1, skipna=True)

# Sum NDI columns
ndi_cols = [
    'ndi_intensity', 'ndi_personalcare', 'ndi_lifting', 'ndi_work',
    'ndi_headaches', 'ndi_concentration', 'ndi_sleeping',
    'ndi_driving', 'ndi_reading', 'ndi_recreation'
]
redcap['ndi_sum'] = redcap[ndi_cols].sum(axis=1, skipna=True)


## Psychology Questionnaire

In [47]:
# Define the columns for each measure
anxiety_cols = ['gad_anxious', 'gad_worrying']
depression_cols = ['phq_hopeless', 'phq_anhedonia']
ptsd_cols = ['ptsd_nightmares', 'ptsd_intrusive', 'ptsd_startled', 'ptsd_detached', 'ptsd_guilty']

# Calculate sum scores
redcap['anxiety_sum'] = redcap[anxiety_cols].sum(axis=1, skipna=True)
redcap['depression_sum'] = redcap[depression_cols].sum(axis=1, skipna=True)
redcap['ptsd_sum'] = redcap[ptsd_cols].sum(axis=1, skipna=True)

# Create dichotomous variables
# PTSD-5: 3 most sensitive, 5 most specific, 4 most efficient https://pmc.ncbi.nlm.nih.gov/articles/PMC5023594/ 
# PHq-2: >=2 --> https://pubmed.ncbi.nlm.nih.gov/33026888/, https://jamanetwork.com/journals/jama/fullarticle/2766865
# GAD-2: >=3 --> https://www.sciencedirect.com/science/article/abs/pii/S0003999318303903, https://www.sciencedirect.com/science/article/abs/pii/S0163834315002406, https://pmc.ncbi.nlm.nih.gov/articles/PMC6163062/, https://pmc.ncbi.nlm.nih.gov/articles/PMC7306644/
redcap['anxiety_dichotomous'] = (redcap['anxiety_sum'] >= 3).astype(int)
redcap['depression_dichotomous'] = (redcap['depression_sum'] >= 2).astype(int)
redcap['ptsd_dichotomous'] = (redcap['ptsd_sum'] >= 3).astype(int)


  redcap['ptsd_dichotomous'] = (redcap['ptsd_sum'] >= 3).astype(int)


## SF-PA

In [48]:
# Define the columns for SFPA
sfpa_cols = [
    'sfpa_vigorous', 'sfpa_moderate', 'sfpa_lifting',
    'sfpa_stairs2', 'sfpa_stairs1', 'sfpa_stooping',
    'sfpa_walkingmile', 'sfpa_walkingblocks2',
    'sfpa_walkingblocks1', 'sfpa_bathingdress'
]

# Recode and calculate in one step using replace()
# https://www.physio-pedia.com/36-Item_Short_Form_Survey_(SF-36)
redcap['sfpa_sum'] = (
    redcap[sfpa_cols]
    .replace({1: 0, 2: 50, 3: 100})
    .sum(axis=1, skipna=True) 
    / 10
)

  redcap['sfpa_sum'] = (


# Export

In [49]:
# Create DataFrame for time_point = 1
df_CS = redcap[redcap['time_point'] == 1].copy()

# Create DataFrame for Covid_group = 1
df_RCT = redcap[redcap['covid_group'] == 1].copy()

# Export to CSV files
df_CS.to_csv('df_CS.csv', index=False)
df_RCT.to_csv('df_RCT.csv', index=False)
redcap.to_csv('df_full.csv', index=False)
