In [18]:
library(tidyverse)
library(effectsize)
library(pROC)
library(dplyr)


# Import Dataset

In [66]:
df <- read.csv("C:/Users/ZCooper/Documents/GitHub/longcovid/df_full.csv")
df_cs <- read.csv("C:/Users/ZCooper/Documents/GitHub/longcovid/df_CS.csv")
df_rct <- read.csv("C:/Users/ZCooper/Documents/GitHub/longcovid/df_RCT.csv")
df_rct_wide <- read.csv("C:/Users/ZCooper/Documents/GitHub/longcovid/df_RCT_wide.csv")

# Select Indices

In [70]:
# list of respiratory indices (taken at baseline)
vars <- c("mip_pre_max", "mip_pre_max_percentpredict_1", "mip_pre_max_percentpredict_2", "mip_post_max_percentpredict_1", "mip_post_max_percentpredict_2", "smip_pre_max", "slopesmip_pre_max", "id_pre_max", "pif_pre_max", 
          "volume_pre_max", "sindex_pre_max", "sindex_pre_max_percentpredict", "fit_pre_max", "mip_post_max", "smip_post_max", "slopesmip_post_max", "id_post_max", "pif_post_max", 
          "volume_post_max", "sindex_post_max", "sindex_post_max_percentpredict", "fit_post_max")

# Discriminatory Validity

## PCS vs Healthy Controls

### ROC AUC + Youden J + Sensitivity/Specificity

In [55]:
# ---- packages ---- 
library(dplyr)
library(pROC)
library(binom)     # for exact binomial CIs on sens/spec
library(effectsize)

# ---- loop ---- 
roc_tbl <- list() 

for (v in vars) {
  r <- roc(df_cs$covid_group, df_cs[[v]], quiet = TRUE, direction = ">")
  auc_ci <- ci.auc(r)
  
  # Youden cutpoint
  best <- coords(r, "best", best.method = "youden",
                 ret = c("threshold","sensitivity","specificity","tp","tn","fp","fn"))
  

  # Extract values more safely
  tp_val <- as.numeric(best$tp[1])  # Use $ notation and take first element
  fn_val <- as.numeric(best$fn[1])
  tn_val <- as.numeric(best$tn[1]) 
  fp_val <- as.numeric(best$fp[1])
  
  # Check for NA values
  if(any(is.na(c(tp_val, fn_val, tn_val, fp_val)))) {
    print("Warning: NA values detected")
    next
  }
  
  # sens/spec CIs
  sens_ci <- binom.exact(tp_val, tp_val + fn_val)
  spec_ci <- binom.exact(tn_val, tn_val + fp_val)
  
  # Cohen's d (LC vs HC)
  d_val <- cohens_d(as.formula(paste(v, "~ covid_group")), data = df_cs)$Cohens_d
   
  roc_tbl[[v]] <- tibble(
    Variable = v,
    AUC = as.numeric(auc(r)),
    AUC_low = as.numeric(auc_ci[1]),
    AUC_high = as.numeric(auc_ci[3]),
    Youden_J = as.numeric(best$sensitivity[1] + best$specificity[1] - 1),
    Cutpoint = as.numeric(best$threshold[1]),
    Sens = as.numeric(best$sensitivity[1]),
    Sens_low = sens_ci$lower,
    Sens_high = sens_ci$upper,
    Spec = as.numeric(best$specificity[1]),
    Spec_low = spec_ci$lower,
    Spec_high = spec_ci$upper,
    Cohens_d = d_val
  )
}

roc_results <- bind_rows(roc_tbl) %>% arrange(desc(AUC))
print(roc_results, width=Inf)

1: Missing values detected. NAs dropped. 
2: Missing values detected. NAs dropped. 
3: Missing values detected. NAs dropped. 
4: Missing values detected. NAs dropped. 


# A tibble: 20 × 13
   Variable                        AUC AUC_low AUC_high Youden_J Cutpoint
   <chr>                         <dbl>   <dbl>    <dbl>    <dbl>    <dbl>
 1 mip_pre_max_percentpredict_2  0.975   0.922    1        0.9      72.0 
 2 mip_pre_max_percentpredict_1  0.96    0.893    1        0.85     86.6 
 3 mip_post_max_percentpredict_2 0.955   0.884    1        0.8      83.2 
 4 mip_post_max                  0.955   0.886    1        0.85     77.5 
 5 mip_post_max_percentpredict_1 0.95    0.878    1        0.75     96.2 
 6 mip_pre_max                   0.93    0.791    1        0.9      76.5 
 7 pif_pre_max                   0.865   0.704    1        0.7       3.65
 8 sindex_pre_max                0.862   0.700    1        0.7      63.5 
 9 pif_post_max                  0.855   0.687    1        0.7       4.05
10 smip_pre_max                  0.85    0.698    1        0.65    419   
11 sindex_post_max               0.848   0.674    1        0.7      69   
12 smip_post_max  

### Compare AUCs (DeLong)

In [56]:
# Example: compare S-Index vs MIP
r_mip <- roc(df_cs$covid_group, df_cs$mip_pre_max, quiet=TRUE)
r_sidx <- roc(df_cs$covid_group, df_cs$sindex_pre_max, quiet=TRUE)
roc.test(r_mip, r_sidx, method = "delong")


	DeLong's test for two correlated ROC curves

data:  r_mip and r_sidx
Z = 1.4467, p-value = 0.148
alternative hypothesis: true difference in AUC is not equal to 0
95 percent confidence interval:
 -0.02394503  0.15894503
sample estimates:
AUC of roc1 AUC of roc2 
     0.9300      0.8625 


## Dyspnea vs No Dyspnea

### ROC AUC + Youden

In [57]:
# ---- packages ---- 
library(dplyr)
library(pROC)
library(binom)     # for exact binomial CIs on sens/spec
library(effectsize)

# ---- loop ---- 
roc_tbl <- list() 

for (v in vars) {
  r <- roc(df_cs$mmrc_dichotomous, df_cs[[v]], quiet = TRUE, direction = ">")
  auc_ci <- ci.auc(r)
  
  # Youden cutpoint
  best <- coords(r, "best", best.method = "youden",
                 ret = c("threshold","sensitivity","specificity","tp","tn","fp","fn"))
  
  # Extract values more safely
  tp_val <- as.numeric(best$tp[1])  # Use $ notation and take first element
  fn_val <- as.numeric(best$fn[1])
  tn_val <- as.numeric(best$tn[1]) 
  fp_val <- as.numeric(best$fp[1])
  
  # Check for NA values
  if(any(is.na(c(tp_val, fn_val, tn_val, fp_val)))) {
    print("Warning: NA values detected")
    next
  }
  
  # sens/spec CIs
  sens_ci <- binom.exact(tp_val, tp_val + fn_val)
  spec_ci <- binom.exact(tn_val, tn_val + fp_val)
  
  # Cohen's d (Dyspnea vs No Dyspnea)
  d_val <- cohens_d(as.formula(paste(v, "~ mmrc_dichotomous")), data = df_cs)$Cohens_d
   
  roc_tbl[[v]] <- tibble(
    Variable = v,
    AUC = as.numeric(auc(r)),
    AUC_low = as.numeric(auc_ci[1]),
    AUC_high = as.numeric(auc_ci[3]),
    Youden_J = as.numeric(best$sensitivity[1] + best$specificity[1] - 1),
    Cutpoint = as.numeric(best$threshold[1]),
    Sens = as.numeric(best$sensitivity[1]),
    Sens_low = sens_ci$lower,
    Sens_high = sens_ci$upper,
    Spec = as.numeric(best$specificity[1]),
    Spec_low = spec_ci$lower,
    Spec_high = spec_ci$upper,
    Cohens_d = d_val
  )
}

roc_results <- bind_rows(roc_tbl) %>% arrange(desc(AUC))
print(roc_results, width=Inf)

1: Missing values detected. NAs dropped. 
2: Missing values detected. NAs dropped. 
3: Missing values detected. NAs dropped. 
4: Missing values detected. NAs dropped. 


# A tibble: 20 × 13
   Variable                        AUC AUC_low AUC_high Youden_J Cutpoint
   <chr>                         <dbl>   <dbl>    <dbl>    <dbl>    <dbl>
 1 mip_post_max                  0.891   0.779    1        0.688    75   
 2 sindex_post_max               0.877   0.756    0.999    0.688    67.5 
 3 pif_post_max                  0.868   0.742    0.994    0.688     3.95
 4 mip_post_max_percentpredict_1 0.866   0.740    0.992    0.580    48.8 
 5 mip_post_max_percentpredict_2 0.857   0.726    0.988    0.616    65.5 
 6 smip_pre_max                  0.848   0.694    1        0.688   419   
 7 id_pre_max                    0.847   0.680    1        0.667     5.26
 8 fit_pre_max                   0.817   0.649    0.985    0.679    13   
 9 pif_pre_max                   0.808   0.650    0.966    0.625     3.65
10 sindex_pre_max                0.806   0.646    0.965    0.625    63.5 
11 smip_post_max                 0.804   0.638    0.970    0.616   302   
12 mip_pre_max    

### Compare AUCs (Delong)

In [58]:
# Example: compare S-Index vs MIP
r_mip <- roc(df_cs$mmrc_dichotomous, df_cs$mip_pre_max, quiet=TRUE)
r_sidx <- roc(df_cs$mmrc_dichotomous, df_cs$sindex_pre_max, quiet=TRUE)
print(roc.test(r_mip, r_sidx, method = "delong"))



	DeLong's test for two correlated ROC curves

data:  r_mip and r_sidx
Z = -0.25375, p-value = 0.7997
alternative hypothesis: true difference in AUC is not equal to 0
95 percent confidence interval:
 -0.1752564  0.1350779
sample estimates:
AUC of roc1 AUC of roc2 
  0.7857143   0.8058036 



# Convergent Validity

## BDI vs Index

In [74]:
# Initialize list to store results
convergent_results <- list()

# Loop through each respiratory variable
for (v in vars) {
  
  # Create complete cases dataset for this analysis
  temp_df <- df_cs[complete.cases(df_cs[, c("bdi_sum", v, "data_age", "subject_female")]), ]
  
  # Skip if not enough data
  if (nrow(temp_df) < 10) {
    message("Skipping ", v, ": insufficient data (n = ", nrow(temp_df), ")")
    next
  }
  
  # 1. CORRELATION ANALYSIS (Spearman - good for BDI Likert scale)
  cor_test <- cor.test(temp_df[[v]], temp_df$bdi_sum, method = "pearson")
  
  # 2. LINEAR REGRESSION (controlling for age and sex)
  formula <- as.formula(paste("bdi_sum ~", v, "+ data_age + subject_female"))
  lm_model <- lm(formula, data = temp_df)
  lm_summary <- summary(lm_model)
  
  # 3. CHECK REGRESSION ASSUMPTIONS
  # Normality of residuals
  shapiro_test <- shapiro.test(residuals(lm_model))
  
  # 4. EXTRACT KEY RESULTS
  convergent_results[[v]] <- list(
    variable = v,
    n = nrow(temp_df),
    
    # Correlation results
    correlation = list(
      rho = cor_test$estimate,
      p_value = cor_test$p.value,
      ci_low = cor_test$conf.int[1],
      ci_high = cor_test$conf.int[2]
    ),
    
    # Regression results
    regression = list(
      formula = format(formula),
      r_squared = lm_summary$r.squared,
      adj_r_squared = lm_summary$adj.r.squared,
      f_statistic = lm_summary$fstatistic[1],
      f_p_value = pf(lm_summary$fstatistic[1], 
                     lm_summary$fstatistic[2], 
                     lm_summary$fstatistic[3], 
                     lower.tail = FALSE)
    ),
    
    # Key coefficient for the respiratory variable (adjusted for age/sex)
    main_effect = list(
      estimate = lm_summary$coefficients[v, "Estimate"],
      std_error = lm_summary$coefficients[v, "Std. Error"],
      t_value = lm_summary$coefficients[v, "t value"],
      p_value = lm_summary$coefficients[v, "Pr(>|t|)"],
      conf_low = confint(lm_model)[v, 1],
      conf_high = confint(lm_model)[v, 2],
      standardized_beta = lm_summary$coefficients[v, "Estimate"] * sd(temp_df[[v]]) / sd(temp_df$bdi_sum)
    ),
    
    # Assumption checks
    assumptions = list(
      residuals_normal = shapiro_test$p.value > 0.05,
      shapiro_p = shapiro_test$p.value
    )
  )
}

# Create a clean summary table for interpretation
summary_table <- do.call(rbind, lapply(convergent_results, function(x) {
  data.frame(
    Variable = x$variable,
    N = x$n,
    Correlation_rho = round(x$correlation$rho, 3),
    Correlation_p = round(x$correlation$p_value, 3),
    Beta = round(x$main_effect$estimate, 3),
    Std_Beta = round(x$main_effect$standardized_beta, 3),
    SE = round(x$main_effect$std_error, 3),
    Beta_p = round(x$main_effect$p_value, 3),
    CI_95 = paste0("[", round(x$main_effect$conf_low, 2), ", ", 
                   round(x$main_effect$conf_high, 2), "]"),
    R_squared = round(x$regression$r_squared, 3),
    Residuals_normal = x$assumptions$residuals_normal
  )
}))

# Sort by absolute correlation strength (strongest relationships first)
summary_table <- summary_table[order(-abs(summary_table$Correlation_rho)), ]

# Print the results
print(summary_table)

                                                     Variable  N
mip_post_max_percentpredict_2   mip_post_max_percentpredict_2 30
mip_post_max_percentpredict_1   mip_post_max_percentpredict_1 30
mip_pre_max_percentpredict_2     mip_pre_max_percentpredict_2 30
mip_post_max                                     mip_post_max 30
sindex_post_max_percentpredict sindex_post_max_percentpredict 30
mip_pre_max_percentpredict_1     mip_pre_max_percentpredict_1 30
sindex_pre_max_percentpredict   sindex_pre_max_percentpredict 30
smip_pre_max                                     smip_pre_max 30
mip_pre_max                                       mip_pre_max 30
fit_pre_max                                       fit_pre_max 30
smip_post_max                                   smip_post_max 30
id_pre_max                                         id_pre_max 25
pif_post_max                                     pif_post_max 30
sindex_post_max                               sindex_post_max 30
fit_post_max             

## Lasso Logistic

In [77]:
library(glmnet)

# predictors matrix (complete cases across chosen variables)
dat_lasso <- df_cs %>% select(all_of(resp_vars), covid_group) %>% na.omit()
x <- as.matrix(dat_lasso %>% select(-covid_group))
y <- dat_lasso$covid_group

set.seed(123)
cvfit <- cv.glmnet(x, y, family = "binomial", alpha = 1, nfolds = 10, standardize = TRUE)
cvfit$lambda.min; cvfit$lambda.1se

# non-zero coefficients at the more parsimonious lambda.1se
coef_1se <- coef(cvfit, s = "lambda.1se")
keep <- which(as.numeric(coef_1se) != 0)
lasso_keep <- rownames(coef_1se)[keep]
print(lasso_keep)

# predicted probs and AUC
pred <- as.numeric(predict(cvfit, newx = x, s = "lambda.1se", type = "response"))
auc(cvfit_roc <- roc(y, pred, quiet=TRUE))
ci.auc(cvfit_roc)

# Check coefficients at lambda.min
coef_min <- coef(cvfit, s = "lambda.min")
print(coef_min)


[1] "(Intercept)"  "mip_pre_max"  "mip_post_max"
13 x 1 sparse Matrix of class "dgCMatrix"
                 lambda.min
(Intercept)      5.25944715
mip_pre_max     -0.04614875
smip_pre_max     .         
pif_pre_max      .         
sindex_pre_max   .         
volume_pre_max   .         
fit_pre_max      .         
mip_post_max    -0.01717136
smip_post_max    .         
pif_post_max     .         
sindex_post_max  .         
volume_post_max  .         
fit_post_max     .         


# Intervention Responsiveness

In [22]:
# Get all variable names that end with "_pre"
pre_vars <- grep("_pre$", names(df_rct_wide), value = TRUE)

resp_results <- data.frame()

for (pre_col in pre_vars) {
  # corresponding post column
  post_col <- sub("_pre$", "_post", pre_col)
  
  if (post_col %in% names(df_rct_wide)) {
    # compute paired Cohen's d
    d_val <- cohens_d(df_rct_wide[[post_col]], df_rct_wide[[pre_col]], paired = TRUE)$Cohens_d
    
    # clean variable name for display
    var_name <- sub("_pre$", "", pre_col)
    
    resp_results <- rbind(resp_results,
                          data.frame(Variable = var_name,
                                     Responsiveness_d = d_val))
  }
}

print(resp_results)


For paired samples, 'repeated_measures_d()' provides more options.
For paired samples, 'repeated_measures_d()' provides more options.
For paired samples, 'repeated_measures_d()' provides more options.
For paired samples, 'repeated_measures_d()' provides more options.
For paired samples, 'repeated_measures_d()' provides more options.
For paired samples, 'repeated_measures_d()' provides more options.
For paired samples, 'repeated_measures_d()' provides more options.
For paired samples, 'repeated_measures_d()' provides more options.
For paired samples, 'repeated_measures_d()' provides more options.
For paired samples, 'repeated_measures_d()' provides more options.
For paired samples, 'repeated_measures_d()' provides more options.
For paired samples, 'repeated_measures_d()' provides more options.
For paired samples, 'repeated_measures_d()' provides more options.
For paired samples, 'repeated_measures_d()' provides more options.
For paired samples, 'repeated_measures_d()' provides more opti



                       Variable Responsiveness_d
1                   fmd_percent     0.7508984545
2         fmd_diameter_baseline     0.0098471710
3          fmd_diameter_maximum     0.1436378964
4            fmd_shear_baseline     0.2645379954
5             fmd_shear_maximum     0.1225486725
6         fmd_velocity_baseline     0.3631545859
7          fmd_velocity_maximum     0.2883041109
8       fmd_shear_areatomaximum     0.1873395054
9                 fmd_shear_auc    -0.1868607981
10             cpet_sbp_sitting    -0.1958397687
11             cpet_dbp_sitting    -0.1832146524
12        cpet_vo2peak_relative     0.2715451447
13        cpet_vo2max_predicted    -0.2832324977
14 cpet_vo2max_percentpredicted     0.2998120087
15        cpet_vo2peak_absolute     0.2475695868
16                     cpet_rer     0.6711071934
17                 cpet_o2pulse     0.1820428375
18                 cpet_vt_peak     0.1877844029
19                 cpet_rr_peak     0.4466161231
20                 c