In [4]:
library(stringr)
library(readr)
library(tibble)
library(psychometric)
library(dplyr)

In [5]:
fit_glm <- function(data_df, formula_str, family){
    glm(
        stats::as.formula(formula_str),
        family = family,
        data = data_df
    )
}

In [6]:
fit_to_df <- function(fit){
    fit_df <- summary(fit)$coeff %>%
    as.data.frame() %>% rownames_to_column("variable")
    colnames(fit_df) <- c("variable", "estimate", "SE", "z_or_t_value", "P")
    fit_df
}


In [7]:
glm_fit_to_R2 <- function(glm_fit) {
    with(summary(glm_fit), 1 - deviance / null.deviance)
}

In [8]:
compose_regression_formula_str <- function(
    response, predictors, quote_char="`"
) {
    return(sprintf(
        "%s ~ 1 + %s",
        paste0(quote_char, response, quote_char),
        paste(sapply(
            predictors,
            function(term){paste0(quote_char, term, quote_char)}
        ), collapse = " + ")
    ))
}

In [9]:
eval_R2_CI <- function(data, response, predictors, level=.95) {
    data %>% fit_glm(
        compose_regression_formula_str(response, predictors),
        "gaussian"
    ) -> glm_fit
    # get the p-value
    glm_fit %>% fit_to_df() %>%
    mutate(variable = str_replace_all(variable, "`", "")) %>%
    filter(variable %in% predictors) %>%
    pull(P) %>%
    # we extract the smallest p-values across multiple predictors for now
    min() -> P_val
    # compute the r-squared value
    glm_fit %>% glm_fit_to_R2() -> rsq
    # call psychometric::CI.Rsq() to compute confidence interval
    # https://rdrr.io/cran/psychometric/man/CI.Rsq.html
    # https://rdrr.io/cran/psychometric/src/R/CI.Rsq.R
    CI.Rsq(rsq, n=nrow(data), k=length(predictors), level=level) %>%
    # format the resulting table
    dplyr::select(-SErsq) %>% mutate(
        metric = "R2",
        response = response,
        predictors = paste(predictors, collapse = "+"),
        P = P_val,
        n = nrow(data)
    ) %>%
    rename("eval"="Rsq", "l_eval"="LCL", "u_eval"="UCL") %>%
    dplyr::select(response, predictors, metric, `eval`, l_eval, u_eval, P, n)
}

In [11]:
phenos <- c("INI1003063", "INI20030780", "INI30120")
results <- lapply(phenos, function(p) {
    pheno_file <- paste0("/home/lucytian/data/5_SBayesRC/pheno/test_", p, ".pheno.txt")
    pheno <- read_tsv(pheno_file, show_col_types = FALSE)
    prs_file  <- paste0("prs/best_", p, ".sscore")
    prs <- read_tsv(prs_file, show_col_types = FALSE) %>%
        dplyr::select(all_of(c("#FID", "IID", "SCORE1_AVG"))) %>%
        dplyr::rename(FID = `#FID`)



    df <- inner_join(pheno, prs, by = c("FID", "IID"))

  out <- eval_R2_CI(df, response = "PHENO", predictors = c("SCORE1_AVG"))
  out$pheno_id <- p
  out
})

In [14]:
bind_rows(results)

response,predictors,metric,eval,l_eval,u_eval,P,n,pheno_id
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<chr>
PHENO,SCORE1_AVG,R2,0.0002678754,-0.001679318,0.002215068,0.5914158,1078,INI1003063
PHENO,SCORE1_AVG,R2,0.0366149091,0.015184875,0.058044943,8.859983e-11,1130,INI20030780
PHENO,SCORE1_AVG,R2,0.0054496222,-0.002996711,0.013895956,0.01212565,1154,INI30120


In [15]:
phenos <- c("INI20030780")
results <- lapply(phenos, function(p) {
    pheno_file <- paste0("/home/lucytian/data/5_SBayesRC/pheno/test_", p, ".pheno.txt")
    pheno <- read_tsv(pheno_file, show_col_types = FALSE)
    prs_file  <- paste0("prs/best_", p, "_exclude_APOE.sscore")
    prs <- read_tsv(prs_file, show_col_types = FALSE) %>%
        dplyr::select(all_of(c("#FID", "IID", "SCORE1_AVG"))) %>%
        dplyr::rename(FID = `#FID`)



    df <- inner_join(pheno, prs, by = c("FID", "IID"))

  out <- eval_R2_CI(df, response = "PHENO", predictors = c("SCORE1_AVG"))
  out$pheno_id <- p
  out
})

In [16]:
results

response,predictors,metric,eval,l_eval,u_eval,P,n,pheno_id
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<chr>
PHENO,SCORE1_AVG,R2,0.02183757,0.005033786,0.03864135,6.054855e-07,1130,INI20030780
