In [1]:
library(stringr)
library(readr)
library(tibble)
library(psychometric)
library(dplyr)

Loading required package: dplyr


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: multilevel

Loading required package: nlme


Attaching package: ‘nlme’


The following object is masked from ‘package:dplyr’:

    collapse


Loading required package: MASS


Attaching package: ‘MASS’


The following object is masked from ‘package:dplyr’:

    select


Loading required package: purrr



In [2]:
getAnywhere(select)

3 differing objects matching ‘select’ were found
in the following places
  package:MASS
  package:dplyr
  namespace:dplyr
  namespace:tidyselect
  namespace:MASS
Use [] to view one of them

In [3]:
fit_glm <- function(data_df, formula_str, family){
    glm(
        stats::as.formula(formula_str),
        family = family,
        data = data_df
    )
}


In [4]:
fit_to_df <- function(fit){
    fit_df <- summary(fit)$coeff %>%
    as.data.frame() %>% rownames_to_column("variable")
    colnames(fit_df) <- c("variable", "estimate", "SE", "z_or_t_value", "P")
    fit_df
}


In [5]:
glm_fit_to_R2 <- function(glm_fit) {
    with(summary(glm_fit), 1 - deviance / null.deviance)
}

In [6]:
compose_regression_formula_str <- function(
    response, predictors, quote_char="`"
) {
    return(sprintf(
        "%s ~ 1 + %s",
        paste0(quote_char, response, quote_char),
        paste(sapply(
            predictors,
            function(term){paste0(quote_char, term, quote_char)}
        ), collapse = " + ")
    ))
}

In [7]:
eval_R2_CI <- function(data, response, predictors, level=.95) {
    data %>% fit_glm(
        compose_regression_formula_str(response, predictors),
        "gaussian"
    ) -> glm_fit
    # get the p-value
    glm_fit %>% fit_to_df() %>%
    mutate(variable = str_replace_all(variable, "`", "")) %>%
    filter(variable %in% predictors) %>%
    pull(P) %>%
    # we extract the smallest p-values across multiple predictors for now
    min() -> P_val
    # compute the r-squared value
    glm_fit %>% glm_fit_to_R2() -> rsq
    # call psychometric::CI.Rsq() to compute confidence interval
    # https://rdrr.io/cran/psychometric/man/CI.Rsq.html
    # https://rdrr.io/cran/psychometric/src/R/CI.Rsq.R
    CI.Rsq(rsq, n=nrow(data), k=length(predictors), level=level) %>%
    # format the resulting table
    dplyr::select(-SErsq) %>% mutate(
        metric = "R2",
        response = response,
        predictors = paste(predictors, collapse = "+"),
        P = P_val,
        n = nrow(data)
    ) %>%
    rename("eval"="Rsq", "l_eval"="LCL", "u_eval"="UCL") %>%
    dplyr::select(response, predictors, metric, `eval`, l_eval, u_eval, P, n)
}


In [18]:
setwd('/home/lucytian/data/4_polypred')



pheno <- read_tsv("test_pheno/INI20030780.pheno.txt")

prs <- read_tsv("PRS/INI20030780_exclude_APOE.pred.prs")

[1mRows: [22m[34m1130[39m [1mColumns: [22m[34m3[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[32mdbl[39m (3): FID, IID, PHENO

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m1213[39m [1mColumns: [22m[34m3[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[32mdbl[39m (3): FID, IID, PRS

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [19]:
df <- pheno %>%
  inner_join(prs, by = c("FID", "IID"))

In [20]:
eval_R2_CI(df, response = "PHENO", predictors = c("PRS"))

response,predictors,metric,eval,l_eval,u_eval,P,n
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<int>
PHENO,PRS,R2,0.01178495,-0.000686278,0.02425619,0.0002561644,1130


In [11]:
phenos <- c("INI1003063", "INI20030780", "INI30120", "INI50030700")

results <- lapply(phenos, function(p) {
    pheno_file <- paste0("test_pheno/", p, ".pheno.txt")
    pheno <- read_tsv(pheno_file, show_col_types = FALSE)
    prs_file <- paste0("PRS/", p, ".pred.prs")
    prs <- read_tsv(prs_file, show_col_types = FALSE)
    
    df <- pheno %>%
    inner_join(prs, by = c("FID", "IID"))
  
    out <- eval_R2_CI(df, response = "PHENO", predictors = c("PRS"))
    
    out$pheno_id <- p
    
    out
}) %>%
  bind_rows()

In [12]:
results

response,predictors,metric,eval,l_eval,u_eval,P,n,pheno_id
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<chr>
PHENO,PRS,R2,0.001468721,-0.00308525,0.006022691,0.2086498,1078,INI1003063
PHENO,PRS,R2,0.024756296,0.006918141,0.042594452,1.058424e-07,1130,INI20030780
PHENO,PRS,R2,0.001725589,-0.003045054,0.006496232,0.1584734,1154,INI30120
PHENO,PRS,R2,0.005557166,-0.003045596,0.014159928,0.01203659,1134,INI50030700
