In [1]:
library(stringr)
library(readr)
library(tibble)
library(psychometric)
library(dplyr)

Loading required package: dplyr


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: multilevel

Loading required package: nlme


Attaching package: ‘nlme’


The following object is masked from ‘package:dplyr’:

    collapse


Loading required package: MASS


Attaching package: ‘MASS’


The following object is masked from ‘package:dplyr’:

    select


Loading required package: purrr



In [2]:
fit_glm <- function(data_df, formula_str, family){
    glm(
        stats::as.formula(formula_str),
        family = family,
        data = data_df
    )
}


In [3]:
fit_to_df <- function(fit){
    fit_df <- summary(fit)$coeff %>%
    as.data.frame() %>% rownames_to_column("variable")
    colnames(fit_df) <- c("variable", "estimate", "SE", "z_or_t_value", "P")
    fit_df
}


In [4]:
glm_fit_to_R2 <- function(glm_fit) {
    with(summary(glm_fit), 1 - deviance / null.deviance)
}

In [5]:
compose_regression_formula_str <- function(
    response, predictors, quote_char="`"
) {
    return(sprintf(
        "%s ~ 1 + %s",
        paste0(quote_char, response, quote_char),
        paste(sapply(
            predictors,
            function(term){paste0(quote_char, term, quote_char)}
        ), collapse = " + ")
    ))
}

In [6]:
eval_R2_CI <- function(data, response, predictors, level=.95) {
    data %>% fit_glm(
        compose_regression_formula_str(response, predictors),
        "gaussian"
    ) -> glm_fit
    # get the p-value
    glm_fit %>% fit_to_df() %>%
    mutate(variable = str_replace_all(variable, "`", "")) %>%
    filter(variable %in% predictors) %>%
    pull(P) %>%
    # we extract the smallest p-values across multiple predictors for now
    min() -> P_val
    # compute the r-squared value
    glm_fit %>% glm_fit_to_R2() -> rsq
    # call psychometric::CI.Rsq() to compute confidence interval
    # https://rdrr.io/cran/psychometric/man/CI.Rsq.html
    # https://rdrr.io/cran/psychometric/src/R/CI.Rsq.R
    CI.Rsq(rsq, n=nrow(data), k=length(predictors), level=level) %>%
    # format the resulting table
    dplyr::select(-SErsq) %>% mutate(
        metric = "R2",
        response = response,
        predictors = paste(predictors, collapse = "+"),
        P = P_val,
        n = nrow(data)
    ) %>%
    rename("eval"="Rsq", "l_eval"="LCL", "u_eval"="UCL") %>%
    dplyr::select(response, predictors, metric, `eval`, l_eval, u_eval, P, n)
}


In [9]:
setwd('/home/lucytian/data/5_SBayesRC')

phenos <- c("INI1003063", "INI20030780", "INI30120", "INI50030700")

results <- lapply(phenos, function(p) {
    pheno_file <- paste0("pheno/test_", p, ".pheno.txt")
    pheno <- read_tsv(pheno_file, show_col_types = FALSE)
    prs_file <- paste0("prs/tuned_afr_wb_", p, ".score.txt")
    val_file <- 'Afr_val.keep'
    prs <- read_tsv(prs_file, show_col_types = FALSE)
    val <- read_tsv(val_file, col_names = c("FID","IID"), show_col_types = FALSE)
    prs_test <- prs %>%
      anti_join(val, by = c("FID","IID"))
    
    df <- pheno %>%
    inner_join(prs_test, by = c("FID", "IID"))
  
    out <- eval_R2_CI(df, response = "PHENO", predictors = c("SCORE"))
    
    out$pheno_id <- p
    
    out
}) %>%
  bind_rows()

In [28]:
write_tsv(results, "prs/sbayesrc_results.tsv")

In [10]:
results

response,predictors,metric,eval,l_eval,u_eval,P,n,pheno_id
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<chr>
PHENO,SCORE,R2,0.0001130869,-0.0011522785,0.001378452,0.7272713,1078,INI1003063
PHENO,SCORE,R2,0.1432530185,0.1055567439,0.180949293,8.398954e-40,1130,INI20030780
PHENO,SCORE,R2,0.0136304569,0.0003823721,0.026878542,7.027497e-05,1154,INI30120
PHENO,SCORE,R2,0.0072372872,-0.0025635857,0.01703816,0.004145929,1134,INI50030700


In [38]:
setwd('/home/lucytian/data/5_SBayesRC')

phenos <- c("INI20030780", "INI30120", "INI50030700")

results <- lapply(phenos, function(p) {
    pheno_file <- paste0("pheno/test_", p, ".pheno.txt")
    pheno <- read_tsv(pheno_file, show_col_types = FALSE)
    prs_file   <- if (p == "INI20030780") paste0("prs/tuned_afr_wb_exclude_APOE", p, ".score.txt")
                else paste0("prs/tuned_afr_wb_", p, ".score.txt")
    val_file <- 'Afr_val.keep'
    prs <- read_tsv(prs_file, show_col_types = FALSE)
    val <- read_tsv(val_file, col_names = c("FID","IID"), show_col_types = FALSE)
    prs_test <- prs %>%
      anti_join(val, by = c("FID","IID"))
    
    df <- pheno %>%
    inner_join(prs_test, by = c("FID", "IID"))
  
    out <- eval_R2_CI(df, response = "PHENO", predictors = c("SCORE"))
    
    out$pheno_id <- p
    
    out
}) %>%
  bind_rows()

In [39]:
results

response,predictors,metric,eval,l_eval,u_eval,P,n,pheno_id
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<chr>
PHENO,SCORE,R2,0.024846097,0.006977263,0.04271493,1.003151e-07,1130,INI20030780
PHENO,SCORE,R2,0.013630457,0.0003823721,0.02687854,7.027497e-05,1154,INI30120
PHENO,SCORE,R2,0.007237287,-0.0025635857,0.01703816,0.004145929,1134,INI50030700


In [None]:
read_tsv("~/data/4_polypred/final_comparison.tsv", show_col_types = FALSE)

In [None]:
### exclude the SNPs