Skip to content

Commit

Permalink
Merge pull request #34 from nhejazi/biocparallel-future-dev
Browse files Browse the repository at this point in the history
Parallelization via future and BiocParallel
  • Loading branch information
nhejazi authored Aug 10, 2017
2 parents da26e25 + a5e80ff commit f494b63
Show file tree
Hide file tree
Showing 9 changed files with 124 additions and 96 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
branches:
only:
- master
- develop
- /.*/-dev

env:
global:
Expand Down
76 changes: 38 additions & 38 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -2,56 +2,56 @@ Package: biotmle
Title: Targeted Learning for Biomarker Discovery with Moderated Statistics
Version: 1.1.0
Authors@R: c(
person("Nima", "Hejazi", email = "nhejazi@berkeley.edu",
role = c("aut", "cre", "cph")),
person("Alan", "Hubbard", email = "hubbard@berkeley.edu", role = "aut"),
person("Weixin", "Cai", email = "wcai@berkeley.edu", role = "ctb")
)
person("Nima", "Hejazi", email = "nhejazi@berkeley.edu",
role = c("aut", "cre", "cph")),
person("Alan", "Hubbard", email = "hubbard@berkeley.edu", role = "aut"),
person("Weixin", "Cai", email = "wcai@berkeley.edu", role = "ctb")
)
Author: Nima Hejazi [aut, cre, cph]
Maintainer: Nima Hejazi <nhejazi@berkeley.edu>
Description: This package facilitates the discovery of biomarkers from
biological sequencing data (e.g., microarrays, RNA-seq) based on the
associations of potential biomarkers with exposure and outcome variables by
implementing an estimation procedure that combines a generalization of
moderated statistics with asymptotically linear statistical parameters
estimated via targeted minimum loss-based estimation (TMLE).
biological sequencing data (e.g., microarrays, RNA-seq) based on the
associations of potential biomarkers with exposure and outcome variables by
implementing an estimation procedure that combines a generalization of
moderated statistics with asymptotically linear statistical parameters
estimated via targeted minimum loss-based estimation (TMLE).
Depends:
R (>= 3.3)
R (>= 3.3)
License: file LICENSE
URL: https://github.com/nhejazi/biotmle
BugReports: https://github.com/nhejazi/biotmle/issues
Encoding: UTF-8
LazyData: true
Imports:
tmle,
limma,
foreach,
parallel,
doParallel,
ggplot2,
wesanderson,
magrittr,
stats,
Matrix,
methods,
DBI,
dplyr,
SummarizedExperiment,
superheat,
SuperLearner,
biotmleData
dplyr,
magrittr,
ggplot2,
superheat,
wesanderson,
doFuture,
future,
stats,
Matrix,
methods,
DBI,
limma,
BiocParallel,
SummarizedExperiment,
biotmleData,
SuperLearner,
tmle
Suggests:
testthat,
knitr,
rmarkdown,
BiocStyle
testthat,
knitr,
rmarkdown,
BiocStyle
Remotes:
nhejazi/biotmleData
nhejazi/biotmleData
VignetteBuilder: knitr
RoxygenNote: 6.0.1
biocViews:
GeneExpression,
DifferentialExpression,
Sequencing,
Microarray,
RNASeq
GeneExpression,
DifferentialExpression,
Sequencing,
Microarray,
RNASeq
11 changes: 7 additions & 4 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,22 @@ exportClasses(bioTMLE)
exportClasses(data.frame_OR_EList)
importClassesFrom(SummarizedExperiment,SummarizedExperiment)
importClassesFrom(limma,EList)
importFrom(BiocParallel,DoparParam)
importFrom(BiocParallel,bplapply)
importFrom(BiocParallel,register)
importFrom(SummarizedExperiment,SummarizedExperiment)
importFrom(SummarizedExperiment,assay)
importFrom(SummarizedExperiment,colData)
importFrom(SummarizedExperiment,rowData)
importFrom(doParallel,registerDoParallel)
importFrom(doFuture,registerDoFuture)
importFrom(dplyr,arrange)
importFrom(dplyr,filter)
importFrom(dplyr,mutate)
importFrom(dplyr,select)
importFrom(dplyr,slice)
importFrom(foreach,"%dopar%")
importFrom(foreach,foreach)
importFrom(future,multiprocess)
importFrom(future,plan)
importFrom(future,sequential)
importFrom(ggplot2,aes)
importFrom(ggplot2,geom_histogram)
importFrom(ggplot2,geom_point)
Expand All @@ -42,7 +46,6 @@ importFrom(limma,voomWithQualityWeights)
importFrom(magrittr,"%>%")
importFrom(methods,setClass)
importFrom(methods,setClassUnion)
importFrom(parallel,detectCores)
importFrom(stats,quantile)
importFrom(superheat,superheat)
importFrom(tmle,tmle)
Expand Down
79 changes: 47 additions & 32 deletions R/biomarkertmle.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,18 @@ utils::globalVariables(c("gene","assay<-"))
#' next-generation sequencing (NGS) experiment (e.g., RNA-seq). The
#' default setting assumes continuous expression measures as generated by
#' microarray-type platforms.
#' @param parallel (logical, numeric) - whether to use or the number of cores to
#' be used when the TMLE-based estimation procedure is parallelized.
#' @param parallel (logical) - whether or not to use parallelization in the
#' estimation procedure. Invoking parallelization happens through a
#' combination of calls to \code{future} and \code{BiocParallel}. If this
#' argument is set to \code{TRUE}, \code{future::multiprocess} is used,
#' and if \code{FALSE}, \code{future::sequential} is used, alongside
#' \code{BiocParallel::bplapply}. Other options for evaluation through
#' futures may be invoked by setting the argument \code{future_param}.
#' @param future_param (character) - specifies the type of parallelization to be
#' invoked when using futures for evaluation. For a list of the available
#' types, please consult the documentation for \code{future::plan}. The
#' default setting (this argument set to \code{NULL}) silently invokes
#' \code{future::multiprocess}. Be careful if changing this setting.
#' @param family (character) - specification of error family: "binomial" or
#' "gaussian".
#' @param subj_ids (numeric vector) - subject IDs to be passed directly to
Expand All @@ -29,10 +39,10 @@ utils::globalVariables(c("gene","assay<-"))
#' @param Q_lib (char vector) - library of learning algorithms to be used in
#' fitting the "Q" step of the standard TMLE procedure.
#'
#' @importFrom parallel detectCores
#' @importFrom doParallel registerDoParallel
#' @importFrom foreach foreach "%dopar%"
#' @importFrom SummarizedExperiment assay colData rowData SummarizedExperiment
#' @importFrom BiocParallel register DoparParam bplapply
#' @importFrom future plan multiprocess sequential
#' @importFrom doFuture registerDoFuture
#'
#' @return S4 object of class \code{biotmle}, generated by sub-classing
#' \code{SummarizedExperiment}, with additional slots containing
Expand All @@ -57,18 +67,19 @@ utils::globalVariables(c("gene","assay<-"))
#'
#' varInt_index <- which(names(colData(illuminaData)) %in% "benzene")
#'
#' biomarkerTMLEout <- biomarkertmle(se = illuminaData[1, ],
#' biomarkerTMLEout <- biomarkertmle(se = illuminaData[1:2, ],
#' varInt = varInt_index,
#' parallel = 1,
#' parallel = FALSE,
#' family = "gaussian",
#' g_lib = c("SL.mean"),
#' Q_lib = c("SL.mean")
#' g_lib = c("SL.mean", "SL.glm"),
#' Q_lib = "SL.mean"
#' )
#'
biomarkertmle <- function(se,
varInt,
ngscounts = FALSE,
parallel = TRUE,
future_param = NULL,
family = "gaussian",
subj_ids = NULL,
g_lib = c("SL.glm", "SL.randomForest", "SL.nnet",
Expand Down Expand Up @@ -108,19 +119,20 @@ biomarkertmle <- function(se,
#=============================================================================
# set up parallelization based on input
# ============================================================================
if (class(parallel) == "numeric") doParallel::registerDoParallel(parallel)
if (class(parallel) == "logical") {
nCores <- parallel::detectCores()
if (nCores > 1) {
doParallel::registerDoParallel(nCores)
doFuture::registerDoFuture()
if (parallel == TRUE) {
if (!is.null(future_param)) {
future::plan(eval(paste0("future::", future_param)))
} else {
warning("option 'parallel' is set to TRUE but only 1 core detected.")
}
if (parallel == FALSE) {
stop("parallelization set to FALSE: the estimation procedure will not
run to completion in any sort of timely fashion.")
future::plan(future::multiprocess)
}
} else if (parallel == FALSE) {
warning(paste("Sequential evaluation is strongly discouraged.",
"\n Proceed with caution."))
future::plan(future::sequential)
}
BiocParallel::register(BiocParallel::DoparParam(), default = TRUE)

#=============================================================================
# TMLE procedure to identify biomarkers based on an EXPOSURE
# ============================================================================
Expand All @@ -146,21 +158,24 @@ biomarkertmle <- function(se,
W <- as.numeric(rep(1, length(A)))
}

# perform multi-level TMLE-based estimation for genes as Y
biomarkerTMLEout <- foreach::foreach(gene = seq_len(ncol(Y)),
.combine = cbind) %dopar% {
print(paste("Estimating target parameter for", gene, "of", ncol(Y)))
out <- biomarkerTMLE_exposure(Y = Y[, gene],
W = W,
A = A,
a = unique(A),
g_lib = g_lib,
Q_lib = Q_lib,
family = family,
subj_ids = subj_ids
)
# coerce matrix of baseline covariates to numeric
if (!all(unique(sapply(W, class)) == "numeric")) {
W <- as.data.frame(sapply(W, as.numeric))
}

# perform multi-level TMLE (of the ATE) for genes as Y
biomarkerTMLEout <- BiocParallel::bplapply(Y[, seq_along(Y)],
biomarkerTMLE_exposure,
W = W,
A = A,
a = unique(A),
g_lib = g_lib,
Q_lib = Q_lib,
family = family,
subj_ids = subj_ids
)
biomarkerTMLEout <- do.call(cbind.data.frame, biomarkerTMLEout)

if (ngscounts) {
voom_out$E <- t(biomarkerTMLEout)
biotmle@tmleOut <- voom_out
Expand Down
5 changes: 2 additions & 3 deletions appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,18 @@ cache:
branches:
only:
- master
- develop
- /*/-dev

environment:
global:
WARNINGS_ARE_ERRORS: 0
USE_RTOOLS: true
GITHUB_PAT:
secure: NOkWY7zLKevHO11BcijFIDC95MS6o0J+Oh1rCsnUFEmjIR26uW9JKy/HMwasCi7x

build_script:
- travis-tool.sh install_deps
- travis-tool.sh install_github nhejazi/biotmleData
- travis-tool.sh install_bioc SummarizedExperiment
- travis-tool.sh install_bioc BiocParallel
- travis-tool.sh install_bioc BiocStyle
- travis-tool.sh install_bioc limma
- travis-tool.sh install_bioc_deps
Expand Down
29 changes: 20 additions & 9 deletions man/biomarkertmle.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 5 additions & 5 deletions tests/testthat/test_biomarkertmle.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@ colData(illuminaData) <- colData(illuminaData) %>%

varInt_index <- which(names(colData(illuminaData)) %in% "benzene")

biomarkerTMLEout <- biomarkertmle(se = illuminaData[1, ],
biomarkerTMLEout <- biomarkertmle(se = illuminaData[1:2, ],
varInt = varInt_index,
parallel = 1,
parallel = FALSE,
family = "gaussian",
g_lib = c("SL.mean"),
Q_lib = c("SL.mean")
g_lib = c("SL.mean", "SL.glm"),
Q_lib = "SL.mean"
)

################################################################################
Expand All @@ -39,7 +39,7 @@ test_that("biomarkertmle object is of appropriate custom class", {
})

test_that("biomarkertmle output is consistent using example data", {
expect_equal(assay(biomarkerTMLEout)[, c(17, 83, 117)],
expect_equal(assay(biomarkerTMLEout)[1, c(17, 83, 117)],
c(360.7073, 375.9316, 319.3649))
})

Expand Down
8 changes: 4 additions & 4 deletions tests/testthat/test_modtest_ic.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@ colData(illuminaData) <- colData(illuminaData) %>%

varInt_index <- which(names(colData(illuminaData)) %in% "benzene")

biomarkerTMLEout <- biomarkertmle(se = illuminaData[1, ],
biomarkerTMLEout <- biomarkertmle(se = illuminaData[1:2, ],
varInt = varInt_index,
parallel = 1,
parallel = FALSE,
family = "gaussian",
g_lib = c("SL.mean"),
Q_lib = c("SL.mean")
g_lib = c("SL.mean", "SL.glm"),
Q_lib = "SL.mean"
)

limmaTMLEout <- modtest_ic(biotmle = biomarkerTMLEout)
Expand Down

0 comments on commit f494b63

Please sign in to comment.