Merge pull request #34 from nhejazi/biocparallel-future-dev

Parallelization via future and BiocParallel
nhejazi · Aug 10, 2017 · f494b63 · f494b63
2 parents da26e25 + a5e80ff
commit f494b63
Show file tree

Hide file tree

Showing 9 changed files with 124 additions and 96 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,7 +1,7 @@
 branches:
   only:
   - master
-  - develop
+  - /.*/-dev
 
 env:
   global:

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -2,56 +2,56 @@ Package: biotmle
 Title: Targeted Learning for Biomarker Discovery with Moderated Statistics
 Version: 1.1.0
 Authors@R: c(
-    person("Nima", "Hejazi", email = "nhejazi@berkeley.edu",
-           role = c("aut", "cre", "cph")),
-    person("Alan", "Hubbard", email = "hubbard@berkeley.edu", role = "aut"),
-    person("Weixin", "Cai", email = "wcai@berkeley.edu", role = "ctb")
-    )
+  person("Nima", "Hejazi", email = "nhejazi@berkeley.edu",
+         role = c("aut", "cre", "cph")),
+  person("Alan", "Hubbard", email = "hubbard@berkeley.edu", role = "aut"),
+  person("Weixin", "Cai", email = "wcai@berkeley.edu", role = "ctb")
+  )
 Author: Nima Hejazi [aut, cre, cph]
 Maintainer: Nima Hejazi <nhejazi@berkeley.edu>
 Description: This package facilitates the discovery of biomarkers from
-    biological sequencing data (e.g., microarrays, RNA-seq) based on the
-    associations of potential biomarkers with exposure and outcome variables by
-    implementing an estimation procedure that combines a generalization of
-    moderated statistics with asymptotically linear statistical parameters
-    estimated via targeted minimum loss-based estimation (TMLE).
+  biological sequencing data (e.g., microarrays, RNA-seq) based on the
+  associations of potential biomarkers with exposure and outcome variables by
+  implementing an estimation procedure that combines a generalization of
+  moderated statistics with asymptotically linear statistical parameters
+  estimated via targeted minimum loss-based estimation (TMLE).
 Depends:
-    R (>= 3.3)
+  R (>= 3.3)
 License: file LICENSE
 URL: https://github.com/nhejazi/biotmle
 BugReports: https://github.com/nhejazi/biotmle/issues
 Encoding: UTF-8
 LazyData: true
 Imports:
-    tmle,
-    limma,
-    foreach,
-    parallel,
-    doParallel,
-    ggplot2,
-    wesanderson,
-    magrittr,
-    stats,
-    Matrix,
-    methods,
-    DBI,
-    dplyr,
-    SummarizedExperiment,
-    superheat,
-    SuperLearner,
-    biotmleData
+  dplyr,
+  magrittr,
+  ggplot2,
+  superheat,
+  wesanderson,
+  doFuture,
+  future,
+  stats,
+  Matrix,
+  methods,
+  DBI,
+  limma,
+  BiocParallel,
+  SummarizedExperiment,
+  biotmleData,
+  SuperLearner,
+  tmle
 Suggests:
-    testthat,
-    knitr,
-    rmarkdown,
-    BiocStyle
+  testthat,
+  knitr,
+  rmarkdown,
+  BiocStyle
 Remotes:
-    nhejazi/biotmleData
+  nhejazi/biotmleData
 VignetteBuilder: knitr
 RoxygenNote: 6.0.1
 biocViews:
-    GeneExpression,
-    DifferentialExpression,
-    Sequencing,
-    Microarray,
-    RNASeq
+  GeneExpression,
+  DifferentialExpression,
+  Sequencing,
+  Microarray,
+  RNASeq
diff --git a/NAMESPACE b/NAMESPACE
@@ -11,18 +11,22 @@ exportClasses(bioTMLE)
 exportClasses(data.frame_OR_EList)
 importClassesFrom(SummarizedExperiment,SummarizedExperiment)
 importClassesFrom(limma,EList)
+importFrom(BiocParallel,DoparParam)
+importFrom(BiocParallel,bplapply)
+importFrom(BiocParallel,register)
 importFrom(SummarizedExperiment,SummarizedExperiment)
 importFrom(SummarizedExperiment,assay)
 importFrom(SummarizedExperiment,colData)
 importFrom(SummarizedExperiment,rowData)
-importFrom(doParallel,registerDoParallel)
+importFrom(doFuture,registerDoFuture)
 importFrom(dplyr,arrange)
 importFrom(dplyr,filter)
 importFrom(dplyr,mutate)
 importFrom(dplyr,select)
 importFrom(dplyr,slice)
-importFrom(foreach,"%dopar%")
-importFrom(foreach,foreach)
+importFrom(future,multiprocess)
+importFrom(future,plan)
+importFrom(future,sequential)
 importFrom(ggplot2,aes)
 importFrom(ggplot2,geom_histogram)
 importFrom(ggplot2,geom_point)
@@ -42,7 +46,6 @@ importFrom(limma,voomWithQualityWeights)
 importFrom(magrittr,"%>%")
 importFrom(methods,setClass)
 importFrom(methods,setClassUnion)
-importFrom(parallel,detectCores)
 importFrom(stats,quantile)
 importFrom(superheat,superheat)
 importFrom(tmle,tmle)

diff --git a/R/biomarkertmle.R b/R/biomarkertmle.R
@@ -16,8 +16,18 @@ utils::globalVariables(c("gene","assay<-"))
 #'        next-generation sequencing (NGS) experiment (e.g., RNA-seq). The
 #'        default setting assumes continuous expression measures as generated by
 #'        microarray-type platforms.
-#' @param parallel (logical, numeric) - whether to use or the number of cores to
-#'        be used when the TMLE-based estimation procedure is parallelized.
+#' @param parallel (logical) - whether or not to use parallelization in the
+#'        estimation procedure. Invoking parallelization happens through a
+#'        combination of calls to \code{future} and \code{BiocParallel}. If this
+#'        argument is set to \code{TRUE}, \code{future::multiprocess} is used,
+#'        and if \code{FALSE}, \code{future::sequential} is used, alongside
+#'        \code{BiocParallel::bplapply}. Other options for evaluation through
+#'        futures may be invoked by setting the argument \code{future_param}.
+#' @param future_param (character) - specifies the type of parallelization to be
+#'        invoked when using futures for evaluation. For a list of the available
+#'        types, please consult the documentation for \code{future::plan}. The
+#'        default setting (this argument set to \code{NULL}) silently invokes
+#'        \code{future::multiprocess}. Be careful if changing this setting.
 #' @param family (character) - specification of error family: "binomial" or
 #'        "gaussian".
 #' @param subj_ids (numeric vector) - subject IDs to be passed directly to
@@ -29,10 +39,10 @@ utils::globalVariables(c("gene","assay<-"))
 #' @param Q_lib (char vector) - library of learning algorithms to be used in
 #'        fitting the "Q" step of the standard TMLE procedure.
 #'
-#' @importFrom parallel detectCores
-#' @importFrom doParallel registerDoParallel
-#' @importFrom foreach foreach "%dopar%"
 #' @importFrom SummarizedExperiment assay colData rowData SummarizedExperiment
+#' @importFrom BiocParallel register DoparParam bplapply
+#' @importFrom future plan multiprocess sequential
+#' @importFrom doFuture registerDoFuture
 #'
 #' @return S4 object of class \code{biotmle}, generated by sub-classing
 #'         \code{SummarizedExperiment}, with additional slots containing
@@ -57,18 +67,19 @@ utils::globalVariables(c("gene","assay<-"))
 #'
 #' varInt_index <- which(names(colData(illuminaData)) %in% "benzene")
 #'
-#' biomarkerTMLEout <- biomarkertmle(se = illuminaData[1, ],
+#' biomarkerTMLEout <- biomarkertmle(se = illuminaData[1:2, ],
 #'                                   varInt = varInt_index,
-#'                                   parallel = 1,
+#'                                   parallel = FALSE,
 #'                                   family = "gaussian",
-#'                                   g_lib = c("SL.mean"),
-#'                                   Q_lib = c("SL.mean")
+#'                                   g_lib = c("SL.mean", "SL.glm"),
+#'                                   Q_lib = "SL.mean"
 #'                                  )
 #'
 biomarkertmle <- function(se,
                           varInt,
                           ngscounts = FALSE,
                           parallel = TRUE,
+                          future_param = NULL,
                           family = "gaussian",
                           subj_ids = NULL,
                           g_lib = c("SL.glm", "SL.randomForest", "SL.nnet",
@@ -108,19 +119,20 @@ biomarkertmle <- function(se,
   #=============================================================================
   # set up parallelization based on input
   # ============================================================================
-  if (class(parallel) == "numeric") doParallel::registerDoParallel(parallel)
-  if (class(parallel) == "logical") {
-    nCores <- parallel::detectCores()
-    if (nCores > 1) {
-      doParallel::registerDoParallel(nCores)
+  doFuture::registerDoFuture()
+  if (parallel == TRUE) {
+    if (!is.null(future_param)) {
+      future::plan(eval(paste0("future::", future_param)))
     } else {
-      warning("option 'parallel' is set to TRUE but only 1 core detected.")
-    }
-    if (parallel == FALSE) {
-      stop("parallelization set to FALSE: the estimation procedure will not
-           run to completion in any sort of timely fashion.")
+      future::plan(future::multiprocess)
     }
+  } else if (parallel == FALSE) {
+    warning(paste("Sequential evaluation is strongly discouraged.",
+                  "\n Proceed with caution."))
+    future::plan(future::sequential)
   }
+  BiocParallel::register(BiocParallel::DoparParam(), default = TRUE)
+
   #=============================================================================
   # TMLE procedure to identify biomarkers based on an EXPOSURE
   # ============================================================================
@@ -146,21 +158,24 @@ biomarkertmle <- function(se,
     W <- as.numeric(rep(1, length(A)))
   }
 
-  # perform multi-level TMLE-based estimation for genes as Y
-  biomarkerTMLEout <- foreach::foreach(gene = seq_len(ncol(Y)),
-                                       .combine = cbind) %dopar% {
-    print(paste("Estimating target parameter for", gene, "of", ncol(Y)))
-    out <- biomarkerTMLE_exposure(Y = Y[, gene],
-                                  W = W,
-                                  A = A,
-                                  a = unique(A),
-                                  g_lib = g_lib,
-                                  Q_lib = Q_lib,
-                                  family = family,
-                                  subj_ids = subj_ids
-                                 )
+  # coerce matrix of baseline covariates to numeric
+  if (!all(unique(sapply(W, class)) == "numeric")) {
+    W <- as.data.frame(sapply(W, as.numeric))
   }
 
+  # perform multi-level TMLE (of the ATE) for genes as Y
+  biomarkerTMLEout <- BiocParallel::bplapply(Y[, seq_along(Y)],
+                                             biomarkerTMLE_exposure,
+                                             W = W,
+                                             A = A,
+                                             a = unique(A),
+                                             g_lib = g_lib,
+                                             Q_lib = Q_lib,
+                                             family = family,
+                                             subj_ids = subj_ids
+                                            )
+  biomarkerTMLEout <- do.call(cbind.data.frame, biomarkerTMLEout)
+
   if (ngscounts) {
     voom_out$E <- t(biomarkerTMLEout)
     biotmle@tmleOut <- voom_out

diff --git a/appveyor.yml b/appveyor.yml
@@ -17,19 +17,18 @@ cache:
 branches:
   only:
     - master
-    - develop
+    - /*/-dev
 
 environment:
   global:
     WARNINGS_ARE_ERRORS: 0
     USE_RTOOLS: true
-    GITHUB_PAT:
-      secure: NOkWY7zLKevHO11BcijFIDC95MS6o0J+Oh1rCsnUFEmjIR26uW9JKy/HMwasCi7x
 
 build_script:
   - travis-tool.sh install_deps
   - travis-tool.sh install_github nhejazi/biotmleData
   - travis-tool.sh install_bioc SummarizedExperiment
+  - travis-tool.sh install_bioc BiocParallel
   - travis-tool.sh install_bioc BiocStyle
   - travis-tool.sh install_bioc limma
   - travis-tool.sh install_bioc_deps

diff --git a/man/biomarkertmle.Rd b/man/biomarkertmle.Rd
diff --git a/tests/testthat/test_biomarkertmle.R b/tests/testthat/test_biomarkertmle.R
@@ -18,12 +18,12 @@ colData(illuminaData) <- colData(illuminaData) %>%
 
 varInt_index <- which(names(colData(illuminaData)) %in% "benzene")
 
-biomarkerTMLEout <- biomarkertmle(se = illuminaData[1, ],
+biomarkerTMLEout <- biomarkertmle(se = illuminaData[1:2, ],
                                   varInt = varInt_index,
-                                  parallel = 1,
+                                  parallel = FALSE,
                                   family = "gaussian",
-                                  g_lib = c("SL.mean"),
-                                  Q_lib = c("SL.mean")
+                                  g_lib = c("SL.mean", "SL.glm"),
+                                  Q_lib = "SL.mean"
                                  )
 
 ################################################################################
@@ -39,7 +39,7 @@ test_that("biomarkertmle object is of appropriate custom class", {
 })
 
 test_that("biomarkertmle output is consistent using example data", {
-  expect_equal(assay(biomarkerTMLEout)[, c(17, 83, 117)],
+  expect_equal(assay(biomarkerTMLEout)[1, c(17, 83, 117)],
                c(360.7073, 375.9316, 319.3649))
 })
 

diff --git a/tests/testthat/test_biomarkertmle_exposure.R → tests/testthat/test_exposure_biomarkertmle.R b/tests/testthat/test_biomarkertmle_exposure.R → tests/testthat/test_exposure_biomarkertmle.R
diff --git a/tests/testthat/test_modtest_ic.R b/tests/testthat/test_modtest_ic.R
@@ -17,12 +17,12 @@ colData(illuminaData) <- colData(illuminaData) %>%
 
 varInt_index <- which(names(colData(illuminaData)) %in% "benzene")
 
-biomarkerTMLEout <- biomarkertmle(se = illuminaData[1, ],
+biomarkerTMLEout <- biomarkertmle(se = illuminaData[1:2, ],
                                   varInt = varInt_index,
-                                  parallel = 1,
+                                  parallel = FALSE,
                                   family = "gaussian",
-                                  g_lib = c("SL.mean"),
-                                  Q_lib = c("SL.mean")
+                                  g_lib = c("SL.mean", "SL.glm"),
+                                  Q_lib = "SL.mean"
                                  )
 
 limmaTMLEout <- modtest_ic(biotmle = biomarkerTMLEout)