pachterlab · pimentel · Nov 10, 2017 · Oct 23, 2017 · Oct 23, 2017 · Oct 23, 2017
diff --git a/R/matrix.R b/R/matrix.R
@@ -35,10 +35,18 @@ sleuth_to_matrix <- function(obj, which_df, which_units) {
   if ( !(which_df %in% c("obs_norm", "obs_raw")) ) {
     stop("Invalid object")
   }
-  if ( !(which_units %in% c("tpm", "est_counts")) ) {
+  if ( !(which_units %in% c("tpm", "est_counts", "scaled_reads_per_base")) ) {
     stop("Invalid units")
   }
 
+  which_units <- check_quant_mode(obj, which_units)
+
+  if (obj$gene_mode && which_df == "obs_raw") {
+    warning("This object is in gene mode, and the raw values are ",
+            "transcripts. Using 'obs_norm' instead.")
+    which_df <- "obs_norm"
+  }
+
   data <- as.data.frame(obj[[which_df]])
 
   res <- list()

diff --git a/R/model.R b/R/model.R
@@ -269,30 +269,37 @@ tests.sleuth <- function(obj, lrt = TRUE, wt = TRUE) {
 
 }
 
-#' Extract Wald test results from a sleuth object
+#' Extract Wald or Likelihood Ratio test results from a sleuth object
 #'
-#' This function extracts Wald test results from a sleuth object.
+#' This function extracts Wald or Likelihood Ratio test results from a sleuth object.
 #'
 #' @param obj a \code{sleuth} object
 #' @param test a character string denoting the test to extract. Possible tests can be found by using \code{models(obj)}.
-#' @param which_model a character string denoting the model. If extracting a wald test, use the model name. If extracting a likelihood ratio test, use 'lrt'.
+#' @param test_type 'wt' for Wald test or 'lrt' for Likelihood Ratio test.
+#' @param which_model a character string denoting the model. If extracting a wald test, use the model name.
+#'   Not used if extracting a likelihood ratio test.
 #' @param rename_cols if \code{TRUE} will rename some columns to be shorter and
-#' consistent with vignette
+#'   consistent with vignette
 #' @param show_all if \code{TRUE} will show all transcripts (not only the ones
 #' passing filters). The transcripts that do not pass filters will have
 #' \code{NA} values in most columns.
 #' @return a \code{data.frame} with the following columns:
 #' @return target_id: transcript name, e.g. "ENSXX#####" (dependent on the transcriptome used in kallisto)
+#' @return ...: if there is a target mapping data frame, all of the annotations columns are added from \code{obj$target_mapping} before the other columns.
 #' @return pval: p-value of the chosen model
 #' @return qval: false discovery rate adjusted p-value, using Benjamini-Hochberg (see \code{\link{p.adjust}})
-#' @return b: 'beta' value (effect size). Technically a biased estimator of the fold change
-#' @return se_b: standard error of the beta
+#' @return test_stat (LRT only): Chi-squared test statistic (likelihood ratio test). Only seen with Likelihood Ratio test results.
+#' @return rss (LRT only): the residual sum of squares under the "null model". Only seen with Likelihood Ratio test results.
+#' @return degrees_free (LRT only): the degrees of freedom (equal to difference between the two models). Only seen with Likelihood Ratio test results.
+#' @return b (Wald only): 'beta' value (effect size). Technically a biased estimator of the fold change. Only seen with Wald test results.
+#' @return se_b (Wald only): standard error of the beta. Only seen with Wald test results.
 #' @return mean_obs: mean of natural log counts of observations
 #' @return var_obs: variance of observation
-#' @return tech_var: technical variance of observation from the bootstraps
+#' @return tech_var: technical variance of observation from the bootstraps (named 'sigma_q_sq' if rename_cols is \code{FALSE})
 #' @return sigma_sq: raw estimator of the variance once the technical variance has been removed
 #' @return smooth_sigma_sq: smooth regression fit for the shrinkage estimation
 #' @return final_simga_sq: max(sigma_sq, smooth_sigma_sq); used for covariance estimation of beta
+#'   (named 'smooth_sigma_sq_pmax' if rename_cols is \code{FALSE})
 #' @seealso \code{\link{sleuth_wt}} and \code{\link{sleuth_lrt}} to compute tests, \code{\link{models}} to
 #' view which models, \code{\link{tests}} to view which tests were performed (and can be extracted)
 #' @examples
@@ -325,6 +332,20 @@ sleuth_results <- function(obj, test, test_type = 'wt',
   res <- NULL
   if (test_type == 'lrt') {
     res <- get_test(obj, test, type = 'lrt')
+    res <- dplyr::select(res,
+      target_id,
+      pval,
+      qval,
+      test_stat,
+      rss,
+      degrees_free,
+      mean_obs,
+      var_obs,
+      sigma_q_sq,
+      sigma_sq,
+      smooth_sigma_sq,
+      smooth_sigma_sq_pmax
+      )
   } else {
     res <- get_test(obj, test, 'wt', which_model)
     res <- dplyr::select(res,
@@ -360,8 +381,8 @@ sleuth_results <- function(obj, test, test_type = 'wt',
 
   if ( !is.null(obj$target_mapping) && !obj$gene_mode) {
     res <- dplyr::left_join(
-      data.table::as.data.table(res),
       data.table::as.data.table(obj$target_mapping),
+      data.table::as.data.table(res),
       by = 'target_id')
   }
 
@@ -375,9 +396,12 @@ sleuth_results <- function(obj, test, test_type = 'wt',
     # this line uses dplyr's "left_join" syntax for "by"
     # to match "target_id" from the "res" table,
     # and the gene_column from the target_mapping table.
-    res <- dplyr::left_join(data.table::as.data.table(res),
-                            data.table::as.data.table(target_mapping),
-                            by = c("target_id" = obj$gene_column))
+    by_col <- "target_id"
+    names(by_col) <- obj$gene_column
+    res <- dplyr::left_join(data.table::as.data.table(target_mapping),
+                            data.table::as.data.table(res),
+                            by = by_col)
+    names(res)[1] <- "target_id"
   }
 
   res <- as_df(res)

diff --git a/R/read_write.R b/R/read_write.R
@@ -221,7 +221,7 @@ get_kallisto_path <- function(path) {
       output$ext <- "tsv"
       output$path <- file.path(path, "abundance.tsv")
     } else {
-      stop(path, 'exists, but does not contain kallisto output (abundance.h5)')
+      stop(path, ' exists, but does not contain kallisto output (abundance.h5)')
     }
   } else if ( file.exists(path) ){
     # make an assumption that the user has kept the correct extension
@@ -234,7 +234,7 @@ get_kallisto_path <- function(path) {
     } else if (ext == 'tsv') {
       output$ext <- 'tsv'
     } else {
-      stop("'", path, "' exists, is not a recognized extension")
+      stop("'", path, "' exists, but does not have a recognized extension")
     }
     output$path <- path
   } else {

diff --git a/R/sleuth.R b/R/sleuth.R
@@ -237,6 +237,16 @@ sleuth_prep <- function(
 
   obs_raw <- dplyr::bind_rows(lapply(kal_list, function(k) k$abundance))
 
+  counts_test <- data.table::as.data.table(obs_raw)
+  counts_test <- counts_test[, total = .(total = sum(est_counts)), by = "sample"]
+  if (any(counts_test$total == 0)) {
+    zero_names <- counts_test$sample[which(counts_test$total == 0)]
+    formatted_names <- paste(zero_names, collapse = ", ")
+    warning("At least one sample have no reads aligned. ",
+            "Here are the samples with zero counts:\n",
+            formatted_names)
+  }
+
   design_matrix <- NULL
   if (is(full_model, 'formula')) {
     design_matrix <- model.matrix(full_model, sample_to_covariates)
@@ -249,6 +259,18 @@ sleuth_prep <- function(
 
   if (!is.null(full_model)) {
     rownames(design_matrix) <- sample_to_covariates$sample
+    # check if the resulting design_matrix is singular (i.e. non-invertible)
+    # followed the suggested method found here: https://stackoverflow.com/a/24962470 
+    M <- t(design_matrix) %*% design_matrix
+    det_mod <- determinant(M)$modulus
+    if(!is.finite(det_mod)) {
+      stop("The full model you provided seems to result in a singular design matrix. ",
+           "This frequently happens when one of the covariates is a linear ",
+           "combination of one or more other covariates (e.g. one covariate ",
+           "yields identical groupings as another covariate). Check your ",
+           "sample_to_covariates table and your full model.")
+    }
+    rm(M, det_mod)
   }
 
   obs_raw <- dplyr::arrange(obs_raw, target_id, sample)
@@ -259,7 +281,8 @@ sleuth_prep <- function(
   if (!is.null(target_mapping)) {
     tmp_names <- data.frame(target_id = kal_list[[1]]$abundance$target_id,
       stringsAsFactors = FALSE)
-    target_mapping <- check_target_mapping(tmp_names, target_mapping)
+    target_mapping <- check_target_mapping(tmp_names, target_mapping,
+                                           !is.null(aggregation_column))
     rm(tmp_names)
   }
 
@@ -457,7 +480,14 @@ sleuth_prep <- function(
     # if mclapply results in an error (a warning is shown), then print error and stop
     error_status <- sapply(bs_results, function(x) is(x, "try-error"))
     if (any(error_status)) {
-      print(attributes(bs_results[error_status])$condition)
+      error_msgs <- sapply(which(error_status), function(i) {
+        bad_run <- bs_results[[i]]
+        samp_name <- sample_to_covariates$sample[i]
+        trace <- .traceback(bad_run)
+        paste0("Sample '", samp_name, "' had this error message: ", trace[1])
+      })
+      formatted_error <- paste(error_msgs, collapse = "")
+      message(formatted_error)
       stop("At least one core from mclapply had an error. See the above error message(s) for more details.")
     }
 
@@ -519,11 +549,16 @@ check_kal_pack <- function(kal_list) {
 }
 
 # this function is mostly to deal with annoying ENSEMBL transcript names that
-# have a training .N to keep track of version number
-#
+# have a trailing .N to keep track of version number
+# 
+# this also checks to see if there are duplicate entries for any target IDs
+# and issues a warning if sleuth prep is in transcript mode, but stops if
+# sleuth prep is in gene mode, since duplicate entries creates problems when
+# doing the aggregation
+# 
 # @return the target_mapping if an intersection is found. a target_mapping that
 # matches \code{t_id} if no matching is found
-check_target_mapping <- function(t_id, target_mapping) {
+check_target_mapping <- function(t_id, target_mapping, gene_mode) {
   t_id <- data.table::as.data.table(t_id)
   target_mapping <- data.table::as.data.table(target_mapping)
 
@@ -557,6 +592,24 @@ check_target_mapping <- function(t_id, target_mapping) {
       ' please check obj$target_mapping to ensure this new mapping is correct.'))
   }
 
+  if(any(duplicated(target_mapping$target_id))) {
+    indices <- which(duplicated(target_mapping$target_id))
+    duplicated_ids <- target_mapping$target_id[indices]
+    formatted_ids <- paste(dupliated_ids, collapse = ", ")
+    if(gene_mode) {
+      stop("There is at least one duplicated target ID in the target mapping. ",
+           "Since sleuth prep is in gene aggregation mode, any duplicated ",
+           "entries will cause errors during gene aggregation. Here is a list ",
+           "of all the duplicated target IDs:\n", formatted_ids)
+    } else {
+      warning("There is at least one duplicated target ID in the target ",
+              "mapping. Since sleuth prep is not in gene aggregation mode, ",
+              "duplicated entries will not cause errors during preparation, ",
+              "but may cause unexpected behavior when making any plots or ",
+              "tables. Here is a list of all the duplicated target IDs:\n",
+              formatted_ids)
+    }
+  }
   target_mapping
 }
 
@@ -825,7 +878,7 @@ sleuth_gene_table <- function(obj, test, test_type = 'lrt', which_model = 'full'
   filter_empty <- !filter_empty
   popped_gene_table <- popped_gene_table[filter_empty, ]
 
-  popped_gene_table
+  adf(popped_gene_table)
 }