Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes to various miscellaneous issues (#73, #84, #97, #122, #135, #142) #144

Merged
merged 9 commits into from
Nov 10, 2017
10 changes: 9 additions & 1 deletion R/matrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,18 @@ sleuth_to_matrix <- function(obj, which_df, which_units) {
if ( !(which_df %in% c("obs_norm", "obs_raw")) ) {
stop("Invalid object")
}
if ( !(which_units %in% c("tpm", "est_counts")) ) {
if ( !(which_units %in% c("tpm", "est_counts", "scaled_reads_per_base")) ) {
stop("Invalid units")
}

which_units <- check_quant_mode(obj, which_units)

if (obj$gene_mode && which_df == "obs_raw") {
warning("This object is in gene mode, and the raw values are ",
"transcripts. Using 'obs_norm' instead.")
which_df <- "obs_norm"
}

data <- as.data.frame(obj[[which_df]])

res <- list()
Expand Down
46 changes: 35 additions & 11 deletions R/model.R
Original file line number Diff line number Diff line change
Expand Up @@ -269,30 +269,37 @@ tests.sleuth <- function(obj, lrt = TRUE, wt = TRUE) {

}

#' Extract Wald test results from a sleuth object
#' Extract Wald or Likelihood Ratio test results from a sleuth object
#'
#' This function extracts Wald test results from a sleuth object.
#' This function extracts Wald or Likelihood Ratio test results from a sleuth object.
#'
#' @param obj a \code{sleuth} object
#' @param test a character string denoting the test to extract. Possible tests can be found by using \code{models(obj)}.
#' @param which_model a character string denoting the model. If extracting a wald test, use the model name. If extracting a likelihood ratio test, use 'lrt'.
#' @param test_type 'wt' for Wald test or 'lrt' for Likelihood Ratio test.
#' @param which_model a character string denoting the model. If extracting a wald test, use the model name.
#' Not used if extracting a likelihood ratio test.
#' @param rename_cols if \code{TRUE} will rename some columns to be shorter and
#' consistent with vignette
#' consistent with vignette
#' @param show_all if \code{TRUE} will show all transcripts (not only the ones
#' passing filters). The transcripts that do not pass filters will have
#' \code{NA} values in most columns.
#' @return a \code{data.frame} with the following columns:
#' @return target_id: transcript name, e.g. "ENSXX#####" (dependent on the transcriptome used in kallisto)
#' @return ...: if there is a target mapping data frame, all of the annotations columns are added from \code{obj$target_mapping} before the other columns.
#' @return pval: p-value of the chosen model
#' @return qval: false discovery rate adjusted p-value, using Benjamini-Hochberg (see \code{\link{p.adjust}})
#' @return b: 'beta' value (effect size). Technically a biased estimator of the fold change
#' @return se_b: standard error of the beta
#' @return test_stat (LRT only): Chi-squared test statistic (likelihood ratio test). Only seen with Likelihood Ratio test results.
#' @return rss (LRT only): the residual sum of squares under the "null model". Only seen with Likelihood Ratio test results.
#' @return degrees_free (LRT only): the degrees of freedom (equal to difference between the two models). Only seen with Likelihood Ratio test results.
#' @return b (Wald only): 'beta' value (effect size). Technically a biased estimator of the fold change. Only seen with Wald test results.
#' @return se_b (Wald only): standard error of the beta. Only seen with Wald test results.
#' @return mean_obs: mean of natural log counts of observations
#' @return var_obs: variance of observation
#' @return tech_var: technical variance of observation from the bootstraps
#' @return tech_var: technical variance of observation from the bootstraps (named 'sigma_q_sq' if rename_cols is \code{FALSE})
#' @return sigma_sq: raw estimator of the variance once the technical variance has been removed
#' @return smooth_sigma_sq: smooth regression fit for the shrinkage estimation
#' @return final_simga_sq: max(sigma_sq, smooth_sigma_sq); used for covariance estimation of beta
#' (named 'smooth_sigma_sq_pmax' if rename_cols is \code{FALSE})
#' @seealso \code{\link{sleuth_wt}} and \code{\link{sleuth_lrt}} to compute tests, \code{\link{models}} to
#' view which models, \code{\link{tests}} to view which tests were performed (and can be extracted)
#' @examples
Expand Down Expand Up @@ -325,6 +332,20 @@ sleuth_results <- function(obj, test, test_type = 'wt',
res <- NULL
if (test_type == 'lrt') {
res <- get_test(obj, test, type = 'lrt')
res <- dplyr::select(res,
target_id,
pval,
qval,
test_stat,
rss,
degrees_free,
mean_obs,
var_obs,
sigma_q_sq,
sigma_sq,
smooth_sigma_sq,
smooth_sigma_sq_pmax
)
} else {
res <- get_test(obj, test, 'wt', which_model)
res <- dplyr::select(res,
Expand Down Expand Up @@ -360,8 +381,8 @@ sleuth_results <- function(obj, test, test_type = 'wt',

if ( !is.null(obj$target_mapping) && !obj$gene_mode) {
res <- dplyr::left_join(
data.table::as.data.table(res),
data.table::as.data.table(obj$target_mapping),
data.table::as.data.table(res),
by = 'target_id')
}

Expand All @@ -375,9 +396,12 @@ sleuth_results <- function(obj, test, test_type = 'wt',
# this line uses dplyr's "left_join" syntax for "by"
# to match "target_id" from the "res" table,
# and the gene_column from the target_mapping table.
res <- dplyr::left_join(data.table::as.data.table(res),
data.table::as.data.table(target_mapping),
by = c("target_id" = obj$gene_column))
by_col <- "target_id"
names(by_col) <- obj$gene_column
res <- dplyr::left_join(data.table::as.data.table(target_mapping),
data.table::as.data.table(res),
by = by_col)
names(res)[1] <- "target_id"
}

res <- as_df(res)
Expand Down
4 changes: 2 additions & 2 deletions R/read_write.R
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ get_kallisto_path <- function(path) {
output$ext <- "tsv"
output$path <- file.path(path, "abundance.tsv")
} else {
stop(path, 'exists, but does not contain kallisto output (abundance.h5)')
stop(path, ' exists, but does not contain kallisto output (abundance.h5)')
}
} else if ( file.exists(path) ){
# make an assumption that the user has kept the correct extension
Expand All @@ -234,7 +234,7 @@ get_kallisto_path <- function(path) {
} else if (ext == 'tsv') {
output$ext <- 'tsv'
} else {
stop("'", path, "' exists, is not a recognized extension")
stop("'", path, "' exists, but does not have a recognized extension")
}
output$path <- path
} else {
Expand Down
65 changes: 59 additions & 6 deletions R/sleuth.R
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,16 @@ sleuth_prep <- function(

obs_raw <- dplyr::bind_rows(lapply(kal_list, function(k) k$abundance))

counts_test <- data.table::as.data.table(obs_raw)
counts_test <- counts_test[, total = .(total = sum(est_counts)), by = "sample"]
if (any(counts_test$total == 0)) {
zero_names <- counts_test$sample[which(counts_test$total == 0)]
formatted_names <- paste(zero_names, collapse = ", ")
warning("At least one sample have no reads aligned. ",
"Here are the samples with zero counts:\n",
formatted_names)
}

design_matrix <- NULL
if (is(full_model, 'formula')) {
design_matrix <- model.matrix(full_model, sample_to_covariates)
Expand All @@ -249,6 +259,18 @@ sleuth_prep <- function(

if (!is.null(full_model)) {
rownames(design_matrix) <- sample_to_covariates$sample
# check if the resulting design_matrix is singular (i.e. non-invertible)
# followed the suggested method found here: https://stackoverflow.com/a/24962470
M <- t(design_matrix) %*% design_matrix
det_mod <- determinant(M)$modulus
if(!is.finite(det_mod)) {
stop("The full model you provided seems to result in a singular design matrix. ",
"This frequently happens when one of the covariates is a linear ",
"combination of one or more other covariates (e.g. one covariate ",
"yields identical groupings as another covariate). Check your ",
"sample_to_covariates table and your full model.")
}
rm(M, det_mod)
}

obs_raw <- dplyr::arrange(obs_raw, target_id, sample)
Expand All @@ -259,7 +281,8 @@ sleuth_prep <- function(
if (!is.null(target_mapping)) {
tmp_names <- data.frame(target_id = kal_list[[1]]$abundance$target_id,
stringsAsFactors = FALSE)
target_mapping <- check_target_mapping(tmp_names, target_mapping)
target_mapping <- check_target_mapping(tmp_names, target_mapping,
!is.null(aggregation_column))
rm(tmp_names)
}

Expand Down Expand Up @@ -457,7 +480,14 @@ sleuth_prep <- function(
# if mclapply results in an error (a warning is shown), then print error and stop
error_status <- sapply(bs_results, function(x) is(x, "try-error"))
if (any(error_status)) {
print(attributes(bs_results[error_status])$condition)
error_msgs <- sapply(which(error_status), function(i) {
bad_run <- bs_results[[i]]
samp_name <- sample_to_covariates$sample[i]
trace <- .traceback(bad_run)
paste0("Sample '", samp_name, "' had this error message: ", trace[1])
})
formatted_error <- paste(error_msgs, collapse = "")
message(formatted_error)
stop("At least one core from mclapply had an error. See the above error message(s) for more details.")
}

Expand Down Expand Up @@ -519,11 +549,16 @@ check_kal_pack <- function(kal_list) {
}

# this function is mostly to deal with annoying ENSEMBL transcript names that
# have a training .N to keep track of version number
#
# have a trailing .N to keep track of version number
#
# this also checks to see if there are duplicate entries for any target IDs
# and issues a warning if sleuth prep is in transcript mode, but stops if
# sleuth prep is in gene mode, since duplicate entries creates problems when
# doing the aggregation
#
# @return the target_mapping if an intersection is found. a target_mapping that
# matches \code{t_id} if no matching is found
check_target_mapping <- function(t_id, target_mapping) {
check_target_mapping <- function(t_id, target_mapping, gene_mode) {
t_id <- data.table::as.data.table(t_id)
target_mapping <- data.table::as.data.table(target_mapping)

Expand Down Expand Up @@ -557,6 +592,24 @@ check_target_mapping <- function(t_id, target_mapping) {
' please check obj$target_mapping to ensure this new mapping is correct.'))
}

if(any(duplicated(target_mapping$target_id))) {
indices <- which(duplicated(target_mapping$target_id))
duplicated_ids <- target_mapping$target_id[indices]
formatted_ids <- paste(dupliated_ids, collapse = ", ")
if(gene_mode) {
stop("There is at least one duplicated target ID in the target mapping. ",
"Since sleuth prep is in gene aggregation mode, any duplicated ",
"entries will cause errors during gene aggregation. Here is a list ",
"of all the duplicated target IDs:\n", formatted_ids)
} else {
warning("There is at least one duplicated target ID in the target ",
"mapping. Since sleuth prep is not in gene aggregation mode, ",
"duplicated entries will not cause errors during preparation, ",
"but may cause unexpected behavior when making any plots or ",
"tables. Here is a list of all the duplicated target IDs:\n",
formatted_ids)
}
}
target_mapping
}

Expand Down Expand Up @@ -825,7 +878,7 @@ sleuth_gene_table <- function(obj, test, test_type = 'lrt', which_model = 'full'
filter_empty <- !filter_empty
popped_gene_table <- popped_gene_table[filter_empty, ]

popped_gene_table
adf(popped_gene_table)
}


Expand Down