Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve obs_to_matrix with data.table #210

Open
wants to merge 12 commits into
base: devel
Choose a base branch
from
1 change: 0 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ Imports:
dplyr,
data.table,
tidyr,
reshape2,
rhdf5,
parallel,
lazyeval,
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ export(design_matrix)
export(enclosed_brush)
export(excluded_ids)
export(extract_model)
export(gene_from_gene)
export(get_bootstrap_summary)
export(get_bootstraps)
export(get_quantile)
Expand All @@ -40,6 +41,7 @@ export(log_transform)
export(melt_bootstrap_sleuth)
export(models)
export(norm_factors)
export(head)
export(plot_bootstrap)
export(plot_fld)
export(plot_group_density)
Expand Down Expand Up @@ -86,3 +88,4 @@ importFrom(lazyeval,interp)
importFrom(lazyeval,lazy)
importFrom(rhdf5,h5write)
importFrom(rhdf5,h5write.default)
importFrom(utils, head)
1 change: 1 addition & 0 deletions R/bootstrap.R
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,7 @@ process_bootstrap <- function(i, samp_name, kal_path,
mappings)
# this step undoes the tidying to get back a matrix format
# target_ids here are now the aggregation column ids
scaled_bs <- data.table::as.data.table(scaled_bs)
bs_mat <- data.table::dcast(scaled_bs, sample ~ target_id,
value.var = "scaled_reads_per_base")
# this now has the same format as the transcript matrix
Expand Down
6 changes: 6 additions & 0 deletions R/fix_head.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@

## ---- head
#' Fix unexported head error.
#'
#' @export head
head <- utils::head
6 changes: 4 additions & 2 deletions R/matrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,14 @@
#' @param which_df character vector of length one. Which type of data to use
#' ("obs_norm" or "obs_raw")
#' @param which_units character vector of length one. Which units to use ("tpm"
#' or "est_counts")
#' @return a matrix which contains a matrix of target_ids and transcript expression in \code{which_units}
#' or "est_counts" (for transcript-level analyses) or "scaled_reads_per_base" (for gene-level analyses))
#' @return a matrix which contains a matrix of target_ids and transcript (or gene) expression in \code{which_units}.
#' Note this currently does not support returning raw values for gene-level counts or TPMs.
#' @examples
#' sleuth_matrix <- sleuth_to_matrix(sleuth_obj, 'obs_norm', 'tpm')
#' head(sleuth_matrix) # look at first 5 transcripts, sorted by name
#' @export
#' importFrom utils head
sleuth_to_matrix <- function(obj, which_df, which_units) {
if ( !(which_df %in% c("obs_norm", "obs_raw")) ) {
stop("Invalid object")
Expand Down
6 changes: 3 additions & 3 deletions R/plots.R
Original file line number Diff line number Diff line change
Expand Up @@ -1035,13 +1035,13 @@ plot_transcript_heatmap <- function(obj,

if (units == 'tpm') {
tabd_df <- dplyr::select(tabd_df, target_id, sample, tpm)
tabd_df <- reshape2::dcast(tabd_df, target_id ~sample, value.var = 'tpm')
tabd_df <- data.table::dcast(tabd_df, target_id ~sample, value.var = 'tpm')
} else if (units == 'est_counts') {
tabd_df <- dplyr::select(tabd_df, target_id, sample, est_counts)
tabd_df <- reshape2::dcast(tabd_df, target_id ~sample, value.var = 'est_counts')
tabd_df <- data.table::dcast(tabd_df, target_id ~sample, value.var = 'est_counts')
} else if (units == 'scaled_reads_per_base') {
tabd_df <- dplyr::select(tabd_df, target_id, sample, scaled_reads_per_base)
tabd_df <- reshape2::dcast(tabd_df, target_id ~sample,
tabd_df <- data.table::dcast(tabd_df, target_id ~sample,
value.var = 'scaled_reads_per_base')
} else {
stop("Didn't recognize the following unit: ", units)
Expand Down
50 changes: 49 additions & 1 deletion R/sleuth.R
Original file line number Diff line number Diff line change
Expand Up @@ -1007,7 +1007,8 @@ kallisto_table <- function(obj,
# @return a matrix with the appropriate names
obs_to_matrix <- function(obj, value_name) {

obs_counts <- reshape2::dcast(obj$obs_norm, target_id ~ sample,
obj$obs_norm <- data.table::as.data.table(obj$obs_norm)
obs_counts <- data.table::dcast(obj$obs_norm, target_id ~ sample,
value.var = value_name)

obs_counts <- as.data.frame(obs_counts)
Expand Down Expand Up @@ -1091,6 +1092,7 @@ summary.sleuth <- function(obj, covariates = TRUE) {
#' head(sleuth_genes) # show info for first 5 genes
#' sleuth_genes[1:5, 6] # show transcripts for first 5 genes
#' @export
#' @importFrom utils head
sleuth_gene_table <- function(obj, test, test_type = 'lrt', which_model = 'full', which_group = 'ens_gene') {

if (is.null(obj$target_mapping)) {
Expand Down Expand Up @@ -1150,6 +1152,52 @@ transcripts_from_gene <- function(obj, test, test_type,
table$target_id[table[, 2] == gene_name]
}

#' Get the gene ID using other gene identifiers
#'
#' Get the \code{target_id} of a gene using other gene identifiers.
#' The identifiers found under the \code{obj$gene_column} are often
#' difficult to remember (e.g. ensembl gene ID, ENSG00000111640).
#' This function allows a user to find that difficult-to-remember
#' identifier using more-easily-remembered identifiers, such as
#' gene symbol (e.g. "GAPDH").
#'
#' @param obj a \code{sleuth} object
#' @param gene_colname the name of the column containing 'gene_name'.
#' This parameter refers to the name of the column that the gene you are searching for appears in.
#' Check the column names using \code{colnames(obj$target_mapping)}.
#' @param gene_name a string containing the name of the gene you are interested in.
#' @return a character vector containing the \code{target_id} of the gene, found under
#' \code{obj$gene_column} within \code{obj$target_mapping}.
#' If the column name provided is the same as \code{obj$gene_column}, and the
#' gene_name used is found, that gene_name will be returned.
#' @examples
#' \dontrun{gene_from_gene(obj, "gene_symbol", "GAPDH")}
#' @export
gene_from_gene <- function(obj, gene_colname, gene_name) {

if (!obj$gene_mode) {
stop("this sleuth object is in transcript mode. Please use 'transcripts_from_gene' instead.")
}

table <- as.data.frame(obj$target_mapping)
if (gene_colname == obj$gene_column) {
if (!(gene_name %in% table[, eval(parse(text = obj$gene_column))])) {
stop("Couldn't find gene ", gene_name)
} else {
return(gene_name)
}
}

table <- unique(dplyr::select_(table, obj$gene_column, gene_colname))
if (!(gene_name %in% table[, 2])) {
stop("Couldn't find gene ", gene_name)
}
hits <- unique(table[table[,2] == gene_name, 1])
if (length(hits) > 1) {
warning("there was more than one gene ID that matched this identifier; taking the first one")
}
hits[1]
}

#' Change sleuth transform counts function
#'
Expand Down
34 changes: 34 additions & 0 deletions man/gene_from_gene.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions man/sleuth_to_matrix.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.