From a47431df80c9b1ef9dbb4d97517759f234e1cba1 Mon Sep 17 00:00:00 2001 From: olapuentesantana Date: Mon, 4 Mar 2024 21:13:25 +0100 Subject: [PATCH] TPM (no-scaling) and wmean method --- NAMESPACE | 8 +- R/compute_TF_activity.R | 37 +++++---- R/compute_pathway_activity.R | 129 +++++++++++++------------------ man/compute_TF_activity.Rd | 11 ++- man/compute_pathway_activity.Rd | 4 +- vignettes/easier_user_manual.Rmd | 18 ++--- 6 files changed, 97 insertions(+), 110 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 2b3ebcd..f0a4e00 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,20 +17,16 @@ import(magrittr) importFrom(BiocParallel,MulticoreParam) importFrom(BiocParallel,bplapply) importFrom(BiocParallel,register) -importFrom(DESeq2,DESeqDataSetFromMatrix) -importFrom(DESeq2,estimateDispersions) -importFrom(DESeq2,estimateSizeFactors) -importFrom(DESeq2,getVarianceStabilizedData) importFrom(ROCR,performance) importFrom(ROCR,plot) importFrom(ROCR,prediction) importFrom(coin,pvalue) importFrom(decoupleR,get_collectri) +importFrom(decoupleR,get_dorothea) +importFrom(decoupleR,get_progeny) importFrom(decoupleR,run_wmean) importFrom(dplyr,filter) importFrom(easierData,get_HGNC_annotation) -importFrom(easierData,get_TCGA_mean_pancancer) -importFrom(easierData,get_TCGA_sd_pancancer) importFrom(easierData,get_cor_scores_genes) importFrom(easierData,get_group_lrpairs) importFrom(easierData,get_intercell_networks) diff --git a/R/compute_TF_activity.R b/R/compute_TF_activity.R index 20e6ba3..2a459a3 100644 --- a/R/compute_TF_activity.R +++ b/R/compute_TF_activity.R @@ -13,13 +13,13 @@ #' @import magrittr #' @importFrom stats na.exclude #' @importFrom dplyr filter -#' @importFrom decoupleR get_collectri run_wmean +#' @importFrom decoupleR get_collectri get_dorothea run_wmean #' @importFrom tidyr pivot_wider #' @importFrom tibble column_to_rownames -#' @importFrom easierData get_TCGA_mean_pancancer get_TCGA_sd_pancancer #' #' @param RNA_tpm data.frame containing TPM values with HGNC symbols #' in rows and samples in columns. +#' @param regulon_net string indicating the regulon network to be used. #' @param verbose logical value indicating whether to display messages #' about the number of regulated #' genes found in the gene expression data provided. @@ -49,16 +49,15 @@ #' #' # Computation of TF activity (Garcia-Alonso et al., Genome Res, 2019) #' tf_activity <- compute_TF_activity( -#' RNA_tpm = RNA_tpm +#' RNA_tpm = RNA_tpm, +#' regulon_net = "collectri" #' ) compute_TF_activity <- function(RNA_tpm = NULL, + regulon_net = c("collectri", "dorothea"), verbose = TRUE) { # Some checks if (is.null(RNA_tpm)) stop("TPM gene expression data not found") - - # Retrieve internal data - TCGA_mean_pancancer <- suppressMessages(easierData::get_TCGA_mean_pancancer()) - TCGA_sd_pancancer <- suppressMessages(easierData::get_TCGA_sd_pancancer()) + if (length(regulon_net) > 1) stop("Please select just one regulon network") # Gene expression data tpm <- RNA_tpm @@ -67,13 +66,6 @@ compute_TF_activity <- function(RNA_tpm = NULL, # HGNC symbols are required if (any(grep("ENSG00000", genes))) stop("hgnc gene symbols are required", call. = FALSE) - # Log transformed expression matrix (log2[tpm+1]): - # expression matrix scaled and recentered. - #gene_expr <- calc_z_score(t(tpm), - # mean = TCGA_mean_pancancer, - # sd = TCGA_sd_pancancer - #) - gene_expr <- t(tpm) # redefine gene names to match TF-target network E <- t(gene_expr) @@ -81,9 +73,20 @@ compute_TF_activity <- function(RNA_tpm = NULL, rownames(E) <- newNames # collectTRI network - net <- decoupleR::get_collectri(organism='human', split_complexes=FALSE) + if(regulon_net == "collectri"){ + + net <- decoupleR::get_collectri(organism='human', split_complexes=FALSE) + + # dorothea network + }else if(regulon_net == "dorothea"){ + + net <- decoupleR::get_dorothea(organism='human', + levels = c("A", "B", "C"), + weight_dict = list(A = 1, B = 1, C = 1, D = 1)) + + } + all_regulated_transcripts <- unique(net$target) - all_tfs <- unique(net$source) # check what is the percentage of genes we have in our data genes_kept <- intersect(rownames(E), all_regulated_transcripts) @@ -102,7 +105,7 @@ compute_TF_activity <- function(RNA_tpm = NULL, E <- E[!is.na(apply(E, 1, sum)), ] E <- E[!is.infinite(apply(E, 1, sum)), ] - # TF activity: run viper + # TF activity: run wmean tf_activity_df <- decoupleR::run_wmean(mat = E, net = net, .source='source', diff --git a/R/compute_pathway_activity.R b/R/compute_pathway_activity.R index b3ad337..59c5d29 100644 --- a/R/compute_pathway_activity.R +++ b/R/compute_pathway_activity.R @@ -14,13 +14,15 @@ #' Biophysica Acta (BBA) - Gene Regulatory Mechanisms. 2019. #' DOI: 10.1016/j.bbagrm.2019.194431. #' -#' @importFrom DESeq2 DESeqDataSetFromMatrix estimateSizeFactors estimateDispersions -#' getVarianceStabilizedData #' @importFrom stats na.exclude +#' @importFrom dplyr filter #' @importFrom progeny progeny getModel +#' @importFrom decoupleR get_progeny run_wmean +#' @importFrom tidyr pivot_wider +#' @importFrom tibble column_to_rownames #' @importFrom easierData get_cor_scores_genes #' -#' @param RNA_counts data.frame containing raw counts values with HGNC +#' @param RNA_tpm data.frame containing TPM values with HGNC #' gene symbols as row names and samples identifiers as column names. #' @param remove_sig_genes_immune_response logical value indicating #' whether to remove signature genes involved in the derivation of @@ -59,92 +61,71 @@ #' RNA_counts = RNA_counts, #' remove_sig_genes_immune_response = TRUE #' ) -compute_pathway_activity <- function(RNA_counts = NULL, +compute_pathway_activity <- function(RNA_tpm = NULL, remove_sig_genes_immune_response = TRUE, verbose = TRUE) { # Some checks - if (is.null(RNA_counts)) stop("Counts gene expression data not found") - + if (is.null(RNA_tpm)) stop("TPM gene expression data not found") + # Retrieve internal data cor_scores_genes <- suppressMessages(easierData::get_cor_scores_genes()) # Gene expression data - raw_counts <- RNA_counts - genes <- rownames(raw_counts) - + tpm <- RNA_tpm + genes <- rownames(tpm) + # HGNC symbols are required - if (any(grep("ENSG00000", genes))) { - stop("Hgnc gene symbols are required", - call. = FALSE - ) - } - - # Remove list of genes used to build proxy's of ICB response - if (remove_sig_genes_immune_response) { - if (verbose) message("Removing signature genes of hallmarks of immune response \n") - idy <- stats::na.exclude(match(cor_scores_genes, rownames(raw_counts))) - raw_counts <- raw_counts[-idy, ] - } - - # Integers are required for "DESeq2" - if (is.integer(raw_counts) == FALSE) { - raw_counts_integer <- apply(raw_counts, 2, as.integer) - rownames(raw_counts_integer) <- rownames(raw_counts) - } else { - raw_counts_integer <- raw_counts - } - - # Variance stabilizing transformation (DESeq2 package) - # Integer count matrix, a data frame with the sample info, - # design =~1 to consider all samples as part of the same group. - - # Column data: - colData <- data.frame(id = colnames(raw_counts_integer)) - - if (verbose) message("Gene counts normalization with DESeq2:") - # Construction a DESeqDataSet: (Forced all to be data.frames($ operator)) - dset <- DESeq2::DESeqDataSetFromMatrix( - countData = raw_counts_integer, - colData = colData, - design = ~1 - ) - - # Variance stabilization transformation - dset <- DESeq2::estimateSizeFactors(dset) - dset <- DESeq2::estimateDispersions(dset) - gene_expr <- DESeq2::getVarianceStabilizedData(dset) - rownames(gene_expr) <- rownames(raw_counts_integer) + if (any(grep("ENSG00000", genes))) stop("hgnc gene symbols are required", call. = FALSE) + + gene_expr <- t(tpm) + # redefine gene names to match TF-target network + E <- t(gene_expr) + newNames <- gsub(".", "-", rownames(E), fixed = TRUE) + rownames(E) <- newNames + + ## progeny network + net <- decoupleR::get_progeny(organism = 'human', top = 100) - # Pathways activity - pathway_activity <- progeny::progeny(gene_expr, - scale = FALSE, - organism = "Human", verbose = verbose - ) + all_regulated_transcripts <- unique(net$target) # check what is the percentage of genes we have in our data - model_pathways <- progeny::getModel(organism = "Human", top = 100) - full_pathway_sig <- unique(unlist(lapply( - colnames(model_pathways), - function(pathway) { - top_genes_pathway <- rownames(model_pathways)[apply( - model_pathways, 2, - function(X) X != 0 - )[, pathway]] - return(top_genes_pathway) - } - ))) - genes_kept <- intersect(rownames(gene_expr), full_pathway_sig) - genes_left <- setdiff(full_pathway_sig, rownames(gene_expr)) - total_genes <- length(genes_left) + length(genes_kept) + genes_kept <- intersect(rownames(E), all_regulated_transcripts) + genes_left <- setdiff(all_regulated_transcripts, rownames(E)) + + # check what is the percentage of regulated transcripts that we have in our data if (verbose) { message( - "Pathway signature genes found in data set: ", - length(genes_kept), "/", - total_genes, " (", - round(length(genes_kept) / total_genes, 3) * 100, - "%)" + "Regulated transcripts found in data set: ", length(genes_kept), "/", + length(all_regulated_transcripts), " (", + round(length(genes_kept) / length(all_regulated_transcripts), 3) * 100, "%)" ) } + + # Remove list of genes used to build proxy's of ICB response + if (remove_sig_genes_immune_response) { + if (verbose) message("Removing signature genes of hallmarks of immune response \n") + idy <- stats::na.exclude(match(cor_scores_genes, rownames(E))) + E <- E[-idy, ] + } + + # Remove genes with all NA/Inf values + E <- E[!is.na(apply(E, 1, sum)), ] + E <- E[!is.infinite(apply(E, 1, sum)), ] + + # Pathway activity: run wmean + pathway_activity_df <- decoupleR::run_wmean(mat = E, + net = net, + .source='source', + .target='target', + .mor='weight', + minsize = 5) + # To matrix + pathway_activity <- pathway_activity_df %>% + dplyr::filter(statistic == "wmean") %>% + tidyr::pivot_wider(id_cols = condition, + names_from = source, + values_from = score) %>% + tibble::column_to_rownames("condition") if (verbose) message("\nPathway activity scores computed! \n") return(as.data.frame(pathway_activity)) diff --git a/man/compute_TF_activity.Rd b/man/compute_TF_activity.Rd index 3001208..814aef6 100644 --- a/man/compute_TF_activity.Rd +++ b/man/compute_TF_activity.Rd @@ -5,12 +5,18 @@ \title{Compute transcription factor activity from gene expression using DoRothEA} \usage{ -compute_TF_activity(RNA_tpm = NULL, verbose = TRUE) +compute_TF_activity( + RNA_tpm = NULL, + regulon_net = c("collectri", "dorothea"), + verbose = TRUE +) } \arguments{ \item{RNA_tpm}{data.frame containing TPM values with HGNC symbols in rows and samples in columns.} +\item{regulon_net}{string indicating the regulon network to be used.} + \item{verbose}{logical value indicating whether to display messages about the number of regulated genes found in the gene expression data provided.} @@ -44,7 +50,8 @@ RNA_tpm <- RNA_tpm[, colnames(RNA_tpm) \%in\% pat_subset] # Computation of TF activity (Garcia-Alonso et al., Genome Res, 2019) tf_activity <- compute_TF_activity( - RNA_tpm = RNA_tpm + RNA_tpm = RNA_tpm, + regulon_net = "collectri" ) } \references{ diff --git a/man/compute_pathway_activity.Rd b/man/compute_pathway_activity.Rd index b8edc23..862fdf1 100644 --- a/man/compute_pathway_activity.Rd +++ b/man/compute_pathway_activity.Rd @@ -5,13 +5,13 @@ \title{Compute pathway activity from gene expression using PROGENy} \usage{ compute_pathway_activity( - RNA_counts = NULL, + RNA_tpm = NULL, remove_sig_genes_immune_response = TRUE, verbose = TRUE ) } \arguments{ -\item{RNA_counts}{data.frame containing raw counts values with HGNC +\item{RNA_tpm}{data.frame containing TPM values with HGNC gene symbols as row names and samples identifiers as column names.} \item{remove_sig_genes_immune_response}{logical value indicating diff --git a/vignettes/easier_user_manual.Rmd b/vignettes/easier_user_manual.Rmd index e13f7ae..3cbfdda 100644 --- a/vignettes/easier_user_manual.Rmd +++ b/vignettes/easier_user_manual.Rmd @@ -251,15 +251,15 @@ head(cell_fractions) By applying PROGENy [@HOLLAND2020194431; @Schubert2018] method to count data from RNA-seq, the activity of 14 signaling pathways can be inferred as in the chunk below. ```{r, eval=TRUE} -pathway_activities <- compute_pathway_activity(RNA_counts = RNA_counts, - remove_sig_genes_immune_response = TRUE) +pathway_activities <- compute_pathway_activity(RNA_tpm = RNA_tpm, + remove_sig_genes_immune_response = FALSE) head(pathway_activities) ``` The call above infers pathway activity as a linear transformation of gene expression data. Since some pathway signature genes were also used to compute scores of immune response (output variable in EaSIeR model). With `remove_sig_genes_immune_response` set to `TRUE` (default), the overlapping genes are removed from the pathway signature genes used to infer pathway activities. By applying DoRothEA [@Garcia-Alonso01082019] method to TPM data from RNA-seq, the activity of 118 TFs can be inferred as follows: ```{r, eval=TRUE} -tf_activities <- compute_TF_activity(RNA_tpm = RNA_tpm) +tf_activities <- compute_TF_activity(RNA_tpm = RNA_tpm, regulon_net = "dorothea") head(tf_activities[,1:5]) ``` @@ -293,13 +293,13 @@ The `cancer_type` provided should match one of the cancer-specific models availa The optimized models can be retrieved from `easierData` package using the function `get_opt_models()`. -```{r, eval=TRUE} +```{r, eval=FALSE} predictions <- predict_immune_response(pathways = pathway_activities, immunecells = cell_fractions, tfs = tf_activities, lrpairs = lrpair_weights, ccpairs = ccpair_scores, - cancer_type = cancer_type, + cancer_type = cancer_type, verbose = TRUE) ``` @@ -325,7 +325,7 @@ Since both immune response and TMB are essential for an effective immunotherapy These two strategies to combine immune response and TMB require the definition of a certain weight or penalty beforehand (`weight_penalty`). The default weight or penalty is 0.5. -```{r, eval=TRUE} +```{r, eval=FALSE} output_eval_with_resp <- assess_immune_response(predictions_immune_response = predictions, patient_response = patient_ICBresponse, RNA_tpm = RNA_tpm, @@ -353,7 +353,7 @@ This is a usual case where we might have a cancer dataset with bulk RNA-seq data In this likely scenario, an score of likelihood of immune response can be assigned to each patient by omitting the argument `patient_response` within the function `assess_immune_response`. -```{r, eval=TRUE} +```{r, eval=FALSE} output_eval_no_resp <- assess_immune_response(predictions_immune_response = predictions, TMB_values = TMB, easier_with_TMB = "weighted_average", @@ -376,7 +376,7 @@ output_eval_no_resp[[2]] We can further retrieve the easier score and also, the refined scores obtained by integrating easier score and TMB via `retrieve_easier_score`. -```{r, eval=TRUE} +```{r, eval=FALSE} easier_derived_scores <- retrieve_easier_score(predictions_immune_response = predictions, TMB_values = TMB, easier_with_TMB = c("weighted_average", @@ -394,7 +394,7 @@ The option `patient_label` allows to make a two-level comparison by providing a In order to leverage the cancer-specific biomarker weights inferred from model training, you need to specify again which `cancer_type` the bulk RNA-seq data belongs to. As before, the selected `cancer_type` should be included in the list described above. -```{r, eval=TRUE} +```{r, eval=FALSE} output_biomarkers <- explore_biomarkers(pathways = pathway_activities, immunecells = cell_fractions, tfs = tf_activities,