From a47431df80c9b1ef9dbb4d97517759f234e1cba1 Mon Sep 17 00:00:00 2001
From: olapuentesantana <olapuente@cnio.es>
Date: Mon, 4 Mar 2024 21:13:25 +0100
Subject: [PATCH] TPM (no-scaling) and wmean method

---
 NAMESPACE                        |   8 +-
 R/compute_TF_activity.R          |  37 +++++----
 R/compute_pathway_activity.R     | 129 +++++++++++++------------------
 man/compute_TF_activity.Rd       |  11 ++-
 man/compute_pathway_activity.Rd  |   4 +-
 vignettes/easier_user_manual.Rmd |  18 ++---
 6 files changed, 97 insertions(+), 110 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 2b3ebcd..f0a4e00 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -17,20 +17,16 @@ import(magrittr)
 importFrom(BiocParallel,MulticoreParam)
 importFrom(BiocParallel,bplapply)
 importFrom(BiocParallel,register)
-importFrom(DESeq2,DESeqDataSetFromMatrix)
-importFrom(DESeq2,estimateDispersions)
-importFrom(DESeq2,estimateSizeFactors)
-importFrom(DESeq2,getVarianceStabilizedData)
 importFrom(ROCR,performance)
 importFrom(ROCR,plot)
 importFrom(ROCR,prediction)
 importFrom(coin,pvalue)
 importFrom(decoupleR,get_collectri)
+importFrom(decoupleR,get_dorothea)
+importFrom(decoupleR,get_progeny)
 importFrom(decoupleR,run_wmean)
 importFrom(dplyr,filter)
 importFrom(easierData,get_HGNC_annotation)
-importFrom(easierData,get_TCGA_mean_pancancer)
-importFrom(easierData,get_TCGA_sd_pancancer)
 importFrom(easierData,get_cor_scores_genes)
 importFrom(easierData,get_group_lrpairs)
 importFrom(easierData,get_intercell_networks)
diff --git a/R/compute_TF_activity.R b/R/compute_TF_activity.R
index 20e6ba3..2a459a3 100644
--- a/R/compute_TF_activity.R
+++ b/R/compute_TF_activity.R
@@ -13,13 +13,13 @@
 #' @import magrittr
 #' @importFrom stats na.exclude
 #' @importFrom dplyr filter
-#' @importFrom decoupleR get_collectri run_wmean
+#' @importFrom decoupleR get_collectri get_dorothea run_wmean
 #' @importFrom tidyr pivot_wider
 #' @importFrom tibble column_to_rownames
-#' @importFrom easierData get_TCGA_mean_pancancer get_TCGA_sd_pancancer
 #'
 #' @param RNA_tpm data.frame containing TPM values with HGNC symbols
 #' in rows and samples in columns.
+#' @param regulon_net string indicating the regulon network to be used.
 #' @param verbose logical value indicating whether to display messages
 #' about the number of regulated
 #' genes found in the gene expression data provided.
@@ -49,16 +49,15 @@
 #'
 #' # Computation of TF activity (Garcia-Alonso et al., Genome Res, 2019)
 #' tf_activity <- compute_TF_activity(
-#'   RNA_tpm = RNA_tpm
+#'   RNA_tpm = RNA_tpm,
+#'   regulon_net = "collectri"
 #' )
 compute_TF_activity <- function(RNA_tpm = NULL,
+                                regulon_net = c("collectri", "dorothea"),
                                 verbose = TRUE) {
   # Some checks
   if (is.null(RNA_tpm)) stop("TPM gene expression data not found")
-
-  # Retrieve internal data
-  TCGA_mean_pancancer <- suppressMessages(easierData::get_TCGA_mean_pancancer())
-  TCGA_sd_pancancer <- suppressMessages(easierData::get_TCGA_sd_pancancer())
+  if (length(regulon_net) > 1) stop("Please select just one regulon network")
 
   # Gene expression data
   tpm <- RNA_tpm
@@ -67,13 +66,6 @@ compute_TF_activity <- function(RNA_tpm = NULL,
   # HGNC symbols are required
   if (any(grep("ENSG00000", genes))) stop("hgnc gene symbols are required", call. = FALSE)
 
-  # Log transformed expression matrix (log2[tpm+1]):
-  # expression matrix scaled and recentered.
-  #gene_expr <- calc_z_score(t(tpm),
-  #  mean = TCGA_mean_pancancer,
-  #  sd = TCGA_sd_pancancer
-  #)
-
   gene_expr <- t(tpm)
   # redefine gene names to match TF-target network
   E <- t(gene_expr)
@@ -81,9 +73,20 @@ compute_TF_activity <- function(RNA_tpm = NULL,
   rownames(E) <- newNames
 
   # collectTRI network
-  net <- decoupleR::get_collectri(organism='human', split_complexes=FALSE)
+  if(regulon_net == "collectri"){
+    
+    net <- decoupleR::get_collectri(organism='human', split_complexes=FALSE)
+  
+  # dorothea network
+  }else if(regulon_net == "dorothea"){
+    
+    net <- decoupleR::get_dorothea(organism='human', 
+                                   levels = c("A", "B", "C"),
+                                   weight_dict = list(A = 1, B = 1, C = 1, D = 1))
+    
+  }
+
   all_regulated_transcripts <- unique(net$target)
-  all_tfs <- unique(net$source)
 
   # check what is the percentage of genes we have in our data
   genes_kept <- intersect(rownames(E), all_regulated_transcripts)
@@ -102,7 +105,7 @@ compute_TF_activity <- function(RNA_tpm = NULL,
   E <- E[!is.na(apply(E, 1, sum)), ]
   E <- E[!is.infinite(apply(E, 1, sum)), ]
 
-  # TF activity: run viper
+  # TF activity: run wmean
   tf_activity_df <- decoupleR::run_wmean(mat = E,
                                          net = net,
                                          .source='source',
diff --git a/R/compute_pathway_activity.R b/R/compute_pathway_activity.R
index b3ad337..59c5d29 100644
--- a/R/compute_pathway_activity.R
+++ b/R/compute_pathway_activity.R
@@ -14,13 +14,15 @@
 #' Biophysica Acta (BBA) - Gene Regulatory Mechanisms. 2019.
 #' DOI: 10.1016/j.bbagrm.2019.194431.
 #'
-#' @importFrom DESeq2 DESeqDataSetFromMatrix estimateSizeFactors estimateDispersions
-#' getVarianceStabilizedData
 #' @importFrom stats na.exclude
+#' @importFrom dplyr filter
 #' @importFrom progeny progeny getModel
+#' @importFrom decoupleR get_progeny run_wmean
+#' @importFrom tidyr pivot_wider
+#' @importFrom tibble column_to_rownames
 #' @importFrom easierData get_cor_scores_genes
 #'
-#' @param RNA_counts data.frame containing raw counts values with HGNC
+#' @param RNA_tpm data.frame containing TPM values with HGNC
 #' gene symbols as row names and samples identifiers as column names.
 #' @param remove_sig_genes_immune_response logical value indicating
 #' whether to remove signature genes involved in the derivation of
@@ -59,92 +61,71 @@
 #'   RNA_counts = RNA_counts,
 #'   remove_sig_genes_immune_response = TRUE
 #' )
-compute_pathway_activity <- function(RNA_counts = NULL,
+compute_pathway_activity <- function(RNA_tpm = NULL,
                                      remove_sig_genes_immune_response = TRUE,
                                      verbose = TRUE) {
   # Some checks
-  if (is.null(RNA_counts)) stop("Counts gene expression data not found")
-
+  if (is.null(RNA_tpm)) stop("TPM gene expression data not found")
+  
   # Retrieve internal data
   cor_scores_genes <- suppressMessages(easierData::get_cor_scores_genes())
 
   # Gene expression data
-  raw_counts <- RNA_counts
-  genes <- rownames(raw_counts)
-
+  tpm <- RNA_tpm
+  genes <- rownames(tpm)
+  
   # HGNC symbols are required
-  if (any(grep("ENSG00000", genes))) {
-    stop("Hgnc gene symbols are required",
-      call. = FALSE
-    )
-  }
-
-  # Remove list of genes used to build proxy's of ICB response
-  if (remove_sig_genes_immune_response) {
-    if (verbose) message("Removing signature genes of hallmarks of immune response \n")
-    idy <- stats::na.exclude(match(cor_scores_genes, rownames(raw_counts)))
-    raw_counts <- raw_counts[-idy, ]
-  }
-
-  # Integers are required for "DESeq2"
-  if (is.integer(raw_counts) == FALSE) {
-    raw_counts_integer <- apply(raw_counts, 2, as.integer)
-    rownames(raw_counts_integer) <- rownames(raw_counts)
-  } else {
-    raw_counts_integer <- raw_counts
-  }
-
-  # Variance stabilizing transformation (DESeq2 package)
-  # Integer count matrix, a data frame with the sample info,
-  # design =~1 to consider all samples as part of the same group.
-
-  # Column data:
-  colData <- data.frame(id = colnames(raw_counts_integer))
-
-  if (verbose) message("Gene counts normalization with DESeq2:")
-  # Construction a DESeqDataSet: (Forced all to be data.frames($ operator))
-  dset <- DESeq2::DESeqDataSetFromMatrix(
-    countData = raw_counts_integer,
-    colData = colData,
-    design = ~1
-  )
-
-  # Variance stabilization transformation
-  dset <- DESeq2::estimateSizeFactors(dset)
-  dset <- DESeq2::estimateDispersions(dset)
-  gene_expr <- DESeq2::getVarianceStabilizedData(dset)
-  rownames(gene_expr) <- rownames(raw_counts_integer)
+  if (any(grep("ENSG00000", genes))) stop("hgnc gene symbols are required", call. = FALSE)
+  
+  gene_expr <- t(tpm)
+  # redefine gene names to match TF-target network
+  E <- t(gene_expr)
+  newNames <- gsub(".", "-", rownames(E), fixed = TRUE)
+  rownames(E) <- newNames
+  
+  ## progeny network
+  net <- decoupleR::get_progeny(organism = 'human', top = 100)
 
-  # Pathways activity
-  pathway_activity <- progeny::progeny(gene_expr,
-    scale = FALSE,
-    organism = "Human", verbose = verbose
-  )
+  all_regulated_transcripts <- unique(net$target)
 
   # check what is the percentage of genes we have in our data
-  model_pathways <- progeny::getModel(organism = "Human", top = 100)
-  full_pathway_sig <- unique(unlist(lapply(
-    colnames(model_pathways),
-    function(pathway) {
-      top_genes_pathway <- rownames(model_pathways)[apply(
-        model_pathways, 2,
-        function(X) X != 0
-      )[, pathway]]
-      return(top_genes_pathway)
-    }
-  )))
-  genes_kept <- intersect(rownames(gene_expr), full_pathway_sig)
-  genes_left <- setdiff(full_pathway_sig, rownames(gene_expr))
-  total_genes <- length(genes_left) + length(genes_kept)
+  genes_kept <- intersect(rownames(E), all_regulated_transcripts)
+  genes_left <- setdiff(all_regulated_transcripts, rownames(E))
+  
+  # check what is the percentage of regulated transcripts that we have in our data
   if (verbose) {
     message(
-      "Pathway signature genes found in data set: ",
-      length(genes_kept), "/",
-      total_genes, " (",
-      round(length(genes_kept) / total_genes, 3) * 100,
-      "%)"
+      "Regulated transcripts found in data set: ", length(genes_kept), "/",
+      length(all_regulated_transcripts), " (",
+      round(length(genes_kept) / length(all_regulated_transcripts), 3) * 100, "%)"
     )
   }
+  
+  # Remove list of genes used to build proxy's of ICB response
+  if (remove_sig_genes_immune_response) {
+    if (verbose) message("Removing signature genes of hallmarks of immune response \n")
+    idy <- stats::na.exclude(match(cor_scores_genes, rownames(E)))
+    E <- E[-idy, ]
+  }
+
+  # Remove genes with all NA/Inf values
+  E <- E[!is.na(apply(E, 1, sum)), ]
+  E <- E[!is.infinite(apply(E, 1, sum)), ]
+  
+  # Pathway activity: run wmean
+  pathway_activity_df <- decoupleR::run_wmean(mat = E,
+                                              net = net,
+                                              .source='source',
+                                              .target='target',
+                                              .mor='weight', 
+                                              minsize = 5)
+  # To matrix
+  pathway_activity <- pathway_activity_df %>%
+    dplyr::filter(statistic == "wmean") %>%
+    tidyr::pivot_wider(id_cols = condition,
+                       names_from = source,
+                       values_from = score) %>%
+    tibble::column_to_rownames("condition")
 
   if (verbose) message("\nPathway activity scores computed! \n")
   return(as.data.frame(pathway_activity))
diff --git a/man/compute_TF_activity.Rd b/man/compute_TF_activity.Rd
index 3001208..814aef6 100644
--- a/man/compute_TF_activity.Rd
+++ b/man/compute_TF_activity.Rd
@@ -5,12 +5,18 @@
 \title{Compute transcription factor activity from gene
 expression using DoRothEA}
 \usage{
-compute_TF_activity(RNA_tpm = NULL, verbose = TRUE)
+compute_TF_activity(
+  RNA_tpm = NULL,
+  regulon_net = c("collectri", "dorothea"),
+  verbose = TRUE
+)
 }
 \arguments{
 \item{RNA_tpm}{data.frame containing TPM values with HGNC symbols
 in rows and samples in columns.}
 
+\item{regulon_net}{string indicating the regulon network to be used.}
+
 \item{verbose}{logical value indicating whether to display messages
 about the number of regulated
 genes found in the gene expression data provided.}
@@ -44,7 +50,8 @@ RNA_tpm <- RNA_tpm[, colnames(RNA_tpm) \%in\% pat_subset]
 
 # Computation of TF activity (Garcia-Alonso et al., Genome Res, 2019)
 tf_activity <- compute_TF_activity(
-  RNA_tpm = RNA_tpm
+  RNA_tpm = RNA_tpm,
+  regulon_net = "collectri"
 )
 }
 \references{
diff --git a/man/compute_pathway_activity.Rd b/man/compute_pathway_activity.Rd
index b8edc23..862fdf1 100644
--- a/man/compute_pathway_activity.Rd
+++ b/man/compute_pathway_activity.Rd
@@ -5,13 +5,13 @@
 \title{Compute pathway activity from gene expression using PROGENy}
 \usage{
 compute_pathway_activity(
-  RNA_counts = NULL,
+  RNA_tpm = NULL,
   remove_sig_genes_immune_response = TRUE,
   verbose = TRUE
 )
 }
 \arguments{
-\item{RNA_counts}{data.frame containing raw counts values with HGNC
+\item{RNA_tpm}{data.frame containing TPM values with HGNC
 gene symbols as row names and samples identifiers as column names.}
 
 \item{remove_sig_genes_immune_response}{logical value indicating
diff --git a/vignettes/easier_user_manual.Rmd b/vignettes/easier_user_manual.Rmd
index e13f7ae..3cbfdda 100644
--- a/vignettes/easier_user_manual.Rmd
+++ b/vignettes/easier_user_manual.Rmd
@@ -251,15 +251,15 @@ head(cell_fractions)
 
 By applying PROGENy [@HOLLAND2020194431; @Schubert2018] method to count data from RNA-seq, the activity of 14 signaling pathways can be inferred as in the chunk below. 
 ```{r, eval=TRUE}
-pathway_activities <- compute_pathway_activity(RNA_counts = RNA_counts,
-                                               remove_sig_genes_immune_response = TRUE)
+pathway_activities <- compute_pathway_activity(RNA_tpm = RNA_tpm,
+                                               remove_sig_genes_immune_response = FALSE)
 head(pathway_activities)
 ```
 The call above infers pathway activity as a linear transformation of gene expression data. Since some pathway signature genes were also used to compute scores of immune response (output variable in EaSIeR model). With `remove_sig_genes_immune_response` set to `TRUE` (default), the overlapping genes are removed from the pathway signature genes used to infer pathway activities.  
 
 By applying DoRothEA [@Garcia-Alonso01082019] method to TPM data from RNA-seq, the activity of 118 TFs can be inferred as follows:
 ```{r, eval=TRUE}
-tf_activities <- compute_TF_activity(RNA_tpm = RNA_tpm)
+tf_activities <- compute_TF_activity(RNA_tpm = RNA_tpm, regulon_net = "dorothea")
 head(tf_activities[,1:5])
 ```
 
@@ -293,13 +293,13 @@ The `cancer_type` provided should match one of the cancer-specific models availa
 
 The optimized models can be retrieved from `easierData` package using the function `get_opt_models()`.
 
-```{r, eval=TRUE}
+```{r, eval=FALSE}
 predictions <- predict_immune_response(pathways = pathway_activities,
                                        immunecells = cell_fractions,
                                        tfs = tf_activities,
                                        lrpairs = lrpair_weights,
                                        ccpairs = ccpair_scores,
-                                       cancer_type = cancer_type, 
+                                       cancer_type = cancer_type,
                                        verbose = TRUE)
 ```
 
@@ -325,7 +325,7 @@ Since both immune response and TMB are essential for an effective immunotherapy
 
 These two strategies to combine immune response and TMB require the definition of a certain weight or penalty beforehand (`weight_penalty`). The default weight or penalty is 0.5.
 
-```{r, eval=TRUE}
+```{r, eval=FALSE}
 output_eval_with_resp <- assess_immune_response(predictions_immune_response = predictions,
                                                 patient_response = patient_ICBresponse,
                                                 RNA_tpm = RNA_tpm,
@@ -353,7 +353,7 @@ This is a usual case where we might have a cancer dataset with bulk RNA-seq data
 
 In this likely scenario, an score of likelihood of immune response can be assigned to each patient by omitting the argument `patient_response` within the function `assess_immune_response`.
 
-```{r, eval=TRUE}
+```{r, eval=FALSE}
 output_eval_no_resp <- assess_immune_response(predictions_immune_response = predictions,
                                               TMB_values = TMB,
                                               easier_with_TMB = "weighted_average",
@@ -376,7 +376,7 @@ output_eval_no_resp[[2]]
 
 We can further retrieve the easier score and also, the refined scores obtained by integrating easier score and TMB via `retrieve_easier_score`.
 
-```{r, eval=TRUE}
+```{r, eval=FALSE}
 easier_derived_scores <- retrieve_easier_score(predictions_immune_response = predictions,
                                                TMB_values = TMB,
                                                easier_with_TMB = c("weighted_average", 
@@ -394,7 +394,7 @@ The option `patient_label` allows to make a two-level comparison by providing a
 
 In order to leverage the cancer-specific biomarker weights inferred from model training, you need to specify again which `cancer_type` the bulk RNA-seq data belongs to. As before, the selected `cancer_type` should be included in the list described above.
 
-```{r, eval=TRUE}
+```{r, eval=FALSE}
 output_biomarkers <- explore_biomarkers(pathways = pathway_activities,
                                         immunecells = cell_fractions,
                                         tfs = tf_activities,