Skip to content

Commit

Permalink
TPM (no-scaling) and wmean method
Browse files Browse the repository at this point in the history
  • Loading branch information
olapuentesantana committed Mar 4, 2024
1 parent 4b40e76 commit a47431d
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 110 deletions.
8 changes: 2 additions & 6 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,16 @@ import(magrittr)
importFrom(BiocParallel,MulticoreParam)
importFrom(BiocParallel,bplapply)
importFrom(BiocParallel,register)
importFrom(DESeq2,DESeqDataSetFromMatrix)
importFrom(DESeq2,estimateDispersions)
importFrom(DESeq2,estimateSizeFactors)
importFrom(DESeq2,getVarianceStabilizedData)
importFrom(ROCR,performance)
importFrom(ROCR,plot)
importFrom(ROCR,prediction)
importFrom(coin,pvalue)
importFrom(decoupleR,get_collectri)
importFrom(decoupleR,get_dorothea)
importFrom(decoupleR,get_progeny)
importFrom(decoupleR,run_wmean)
importFrom(dplyr,filter)
importFrom(easierData,get_HGNC_annotation)
importFrom(easierData,get_TCGA_mean_pancancer)
importFrom(easierData,get_TCGA_sd_pancancer)
importFrom(easierData,get_cor_scores_genes)
importFrom(easierData,get_group_lrpairs)
importFrom(easierData,get_intercell_networks)
Expand Down
37 changes: 20 additions & 17 deletions R/compute_TF_activity.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@
#' @import magrittr
#' @importFrom stats na.exclude
#' @importFrom dplyr filter
#' @importFrom decoupleR get_collectri run_wmean
#' @importFrom decoupleR get_collectri get_dorothea run_wmean
#' @importFrom tidyr pivot_wider
#' @importFrom tibble column_to_rownames
#' @importFrom easierData get_TCGA_mean_pancancer get_TCGA_sd_pancancer
#'
#' @param RNA_tpm data.frame containing TPM values with HGNC symbols
#' in rows and samples in columns.
#' @param regulon_net string indicating the regulon network to be used.
#' @param verbose logical value indicating whether to display messages
#' about the number of regulated
#' genes found in the gene expression data provided.
Expand Down Expand Up @@ -49,16 +49,15 @@
#'
#' # Computation of TF activity (Garcia-Alonso et al., Genome Res, 2019)
#' tf_activity <- compute_TF_activity(
#' RNA_tpm = RNA_tpm
#' RNA_tpm = RNA_tpm,
#' regulon_net = "collectri"
#' )
compute_TF_activity <- function(RNA_tpm = NULL,
regulon_net = c("collectri", "dorothea"),
verbose = TRUE) {
# Some checks
if (is.null(RNA_tpm)) stop("TPM gene expression data not found")

# Retrieve internal data
TCGA_mean_pancancer <- suppressMessages(easierData::get_TCGA_mean_pancancer())
TCGA_sd_pancancer <- suppressMessages(easierData::get_TCGA_sd_pancancer())
if (length(regulon_net) > 1) stop("Please select just one regulon network")

# Gene expression data
tpm <- RNA_tpm
Expand All @@ -67,23 +66,27 @@ compute_TF_activity <- function(RNA_tpm = NULL,
# HGNC symbols are required
if (any(grep("ENSG00000", genes))) stop("hgnc gene symbols are required", call. = FALSE)

# Log transformed expression matrix (log2[tpm+1]):
# expression matrix scaled and recentered.
#gene_expr <- calc_z_score(t(tpm),
# mean = TCGA_mean_pancancer,
# sd = TCGA_sd_pancancer
#)

gene_expr <- t(tpm)
# redefine gene names to match TF-target network
E <- t(gene_expr)
newNames <- gsub(".", "-", rownames(E), fixed = TRUE)
rownames(E) <- newNames

# collectTRI network
net <- decoupleR::get_collectri(organism='human', split_complexes=FALSE)
if(regulon_net == "collectri"){

net <- decoupleR::get_collectri(organism='human', split_complexes=FALSE)

# dorothea network
}else if(regulon_net == "dorothea"){

net <- decoupleR::get_dorothea(organism='human',
levels = c("A", "B", "C"),
weight_dict = list(A = 1, B = 1, C = 1, D = 1))

}

all_regulated_transcripts <- unique(net$target)
all_tfs <- unique(net$source)

# check what is the percentage of genes we have in our data
genes_kept <- intersect(rownames(E), all_regulated_transcripts)
Expand All @@ -102,7 +105,7 @@ compute_TF_activity <- function(RNA_tpm = NULL,
E <- E[!is.na(apply(E, 1, sum)), ]
E <- E[!is.infinite(apply(E, 1, sum)), ]

# TF activity: run viper
# TF activity: run wmean
tf_activity_df <- decoupleR::run_wmean(mat = E,
net = net,
.source='source',
Expand Down
129 changes: 55 additions & 74 deletions R/compute_pathway_activity.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,15 @@
#' Biophysica Acta (BBA) - Gene Regulatory Mechanisms. 2019.
#' DOI: 10.1016/j.bbagrm.2019.194431.
#'
#' @importFrom DESeq2 DESeqDataSetFromMatrix estimateSizeFactors estimateDispersions
#' getVarianceStabilizedData
#' @importFrom stats na.exclude
#' @importFrom dplyr filter
#' @importFrom progeny progeny getModel
#' @importFrom decoupleR get_progeny run_wmean
#' @importFrom tidyr pivot_wider
#' @importFrom tibble column_to_rownames
#' @importFrom easierData get_cor_scores_genes
#'
#' @param RNA_counts data.frame containing raw counts values with HGNC
#' @param RNA_tpm data.frame containing TPM values with HGNC
#' gene symbols as row names and samples identifiers as column names.
#' @param remove_sig_genes_immune_response logical value indicating
#' whether to remove signature genes involved in the derivation of
Expand Down Expand Up @@ -59,92 +61,71 @@
#' RNA_counts = RNA_counts,
#' remove_sig_genes_immune_response = TRUE
#' )
compute_pathway_activity <- function(RNA_counts = NULL,
compute_pathway_activity <- function(RNA_tpm = NULL,
remove_sig_genes_immune_response = TRUE,
verbose = TRUE) {
# Some checks
if (is.null(RNA_counts)) stop("Counts gene expression data not found")

if (is.null(RNA_tpm)) stop("TPM gene expression data not found")
# Retrieve internal data
cor_scores_genes <- suppressMessages(easierData::get_cor_scores_genes())

# Gene expression data
raw_counts <- RNA_counts
genes <- rownames(raw_counts)

tpm <- RNA_tpm
genes <- rownames(tpm)
# HGNC symbols are required
if (any(grep("ENSG00000", genes))) {
stop("Hgnc gene symbols are required",
call. = FALSE
)
}

# Remove list of genes used to build proxy's of ICB response
if (remove_sig_genes_immune_response) {
if (verbose) message("Removing signature genes of hallmarks of immune response \n")
idy <- stats::na.exclude(match(cor_scores_genes, rownames(raw_counts)))
raw_counts <- raw_counts[-idy, ]
}

# Integers are required for "DESeq2"
if (is.integer(raw_counts) == FALSE) {
raw_counts_integer <- apply(raw_counts, 2, as.integer)
rownames(raw_counts_integer) <- rownames(raw_counts)
} else {
raw_counts_integer <- raw_counts
}

# Variance stabilizing transformation (DESeq2 package)
# Integer count matrix, a data frame with the sample info,
# design =~1 to consider all samples as part of the same group.

# Column data:
colData <- data.frame(id = colnames(raw_counts_integer))

if (verbose) message("Gene counts normalization with DESeq2:")
# Construction a DESeqDataSet: (Forced all to be data.frames($ operator))
dset <- DESeq2::DESeqDataSetFromMatrix(
countData = raw_counts_integer,
colData = colData,
design = ~1
)

# Variance stabilization transformation
dset <- DESeq2::estimateSizeFactors(dset)
dset <- DESeq2::estimateDispersions(dset)
gene_expr <- DESeq2::getVarianceStabilizedData(dset)
rownames(gene_expr) <- rownames(raw_counts_integer)
if (any(grep("ENSG00000", genes))) stop("hgnc gene symbols are required", call. = FALSE)

gene_expr <- t(tpm)
# redefine gene names to match TF-target network
E <- t(gene_expr)
newNames <- gsub(".", "-", rownames(E), fixed = TRUE)
rownames(E) <- newNames

## progeny network
net <- decoupleR::get_progeny(organism = 'human', top = 100)

# Pathways activity
pathway_activity <- progeny::progeny(gene_expr,
scale = FALSE,
organism = "Human", verbose = verbose
)
all_regulated_transcripts <- unique(net$target)

# check what is the percentage of genes we have in our data
model_pathways <- progeny::getModel(organism = "Human", top = 100)
full_pathway_sig <- unique(unlist(lapply(
colnames(model_pathways),
function(pathway) {
top_genes_pathway <- rownames(model_pathways)[apply(
model_pathways, 2,
function(X) X != 0
)[, pathway]]
return(top_genes_pathway)
}
)))
genes_kept <- intersect(rownames(gene_expr), full_pathway_sig)
genes_left <- setdiff(full_pathway_sig, rownames(gene_expr))
total_genes <- length(genes_left) + length(genes_kept)
genes_kept <- intersect(rownames(E), all_regulated_transcripts)
genes_left <- setdiff(all_regulated_transcripts, rownames(E))

# check what is the percentage of regulated transcripts that we have in our data
if (verbose) {
message(
"Pathway signature genes found in data set: ",
length(genes_kept), "/",
total_genes, " (",
round(length(genes_kept) / total_genes, 3) * 100,
"%)"
"Regulated transcripts found in data set: ", length(genes_kept), "/",
length(all_regulated_transcripts), " (",
round(length(genes_kept) / length(all_regulated_transcripts), 3) * 100, "%)"
)
}

# Remove list of genes used to build proxy's of ICB response
if (remove_sig_genes_immune_response) {
if (verbose) message("Removing signature genes of hallmarks of immune response \n")
idy <- stats::na.exclude(match(cor_scores_genes, rownames(E)))
E <- E[-idy, ]
}

# Remove genes with all NA/Inf values
E <- E[!is.na(apply(E, 1, sum)), ]
E <- E[!is.infinite(apply(E, 1, sum)), ]

# Pathway activity: run wmean
pathway_activity_df <- decoupleR::run_wmean(mat = E,
net = net,
.source='source',
.target='target',
.mor='weight',
minsize = 5)
# To matrix
pathway_activity <- pathway_activity_df %>%
dplyr::filter(statistic == "wmean") %>%
tidyr::pivot_wider(id_cols = condition,
names_from = source,
values_from = score) %>%
tibble::column_to_rownames("condition")

if (verbose) message("\nPathway activity scores computed! \n")
return(as.data.frame(pathway_activity))
Expand Down
11 changes: 9 additions & 2 deletions man/compute_TF_activity.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions man/compute_pathway_activity.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 9 additions & 9 deletions vignettes/easier_user_manual.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -251,15 +251,15 @@ head(cell_fractions)

By applying PROGENy [@HOLLAND2020194431; @Schubert2018] method to count data from RNA-seq, the activity of 14 signaling pathways can be inferred as in the chunk below.
```{r, eval=TRUE}
pathway_activities <- compute_pathway_activity(RNA_counts = RNA_counts,
remove_sig_genes_immune_response = TRUE)
pathway_activities <- compute_pathway_activity(RNA_tpm = RNA_tpm,
remove_sig_genes_immune_response = FALSE)
head(pathway_activities)
```
The call above infers pathway activity as a linear transformation of gene expression data. Since some pathway signature genes were also used to compute scores of immune response (output variable in EaSIeR model). With `remove_sig_genes_immune_response` set to `TRUE` (default), the overlapping genes are removed from the pathway signature genes used to infer pathway activities.

By applying DoRothEA [@Garcia-Alonso01082019] method to TPM data from RNA-seq, the activity of 118 TFs can be inferred as follows:
```{r, eval=TRUE}
tf_activities <- compute_TF_activity(RNA_tpm = RNA_tpm)
tf_activities <- compute_TF_activity(RNA_tpm = RNA_tpm, regulon_net = "dorothea")
head(tf_activities[,1:5])
```

Expand Down Expand Up @@ -293,13 +293,13 @@ The `cancer_type` provided should match one of the cancer-specific models availa

The optimized models can be retrieved from `easierData` package using the function `get_opt_models()`.

```{r, eval=TRUE}
```{r, eval=FALSE}
predictions <- predict_immune_response(pathways = pathway_activities,
immunecells = cell_fractions,
tfs = tf_activities,
lrpairs = lrpair_weights,
ccpairs = ccpair_scores,
cancer_type = cancer_type,
cancer_type = cancer_type,
verbose = TRUE)
```

Expand All @@ -325,7 +325,7 @@ Since both immune response and TMB are essential for an effective immunotherapy

These two strategies to combine immune response and TMB require the definition of a certain weight or penalty beforehand (`weight_penalty`). The default weight or penalty is 0.5.

```{r, eval=TRUE}
```{r, eval=FALSE}
output_eval_with_resp <- assess_immune_response(predictions_immune_response = predictions,
patient_response = patient_ICBresponse,
RNA_tpm = RNA_tpm,
Expand Down Expand Up @@ -353,7 +353,7 @@ This is a usual case where we might have a cancer dataset with bulk RNA-seq data

In this likely scenario, an score of likelihood of immune response can be assigned to each patient by omitting the argument `patient_response` within the function `assess_immune_response`.

```{r, eval=TRUE}
```{r, eval=FALSE}
output_eval_no_resp <- assess_immune_response(predictions_immune_response = predictions,
TMB_values = TMB,
easier_with_TMB = "weighted_average",
Expand All @@ -376,7 +376,7 @@ output_eval_no_resp[[2]]

We can further retrieve the easier score and also, the refined scores obtained by integrating easier score and TMB via `retrieve_easier_score`.

```{r, eval=TRUE}
```{r, eval=FALSE}
easier_derived_scores <- retrieve_easier_score(predictions_immune_response = predictions,
TMB_values = TMB,
easier_with_TMB = c("weighted_average",
Expand All @@ -394,7 +394,7 @@ The option `patient_label` allows to make a two-level comparison by providing a

In order to leverage the cancer-specific biomarker weights inferred from model training, you need to specify again which `cancer_type` the bulk RNA-seq data belongs to. As before, the selected `cancer_type` should be included in the list described above.

```{r, eval=TRUE}
```{r, eval=FALSE}
output_biomarkers <- explore_biomarkers(pathways = pathway_activities,
immunecells = cell_fractions,
tfs = tf_activities,
Expand Down

0 comments on commit a47431d

Please sign in to comment.