From 7b6aa86fc89488eb46711edf0e7b919c8705f4e2 Mon Sep 17 00:00:00 2001 From: marouenbg Date: Mon, 5 Dec 2022 08:18:26 -0500 Subject: [PATCH 1/4] add puma --- R/PUMA.R | 204 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 204 insertions(+) create mode 100644 R/PUMA.R diff --git a/R/PUMA.R b/R/PUMA.R new file mode 100644 index 00000000..976a32fe --- /dev/null +++ b/R/PUMA.R @@ -0,0 +1,204 @@ +#' Run Python implementation PANDA in R +#' +#' \strong{PANDA}(Passing Attributes between Networks for Data Assimilation) is a message-passing model +#' to reconstruct gene regulatory network, which integrates multiple sources of biological data-including protein-protein interaction data, +#' gene expression data, and transcription factor binding motifs data to reconstruct genome-wide, condition-specific regulatory networks. +#' \href{http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0064832}{[(Glass et al. 2013)])} +#' This function is designed to run the a derived PANDA implementation in Python Library "netZooPy" \href{https://github.com/netZoo/netZooPy}{netZooPy}. +#' +#' @param expr_file Character string indicating the file path of expression values file, with each gene(in rows) across samples(in columns). +#' @param motif_file An optional character string indicating the file path of a prior transcription factor binding motifs dataset. +#' When this argument is not provided, analysis will continue with Pearson correlation matrix. +#' @param ppi_file An optional character string indicating the file path of protein-protein interaction edge dataset. +#' Also, this can be generated with a list of proteins of interest by \code{\link{sourcePPI}}. +#' +#' @param computing 'cpu' uses Central Processing Unit (CPU) to run PANDA; 'gpu' use the Graphical Processing Unit (GPU) to run PANDA. The default value is "cpu". +#' +#' @param precision 'double' computes the regulatory network in double precision (15 decimal digits); 'single' computes the regulatory network in single precision (7 decimal digits) which is fastaer, requires half the memory but less accurate. The default value is 'double'. +#' @param save_memory 'TRUE' removes temporary results from memory. The result network is weighted adjacency matrix of size (nTFs, nGenes); 'FALSE' keeps the temporary files in memory. The result network has 4 columns in the form gene - TF - weight in motif prior - PANDA edge. PANDA indegree/outdegree of panda network, only if save_memory = FALSE. The default value is 'FALSE'. +#' @param save_tmp 'TRUE' saves middle data like expression matrix and normalized networks; 'FALSE' deletes the middle data. The default value is 'TURE'. +#' @param keep_expression_matrix 'TRUE' keeps the input expression matrix as an attribute in the result Panda object.'FALSE' deletes the expression matrix attribute in the Panda object. The default value is 'FALSE'. +#' @param modeProcess 'legacy' refers to the processing mode in netZooPy<=0.5, 'union': takes the union of all TFs and genes across priors and fills the missing genes in the priors with zeros; 'intersection': intersects the input genes and TFs across priors and removes the missing TFs/genes. Default values is 'union'. +#' @param remove_missing Only when modeProcess='legacy': remove_missing='TRUE' removes all unmatched TF and genes; remove_missing='FALSE' keeps all tf and genes. The default value is 'FALSE'. +#' +#' @return When save_memory=FALSE(default), this function will return a list of three items: +#' Use \code{$panda} to access the standard output of PANDA as data frame, which consists of four columns: +#' "TF", "Gene", "Motif" using 0 or 1 to indicate if this edge belongs to prior motif dataset, and "Score". +#' +#' Use \code{$indegree} to access the indegree of PANDA network as data frame, which consists of two columns: "Gene", "Score". +#' +#' Use \code{$outdegree} to access the outdegree of PANDA network as data frame, which consists of two columns: "TF", "Score". +#' +#' When save_memory=TRUE, this function will return a weigheted adjacency matirx of size (nTFs, nGenes), use \code{$WAMpanda} to access. +#' +#' @examples + +#' # take the treated TB dataset as example here. +#' # refer to the datasets files path in inst/extdat +#' +#' treated_expression_file_path <- system.file("extdata", "expr4_matched.txt", +#' package = "netZooR", mustWork = TRUE) +#' treated_expression_file_path <- system.file("extdata", "expr4_matched.txt", +#' package = "netZooR", mustWork = TRUE) +#' motif_file_path <- system.file("extdata", "chip_matched.txt", package = "netZooR", mustWork = TRUE) +#' ppi_file_path <- system.file("extdata", "ppi_matched.txt", package = "netZooR", mustWork = TRUE) +#' +#' +#' # Run PANDA for treated and control network +#' \donttest{ +#' treated_all_panda_result <- pandaPy(expr_file = treated_expression_file_path, +#' motif_file = motif_file_path, ppi_file = ppi_file_path, +#' modeProcess="legacy", remove_missing = TRUE ) +#' +#' # access PANDA regulatory network +#' treated_net <- treated_all_panda_result$panda +#' +#' # access PANDA regulatory indegree network. +#' indegree_net <- treated_all_panda_result$indegree +#' +#' # access PANDA regulatory outdegree networks +#' outdegree_net <- treated_all_panda_result$outdegree +#' } +#' +#' @import reticulate +#' @export +#' + + +pandaPy <- function(expr_file, motif_file=NULL, ppi_file=NULL, computing="cpu", precision="double",save_memory=FALSE, save_tmp=TRUE, keep_expression_matrix=FALSE, modeProcess="union", remove_missing=FALSE, with_header=FALSE){ + + if(missing(expr_file)){ + stop("Please provide the path of gene expression data file to 'expr_file' variable") } + else{ expr.str <- paste("\'", expr_file, "\'", sep = '') } + + if(is.null(motif_file)){ + motif.str <- 'None' + message("The prior motif network is not provided, so analysis continues with Pearson correlation matrix and gene coexpression mastrix is returned as a result network") + } else{ motif.str <- paste("\'", motif_file,"\'", sep = '') } + + if(is.null(ppi_file)){ + ppi.str <- 'None' + message("No protein-protein interaction network provided.") + } else{ ppi.str <- paste("\'", ppi_file, "\'", sep = '') } + + # computing variable + if(computing=="gpu"){ + computing.str <- "computing='gpu'" + } else{ computing.str <- "computing='cpu'"} + + # precision variable + if(precision == "single" ){ + precision.str <- "precision='single'" + } else{ precision.str <- "precision='double'"} + + # save_memory variable + if(save_memory==TRUE){ + savememory.str <- "save_memory= True" + } else{ savememory.str <- "save_memory=False" } + + # save_tmp variable + if(save_tmp==FALSE){ + savetmp.str <- "save_tmp= False" + } else{ savetmp.str <- "save_tmp=True" } + + # keep_expression_matrix variable + if(keep_expression_matrix==TRUE){ + keepexpression.str <- "keep_expression_matrix=True" + } else{ keepexpression.str <- "keep_expression_matrix=False" } + + # with header option + if(with_header==FALSE){ + withheader.str <- "with_header=False" + }else if (with_header==TRUE){ + withheader.str <- "with_header=True" + } + + # when pre-processing mode is legacy + if(modeProcess == "legacy"){ + + if(remove_missing == TRUE){ + message("Use the legacy mode to pre-process the input dataset and keep only the matched TFs or Genes") + mode.str <- "modeProcess ='legacy', remove_missing = True" + } else{ message("Use the legacy mode (netZooPy version <= 0.5) to pre-process the input dataset and keep all the unmatched TFs or Genes") + mode.str <- "modeProcess ='legacy', remove_missing = False"} + } + + # when pre-processing mode is union + else if(modeProcess == "union"){ + mode.str <- "modeProcess ='union'" + } + + else if(modeProcess == "intersection"){ + mode.str <- "modeProcess ='intersection'" + } + + # source the pypanda from github raw website. + pandapath <- system.file("extdata", "panda.py", package = "netZooR", mustWork = TRUE) + reticulate::source_python(pandapath,convert = TRUE) + + # invoke Python script to create a Panda object + obj.str <- paste("panda_obj=Panda(", expr.str, ",", motif.str,",", ppi.str, ",", + computing.str, ",", precision.str, ",", savememory.str, ",", savetmp.str, "," , + keepexpression.str, ",", mode.str, "," , withheader.str, ")", sep ='') + + # run Python code + py_run_string(obj.str) + # run PAMDA + if(save_memory == FALSE){ + + py_run_string("panda_network=panda_obj.export_panda_results",local = FALSE, convert = TRUE) + # convert python object to R vector + panda_net <- py$panda_network + + # re-assign data type + panda_net$tf <- as.character(panda_net$tf) + panda_net$gene <- as.character(panda_net$gene) + if("motif" %in% names(panda_net)){ + panda_net$motif <- as.numeric(panda_net$motif) + } + panda_net$force <- as.numeric(panda_net$force) + if("motif" %in% names(panda_net)){ + # adjust column order + panda_net <- panda_net[,c("tf","gene","motif","force")] + # rename the PANDA output colnames + colnames(panda_net) <- c("TF","Gene","Motif","Score") + }else{ + # adjust column order + panda_net <- panda_net[,c("tf","gene","force")] + # rename the PANDA output colnames + colnames(panda_net) <- c("TF","Gene","Score") + } + + + # in-degree of panda network + py_run_string(paste("indegree=panda_obj.return_panda_indegree()")) + indegree_net <- py$indegree + indegree_net <- as.data.frame(cbind(Target = rownames(indegree_net), Target_Score = indegree_net$force), stringsAsFactors =FALSE) + indegree_net$`Target_Score` <- as.numeric(indegree_net$`Target_Score`) + + # out-degree of panda netwook + py_run_string(paste("outdegree=panda_obj.return_panda_outdegree()")) + outdegree_net <- py$outdegree + outdegree_net <- as.data.frame(cbind(Regulator = rownames(outdegree_net), Regulator_Score = outdegree_net$force), stringsAsFactors =FALSE) + outdegree_net$`Regulator_Score` <- as.numeric(outdegree_net$`Regulator_Score`) + + if( length(intersect(panda_net$Gene, panda_net$TF))>0){ + panda_net$TF <- paste('reg_', panda_net$TF, sep='') + panda_net$Gene <- paste('tar_', panda_net$Gene, sep='') + message("Rename the content of first two columns with prefix 'reg_' and 'tar_' as there are some duplicate node names between the first two columns" ) + } + + output <- list("panda" = panda_net, "indegree" = indegree_net, "outdegree" = outdegree_net) + + + } else{ py_run_string("panda_network=panda_obj.panda_network",local = FALSE, convert = TRUE) + panda_net <- py$panda_network + # weighted adjacency matrix of PANDA network + output <- list("WAMpanda" = panda_net) + } + + message ("...Finish PANDA...") + return(output) +} + + From b314dd65f86d8a3116dcd2f0d4dc7342bcaa73d3 Mon Sep 17 00:00:00 2001 From: marouenbg Date: Sun, 11 Dec 2022 17:29:38 -0500 Subject: [PATCH 2/4] update PUMA --- R/PUMA.R | 441 ++++++++++++++++++++++++++++++++---------------------- README.md | 4 + 2 files changed, 266 insertions(+), 179 deletions(-) diff --git a/R/PUMA.R b/R/PUMA.R index 976a32fe..aed97231 100644 --- a/R/PUMA.R +++ b/R/PUMA.R @@ -1,204 +1,287 @@ -#' Run Python implementation PANDA in R -#' -#' \strong{PANDA}(Passing Attributes between Networks for Data Assimilation) is a message-passing model -#' to reconstruct gene regulatory network, which integrates multiple sources of biological data-including protein-protein interaction data, -#' gene expression data, and transcription factor binding motifs data to reconstruct genome-wide, condition-specific regulatory networks. -#' \href{http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0064832}{[(Glass et al. 2013)])} -#' This function is designed to run the a derived PANDA implementation in Python Library "netZooPy" \href{https://github.com/netZoo/netZooPy}{netZooPy}. +#' PANDA using microRNA associations #' -#' @param expr_file Character string indicating the file path of expression values file, with each gene(in rows) across samples(in columns). -#' @param motif_file An optional character string indicating the file path of a prior transcription factor binding motifs dataset. -#' When this argument is not provided, analysis will continue with Pearson correlation matrix. -#' @param ppi_file An optional character string indicating the file path of protein-protein interaction edge dataset. -#' Also, this can be generated with a list of proteins of interest by \code{\link{sourcePPI}}. -#' -#' @param computing 'cpu' uses Central Processing Unit (CPU) to run PANDA; 'gpu' use the Graphical Processing Unit (GPU) to run PANDA. The default value is "cpu". -#' -#' @param precision 'double' computes the regulatory network in double precision (15 decimal digits); 'single' computes the regulatory network in single precision (7 decimal digits) which is fastaer, requires half the memory but less accurate. The default value is 'double'. -#' @param save_memory 'TRUE' removes temporary results from memory. The result network is weighted adjacency matrix of size (nTFs, nGenes); 'FALSE' keeps the temporary files in memory. The result network has 4 columns in the form gene - TF - weight in motif prior - PANDA edge. PANDA indegree/outdegree of panda network, only if save_memory = FALSE. The default value is 'FALSE'. -#' @param save_tmp 'TRUE' saves middle data like expression matrix and normalized networks; 'FALSE' deletes the middle data. The default value is 'TURE'. -#' @param keep_expression_matrix 'TRUE' keeps the input expression matrix as an attribute in the result Panda object.'FALSE' deletes the expression matrix attribute in the Panda object. The default value is 'FALSE'. -#' @param modeProcess 'legacy' refers to the processing mode in netZooPy<=0.5, 'union': takes the union of all TFs and genes across priors and fills the missing genes in the priors with zeros; 'intersection': intersects the input genes and TFs across priors and removes the missing TFs/genes. Default values is 'union'. -#' @param remove_missing Only when modeProcess='legacy': remove_missing='TRUE' removes all unmatched TF and genes; remove_missing='FALSE' keeps all tf and genes. The default value is 'FALSE'. -#' -#' @return When save_memory=FALSE(default), this function will return a list of three items: -#' Use \code{$panda} to access the standard output of PANDA as data frame, which consists of four columns: -#' "TF", "Gene", "Motif" using 0 or 1 to indicate if this edge belongs to prior motif dataset, and "Score". -#' -#' Use \code{$indegree} to access the indegree of PANDA network as data frame, which consists of two columns: "Gene", "Score". -#' -#' Use \code{$outdegree} to access the outdegree of PANDA network as data frame, which consists of two columns: "TF", "Score". -#' -#' When save_memory=TRUE, this function will return a weigheted adjacency matirx of size (nTFs, nGenes), use \code{$WAMpanda} to access. -#' -#' @examples - -#' # take the treated TB dataset as example here. -#' # refer to the datasets files path in inst/extdat -#' -#' treated_expression_file_path <- system.file("extdata", "expr4_matched.txt", -#' package = "netZooR", mustWork = TRUE) -#' treated_expression_file_path <- system.file("extdata", "expr4_matched.txt", -#' package = "netZooR", mustWork = TRUE) -#' motif_file_path <- system.file("extdata", "chip_matched.txt", package = "netZooR", mustWork = TRUE) -#' ppi_file_path <- system.file("extdata", "ppi_matched.txt", package = "netZooR", mustWork = TRUE) -#' -#' -#' # Run PANDA for treated and control network -#' \donttest{ -#' treated_all_panda_result <- pandaPy(expr_file = treated_expression_file_path, -#' motif_file = motif_file_path, ppi_file = ppi_file_path, -#' modeProcess="legacy", remove_missing = TRUE ) -#' -#' # access PANDA regulatory network -#' treated_net <- treated_all_panda_result$panda -#' -#' # access PANDA regulatory indegree network. -#' indegree_net <- treated_all_panda_result$indegree -#' -#' # access PANDA regulatory outdegree networks -#' outdegree_net <- treated_all_panda_result$outdegree -#' } -#' -#' @import reticulate -#' @export +#' This function runs the PUMA algorithm #' - - -pandaPy <- function(expr_file, motif_file=NULL, ppi_file=NULL, computing="cpu", precision="double",save_memory=FALSE, save_tmp=TRUE, keep_expression_matrix=FALSE, modeProcess="union", remove_missing=FALSE, with_header=FALSE){ - - if(missing(expr_file)){ - stop("Please provide the path of gene expression data file to 'expr_file' variable") } - else{ expr.str <- paste("\'", expr_file, "\'", sep = '') } - - if(is.null(motif_file)){ - motif.str <- 'None' - message("The prior motif network is not provided, so analysis continues with Pearson correlation matrix and gene coexpression mastrix is returned as a result network") - } else{ motif.str <- paste("\'", motif_file,"\'", sep = '') } - - if(is.null(ppi_file)){ - ppi.str <- 'None' - message("No protein-protein interaction network provided.") - } else{ ppi.str <- paste("\'", ppi_file, "\'", sep = '') } - - # computing variable - if(computing=="gpu"){ - computing.str <- "computing='gpu'" - } else{ computing.str <- "computing='cpu'"} +#' @param motif A motif dataset, a data.frame, matrix or exprSet containing 3 columns. +#' Each row describes an motif associated with a transcription factor (column 1) a +#' gene (column 2) and a score (column 3) for the motif. +#' @param expr An expression dataset, as a genes (rows) by samples (columns) data.frame +#' @param ppi A Protein-Protein interaction dataset, a data.frame containing 3 columns. +#' Each row describes a protein-protein interaction between transcription factor 1(column 1), +#' transcription factor 2 (column 2) and a score (column 3) for the interaction. +#' @param alpha value to be used for update variable, alpha (default=0.1) +#' @param hamming value at which to terminate the process based on hamming distance (default 10^-3) +#' @param iter sets the maximum number of iterations PUMA can run before exiting. +#' @param progress Boolean to indicate printing of output for algorithm progress. +#' @param output a vector containing which networks to return. Options include "regulatory", +#' "coregulatory", "cooperative". +#' @param zScale Boolean to indicate use of z-scores in output. False will use [0,1] scale. +#' @param randomize method by which to randomize gene expression matrix. Default "None". Must +#' be one of "None", "within.gene", "by.genes". "within.gene" randomization scrambles each row +#' of the gene expression matrix, "by.gene" scrambles gene labels. +#' @param cor.method Correlation method, default is "pearson". +#' @param scale.by.present Boolean to indicate scaling of correlations by percentage of positive samples. +#' @param remove.missing.ppi Boolean to indicate whether TFs in the PPI but not in the motif data should be +#' removed. Only when mode=='legacy'. +#' @param remove.missing.motif Boolean to indicate whether genes targeted in the motif data but not the +#' expression data should be removed. Only when mode=='legacy'. +#' @param remove.missing.genes Boolean to indicate whether genes in the expression data but lacking +#' information from the motif prior should be removed. Only when mode=='legacy'. +#' @param edgelist Boolean to indicate if edge lists instead of matrices should be returned. +#' @param mode The data alignment mode. The mode 'union' takes the union of the genes in the expression matrix and the motif +#' and the union of TFs in the ppi and motif and fills the matrics with zeros for nonintersecting TFs and gens, 'intersection' +#' takes the intersection of genes and TFs and removes nonintersecting sets, 'legacy' is the old behavior with version 1.19.3. +#' #' Parameters remove.missing.ppi, remove.missingmotif, remove.missing.genes work only with mode=='legacy'. +#' @keywords keywords +#' @importFrom matrixStats rowSds +#' @importFrom matrixStats colSds +#' @importFrom Biobase assayData +#' @importFrom reshape melt.array +#' @export +#' @return An object of class "panda" containing matrices describing networks achieved by convergence +#' with PUMA algorithm.\cr +#' "regNet" is the regulatory network\cr +#' "coregNet" is the coregulatory network\cr +#' "coopNet" is the cooperative network +#' @examples +#' data(pandaToyData) +#' pumaRes <- puma(pandaToyData$motif, +#' pandaToyData$expression,pandaToyData$ppi,hamming=.1,progress=TRUE) +#' @references +#' Kuijjer, Marieke L., et al. "PUMA: PANDA using microRNA associations." Bioinformatics 36.18 (2020): 4765-4773. +puma <- function(motif,expr=NULL,ppi=NULL,alpha=0.1,hamming=0.001, + iter=NA,output=c('regulatory','coexpression','cooperative'), + zScale=TRUE,progress=FALSE,randomize=c("None", "within.gene", "by.gene"),cor.method="pearson", + scale.by.present=FALSE,edgelist=FALSE,remove.missing.ppi=FALSE, + remove.missing.motif=FALSE,remove.missing.genes=FALSE,mode="union"){ - # precision variable - if(precision == "single" ){ - precision.str <- "precision='single'" - } else{ precision.str <- "precision='double'"} + randomize <- match.arg(randomize) + if(progress) + print('Initializing and validating') - # save_memory variable - if(save_memory==TRUE){ - savememory.str <- "save_memory= True" - } else{ savememory.str <- "save_memory=False" } + if(class(expr)=="ExpressionSet") + expr <- assayData(expr)[["exprs"]] - # save_tmp variable - if(save_tmp==FALSE){ - savetmp.str <- "save_tmp= False" - } else{ savetmp.str <- "save_tmp=True" } + if (is.null(expr)){ + # Use only the motif data here for the gene list + num.conditions <- 0 + if (randomize!="None"){ + warning("Randomization ignored because gene expression is not used.") + randomize <- "None" + } + } else { + if(mode=='legacy'){ + if(remove.missing.genes){ + # remove genes from expression data that are not in the motif data + n <- nrow(expr) + expr <- expr[which(rownames(expr)%in%motif[,2]),] + message(sprintf("%s genes removed that were not present in motif", n-nrow(expr))) + } + if(remove.missing.motif){ + # remove genes from motif data that are not in the expression data + n <- nrow(motif) + motif <- motif[which(motif[,2]%in%rownames(expr)),] + message(sprintf("%s motif edges removed that targeted genes missing in expression data", n-nrow(motif))) + } + # Use the motif data AND the expr data (if provided) for the gene list + # Keep everything sorted alphabetically + expr <- expr[order(rownames(expr)),] + }else if(mode=='union'){ + gene.names=unique(union(rownames(expr),unique(motif[,2]))) + tf.names =unique(union(unique(ppi[,1]),unique(motif[,1]))) + num.TFs <- length(tf.names) + num.genes <- length(gene.names) + # gene expression matrix + expr1=as.data.frame(matrix(0,num.genes,ncol(expr))) + rownames(expr1)=gene.names + expr1[which(gene.names%in%rownames(expr)),]=expr[] + expr=expr1 + #PPI matrix + tfCoopNetwork <- matrix(0,num.TFs,num.TFs) + colnames(tfCoopNetwork)=tf.names + rownames(tfCoopNetwork)=tf.names + Idx1 <- match(ppi[,1], tf.names); + Idx2 <- match(ppi[,2], tf.names); + Idx <- (Idx2-1)*num.TFs+Idx1; + tfCoopNetwork[Idx] <- ppi[,3]; + Idx <- (Idx1-1)*num.TFs+Idx2; + tfCoopNetwork[Idx] <- ppi[,3]; + #Motif matrix + regulatoryNetwork=matrix(0,num.TFs,num.genes) + colnames(regulatoryNetwork)=gene.names + rownames(regulatoryNetwork)=tf.names + Idx1=match(motif[,1], tf.names); + Idx2=match(motif[,2], gene.names); + Idx=(Idx2-1)*num.TFs+Idx1; + regulatoryNetwork[Idx]=motif[,3] + }else if(mode=='intersection'){ + gene.names=unique(intersect(rownames(expr),unique(motif[,2]))) + tf.names =unique(intersect(unique(ppi[,1]),unique(motif[,1]))) + num.TFs <- length(tf.names) + num.genes <- length(gene.names) + # gene expression matrix + expr1=as.data.frame(matrix(0,num.genes,ncol(expr))) + rownames(expr1)=gene.names + interGeneNames=gene.names[which(gene.names%in%rownames(expr))] + expr1[interGeneNames,]=expr[interGeneNames,] + expr=expr1 + #PPI matrix + tfCoopNetwork <- matrix(0,num.TFs,num.TFs) + colnames(tfCoopNetwork)=tf.names + rownames(tfCoopNetwork)=tf.names + Idx1 <- match(ppi[,1], tf.names); + Idx2 <- match(ppi[,2], tf.names); + Idx <- (Idx2-1)*num.TFs+Idx1; + indIdx=!is.na(Idx) + Idx=Idx[indIdx] #remove missing TFs + tfCoopNetwork[Idx] <- ppi[indIdx,3]; + Idx <- (Idx1-1)*num.TFs+Idx2; + indIdx=!is.na(Idx) + Idx=Idx[indIdx] #remove missing TFs + tfCoopNetwork[Idx] <- ppi[indIdx,3]; + #Motif matrix + regulatoryNetwork=matrix(0,num.TFs,num.genes) + colnames(regulatoryNetwork)=gene.names + rownames(regulatoryNetwork)=tf.names + Idx1=match(motif[,1], tf.names); + Idx2=match(motif[,2], gene.names); + Idx=(Idx2-1)*num.TFs+Idx1; + indIdx=!is.na(Idx) + Idx=Idx[indIdx] #remove missing genes + regulatoryNetwork[Idx]=motif[indIdx,3]; + } + num.conditions <- ncol(expr) + if (randomize=='within.gene'){ + expr <- t(apply(expr, 1, sample)) + if(progress) + print("Randomizing by reordering each gene's expression") + } else if (randomize=='by.gene'){ + rownames(expr) <- sample(rownames(expr)) + expr <- expr[order(rownames(expr)),] + if(progress) + print("Randomizing by reordering each gene labels") + } + } - # keep_expression_matrix variable - if(keep_expression_matrix==TRUE){ - keepexpression.str <- "keep_expression_matrix=True" - } else{ keepexpression.str <- "keep_expression_matrix=False" } + if (mode=='legacy'){ + # Create vectors for TF names and Gene names from motif dataset + tf.names <- sort(unique(motif[,1])) + gene.names <- sort(unique(rownames(expr))) + num.TFs <- length(tf.names) + num.genes <- length(gene.names) + } - # with header option - if(with_header==FALSE){ - withheader.str <- "with_header=False" - }else if (with_header==TRUE){ - withheader.str <- "with_header=True" + # Bad data checking + if (num.genes==0){ + stop("Error validating data. No matched genes.\n Please ensure that gene names in expression data match gene names in motif data") } - # when pre-processing mode is legacy - if(modeProcess == "legacy"){ + if(num.conditions==0) { + warning('No expression data given. PUMA will run based on an identity co-regulation matrix') + geneCoreg <- diag(num.genes) + } else if(num.conditions<3) { + warning('Not enough expression conditions detected to calculate correlation. Co-regulation network will be initialized to an identity matrix.') + geneCoreg <- diag(num.genes) + } else { - if(remove_missing == TRUE){ - message("Use the legacy mode to pre-process the input dataset and keep only the matched TFs or Genes") - mode.str <- "modeProcess ='legacy', remove_missing = True" - } else{ message("Use the legacy mode (netZooPy version <= 0.5) to pre-process the input dataset and keep all the unmatched TFs or Genes") - mode.str <- "modeProcess ='legacy', remove_missing = False"} + if(scale.by.present){ + num.positive=(expr>0)%*%t((expr>0)) + geneCoreg <- cor(t(expr), method=cor.method, use="pairwise.complete.obs")*(num.positive/num.conditions) + } else { + geneCoreg <- cor(t(expr), method=cor.method, use="pairwise.complete.obs") + } + if(progress) + print('Verified sufficient samples') + } + if (any(is.na(geneCoreg))){ #check for NA and replace them by zero + diag(geneCoreg)=1 + geneCoreg[is.na(geneCoreg)]=0 } - # when pre-processing mode is union - else if(modeProcess == "union"){ - mode.str <- "modeProcess ='union'" + if (any(duplicated(motif))) { + warning("Duplicate edges have been found in the motif data. Weights will be summed.") + motif <- aggregate(motif[,3], by=list(motif[,1], motif[,2]), FUN=sum) } - else if(modeProcess == "intersection"){ - mode.str <- "modeProcess ='intersection'" + # Prior Regulatory Network + if(mode=='legacy'){ + Idx1=match(motif[,1], tf.names); + Idx2=match(motif[,2], gene.names); + Idx=(Idx2-1)*num.TFs+Idx1; + regulatoryNetwork=matrix(data=0, num.TFs, num.genes); + regulatoryNetwork[Idx]=motif[,3] + colnames(regulatoryNetwork) <- gene.names + rownames(regulatoryNetwork) <- tf.names + # PPI data + # If no ppi data is given, we use the identity matrix + tfCoopNetwork <- diag(num.TFs) + # Else we convert our two-column data.frame to a matrix + if (!is.null(ppi)){ + if(any(duplicated(ppi))){ + warning("Duplicate edges have been found in the PPI data. Weights will be summed.") + ppi <- aggregate(ppi[,3], by=list(ppi[,1], ppi[,2]), FUN=sum) + } + if(remove.missing.ppi){ + # remove edges in the PPI data that target TFs not in the motif + n <- nrow(ppi) + ppi <- ppi[which(ppi[,1]%in%tf.names & ppi[,2]%in%tf.names),] + message(sprintf("%s PPI edges removed that were not present in motif", n-nrow(ppi))) + } + Idx1 <- match(ppi[,1], tf.names); + Idx2 <- match(ppi[,2], tf.names); + Idx <- (Idx2-1)*num.TFs+Idx1; + tfCoopNetwork[Idx] <- ppi[,3]; + Idx <- (Idx1-1)*num.TFs+Idx2; + tfCoopNetwork[Idx] <- ppi[,3]; + } + colnames(tfCoopNetwork) <- tf.names + rownames(tfCoopNetwork) <- tf.names } - # source the pypanda from github raw website. - pandapath <- system.file("extdata", "panda.py", package = "netZooR", mustWork = TRUE) - reticulate::source_python(pandapath,convert = TRUE) + ## Run PUMA ## + tic=proc.time()[3] - # invoke Python script to create a Panda object - obj.str <- paste("panda_obj=Panda(", expr.str, ",", motif.str,",", ppi.str, ",", - computing.str, ",", precision.str, ",", savememory.str, ",", savetmp.str, "," , - keepexpression.str, ",", mode.str, "," , withheader.str, ")", sep ='') + if(progress) + print('Normalizing networks...') + regulatoryNetwork = normalizeNetwork(regulatoryNetwork) + tfCoopNetwork = normalizeNetwork(tfCoopNetwork) + geneCoreg = normalizeNetwork(geneCoreg) - # run Python code - py_run_string(obj.str) - # run PAMDA - if(save_memory == FALSE){ - - py_run_string("panda_network=panda_obj.export_panda_results",local = FALSE, convert = TRUE) - # convert python object to R vector - panda_net <- py$panda_network - - # re-assign data type - panda_net$tf <- as.character(panda_net$tf) - panda_net$gene <- as.character(panda_net$gene) - if("motif" %in% names(panda_net)){ - panda_net$motif <- as.numeric(panda_net$motif) - } - panda_net$force <- as.numeric(panda_net$force) - if("motif" %in% names(panda_net)){ - # adjust column order - panda_net <- panda_net[,c("tf","gene","motif","force")] - # rename the PANDA output colnames - colnames(panda_net) <- c("TF","Gene","Motif","Score") - }else{ - # adjust column order - panda_net <- panda_net[,c("tf","gene","force")] - # rename the PANDA output colnames - colnames(panda_net) <- c("TF","Gene","Score") + if(progress) + print('Learning Network...') + + minusAlpha = 1-alpha + step=0 + hamming_cur=1 + if(progress) + print("Using tanimoto similarity") + while(hamming_cur>hamming){ + if ((!is.na(iter))&&step>=iter){ + print(paste("Reached maximum iterations, iter =",iter),sep="") + break } + Responsibility=tanimoto(tfCoopNetwork, regulatoryNetwork) + Availability=tanimoto(regulatoryNetwork, geneCoreg) + RA = 0.5*(Responsibility+Availability) + hamming_cur=sum(abs(regulatoryNetwork-RA))/(num.TFs*num.genes) + regulatoryNetwork=minusAlpha*regulatoryNetwork + alpha*RA - # in-degree of panda network - py_run_string(paste("indegree=panda_obj.return_panda_indegree()")) - indegree_net <- py$indegree - indegree_net <- as.data.frame(cbind(Target = rownames(indegree_net), Target_Score = indegree_net$force), stringsAsFactors =FALSE) - indegree_net$`Target_Score` <- as.numeric(indegree_net$`Target_Score`) - - # out-degree of panda netwook - py_run_string(paste("outdegree=panda_obj.return_panda_outdegree()")) - outdegree_net <- py$outdegree - outdegree_net <- as.data.frame(cbind(Regulator = rownames(outdegree_net), Regulator_Score = outdegree_net$force), stringsAsFactors =FALSE) - outdegree_net$`Regulator_Score` <- as.numeric(outdegree_net$`Regulator_Score`) - - if( length(intersect(panda_net$Gene, panda_net$TF))>0){ - panda_net$TF <- paste('reg_', panda_net$TF, sep='') - panda_net$Gene <- paste('tar_', panda_net$Gene, sep='') - message("Rename the content of first two columns with prefix 'reg_' and 'tar_' as there are some duplicate node names between the first two columns" ) - } - - output <- list("panda" = panda_net, "indegree" = indegree_net, "outdegree" = outdegree_net) + ppi=tanimoto(regulatoryNetwork, t(regulatoryNetwork)) + ppi=update.diagonal(ppi, num.TFs, alpha, step) + tfCoopNetwork=minusAlpha*tfCoopNetwork + alpha*ppi + CoReg2=tanimoto(t(regulatoryNetwork), regulatoryNetwork) + CoReg2=update.diagonal(CoReg2, num.genes, alpha, step) + geneCoreg=minusAlpha*geneCoreg + alpha*CoReg2 - } else{ py_run_string("panda_network=panda_obj.panda_network",local = FALSE, convert = TRUE) - panda_net <- py$panda_network - # weighted adjacency matrix of PANDA network - output <- list("WAMpanda" = panda_net) + if(progress) + message("Iteration", step,": hamming distance =", round(hamming_cur,5)) + step=step+1 } - message ("...Finish PANDA...") - return(output) -} - - + toc=proc.time()[3] - tic + if(progress) + message("Successfully ran PUMA on ", num.genes, " Genes and ", num.TFs, " TFs.\nTime elapsed:", round(toc,2), "seconds.") + prepResult(zScale, output, regulatoryNetwork, geneCoreg, tfCoopNetwork, edgelist, motif) +} \ No newline at end of file diff --git a/README.md b/README.md index c141a136..bb443224 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,11 @@ netZooR currently integrates: * **YARN** (Yet Another RNa-seq package) [[Paulsson et al.]](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-017-1847-x): YARN is a package that combines quality control, gene filtering, and normalization steps to streamline the preprocessing of large-scale, multi-tissue gene expression data from resources such as the Genotype-Tissue Expression (GTEx) project. Among other steps, YARN uses principal coordinate analysis (PCoA) to determine if samples collected from different sites on the same tissue (for example, transverse and sigmoid colon) can be treated as "transcriptionally indistinguishable" and grouped together to increase power for downstream analyses. Paulsson et al. (2017) demonstrate the use of YARN to develop a pan-cancer RNA-seq dataset for 30,333 genes from 9435 samples across 38 tissues from the GTEx dataset. +
+PUMA +* **PUMA** (PANDA using microRNA associations) [[Kuijjer et al.]](https://academic.oup.com/bioinformatics/article/36/18/4765/5858977?login=false) netZooR also integrates additional functions to: +
* Source protein-protein interaction network from [STRINGdb](https://string-db.org/) based on a list of protein of interest. From ccbb5915557f6d57794e8ae5428734db59d02126 Mon Sep 17 00:00:00 2001 From: marouenbg Date: Mon, 19 Dec 2022 11:29:16 -0500 Subject: [PATCH 3/4] updating mirna interactions --- R/PUMA.R | 49 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/R/PUMA.R b/R/PUMA.R index aed97231..53d6cfb6 100644 --- a/R/PUMA.R +++ b/R/PUMA.R @@ -1,15 +1,15 @@ #' PANDA using microRNA associations #' -#' This function runs the PUMA algorithm +#' This function runs the PUMA algorithm to predict a miRNA-gene regulatory network #' -#' @param motif A motif dataset, a data.frame, matrix or exprSet containing 3 columns. -#' Each row describes an motif associated with a transcription factor (column 1) a -#' gene (column 2) and a score (column 3) for the motif. +#' @param motif A miRNA target dataset, a data.frame, matrix or exprSet containing 3 columns. +#' Each row describes the association between a miRNA (column 1) its target +#' gene (column 2) and a score (column 3) for the association from TargetScan or miRanda #' @param expr An expression dataset, as a genes (rows) by samples (columns) data.frame -#' @param ppi A Protein-Protein interaction dataset, a data.frame containing 3 columns. -#' Each row describes a protein-protein interaction between transcription factor 1(column 1), -#' transcription factor 2 (column 2) and a score (column 3) for the interaction. +#' @param ppi This can be set to 1) NULL which will be encoded as an identity matrix between miRNAs in PUMA for now. +#' Or 2) it can include a set of TF interactions, or 3) a mix of TFs and miRNAs. #' @param alpha value to be used for update variable, alpha (default=0.1) +#' @param mir_file list of miRNA to filter the PPI matrix and prevent update of miRNA edges. #' @param hamming value at which to terminate the process based on hamming distance (default 10^-3) #' @param iter sets the maximum number of iterations PUMA can run before exiting. #' @param progress Boolean to indicate printing of output for algorithm progress. @@ -21,7 +21,7 @@ #' of the gene expression matrix, "by.gene" scrambles gene labels. #' @param cor.method Correlation method, default is "pearson". #' @param scale.by.present Boolean to indicate scaling of correlations by percentage of positive samples. -#' @param remove.missing.ppi Boolean to indicate whether TFs in the PPI but not in the motif data should be +#' @param remove.missing.ppi Boolean to indicate whether miRNAs in the PPI but not in the motif data should be #' removed. Only when mode=='legacy'. #' @param remove.missing.motif Boolean to indicate whether genes targeted in the motif data but not the #' expression data should be removed. Only when mode=='legacy'. @@ -42,14 +42,14 @@ #' with PUMA algorithm.\cr #' "regNet" is the regulatory network\cr #' "coregNet" is the coregulatory network\cr -#' "coopNet" is the cooperative network +#' "coopNet" is the cooperative network which is not updated for miRNAs #' @examples #' data(pandaToyData) #' pumaRes <- puma(pandaToyData$motif, -#' pandaToyData$expression,pandaToyData$ppi,hamming=.1,progress=TRUE) +#' pandaToyData$expression,NULL,hamming=.1,progress=TRUE) #' @references #' Kuijjer, Marieke L., et al. "PUMA: PANDA using microRNA associations." Bioinformatics 36.18 (2020): 4765-4773. -puma <- function(motif,expr=NULL,ppi=NULL,alpha=0.1,hamming=0.001, +puma <- function(motif,expr=NULL,ppi=NULL,alpha=0.1,mir_file,hamming=0.001, iter=NA,output=c('regulatory','coexpression','cooperative'), zScale=TRUE,progress=FALSE,randomize=c("None", "within.gene", "by.gene"),cor.method="pearson", scale.by.present=FALSE,edgelist=FALSE,remove.missing.ppi=FALSE, @@ -133,11 +133,11 @@ puma <- function(motif,expr=NULL,ppi=NULL,alpha=0.1,hamming=0.001, Idx2 <- match(ppi[,2], tf.names); Idx <- (Idx2-1)*num.TFs+Idx1; indIdx=!is.na(Idx) - Idx=Idx[indIdx] #remove missing TFs + Idx=Idx[indIdx] #remove missing miRNAs tfCoopNetwork[Idx] <- ppi[indIdx,3]; Idx <- (Idx1-1)*num.TFs+Idx2; indIdx=!is.na(Idx) - Idx=Idx[indIdx] #remove missing TFs + Idx=Idx[indIdx] #remove missing miRNAs tfCoopNetwork[Idx] <- ppi[indIdx,3]; #Motif matrix regulatoryNetwork=matrix(0,num.TFs,num.genes) @@ -177,7 +177,7 @@ puma <- function(motif,expr=NULL,ppi=NULL,alpha=0.1,hamming=0.001, } if(num.conditions==0) { - warning('No expression data given. PUMA will run based on an identity co-regulation matrix') + warning('No expression data given. PUMA will run based on an identity co-regulation matrix') geneCoreg <- diag(num.genes) } else if(num.conditions<3) { warning('Not enough expression conditions detected to calculate correlation. Co-regulation network will be initialized to an identity matrix.') @@ -238,6 +238,15 @@ puma <- function(motif,expr=NULL,ppi=NULL,alpha=0.1,hamming=0.001, rownames(tfCoopNetwork) <- tf.names } + if(mir_file != NULL){ + mirIndex = match(mir_file,tf.names) + tfCoopNetwork[mirIndex,] = 0 + tfCoopNetwork[,mirIndex] = 0 + seqs = seq(1, num.tfs*num.tfs, num.tfs+1) + tfCoopNetwork[seqs] <- 1 + # tfCoopNetwork has now a diagonal of 1 and all entries are zeros + # for miRNA-miRNA interactions and TF-miRNA interactions + } ## Run PUMA ## tic=proc.time()[3] @@ -255,6 +264,9 @@ puma <- function(motif,expr=NULL,ppi=NULL,alpha=0.1,hamming=0.001, hamming_cur=1 if(progress) print("Using tanimoto similarity") + + TFCoopInit = tfCoopNetwork # Save normalized cooperativity network + while(hamming_cur>hamming){ if ((!is.na(iter))&&step>=iter){ print(paste("Reached maximum iterations, iter =",iter),sep="") @@ -275,6 +287,13 @@ puma <- function(motif,expr=NULL,ppi=NULL,alpha=0.1,hamming=0.001, CoReg2=update.diagonal(CoReg2, num.genes, alpha, step) geneCoreg=minusAlpha*geneCoreg + alpha*CoReg2 + #PUMA step to skip update of PPI matrix for miRNA interactions + seqs = seq(1, num.tfs*num.tfs, num.tfs+1) + savediag = tfCoopNetwork[seqs] # save diagonal + tfCoopNetwork[mirIndex,] <- TFCoopInit[mirIndex,] + tfCoopNetwork[,mirIndex] <- TFCoopInit[,mirIndex] + tfCoopNetwork[seqs] <- savediag # put back saved diagonal + if(progress) message("Iteration", step,": hamming distance =", round(hamming_cur,5)) step=step+1 @@ -282,6 +301,6 @@ puma <- function(motif,expr=NULL,ppi=NULL,alpha=0.1,hamming=0.001, toc=proc.time()[3] - tic if(progress) - message("Successfully ran PUMA on ", num.genes, " Genes and ", num.TFs, " TFs.\nTime elapsed:", round(toc,2), "seconds.") + message("Successfully ran PUMA on ", num.genes, " Genes and ", num.TFs, " miRNAs.\nTime elapsed:", round(toc,2), "seconds.") prepResult(zScale, output, regulatoryNetwork, geneCoreg, tfCoopNetwork, edgelist, motif) } \ No newline at end of file From 02d3ddc17bdf9949b2a2ed20b0f765e36d70c473 Mon Sep 17 00:00:00 2001 From: marouenbg Date: Tue, 20 Dec 2022 17:51:41 -0500 Subject: [PATCH 4/4] add PUMA in readme --- R/PANDA.R | 1 + README.md | 33 +++++++++++++++++++++------------ 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/R/PANDA.R b/R/PANDA.R index 976a32fe..f46c2286 100644 --- a/R/PANDA.R +++ b/R/PANDA.R @@ -20,6 +20,7 @@ #' @param keep_expression_matrix 'TRUE' keeps the input expression matrix as an attribute in the result Panda object.'FALSE' deletes the expression matrix attribute in the Panda object. The default value is 'FALSE'. #' @param modeProcess 'legacy' refers to the processing mode in netZooPy<=0.5, 'union': takes the union of all TFs and genes across priors and fills the missing genes in the priors with zeros; 'intersection': intersects the input genes and TFs across priors and removes the missing TFs/genes. Default values is 'union'. #' @param remove_missing Only when modeProcess='legacy': remove_missing='TRUE' removes all unmatched TF and genes; remove_missing='FALSE' keeps all tf and genes. The default value is 'FALSE'. +#' @param with_header Boolean to read gene expression file with a header for sample names #' #' @return When save_memory=FALSE(default), this function will return a list of three items: #' Use \code{$panda} to access the standard output of PANDA as data frame, which consists of four columns: diff --git a/README.md b/README.md index bb443224..4e496ca1 100644 --- a/README.md +++ b/README.md @@ -19,60 +19,69 @@ netZooR currently integrates:
PANDA -* **PANDA** (Passing Attributes between Networks for Data Assimilation) [[Glass et al. 2013]](http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0064832): PANDA is a method for estimating bipartite gene regulatory networks (GRNs) consisting of two types of nodes: transcription factors (TFs) and genes. An edge between TF $i$ and gene $j$ indicates that gene $j$ is regulated by TF $i$. The edge weight represents the strength of evidence for this regulatory relationship obtained by integrating three types of biological data: gene expression data, protein-protein interaction (PPI) data, and transcription factor binding motif (TFBM) data. PANDA is an iterative approach that begins with a seed GRN estimated from TFBMs and uses message passing between data types to refine the seed network to a final GRN that is consistent with the information contained in gene expression, PPI, and TFBM data. +PANDA (Passing Attributes between Networks for Data Assimilation) Glass et al. 2013: PANDA is a method for estimating bipartite gene regulatory networks (GRNs) consisting of two types of nodes: transcription factors (TFs) and genes. An edge between TF $i$ and gene $j$ indicates that gene $j$ is regulated by TF $i$. The edge weight represents the strength of evidence for this regulatory relationship obtained by integrating three types of biological data: gene expression data, protein-protein interaction (PPI) data, and transcription factor binding motif (TFBM) data. PANDA is an iterative approach that begins with a seed GRN estimated from TFBMs and uses message passing between data types to refine the seed network to a final GRN that is consistent with the information contained in gene expression, PPI, and TFBM data.
CONDOR -* **CONDOR** (COmplex Network Description Of Regulators) [[Platig et al. 2016]](http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1005033): CONDOR is a tool for community detection in bipartite networks. Many community detection methods for unipartite networks are based on the concept of maximizing a modularity metric that compares the weight of edges within communities to the weight of edges between communities, prioritizing community assignments with higher values of the former relative to the latter. CONDOR extends this concept to bipartite networks by optimizing a bipartite version of modularity defined by [[Barber (2007)]](https://pubmed.ncbi.nlm.nih.gov/18233893/). To enable bipartite community detection on large networks such gene regulatory networks, CONDOR uses a fast unipartite modularity maximization method on one of the two unipartite projections of the bipartite network. In Platig et al. (2016), CONDOR is applied to bipartite networks of single nucleotide polymorphisms (SNPs) and gene expression, where a network edge from a SNP node to a gene node is indicative of an association between the SNP and the gene expression level, commonly known as an expression quantitative trait locus (eQTL). Communities detected with CONDOR contained local hub nodes ("core SNPs") enriched for association with disease, suggesting that functional eQTL relationships are encoded at the community level. +CONDOR (COmplex Network Description Of Regulators) Platig et al. 2016: CONDOR is a tool for community detection in bipartite networks. Many community detection methods for unipartite networks are based on the concept of maximizing a modularity metric that compares the weight of edges within communities to the weight of edges between communities, prioritizing community assignments with higher values of the former relative to the latter. CONDOR extends this concept to bipartite networks by optimizing a bipartite version of modularity defined by Barber (2007). To enable bipartite community detection on large networks such gene regulatory networks, CONDOR uses a fast unipartite modularity maximization method on one of the two unipartite projections of the bipartite network. In Platig et al. (2016), CONDOR is applied to bipartite networks of single nucleotide polymorphisms (SNPs) and gene expression, where a network edge from a SNP node to a gene node is indicative of an association between the SNP and the gene expression level, commonly known as an expression quantitative trait locus (eQTL). Communities detected with CONDOR contained local hub nodes ("core SNPs") enriched for association with disease, suggesting that functional eQTL relationships are encoded at the community level.
LIONESS -* **LIONESS** (Linear Interpolation to Obtain Network Estimates for Single Samples) [[Kuijjer et al. 2019]](https://doi.org/10.1016/j.isci.2019.03.021): LIONESS is a flexible method for single-sample network integration. The machinery behind LIONESS is a leave-one-out approach. To construct a single-sample network for sample $i$, a first network is estimated on the full dataset and a second network is estimated on the dataset with sample $i$ withheld. The single-sample network is then estimated based on the difference between these two networks. Any method that can be used to estimate a network can be used with LIONESS to estimate single-sample networks. Two common use cases are the use of LIONESS to generate single-sample GRNs based on PANDA and the use of LIONESS to generate single-sample Pearson correlation networks. +LIONESS (Linear Interpolation to Obtain Network Estimates for Single Samples) Kuijjer et al. 2019: LIONESS is a flexible method for single-sample network integration. The machinery behind LIONESS is a leave-one-out approach. To construct a single-sample network for sample $i$, a first network is estimated on the full dataset and a second network is estimated on the dataset with sample $i$ withheld. The single-sample network is then estimated based on the difference between these two networks. Any method that can be used to estimate a network can be used with LIONESS to estimate single-sample networks. Two common use cases are the use of LIONESS to generate single-sample GRNs based on PANDA and the use of LIONESS to generate single-sample Pearson correlation networks.
ALPACA -* **ALPACA** (ALtered Partitions Across Community Architectures) [[Padi and Quackenbush 2018]](https://www.nature.com/articles/s41540-018-0052-5): ALPACA is a method for differential network analysis that is based on a novel approach to comparison of network community structures. Comparisons of community structure have typically been accomplished by assessing which nodes switch community membership between networks ("community comparison") or by computing the edge weight differences by subtracting the adjacency matrices of two networks and then performing community detection on the resulting differential network ("edge subtraction"). Both these approaches have important limitations. Community comparison is subject to a resolution limit and cannot detect differences smaller than the average community size in a network. Edge subtraction transfers noise from both of the original networks to the differential network, leading to an imprecise estimator. Moreover, positive and negative edge differences cannot be distinguished in the subsequent community detection performed on the differential network. +ALPACA (ALtered Partitions Across Community Architectures) Padi and Quackenbush 2018: ALPACA is a method for differential network analysis that is based on a novel approach to comparison of network community structures. Comparisons of community structure have typically been accomplished by assessing which nodes switch community membership between networks ("community comparison") or by computing the edge weight differences by subtracting the adjacency matrices of two networks and then performing community detection on the resulting differential network ("edge subtraction"). Both these approaches have important limitations. Community comparison is subject to a resolution limit and cannot detect differences smaller than the average community size in a network. Edge subtraction transfers noise from both of the original networks to the differential network, leading to an imprecise estimator. Moreover, positive and negative edge differences cannot be distinguished in the subsequent community detection performed on the differential network. In contrast to community comparison and edge subtraction, ALPACA compares the community structure of two networks by optimizing a new metric: "differential modularity". In the ALPACA algorithm, one network is defined as the reference network and the second is defined as the perturbed network. The differential modularity metric measures the extent to which edges in a community in the perturbed network differ from those that would be expected by random chance according to a null distribution based on the reference network. Community structure of the perturbed network is determined by maximizing this differential modularity. The resulting communities are "differential modules" that show how the perturbed network differs from the reference network at the community level.
SAMBAR -* **SAMBAR** (Subtyping Agglomerated Mutations By Annotation Relations) [[Kuijjer et al.]](https://www.nature.com/articles/s41416-018-0109-7): SAMBAR is a tool for studying cancer subtypes based on patterns of somatic mutations in curated biological pathways. Rather than characterize cancer according to mutations at the gene level, SAMBAR agglomerates mutations within pathways to define a pathway mutation score. To avoid bias based on pathway representation, these pathway mutation scores correct for the number of genes in each pathway as well as the number of times each gene is represented in the universe of pathways. By taking a pathway rather than gene-by-gene lens, SAMBAR both de-sparsifies somatic mutation data and incorporates important prior biological knowledge. Kuijjer et al. (2018) demonstrate that SAMBAR is capable of outperforming other methods for cancer subtyping, producing subtypes with greater between-subtype distances; the authors use SAMBAR for a pan-cancer subtyping analysis that identifies four diverse pan-cancer subtypes linked to distinct molecular processes. +SAMBAR (Subtyping Agglomerated Mutations By Annotation Relations) Kuijjer et al.: SAMBAR is a tool for studying cancer subtypes based on patterns of somatic mutations in curated biological pathways. Rather than characterize cancer according to mutations at the gene level, SAMBAR agglomerates mutations within pathways to define a pathway mutation score. To avoid bias based on pathway representation, these pathway mutation scores correct for the number of genes in each pathway as well as the number of times each gene is represented in the universe of pathways. By taking a pathway rather than gene-by-gene lens, SAMBAR both de-sparsifies somatic mutation data and incorporates important prior biological knowledge. Kuijjer et al. (2018) demonstrate that SAMBAR is capable of outperforming other methods for cancer subtyping, producing subtypes with greater between-subtype distances; the authors use SAMBAR for a pan-cancer subtyping analysis that identifies four diverse pan-cancer subtypes linked to distinct molecular processes.
MONSTER -* **MONSTER** (Modeling Network State Transitions from Expression and Regulatory data) [[Schlauch et al.]](https://doi.org/10.1186/s12918-017-0517-y): MONSTER is a method for estimating transitions between network states by modeling the adjacency matrix of one state as a linear transformation of the adjacency matrix of another. Like LIONESS, MONSTER is a flexible method that does not require a particular type of network structure. MONSTER models the perturbation of an initial network A into a perturbed network B according to a matrix product B = AT. T is a transition matrix encoding the changes that map A to B. When A and B are gene regulatory networks, i.e., bipartite networks between TFs and genes, the MONSTER framework leads naturally to the definition of TF involvement as the sum of the off-diagonal weights for a transcription factor $i$ in the transition matrix T. This perspective enables MONSTER to identify differentially involved TFs that contribute to network transitions differently between different conditions. This dimension cannot be captured from a traditional differential expression analysis of TFs, which will not detect TFs that have the same concentration between conditions. +MONSTER (Modeling Network State Transitions from Expression and Regulatory data) Schlauch et al.: MONSTER is a method for estimating transitions between network states by modeling the adjacency matrix of one state as a linear transformation of the adjacency matrix of another. Like LIONESS, MONSTER is a flexible method that does not require a particular type of network structure. MONSTER models the perturbation of an initial network A into a perturbed network B according to a matrix product B = AT. T is a transition matrix encoding the changes that map A to B. When A and B are gene regulatory networks, i.e., bipartite networks between TFs and genes, the MONSTER framework leads naturally to the definition of TF involvement as the sum of the off-diagonal weights for a transcription factor $i$ in the transition matrix T. This perspective enables MONSTER to identify differentially involved TFs that contribute to network transitions differently between different conditions. This dimension cannot be captured from a traditional differential expression analysis of TFs, which will not detect TFs that have the same concentration between conditions.
OTTER -* **OTTER** (Optimization to Estimate Regulation) [[Weighill et al.]](https://www.biorxiv.org/content/10.1101/2020.06.23.167999v2.abstract): OTTER is a GRN inference method based on the idea that observed biological data (PPI data and gene co-expression data) are projections of a bipartite GRN between TFs and genes. Specifically, PPI data represent the projection of the GRN onto the TF-TF space and gene co-expression data represent the projection of the GRN onto the gene-gene space. OTTER reframes the problem of GRN inference as a problem of relaxed graph matching and finds a GRN that has optimal agreement with the observed PPI and coexpression data. The OTTER objective function is tunable in two ways: first, one can prioritize matching the PPI data or the coexpression data more heavily depending on one's confidence in the data source; second, there is a regularization parameter that can be applied to induce sparsity on the estimated GRN. The OTTER objective function can be solved using spectral decomposition techniques and gradient descent; the latter is shown to be closely related to the PANDA message-passing approach (Glass et al. 2013). +OTTER (Optimization to Estimate Regulation) Weighill et al.: OTTER is a GRN inference method based on the idea that observed biological data (PPI data and gene co-expression data) are projections of a bipartite GRN between TFs and genes. Specifically, PPI data represent the projection of the GRN onto the TF-TF space and gene co-expression data represent the projection of the GRN onto the gene-gene space. OTTER reframes the problem of GRN inference as a problem of relaxed graph matching and finds a GRN that has optimal agreement with the observed PPI and coexpression data. The OTTER objective function is tunable in two ways: first, one can prioritize matching the PPI data or the coexpression data more heavily depending on one's confidence in the data source; second, there is a regularization parameter that can be applied to induce sparsity on the estimated GRN. The OTTER objective function can be solved using spectral decomposition techniques and gradient descent; the latter is shown to be closely related to the PANDA message-passing approach (Glass et al. 2013).
CRANE -* **CRANE** (Constrained Random Alteration of Network Edges) [[Lim et al.]](https://doi.org/10.3389/fgene.2020.603264): CRANE is a method for determining statistical significance of structural differences between networks. Analysis with CRANE is a four-phase process. The first step of CRANE is to estimate two networks: a reference network and a perturbed network. In the same spirit as LIONESS, CRANE is flexible: any network inference method (e.g., correlation, partial correlation, PANDA) can be used at this stage. In the second step, differential features are determined by comparing the reference and perturbed networks. Here, CRANE is again flexible: such differential features could arise from simple measures such as a comparison of node degree or centrality, or from more nuanced techniques such as differential module detection with ALPACA. Third, a large number of constrained random networks are developed based on the network structure of the reference network. By comparing each random network with the original reference network, a set of null differential measures is obtained. Fourth, the observed differential features from step two can be compared with the null distribution from step three to generate empirical p-values. A typical workflow for applying CRANE in NetZooR would involve fitting PANDA networks in step one and using ALPACA to estimate differential modules in step two. +CRANE (Constrained Random Alteration of Network Edges) Lim et al.: CRANE is a method for determining statistical significance of structural differences between networks. Analysis with CRANE is a four-phase process. The first step of CRANE is to estimate two networks: a reference network and a perturbed network. In the same spirit as LIONESS, CRANE is flexible: any network inference method (e.g., correlation, partial correlation, PANDA) can be used at this stage. In the second step, differential features are determined by comparing the reference and perturbed networks. Here, CRANE is again flexible: such differential features could arise from simple measures such as a comparison of node degree or centrality, or from more nuanced techniques such as differential module detection with ALPACA. Third, a large number of constrained random networks are developed based on the network structure of the reference network. By comparing each random network with the original reference network, a set of null differential measures is obtained. Fourth, the observed differential features from step two can be compared with the null distribution from step three to generate empirical p-values. A typical workflow for applying CRANE in NetZooR would involve fitting PANDA networks in step one and using ALPACA to estimate differential modules in step two.
EGRET -* **EGRET** (Estimating the Genetic Regulatory effects on TFs) [[Weighill et al.]](https://www.genome.org/cgi/doi/10.1101/gr.275107.120): EGRET incorporates genetic variants as a fourth data type in the PANDA message-passing framework, enabling the estimation of genotype-specific GRNs. Genetic variants can alter transcription factor binding by affecting the composition of motif sites on the DNA. Not every genetic variant has such an affect; EGRET incorporates only genetic variants which have (1) been shown to be associated with gene expression (expression quantitative trait loci, or eQTL), and (2) are predicted to affect transcription factor binding based on a tool called QBiC (Martin et al. 2019). This information is used in combination with TFBM predictions as input to the PANDA message-passing framework. The resulting EGRET network is a genotype-specific bipartite GRN that is similar to a PANDA network but incorporates the information contained by individual genetic variation. +EGRET (Estimating the Genetic Regulatory effects on TFs) Weighill et al.: EGRET incorporates genetic variants as a fourth data type in the PANDA message-passing framework, enabling the estimation of genotype-specific GRNs. Genetic variants can alter transcription factor binding by affecting the composition of motif sites on the DNA. Not every genetic variant has such an affect; EGRET incorporates only genetic variants which have (1) been shown to be associated with gene expression (expression quantitative trait loci, or eQTL), and (2) are predicted to affect transcription factor binding based on a tool called QBiC (Martin et al. 2019). This information is used in combination with TFBM predictions as input to the PANDA message-passing framework. The resulting EGRET network is a genotype-specific bipartite GRN that is similar to a PANDA network but incorporates the information contained by individual genetic variation.
YARN -* **YARN** (Yet Another RNa-seq package) [[Paulsson et al.]](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-017-1847-x): YARN is a package that combines quality control, gene filtering, and normalization steps to streamline the preprocessing of large-scale, multi-tissue gene expression data from resources such as the Genotype-Tissue Expression (GTEx) project. Among other steps, YARN uses principal coordinate analysis (PCoA) to determine if samples collected from different sites on the same tissue (for example, transverse and sigmoid colon) can be treated as "transcriptionally indistinguishable" and grouped together to increase power for downstream analyses. Paulsson et al. (2017) demonstrate the use of YARN to develop a pan-cancer RNA-seq dataset for 30,333 genes from 9435 samples across 38 tissues from the GTEx dataset. +YARN (Yet Another RNa-seq package) Paulsson et al.: YARN is a package that combines quality control, gene filtering, and normalization steps to streamline the preprocessing of large-scale, multi-tissue gene expression data from resources such as the Genotype-Tissue Expression (GTEx) project. Among other steps, YARN uses principal coordinate analysis (PCoA) to determine if samples collected from different sites on the same tissue (for example, transverse and sigmoid colon) can be treated as "transcriptionally indistinguishable" and grouped together to increase power for downstream analyses. Paulsson et al. (2017) demonstrate the use of YARN to develop a pan-cancer RNA-seq dataset for 30,333 genes from 9435 samples across 38 tissues from the GTEx dataset.
PUMA -* **PUMA** (PANDA using microRNA associations) [[Kuijjer et al.]](https://academic.oup.com/bioinformatics/article/36/18/4765/5858977?login=false) -netZooR also integrates additional functions to: +PUMA (PANDA Using MicroRNA Associations) Kuijjer et al. extends the PANDA framework to model how microRNAs (miRNAs) participate in gene regulatory networks. PUMA networks are bipartite networks that consist of a regulatory layer and a layer of genes being regulated, similar to PANDA networks. While the regulatory layer of PANDA networks consists only of transcription factors (TFs), the regulatory layer of PUMA networks consists of both TFs and miRNAs. A PUMA network is seeded using a combination of input data sources such as motif scans or ChIP-seq data (for TF-gene edges) and an miRNA target prediction tool such as TargetScan or miRanda (for miRNA-gene edges). PUMA uses a message passing framework similar to PANDA to integrate this prior information with gene-gene coexpression and protein-protein interactions to estimate a final regulatory network incorporating miRNAs. Kuijjer and colleagues [7] apply PUMA to 38 GTEx tissues and demonstrate that PUMA can identify important patterns in tissue-specific regulation of genes by miRNA. +
+ +
+SPIDER +SPIDER (Seeding PANDA Interactions to Derive Epigenetic Regulation) Sonawane et al. extends the PANDA framework by incorporating DNase-Seq data to account for chromatin state for the prediction of TF binding sites. The method consists of processing DNase-Seq data to find open chromatin regions and build a “mask” matrix that is then overlaid on the TF-gene motif network to select binding sites that are available fro TF binding. This method can be applied for various biological contexts such as cell lines and tissues. Sonawane and colleagues have employed their method to model cell- type specific GRNs using DNase-Seq data from ENCODE and showed that integrating epigenetic data in SPIDER networks allows building more accurate networks. +
+ +
+DRAGON +DRAGON (Determining Regulatory Associations using Graphical models on Omics Networks) Shutta et al. is a method for estimating multiomic Gaussian graphical models (GGMs, also known as partial correlation networks) that incorporate two different omics data types. DRAGON builds off of the popular covariance shrinkage method of Ledoit and Wolf with an optimization approach that explicitly accounts for the differences in two separate omics "layers" in the shrinkage estimator. The resulting sparse covariance matrix is then inverted to obtain a precision matrix estimate and a corresponding GGM. Although GGMs assume normally distributed data, DRAGON can be used on any type of continuous data by transforming data to approximate normality prior to network estimation. Currently, DRAGON can be applied to estimate networks with two different types of omics data. Investigators interested in applying DRAGON to more than two types of omics data can consider estimating pairwise networks and "chaining" them together.
* Source protein-protein interaction network from [STRINGdb](https://string-db.org/) based on a list of protein of interest.