diff --git a/DESCRIPTION b/DESCRIPTION index 0e7970af..186e474e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: netZooR Type: Package Title: Unified methods for the inference and analysis of gene regulatory networks -Version: 1.2.1 -Date: 2022-07-07 +Version: 1.3.15 +Date: 2023-03-16 Authors@R: c(person("Marouen", "Ben Guebila", email = "benguebila@hsph.harvard.edu", role = c("aut","cre"), comment = c(ORCID = "0000-0001-5934-966X")), person("Tian", "Wang", @@ -18,10 +18,10 @@ Authors@R: c(person("Marouen", "Ben Guebila", person("Des", "Weighill", email = "",role = "aut", comment = c(ORCID = "0000-0003-4979-5871")), person("Kate", "Shutta", - email = "",role = "ctb", comment = c(ORCID = "0000-0003-0402-3771"))) -Maintainer: Marouen Ben Guebila + email = "",role = "aut", comment = c(ORCID = "0000-0003-0402-3771"))) +Maintainer: Marouen Ben Guebila Description: netZooR unifies the implementations of several Network Zoo methods (netzoo, netzoo.github.io) into a single package by creating interfaces between network inference and network analysis methods. Currently, the package has 3 methods for network inference including PANDA and its optimized implementation OTTER (network reconstruction using mutliple lines of biological evidence), LIONESS (single-sample network inference), and EGRET (genotype-specific networks). Network analysis methods include CONDOR (community detection), ALPACA (differential community detection), CRANE (significance estimation of differential modules), MONSTER (estimation of network transition states). In addition, YARN allows to process gene expresssion data for tissue-specific analyses and SAMBAR infers missing mutation data based on pathway information. -Depends: R (>= 4.1.0), +Depends: R (>= 4.2.0), igraph, reticulate, pandaR, @@ -37,12 +37,12 @@ biocViews: GraphAndNetwork Imports: RCy3, viridisLite, - STRINGdb, + STRINGdb, Biobase, GOstats, AnnotationDbi, - matrixStats, - GO.db, + matrixStats, + GO.db, org.Hs.eg.db, Matrix, gplots, @@ -51,21 +51,21 @@ Imports: RCy3, vegan, stats, utils, - reshape, - reshape2, - penalized, - parallel, - doParallel, - foreach, - ggplot2, - ggdendro, - grid, - MASS, - assertthat, - tidyr, - methods, - dplyr, - graphics + reshape, + reshape2, + penalized, + parallel, + doParallel, + foreach, + ggplot2, + ggdendro, + grid, + MASS, + assertthat, + tidyr, + methods, + dplyr, + graphics License: GPL-3 Encoding: UTF-8 LazyData: false @@ -76,6 +76,6 @@ Suggests: pkgdown VignetteEngine: knitr VignetteBuilder: knitr -RoxygenNote: 7.2.1 +RoxygenNote: 7.2.3 BugReports: https://github.com/netZoo/netZooR/issues URL: https://github.com/netZoo/netZooR, https://netzoo.github.io/ diff --git a/NAMESPACE b/NAMESPACE index 604551ad..d6be1c4b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -39,6 +39,7 @@ export(pandaToCondorObject) export(runEgret) export(sambar) export(sourcePPI) +export(spider) export(visPandaInCytoscape) import(AnnotationDbi, except= select) diff --git a/NEWS.md b/NEWS.md index d3d5bf9f..1e3e5b01 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,9 +1,17 @@ +CHANGES IN VERSION 1.3.2 +-------------------------- + + - R implementation of SPIDER + - R implementation of DRAGON + - header argument in pandaPy + CHANGES IN VERSION 1.1.12 -------------------------- - o Reactivated unit tests for Ubuntu GitHub actions. - o LIONESS can now build single-sample coexpression networks using @kshutta's implementation - o Fix for ALPACA singleton community case (detected by @talkhanz) - o Fix for CRANE significance test on constant modularity scores (detected by @talkhanz) - o Improved method description by @kshutta - o Fix for PANDA edge case when only expression is provided + - Reactivated unit tests for Ubuntu GitHub actions. + - LIONESS can now build single-sample coexpression networks using @kshutta's implementation + - Fix for ALPACA singleton community case (detected by @talkhanz) + - Fix for CRANE significance test on constant modularity scores (detected by @talkhanz) + - Improved method description by @kshutta + - Fix for PANDA edge case when only expression is provided + diff --git a/R/DRAGON.R b/R/DRAGON.R index 8143c514..aa84fa13 100644 --- a/R/DRAGON.R +++ b/R/DRAGON.R @@ -5,7 +5,7 @@ # X_mean = np.mean(X_temp, axis=0) # return (X_temp - X_mean) / X_std -scale = function(x,bias=F) +scale = function(x,bias=FALSE) { # sd does 1/(n-1), python does 1/n # use the bias option for exact match with python @@ -135,8 +135,8 @@ risk = function(gamma, const, t11, t12, t21, t22, t3, t4) # def estimate_penalty_parameters_dragon(X1, X2): estimatePenaltyParameters = function(X1,X2) { - # X1 = matrix(c(1,2,3,1,5,12),nrow=3,byrow=T) - # X2 = matrix(c(9,7,8),nrow=3,byrow=T) + # X1 = matrix(c(1,2,3,1,5,12),nrow=3,byrow=TRUE) + # X2 = matrix(c(9,7,8),nrow=3,byrow=TRUE) # X1 is omics matrix 1, dimensions n x p1 # X2 is omics matrix 2, dimensions n x p2 # The matrices should have the same ordering by samples @@ -233,7 +233,7 @@ estimatePenaltyParameters = function(X1,X2) method="L-BFGS-B", lower=c(0,0), upper=c(1,1), - control = list(trace=T,pgtol = 1e-15)) + control = list(trace=TRUE,pgtol = 1e-15)) # reparameterize lambdas = c(1-res$par[1]^2, 1-res$par[2]^2) @@ -296,7 +296,7 @@ get_precision_matrix_dragon = function(X1, X2, lambdas) # mu = np.mean(X, axis=0) } -get_partial_correlation_from_precision = function(Theta,selfEdges=F) +get_partial_correlation_from_precision = function(Theta,selfEdges=FALSE) { # by default, does not return self edges (diagonal is set to zero) ggm = -cov2cor(Theta) @@ -358,7 +358,8 @@ estimate_p_values_dragon = function(r, n, p1, p2, lambdas, kappa="estimate",seed #' @param layer1 : first layer of omics data; rows: samples (order must match layer2), columns: variables #' @param layer2 : second layer of omics data; rows: samples (order must match layer1), columns: variables. #' @param pval : calculate p-values for network edges. Not yet implemented in R; available in netZooPy. -#' @param gradient : method for estimating parameters of p-value distribution, applies only if p-val == T. default = "finite_difference"; other option = "exact" +#' @param gradient : method for estimating parameters of p-value distribution, applies only if p-val == TRUE. default = "finite_difference"; other option = "exact" +#' @param verbose : verbosity level (TRUE/FALSE) #' @return A list of model results. cov : the shrunken covariance matrix #' \itemize{ #' \item{\code{cov}}{ the shrunken covariance matrix} @@ -370,7 +371,7 @@ estimate_p_values_dragon = function(r, n, p1, p2, lambdas, kappa="estimate",seed #' } #' #' @export -dragon = function(layer1,layer2,pval = F,gradient = "finite_difference", verbose = F) +dragon = function(layer1,layer2,pval = FALSE,gradient = "finite_difference", verbose = FALSE) { if(verbose) print("[netZooR::dragon] Estimating penalty parameters...") diff --git a/R/LIONESS.R b/R/LIONESS.R index 4760beb9..6ae75b40 100644 --- a/R/LIONESS.R +++ b/R/LIONESS.R @@ -132,9 +132,9 @@ lionessPy <- function(expr_file, motif_file=NULL, ppi_file=NULL, computing="cpu" # create an instance named "lioness_obj" of Lioness Class. # when save_single_network = TRUE if(save_single_network==TRUE){ - py_run_string(paste("lioness_obj = Lioness(panda_obj", " , " , "computing='", computing, "' , ","start=", start_sample," , ", "end=" ,end_sample, " , ", "save_dir='", save_dir, "' , " , "save_fmt='" , save_fmt, "' )",sep = "" )) - }else{ - py_run_string(paste("lioness_obj = Lioness(panda_obj", " , " , "computing='", computing, "' , ","start=", start_sample," , ", "end=" ,end_sample, ")",sep = "" )) + py_run_string(paste("lioness_obj = Lioness(panda_obj,computing='", computing, "' , ","start=", start_sample," , ", "end=" ,end_sample, " , ", "save_dir='", save_dir, "' , " , "save_fmt='" , save_fmt, "' , " , "save_single = True )",sep = "" )) + }else if(save_single_network==FALSE){ + py_run_string(paste("lioness_obj = Lioness(panda_obj,computing='", computing, "' , ","start=", start_sample," , ", "end=" ,end_sample, " , ", "save_single = False )",sep = "" )) } # retrieve the "total_lioness_network" attribute of instance "lionesss_obj" @@ -161,6 +161,8 @@ lionessPy <- function(expr_file, motif_file=NULL, ppi_file=NULL, computing="cpu" #' transcription factor 2 (column 2) and a score (column 3) for the interaction. #' @param network.inference.method String specifying choice of network inference method. Default is "panda". #' Options include "pearson". +#' @param ncores int specifying the number of cores to be used. Default is 1. +#' (Note: constructing panda networks can be memory-intensive, and the number of cores should take into consideration available memory.) #' @param ... additional arguments for panda analysis #' @keywords keywords #' @importFrom matrixStats rowSds @@ -168,6 +170,7 @@ lionessPy <- function(expr_file, motif_file=NULL, ppi_file=NULL, computing="cpu" #' @importFrom Biobase assayData #' @importFrom reshape melt.array #' @importFrom pandaR panda +#' @importFrom parallel mclapply #' @export #' @return A list of length N, containing objects of class "panda" #' corresponding to each of the N samples in the expression data set.\cr @@ -180,25 +183,30 @@ lionessPy <- function(expr_file, motif_file=NULL, ppi_file=NULL, computing="cpu" #' @references #' Kuijjer, M.L., Tung, M., Yuan, G., Quackenbush, J. and Glass, K., 2015. #' Estimating sample-specific regulatory networks. arXiv preprint arXiv:1505.06440. -lioness = function(expr, motif = NULL, ppi = NULL, network.inference.method = "panda", ...){ +#' Kuijjer, M.L., Hsieh, PH., Quackenbush, J. et al. lionessR: single sample network inference in R. BMC Cancer 19, 1003 (2019). https://doi.org/10.1186/s12885-019-6235-7 +lioness = function(expr, motif = NULL, ppi = NULL, network.inference.method = "panda", ncores = 1, ...){ N <- ncol(expr) + if(ncores < 1){ + print('Setting number of cores to 1.') + ncores <- 1 + } if(network.inference.method == "panda") { fullnet <- panda(motif, expr, ppi, ...) - return(lapply(seq_len(N), function(i) { + return(mclapply(seq_len(N), function(i) { print(paste("Computing network for sample ", i)) N * fullnet@regNet - (N - 1) * panda(motif, expr[, -i], ppi, ...)@regNet - })) + }, mc.cores = ncores)) } if(network.inference.method == "pearson") { fullnet <- cor(t(expr)) - return(lapply(seq_len(N), function(i) { + return(mclapply(seq_len(N), function(i) { print(paste("Computing network for sample ", i)) N * fullnet - (N - 1) * cor(t(expr[,-i])) - })) + }, mc.cores = ncores)) } if(!(network.inference.method %in% c("panda","pearson"))) { diff --git a/R/PANDA.R b/R/PANDA.R index 976a32fe..96ec9472 100644 --- a/R/PANDA.R +++ b/R/PANDA.R @@ -20,7 +20,7 @@ #' @param keep_expression_matrix 'TRUE' keeps the input expression matrix as an attribute in the result Panda object.'FALSE' deletes the expression matrix attribute in the Panda object. The default value is 'FALSE'. #' @param modeProcess 'legacy' refers to the processing mode in netZooPy<=0.5, 'union': takes the union of all TFs and genes across priors and fills the missing genes in the priors with zeros; 'intersection': intersects the input genes and TFs across priors and removes the missing TFs/genes. Default values is 'union'. #' @param remove_missing Only when modeProcess='legacy': remove_missing='TRUE' removes all unmatched TF and genes; remove_missing='FALSE' keeps all tf and genes. The default value is 'FALSE'. -#' +#' @param with_header Boolean to read gene expression file with a header for sample names #' @return When save_memory=FALSE(default), this function will return a list of three items: #' Use \code{$panda} to access the standard output of PANDA as data frame, which consists of four columns: #' "TF", "Gene", "Motif" using 0 or 1 to indicate if this edge belongs to prior motif dataset, and "Score". diff --git a/R/PUMA.R b/R/PUMA.R new file mode 100644 index 00000000..27f92eb1 --- /dev/null +++ b/R/PUMA.R @@ -0,0 +1,425 @@ +#' PANDA using microRNA associations +#' +#' This function runs the PUMA algorithm to predict a miRNA-gene regulatory network +#' +#' @param motif A miRNA target dataset, a data.frame, matrix or exprSet containing 3 columns. +#' Each row describes the association between a miRNA (column 1) its target +#' gene (column 2) and a score (column 3) for the association from TargetScan or miRanda +#' @param expr An expression dataset, as a genes (rows) by samples (columns) data.frame +#' @param ppi This can be set to 1) NULL which will be encoded as an identity matrix between miRNAs in PUMA for now. +#' Or 2) it can include a set of TF interactions, or 3) a mix of TFs and miRNAs. +#' @param alpha value to be used for update variable, alpha (default=0.1) +#' @param mir_file list of miRNA to filter the PPI matrix and prevent update of miRNA edges. +#' @param hamming value at which to terminate the process based on hamming distance (default 10^-3) +#' @param iter sets the maximum number of iterations PUMA can run before exiting. +#' @param progress Boolean to indicate printing of output for algorithm progress. +#' @param output a vector containing which networks to return. Options include "regulatory", +#' "coregulatory", "cooperative". +#' @param zScale Boolean to indicate use of z-scores in output. False will use [0,1] scale. +#' @param randomize method by which to randomize gene expression matrix. Default "None". Must +#' be one of "None", "within.gene", "by.genes". "within.gene" randomization scrambles each row +#' of the gene expression matrix, "by.gene" scrambles gene labels. +#' @param cor.method Correlation method, default is "pearson". +#' @param scale.by.present Boolean to indicate scaling of correlations by percentage of positive samples. +#' @param remove.missing.ppi Boolean to indicate whether miRNAs in the PPI but not in the motif data should be +#' removed. Only when mode=='legacy'. +#' @param remove.missing.motif Boolean to indicate whether genes targeted in the motif data but not the +#' expression data should be removed. Only when mode=='legacy'. +#' @param remove.missing.genes Boolean to indicate whether genes in the expression data but lacking +#' information from the motif prior should be removed. Only when mode=='legacy'. +#' @param edgelist Boolean to indicate if edge lists instead of matrices should be returned. +#' @param mode The data alignment mode. The mode 'union' takes the union of the genes in the expression matrix and the motif +#' and the union of TFs in the ppi and motif and fills the matrics with zeros for nonintersecting TFs and gens, 'intersection' +#' takes the intersection of genes and TFs and removes nonintersecting sets, 'legacy' is the old behavior with version 1.19.3. +#' #' Parameters remove.missing.ppi, remove.missingmotif, remove.missing.genes work only with mode=='legacy'. +#' @keywords keywords +#' @importFrom matrixStats rowSds +#' @importFrom matrixStats colSds +#' @importFrom Biobase assayData +#' @importFrom reshape melt.array +#' @export +#' @return An object of class "panda" containing matrices describing networks achieved by convergence +#' with PUMA algorithm.\cr +#' "regNet" is the regulatory network\cr +#' "coregNet" is the coregulatory network\cr +#' "coopNet" is the cooperative network which is not updated for miRNAs +#' @examples +#' data(pandaToyData) +#' pumaRes <- puma(pandaToyData$motif, +#' pandaToyData$expression,NULL,hamming=.1,progress=TRUE) +#' @references +#' Kuijjer, Marieke L., et al. "PUMA: PANDA using microRNA associations." Bioinformatics 36.18 (2020): 4765-4773. +puma <- function(motif,expr=NULL,ppi=NULL,alpha=0.1,mir_file,hamming=0.001, + iter=NA,output=c('regulatory','coexpression','cooperative'), + zScale=TRUE,progress=FALSE,randomize=c("None", "within.gene", "by.gene"),cor.method="pearson", + scale.by.present=FALSE,edgelist=FALSE,remove.missing.ppi=FALSE, + remove.missing.motif=FALSE,remove.missing.genes=FALSE,mode="union"){ + + randomize <- match.arg(randomize) + if(progress) + print('Initializing and validating') + + if(class(expr)=="ExpressionSet") + expr <- assayData(expr)[["exprs"]] + + if (is.null(expr)){ + # Use only the motif data here for the gene list + num.conditions <- 0 + if (randomize!="None"){ + warning("Randomization ignored because gene expression is not used.") + randomize <- "None" + } + } else { + if(mode=='legacy'){ + if(remove.missing.genes){ + # remove genes from expression data that are not in the motif data + n <- nrow(expr) + expr <- expr[which(rownames(expr)%in%motif[,2]),] + message(sprintf("%s genes removed that were not present in motif", n-nrow(expr))) + } + if(remove.missing.motif){ + # remove genes from motif data that are not in the expression data + n <- nrow(motif) + motif <- motif[which(motif[,2]%in%rownames(expr)),] + message(sprintf("%s motif edges removed that targeted genes missing in expression data", n-nrow(motif))) + } + # Use the motif data AND the expr data (if provided) for the gene list + # Keep everything sorted alphabetically + expr <- expr[order(rownames(expr)),] + }else if(mode=='union'){ + gene.names=unique(union(rownames(expr),unique(motif[,2]))) + tf.names =unique(union(unique(ppi[,1]),unique(motif[,1]))) + num.TFs <- length(tf.names) + num.genes <- length(gene.names) + # gene expression matrix + expr1=as.data.frame(matrix(0,num.genes,ncol(expr))) + rownames(expr1)=gene.names + expr1[which(gene.names%in%rownames(expr)),]=expr[] + expr=expr1 + #PPI matrix + tfCoopNetwork <- matrix(0,num.TFs,num.TFs) + colnames(tfCoopNetwork)=tf.names + rownames(tfCoopNetwork)=tf.names + Idx1 <- match(ppi[,1], tf.names); + Idx2 <- match(ppi[,2], tf.names); + Idx <- (Idx2-1)*num.TFs+Idx1; + tfCoopNetwork[Idx] <- ppi[,3]; + Idx <- (Idx1-1)*num.TFs+Idx2; + tfCoopNetwork[Idx] <- ppi[,3]; + #Motif matrix + regulatoryNetwork=matrix(0,num.TFs,num.genes) + colnames(regulatoryNetwork)=gene.names + rownames(regulatoryNetwork)=tf.names + Idx1=match(motif[,1], tf.names); + Idx2=match(motif[,2], gene.names); + Idx=(Idx2-1)*num.TFs+Idx1; + regulatoryNetwork[Idx]=motif[,3] + }else if(mode=='intersection'){ + gene.names=unique(intersect(rownames(expr),unique(motif[,2]))) + tf.names =unique(intersect(unique(ppi[,1]),unique(motif[,1]))) + num.TFs <- length(tf.names) + num.genes <- length(gene.names) + # gene expression matrix + expr1=as.data.frame(matrix(0,num.genes,ncol(expr))) + rownames(expr1)=gene.names + interGeneNames=gene.names[which(gene.names%in%rownames(expr))] + expr1[interGeneNames,]=expr[interGeneNames,] + expr=expr1 + #PPI matrix + tfCoopNetwork <- matrix(0,num.TFs,num.TFs) + colnames(tfCoopNetwork)=tf.names + rownames(tfCoopNetwork)=tf.names + Idx1 <- match(ppi[,1], tf.names); + Idx2 <- match(ppi[,2], tf.names); + Idx <- (Idx2-1)*num.TFs+Idx1; + indIdx=!is.na(Idx) + Idx=Idx[indIdx] #remove missing miRNAs + tfCoopNetwork[Idx] <- ppi[indIdx,3]; + Idx <- (Idx1-1)*num.TFs+Idx2; + indIdx=!is.na(Idx) + Idx=Idx[indIdx] #remove missing miRNAs + tfCoopNetwork[Idx] <- ppi[indIdx,3]; + #Motif matrix + regulatoryNetwork=matrix(0,num.TFs,num.genes) + colnames(regulatoryNetwork)=gene.names + rownames(regulatoryNetwork)=tf.names + Idx1=match(motif[,1], tf.names); + Idx2=match(motif[,2], gene.names); + Idx=(Idx2-1)*num.TFs+Idx1; + indIdx=!is.na(Idx) + Idx=Idx[indIdx] #remove missing genes + regulatoryNetwork[Idx]=motif[indIdx,3]; + } + num.conditions <- ncol(expr) + if (randomize=='within.gene'){ + expr <- t(apply(expr, 1, sample)) + if(progress) + print("Randomizing by reordering each gene's expression") + } else if (randomize=='by.gene'){ + rownames(expr) <- sample(rownames(expr)) + expr <- expr[order(rownames(expr)),] + if(progress) + print("Randomizing by reordering each gene labels") + } + } + + if (mode=='legacy'){ + # Create vectors for TF names and Gene names from motif dataset + tf.names <- sort(unique(motif[,1])) + gene.names <- sort(unique(rownames(expr))) + num.TFs <- length(tf.names) + num.genes <- length(gene.names) + } + + # Bad data checking + if (num.genes==0){ + stop("Error validating data. No matched genes.\n Please ensure that gene names in expression data match gene names in motif data") + } + + if(num.conditions==0) { + warning('No expression data given. PUMA will run based on an identity co-regulation matrix') + geneCoreg <- diag(num.genes) + } else if(num.conditions<3) { + warning('Not enough expression conditions detected to calculate correlation. Co-regulation network will be initialized to an identity matrix.') + geneCoreg <- diag(num.genes) + } else { + + if(scale.by.present){ + num.positive=(expr>0)%*%t((expr>0)) + geneCoreg <- cor(t(expr), method=cor.method, use="pairwise.complete.obs")*(num.positive/num.conditions) + } else { + geneCoreg <- cor(t(expr), method=cor.method, use="pairwise.complete.obs") + } + if(progress) + print('Verified sufficient samples') + } + if (any(is.na(geneCoreg))){ #check for NA and replace them by zero + diag(geneCoreg)=1 + geneCoreg[is.na(geneCoreg)]=0 + } + + if (any(duplicated(motif))) { + warning("Duplicate edges have been found in the motif data. Weights will be summed.") + motif <- aggregate(motif[,3], by=list(motif[,1], motif[,2]), FUN=sum) + } + + # Prior Regulatory Network + if(mode=='legacy'){ + Idx1=match(motif[,1], tf.names); + Idx2=match(motif[,2], gene.names); + Idx=(Idx2-1)*num.TFs+Idx1; + regulatoryNetwork=matrix(data=0, num.TFs, num.genes); + regulatoryNetwork[Idx]=motif[,3] + colnames(regulatoryNetwork) <- gene.names + rownames(regulatoryNetwork) <- tf.names + # PPI data + # If no ppi data is given, we use the identity matrix + tfCoopNetwork <- diag(num.TFs) + # Else we convert our two-column data.frame to a matrix + if (!is.null(ppi)){ + if(any(duplicated(ppi))){ + warning("Duplicate edges have been found in the PPI data. Weights will be summed.") + ppi <- aggregate(ppi[,3], by=list(ppi[,1], ppi[,2]), FUN=sum) + } + if(remove.missing.ppi){ + # remove edges in the PPI data that target TFs not in the motif + n <- nrow(ppi) + ppi <- ppi[which(ppi[,1]%in%tf.names & ppi[,2]%in%tf.names),] + message(sprintf("%s PPI edges removed that were not present in motif", n-nrow(ppi))) + } + Idx1 <- match(ppi[,1], tf.names); + Idx2 <- match(ppi[,2], tf.names); + Idx <- (Idx2-1)*num.TFs+Idx1; + tfCoopNetwork[Idx] <- ppi[,3]; + Idx <- (Idx1-1)*num.TFs+Idx2; + tfCoopNetwork[Idx] <- ppi[,3]; + } + colnames(tfCoopNetwork) <- tf.names + rownames(tfCoopNetwork) <- tf.names + } + + if(mir_file != NULL){ + mirIndex = match(mir_file,tf.names) + tfCoopNetwork[mirIndex,] = 0 + tfCoopNetwork[,mirIndex] = 0 + seqs = seq(1, num.tfs*num.tfs, num.tfs+1) + tfCoopNetwork[seqs] <- 1 + # tfCoopNetwork has now a diagonal of 1 and all entries are zeros + # for miRNA-miRNA interactions and TF-miRNA interactions + } + ## Run PUMA ## + tic=proc.time()[3] + + if(progress) + print('Normalizing networks...') + regulatoryNetwork = normalizeNetwork(regulatoryNetwork) + tfCoopNetwork = normalizeNetwork(tfCoopNetwork) + geneCoreg = normalizeNetwork(geneCoreg) + + if(progress) + print('Learning Network...') + + minusAlpha = 1-alpha + step=0 + hamming_cur=1 + if(progress) + print("Using tanimoto similarity") + + TFCoopInit = tfCoopNetwork # Save normalized cooperativity network + + while(hamming_cur>hamming){ + if ((!is.na(iter))&&step>=iter){ + print(paste("Reached maximum iterations, iter =",iter),sep="") + break + } + Responsibility=tanimoto(tfCoopNetwork, regulatoryNetwork) + Availability=tanimoto(regulatoryNetwork, geneCoreg) + RA = 0.5*(Responsibility+Availability) + + hamming_cur=sum(abs(regulatoryNetwork-RA))/(num.TFs*num.genes) + regulatoryNetwork=minusAlpha*regulatoryNetwork + alpha*RA + + ppi=tanimoto(regulatoryNetwork, t(regulatoryNetwork)) + ppi=update.diagonal(ppi, num.TFs, alpha, step) + tfCoopNetwork=minusAlpha*tfCoopNetwork + alpha*ppi + + CoReg2=tanimoto(t(regulatoryNetwork), regulatoryNetwork) + CoReg2=update.diagonal(CoReg2, num.genes, alpha, step) + geneCoreg=minusAlpha*geneCoreg + alpha*CoReg2 + + #PUMA step to skip update of PPI matrix for miRNA interactions + seqs = seq(1, num.tfs*num.tfs, num.tfs+1) + savediag = tfCoopNetwork[seqs] # save diagonal + tfCoopNetwork[mirIndex,] <- TFCoopInit[mirIndex,] + tfCoopNetwork[,mirIndex] <- TFCoopInit[,mirIndex] + tfCoopNetwork[seqs] <- savediag # put back saved diagonal + + if(progress) + message("Iteration", step,": hamming distance =", round(hamming_cur,5)) + step=step+1 + } + + toc=proc.time()[3] - tic + if(progress) + message("Successfully ran PUMA on ", num.genes, " Genes and ", num.TFs, " miRNAs.\nTime elapsed:", round(toc,2), "seconds.") + prepResult(zScale, output, regulatoryNetwork, geneCoreg, tfCoopNetwork, edgelist, motif) +} + + +normalizeNetwork<-function(X){ + X <- as.matrix(X) + + nr = nrow(X) + nc = ncol(X) + dm = c(nr,nc) + + # overall values + mu0=mean(X) + std0=sd(X)*sqrt((nr*nc-1)/(nr*nc)) + + # operations on rows + mu1=rowMeans(X) # operations on rows + std1=rowSds(X)*sqrt((nc-1)/nc) + + mu1=rep(mu1, nc) + dim(mu1) = dm + std1=rep(std1,nc) + dim(std1)= dm + + Z1=(X-mu1)/std1 + + # operations on columns + mu2=colMeans(X) # operations on columns + std2=colSds(X)*sqrt((nr-1)/nr) + + mu2 = rep(mu2, each=nr) + dim(mu2) = dm + std2= rep(std2, each=nr) + dim(std2) = dm + + Z2=(X-mu2)/std2 + + # combine and return + normMat=Z1/sqrt(2)+Z2/sqrt(2) + + # checks and defaults for missing data + Z0=(X-mu0)/std0; + f1=is.na(Z1); f2=is.na(Z2); + normMat[f1]=Z2[f1]/sqrt(2)+Z0[f1]/sqrt(2); + normMat[f2]=Z1[f2]/sqrt(2)+Z0[f2]/sqrt(2); + normMat[f1 & f2]=2*Z0[f1 & f2]/sqrt(2); + + normMat +} + +tanimoto<-function(X,Y){ + + nc = ncol(Y) + nr = nrow(X) + dm = c(nr,nc) + + Amat=(X %*% Y) + Bmat=colSums(Y*Y) + + Bmat = rep(Bmat,each=nr) + dim(Bmat) = dm + #Bmat=matrix(rep(Bmat, each=nr), dm) + + Cmat=rowSums(X*X) + Cmat=rep(Cmat,nc) + dim(Cmat) = dm + #Cmat=matrix(rep(Cmat, nc), dm) + + den = (Bmat+Cmat-abs(Amat)) + Amat=Amat/sqrt(den) + + return(Amat) +} + +update.diagonal<-function(diagMat, num, alpha, step){ + seqs = seq(1, num*num, num+1) + diagMat[seqs]=NaN; + diagstd=rowSds(diagMat,na.rm=TRUE) + diagstd[is.na(diagstd)]=0#replace NA with zeros + diagstd=diagstd*sqrt( (num-2)/(num-1) ); + diagMat[seqs]=diagstd*num*exp(2*alpha*step); + return(diagMat); +} + +prepResult <- function(zScale, output, regulatoryNetwork, geneCoreg, tfCoopNetwork, edgelist, motif){ + resList <- list() + numGenes = dim(geneCoreg)[1] + numTFs = dim(tfCoopNetwork)[1] + numEdges = sum(regulatoryNetwork!=0) + if (!zScale){ + regulatoryNetwork <- pnorm(regulatoryNetwork) + geneCoreg <- pnorm(geneCoreg) + tfCoopNetwork <- pnorm(tfCoopNetwork) + } + if("regulatory"%in%output){ + if(edgelist){ + regulatoryNetwork <- melt.array(regulatoryNetwork) + colnames(regulatoryNetwork) <- c("TF", "Gene", "Score") + regulatoryNetwork$Motif <- as.numeric(with(regulatoryNetwork, paste0(TF, Gene))%in%paste0(motif[,1],motif[,2])) + } + resList$regNet <- regulatoryNetwork + } + if("coexpression"%in%output){ + if(edgelist){ + geneCoreg <- melt.array(geneCoreg) + colnames(geneCoreg) <- c("Gene.x", "Gene.y", "Score") + } + resList$coregNet <- geneCoreg + } + if("cooperative"%in%output){ + if(edgelist){ + tfCoopNetwork <- melt.array(tfCoopNetwork) + colnames(tfCoopNetwork) <- c("TF.x", "TF.y", "Score") + } + resList$coopNet <- tfCoopNetwork + } + pandaObj(regNet=regulatoryNetwork, coregNet=geneCoreg, coopNet=tfCoopNetwork, numGenes=numGenes, numTFs=numTFs, numEdges=numEdges) +} + +pandaObj <- setClass("panda", slots=c("regNet","coregNet","coopNet","numGenes","numTFs","numEdges")) +setMethod("show","panda",function(object){print.panda(object)}) diff --git a/R/SPIDER.R b/R/SPIDER.R index 7ec15aed..e593819e 100644 --- a/R/SPIDER.R +++ b/R/SPIDER.R @@ -32,7 +32,7 @@ #' @param edgelist Boolean to indicate if edge lists instead of matrices should be returned. #' @param mode The data alignment mode. The mode 'union' takes the union of the genes in the expression matrix and the motif #' and the union of TFs in the ppi and motif and fills the matrics with zeros for nonintersecting TFs and gens, 'intersection' -#' takes the intersection of genes and TFs and removes nonintersecting sets, 'legacy' is the old behavior with version 1.19.3. +#' takes the intersection of genes and TFs and removes nonintersecting sets, 'legacy' is the old behavior with PANDAR version 1.19.3. #' #' Parameters remove.missing.ppi, remove.missingmotif, remove.missing.genes work only with mode=='legacy'. #' @keywords keywords #' @importFrom matrixStats rowSds @@ -47,8 +47,11 @@ #' "coopNet" is the cooperative network #' @examples #' data(pandaToyData) -#' spiderRes <- spider(pandaToyData$motif, pandaToyData$epifilter -#' pandaToyData$expression,pandaToyData$ppi,hamming=.1,progress=TRUE) +#' pandaToyData$epifilter = pandaToyData$motif +#' nind=floor(runif(5000, min=1, max=dim(pandaToyData$epifilter)[1])) +#' pandaToyData$epifilter[nind,3] = 0 +#' spiderRes <- spider(pandaToyData$motif,pandaToyData$expression, +#' pandaToyData$epifilter,pandaToyData$ppi,hamming=.1,progress=TRUE) #' @references #' Sonawane, Abhijeet Rajendra, et al. "Constructing gene regulatory networks using epigenetic data." npj Systems Biology and Applications 7.1 (2021): 1-13. spider <- function(motif,expr=NULL,epifilter=NULL,ppi=NULL,alpha=0.1,hamming=0.001, @@ -61,7 +64,7 @@ spider <- function(motif,expr=NULL,epifilter=NULL,ppi=NULL,alpha=0.1,hamming=0.0 if(progress) print('Initializing and validating') - if(epifilter[c(1,2),] != motif[c(1,2),]){ + if(any(epifilter[,c(1,2)] != motif[,c(1,2)])){ stop('Chromatin accessibility data does not match motif data size and order.') } @@ -300,9 +303,127 @@ spider <- function(motif,expr=NULL,epifilter=NULL,ppi=NULL,alpha=0.1,hamming=0.0 #' #' @param A Input adjacency matrix degreeAdjust <- function(A){ - k1 <- colSums(A)/dim(A,1) - k2 <- rowSums(A)/dim(A,2) - B <- (matrix(replicate(dim(A,2),k1),nrow=dim(A,1)))^2 - B <- B + (matrix(t(replicate(dim(A,2),k2)),nrow=dim(A,1)))^2 + k1 <- colSums(A)/dim(A)[1] + k2 <- rowSums(A)/dim(A)[2] + B <- (matrix(replicate(dim(A)[1],k1),nrow=dim(A)[1]))^2 + B <- B + (matrix(t(replicate(dim(A)[2],k2)),nrow=dim(A)[1]))^2 A <- A * sqrt(B); -} \ No newline at end of file +} + +normalizeNetwork<-function(X){ + X <- as.matrix(X) + + nr = nrow(X) + nc = ncol(X) + dm = c(nr,nc) + + # overall values + mu0=mean(X) + std0=sd(X)*sqrt((nr*nc-1)/(nr*nc)) + + # operations on rows + mu1=rowMeans(X) # operations on rows + std1=rowSds(X)*sqrt((nc-1)/nc) + + mu1=rep(mu1, nc) + dim(mu1) = dm + std1=rep(std1,nc) + dim(std1)= dm + + Z1=(X-mu1)/std1 + + # operations on columns + mu2=colMeans(X) # operations on columns + std2=colSds(X)*sqrt((nr-1)/nr) + + mu2 = rep(mu2, each=nr) + dim(mu2) = dm + std2= rep(std2, each=nr) + dim(std2) = dm + + Z2=(X-mu2)/std2 + + # combine and return + normMat=Z1/sqrt(2)+Z2/sqrt(2) + + # checks and defaults for missing data + Z0=(X-mu0)/std0; + f1=is.na(Z1); f2=is.na(Z2); + normMat[f1]=Z2[f1]/sqrt(2)+Z0[f1]/sqrt(2); + normMat[f2]=Z1[f2]/sqrt(2)+Z0[f2]/sqrt(2); + normMat[f1 & f2]=2*Z0[f1 & f2]/sqrt(2); + + normMat +} + +tanimoto<-function(X,Y){ + + nc = ncol(Y) + nr = nrow(X) + dm = c(nr,nc) + + Amat=(X %*% Y) + Bmat=colSums(Y*Y) + + Bmat = rep(Bmat,each=nr) + dim(Bmat) = dm + #Bmat=matrix(rep(Bmat, each=nr), dm) + + Cmat=rowSums(X*X) + Cmat=rep(Cmat,nc) + dim(Cmat) = dm + #Cmat=matrix(rep(Cmat, nc), dm) + + den = (Bmat+Cmat-abs(Amat)) + Amat=Amat/sqrt(den) + + return(Amat) +} + +update.diagonal<-function(diagMat, num, alpha, step){ + seqs = seq(1, num*num, num+1) + diagMat[seqs]=NaN; + diagstd=rowSds(diagMat,na.rm=TRUE) + diagstd[is.na(diagstd)]=0#replace NA with zeros + diagstd=diagstd*sqrt( (num-2)/(num-1) ); + diagMat[seqs]=diagstd*num*exp(2*alpha*step); + return(diagMat); +} + +prepResult <- function(zScale, output, regulatoryNetwork, geneCoreg, tfCoopNetwork, edgelist, motif){ + resList <- list() + numGenes = dim(geneCoreg)[1] + numTFs = dim(tfCoopNetwork)[1] + numEdges = sum(regulatoryNetwork!=0) + if (!zScale){ + regulatoryNetwork <- pnorm(regulatoryNetwork) + geneCoreg <- pnorm(geneCoreg) + tfCoopNetwork <- pnorm(tfCoopNetwork) + } + if("regulatory"%in%output){ + if(edgelist){ + regulatoryNetwork <- melt.array(regulatoryNetwork) + colnames(regulatoryNetwork) <- c("TF", "Gene", "Score") + regulatoryNetwork$Motif <- as.numeric(with(regulatoryNetwork, paste0(TF, Gene))%in%paste0(motif[,1],motif[,2])) + } + resList$regNet <- regulatoryNetwork + } + if("coexpression"%in%output){ + if(edgelist){ + geneCoreg <- melt.array(geneCoreg) + colnames(geneCoreg) <- c("Gene.x", "Gene.y", "Score") + } + resList$coregNet <- geneCoreg + } + if("cooperative"%in%output){ + if(edgelist){ + tfCoopNetwork <- melt.array(tfCoopNetwork) + colnames(tfCoopNetwork) <- c("TF.x", "TF.y", "Score") + } + resList$coopNet <- tfCoopNetwork + } + pandaObj(regNet=regulatoryNetwork, coregNet=geneCoreg, coopNet=tfCoopNetwork, numGenes=numGenes, numTFs=numTFs, numEdges=numEdges) +} + +pandaObj <- setClass("panda", slots=c("regNet","coregNet","coopNet","numGenes","numTFs","numEdges")) +setMethod("show","panda",function(object){print.panda(object)}) \ No newline at end of file diff --git a/R/sourcePPI.R b/R/sourcePPI.R index eb54d0c4..9e827158 100644 --- a/R/sourcePPI.R +++ b/R/sourcePPI.R @@ -1,6 +1,8 @@ #' Source the Protein-Protein interaction in STRING database #' -#' This function is able to use a list of Transcription Factors(TF) of interest to source the Protein-Protein interactions (PPI)in STRING database +#' This function uses a list of Transcription Factors (TF) of interest to source the Protein-Protein interactions (PPI) from STRING database using all types of interactions not only the physical subnetwork +#' Important: this function produces a simple unweighted network for tutorial purposes, and does not support weighted PPI edges for the moment. +#' For more complex PPI network modeling, consider pulling the PPI network directly from STRINGdb directly or through their R package. #' #' @param TF a data frame with one column indicating the TF of interest #' @param STRING.version a numeric vector indicating the STRING version. Default valuve is 10 diff --git a/README.md b/README.md index c141a136..ff4a3209 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [![tutorials](https://img.shields.io/badge/netZooR-tutorials-9cf)](https://github.com/netZoo/netZooR/tree/master/vignettes) [![Netbooks](https://img.shields.io/badge/netZooR-Netbooks-ff69b4)](http://netbooks.networkmedicine.org/user/marouenbg/notebooks/Welcome_to_netBooks.ipynb?) [![discussions](https://img.shields.io/badge/netZooR-discussions-orange)](https://github.com/netZoo/netZooR/discussions) - +[![DOI](https://zenodo.org/badge/190646802.svg)](https://zenodo.org/badge/latestdoi/190646802) netZooR is tested on: (OS: Ubuntu + Macos) X (Language: R v4.2) @@ -19,57 +19,70 @@ netZooR currently integrates:
PANDA -* **PANDA** (Passing Attributes between Networks for Data Assimilation) [[Glass et al. 2013]](http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0064832): PANDA is a method for estimating bipartite gene regulatory networks (GRNs) consisting of two types of nodes: transcription factors (TFs) and genes. An edge between TF $i$ and gene $j$ indicates that gene $j$ is regulated by TF $i$. The edge weight represents the strength of evidence for this regulatory relationship obtained by integrating three types of biological data: gene expression data, protein-protein interaction (PPI) data, and transcription factor binding motif (TFBM) data. PANDA is an iterative approach that begins with a seed GRN estimated from TFBMs and uses message passing between data types to refine the seed network to a final GRN that is consistent with the information contained in gene expression, PPI, and TFBM data. +PANDA (Passing Attributes between Networks for Data Assimilation) Glass et al. 2013: PANDA is a method for estimating bipartite gene regulatory networks (GRNs) consisting of two types of nodes: transcription factors (TFs) and genes. An edge between TF $i$ and gene $j$ indicates that gene $j$ is regulated by TF $i$. The edge weight represents the strength of evidence for this regulatory relationship obtained by integrating three types of biological data: gene expression data, protein-protein interaction (PPI) data, and transcription factor binding motif (TFBM) data. PANDA is an iterative approach that begins with a seed GRN estimated from TFBMs and uses message passing between data types to refine the seed network to a final GRN that is consistent with the information contained in gene expression, PPI, and TFBM data.
CONDOR -* **CONDOR** (COmplex Network Description Of Regulators) [[Platig et al. 2016]](http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1005033): CONDOR is a tool for community detection in bipartite networks. Many community detection methods for unipartite networks are based on the concept of maximizing a modularity metric that compares the weight of edges within communities to the weight of edges between communities, prioritizing community assignments with higher values of the former relative to the latter. CONDOR extends this concept to bipartite networks by optimizing a bipartite version of modularity defined by [[Barber (2007)]](https://pubmed.ncbi.nlm.nih.gov/18233893/). To enable bipartite community detection on large networks such gene regulatory networks, CONDOR uses a fast unipartite modularity maximization method on one of the two unipartite projections of the bipartite network. In Platig et al. (2016), CONDOR is applied to bipartite networks of single nucleotide polymorphisms (SNPs) and gene expression, where a network edge from a SNP node to a gene node is indicative of an association between the SNP and the gene expression level, commonly known as an expression quantitative trait locus (eQTL). Communities detected with CONDOR contained local hub nodes ("core SNPs") enriched for association with disease, suggesting that functional eQTL relationships are encoded at the community level. +CONDOR (COmplex Network Description Of Regulators) Platig et al. 2016: CONDOR is a tool for community detection in bipartite networks. Many community detection methods for unipartite networks are based on the concept of maximizing a modularity metric that compares the weight of edges within communities to the weight of edges between communities, prioritizing community assignments with higher values of the former relative to the latter. CONDOR extends this concept to bipartite networks by optimizing a bipartite version of modularity defined by Barber (2007). To enable bipartite community detection on large networks such gene regulatory networks, CONDOR uses a fast unipartite modularity maximization method on one of the two unipartite projections of the bipartite network. In Platig et al. (2016), CONDOR is applied to bipartite networks of single nucleotide polymorphisms (SNPs) and gene expression, where a network edge from a SNP node to a gene node is indicative of an association between the SNP and the gene expression level, commonly known as an expression quantitative trait locus (eQTL). Communities detected with CONDOR contained local hub nodes ("core SNPs") enriched for association with disease, suggesting that functional eQTL relationships are encoded at the community level.
LIONESS -* **LIONESS** (Linear Interpolation to Obtain Network Estimates for Single Samples) [[Kuijjer et al. 2019]](https://doi.org/10.1016/j.isci.2019.03.021): LIONESS is a flexible method for single-sample network integration. The machinery behind LIONESS is a leave-one-out approach. To construct a single-sample network for sample $i$, a first network is estimated on the full dataset and a second network is estimated on the dataset with sample $i$ withheld. The single-sample network is then estimated based on the difference between these two networks. Any method that can be used to estimate a network can be used with LIONESS to estimate single-sample networks. Two common use cases are the use of LIONESS to generate single-sample GRNs based on PANDA and the use of LIONESS to generate single-sample Pearson correlation networks. +LIONESS (Linear Interpolation to Obtain Network Estimates for Single Samples) Kuijjer et al. 2019: LIONESS is a flexible method for single-sample network integration. The machinery behind LIONESS is a leave-one-out approach. To construct a single-sample network for sample $i$, a first network is estimated on the full dataset and a second network is estimated on the dataset with sample $i$ withheld. The single-sample network is then estimated based on the difference between these two networks. Any method that can be used to estimate a network can be used with LIONESS to estimate single-sample networks. Two common use cases are the use of LIONESS to generate single-sample GRNs based on PANDA and the use of LIONESS to generate single-sample Pearson correlation networks.
ALPACA -* **ALPACA** (ALtered Partitions Across Community Architectures) [[Padi and Quackenbush 2018]](https://www.nature.com/articles/s41540-018-0052-5): ALPACA is a method for differential network analysis that is based on a novel approach to comparison of network community structures. Comparisons of community structure have typically been accomplished by assessing which nodes switch community membership between networks ("community comparison") or by computing the edge weight differences by subtracting the adjacency matrices of two networks and then performing community detection on the resulting differential network ("edge subtraction"). Both these approaches have important limitations. Community comparison is subject to a resolution limit and cannot detect differences smaller than the average community size in a network. Edge subtraction transfers noise from both of the original networks to the differential network, leading to an imprecise estimator. Moreover, positive and negative edge differences cannot be distinguished in the subsequent community detection performed on the differential network. +ALPACA (ALtered Partitions Across Community Architectures) Padi and Quackenbush 2018: ALPACA is a method for differential network analysis that is based on a novel approach to comparison of network community structures. Comparisons of community structure have typically been accomplished by assessing which nodes switch community membership between networks ("community comparison") or by computing the edge weight differences by subtracting the adjacency matrices of two networks and then performing community detection on the resulting differential network ("edge subtraction"). Both these approaches have important limitations. Community comparison is subject to a resolution limit and cannot detect differences smaller than the average community size in a network. Edge subtraction transfers noise from both of the original networks to the differential network, leading to an imprecise estimator. Moreover, positive and negative edge differences cannot be distinguished in the subsequent community detection performed on the differential network. In contrast to community comparison and edge subtraction, ALPACA compares the community structure of two networks by optimizing a new metric: "differential modularity". In the ALPACA algorithm, one network is defined as the reference network and the second is defined as the perturbed network. The differential modularity metric measures the extent to which edges in a community in the perturbed network differ from those that would be expected by random chance according to a null distribution based on the reference network. Community structure of the perturbed network is determined by maximizing this differential modularity. The resulting communities are "differential modules" that show how the perturbed network differs from the reference network at the community level.
SAMBAR -* **SAMBAR** (Subtyping Agglomerated Mutations By Annotation Relations) [[Kuijjer et al.]](https://www.nature.com/articles/s41416-018-0109-7): SAMBAR is a tool for studying cancer subtypes based on patterns of somatic mutations in curated biological pathways. Rather than characterize cancer according to mutations at the gene level, SAMBAR agglomerates mutations within pathways to define a pathway mutation score. To avoid bias based on pathway representation, these pathway mutation scores correct for the number of genes in each pathway as well as the number of times each gene is represented in the universe of pathways. By taking a pathway rather than gene-by-gene lens, SAMBAR both de-sparsifies somatic mutation data and incorporates important prior biological knowledge. Kuijjer et al. (2018) demonstrate that SAMBAR is capable of outperforming other methods for cancer subtyping, producing subtypes with greater between-subtype distances; the authors use SAMBAR for a pan-cancer subtyping analysis that identifies four diverse pan-cancer subtypes linked to distinct molecular processes. +SAMBAR (Subtyping Agglomerated Mutations By Annotation Relations) Kuijjer et al.: SAMBAR is a tool for studying cancer subtypes based on patterns of somatic mutations in curated biological pathways. Rather than characterize cancer according to mutations at the gene level, SAMBAR agglomerates mutations within pathways to define a pathway mutation score. To avoid bias based on pathway representation, these pathway mutation scores correct for the number of genes in each pathway as well as the number of times each gene is represented in the universe of pathways. By taking a pathway rather than gene-by-gene lens, SAMBAR both de-sparsifies somatic mutation data and incorporates important prior biological knowledge. Kuijjer et al. (2018) demonstrate that SAMBAR is capable of outperforming other methods for cancer subtyping, producing subtypes with greater between-subtype distances; the authors use SAMBAR for a pan-cancer subtyping analysis that identifies four diverse pan-cancer subtypes linked to distinct molecular processes.
MONSTER -* **MONSTER** (Modeling Network State Transitions from Expression and Regulatory data) [[Schlauch et al.]](https://doi.org/10.1186/s12918-017-0517-y): MONSTER is a method for estimating transitions between network states by modeling the adjacency matrix of one state as a linear transformation of the adjacency matrix of another. Like LIONESS, MONSTER is a flexible method that does not require a particular type of network structure. MONSTER models the perturbation of an initial network A into a perturbed network B according to a matrix product B = AT. T is a transition matrix encoding the changes that map A to B. When A and B are gene regulatory networks, i.e., bipartite networks between TFs and genes, the MONSTER framework leads naturally to the definition of TF involvement as the sum of the off-diagonal weights for a transcription factor $i$ in the transition matrix T. This perspective enables MONSTER to identify differentially involved TFs that contribute to network transitions differently between different conditions. This dimension cannot be captured from a traditional differential expression analysis of TFs, which will not detect TFs that have the same concentration between conditions. +MONSTER (Modeling Network State Transitions from Expression and Regulatory data) Schlauch et al.: MONSTER is a method for estimating transitions between network states by modeling the adjacency matrix of one state as a linear transformation of the adjacency matrix of another. Like LIONESS, MONSTER is a flexible method that does not require a particular type of network structure. MONSTER models the perturbation of an initial network A into a perturbed network B according to a matrix product B = AT. T is a transition matrix encoding the changes that map A to B. When A and B are gene regulatory networks, i.e., bipartite networks between TFs and genes, the MONSTER framework leads naturally to the definition of TF involvement as the sum of the off-diagonal weights for a transcription factor $i$ in the transition matrix T. This perspective enables MONSTER to identify differentially involved TFs that contribute to network transitions differently between different conditions. This dimension cannot be captured from a traditional differential expression analysis of TFs, which will not detect TFs that have the same concentration between conditions.
OTTER -* **OTTER** (Optimization to Estimate Regulation) [[Weighill et al.]](https://www.biorxiv.org/content/10.1101/2020.06.23.167999v2.abstract): OTTER is a GRN inference method based on the idea that observed biological data (PPI data and gene co-expression data) are projections of a bipartite GRN between TFs and genes. Specifically, PPI data represent the projection of the GRN onto the TF-TF space and gene co-expression data represent the projection of the GRN onto the gene-gene space. OTTER reframes the problem of GRN inference as a problem of relaxed graph matching and finds a GRN that has optimal agreement with the observed PPI and coexpression data. The OTTER objective function is tunable in two ways: first, one can prioritize matching the PPI data or the coexpression data more heavily depending on one's confidence in the data source; second, there is a regularization parameter that can be applied to induce sparsity on the estimated GRN. The OTTER objective function can be solved using spectral decomposition techniques and gradient descent; the latter is shown to be closely related to the PANDA message-passing approach (Glass et al. 2013). +OTTER (Optimization to Estimate Regulation) Weighill et al.: OTTER is a GRN inference method based on the idea that observed biological data (PPI data and gene co-expression data) are projections of a bipartite GRN between TFs and genes. Specifically, PPI data represent the projection of the GRN onto the TF-TF space and gene co-expression data represent the projection of the GRN onto the gene-gene space. OTTER reframes the problem of GRN inference as a problem of relaxed graph matching and finds a GRN that has optimal agreement with the observed PPI and coexpression data. The OTTER objective function is tunable in two ways: first, one can prioritize matching the PPI data or the coexpression data more heavily depending on one's confidence in the data source; second, there is a regularization parameter that can be applied to induce sparsity on the estimated GRN. The OTTER objective function can be solved using spectral decomposition techniques and gradient descent; the latter is shown to be closely related to the PANDA message-passing approach (Glass et al. 2013).
CRANE -* **CRANE** (Constrained Random Alteration of Network Edges) [[Lim et al.]](https://doi.org/10.3389/fgene.2020.603264): CRANE is a method for determining statistical significance of structural differences between networks. Analysis with CRANE is a four-phase process. The first step of CRANE is to estimate two networks: a reference network and a perturbed network. In the same spirit as LIONESS, CRANE is flexible: any network inference method (e.g., correlation, partial correlation, PANDA) can be used at this stage. In the second step, differential features are determined by comparing the reference and perturbed networks. Here, CRANE is again flexible: such differential features could arise from simple measures such as a comparison of node degree or centrality, or from more nuanced techniques such as differential module detection with ALPACA. Third, a large number of constrained random networks are developed based on the network structure of the reference network. By comparing each random network with the original reference network, a set of null differential measures is obtained. Fourth, the observed differential features from step two can be compared with the null distribution from step three to generate empirical p-values. A typical workflow for applying CRANE in NetZooR would involve fitting PANDA networks in step one and using ALPACA to estimate differential modules in step two. +CRANE (Constrained Random Alteration of Network Edges) Lim et al.: CRANE is a method for determining statistical significance of structural differences between networks. Analysis with CRANE is a four-phase process. The first step of CRANE is to estimate two networks: a reference network and a perturbed network. In the same spirit as LIONESS, CRANE is flexible: any network inference method (e.g., correlation, partial correlation, PANDA) can be used at this stage. In the second step, differential features are determined by comparing the reference and perturbed networks. Here, CRANE is again flexible: such differential features could arise from simple measures such as a comparison of node degree or centrality, or from more nuanced techniques such as differential module detection with ALPACA. Third, a large number of constrained random networks are developed based on the network structure of the reference network. By comparing each random network with the original reference network, a set of null differential measures is obtained. Fourth, the observed differential features from step two can be compared with the null distribution from step three to generate empirical p-values. A typical workflow for applying CRANE in NetZooR would involve fitting PANDA networks in step one and using ALPACA to estimate differential modules in step two.
EGRET -* **EGRET** (Estimating the Genetic Regulatory effects on TFs) [[Weighill et al.]](https://www.genome.org/cgi/doi/10.1101/gr.275107.120): EGRET incorporates genetic variants as a fourth data type in the PANDA message-passing framework, enabling the estimation of genotype-specific GRNs. Genetic variants can alter transcription factor binding by affecting the composition of motif sites on the DNA. Not every genetic variant has such an affect; EGRET incorporates only genetic variants which have (1) been shown to be associated with gene expression (expression quantitative trait loci, or eQTL), and (2) are predicted to affect transcription factor binding based on a tool called QBiC (Martin et al. 2019). This information is used in combination with TFBM predictions as input to the PANDA message-passing framework. The resulting EGRET network is a genotype-specific bipartite GRN that is similar to a PANDA network but incorporates the information contained by individual genetic variation. +EGRET (Estimating the Genetic Regulatory effects on TFs) Weighill et al.: EGRET incorporates genetic variants as a fourth data type in the PANDA message-passing framework, enabling the estimation of genotype-specific GRNs. Genetic variants can alter transcription factor binding by affecting the composition of motif sites on the DNA. Not every genetic variant has such an affect; EGRET incorporates only genetic variants which have (1) been shown to be associated with gene expression (expression quantitative trait loci, or eQTL), and (2) are predicted to affect transcription factor binding based on a tool called QBiC (Martin et al. 2019). This information is used in combination with TFBM predictions as input to the PANDA message-passing framework. The resulting EGRET network is a genotype-specific bipartite GRN that is similar to a PANDA network but incorporates the information contained by individual genetic variation.
YARN -* **YARN** (Yet Another RNa-seq package) [[Paulsson et al.]](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-017-1847-x): YARN is a package that combines quality control, gene filtering, and normalization steps to streamline the preprocessing of large-scale, multi-tissue gene expression data from resources such as the Genotype-Tissue Expression (GTEx) project. Among other steps, YARN uses principal coordinate analysis (PCoA) to determine if samples collected from different sites on the same tissue (for example, transverse and sigmoid colon) can be treated as "transcriptionally indistinguishable" and grouped together to increase power for downstream analyses. Paulsson et al. (2017) demonstrate the use of YARN to develop a pan-cancer RNA-seq dataset for 30,333 genes from 9435 samples across 38 tissues from the GTEx dataset. +YARN (Yet Another RNa-seq package) Paulsson et al.: YARN is a package that combines quality control, gene filtering, and normalization steps to streamline the preprocessing of large-scale, multi-tissue gene expression data from resources such as the Genotype-Tissue Expression (GTEx) project. Among other steps, YARN uses principal coordinate analysis (PCoA) to determine if samples collected from different sites on the same tissue (for example, transverse and sigmoid colon) can be treated as "transcriptionally indistinguishable" and grouped together to increase power for downstream analyses. Paulsson et al. (2017) demonstrate the use of YARN to develop a pan-cancer RNA-seq dataset for 30,333 genes from 9435 samples across 38 tissues from the GTEx dataset.
-netZooR also integrates additional functions to: +
+PUMA +PUMA (PANDA Using MicroRNA Associations) Kuijjer et al. extends the PANDA framework to model how microRNAs (miRNAs) participate in gene regulatory networks. PUMA networks are bipartite networks that consist of a regulatory layer and a layer of genes being regulated, similar to PANDA networks. While the regulatory layer of PANDA networks consists only of transcription factors (TFs), the regulatory layer of PUMA networks consists of both TFs and miRNAs. A PUMA network is seeded using a combination of input data sources such as motif scans or ChIP-seq data (for TF-gene edges) and an miRNA target prediction tool such as TargetScan or miRanda (for miRNA-gene edges). PUMA uses a message passing framework similar to PANDA to integrate this prior information with gene-gene coexpression and protein-protein interactions to estimate a final regulatory network incorporating miRNAs. Kuijjer and colleagues [7] apply PUMA to 38 GTEx tissues and demonstrate that PUMA can identify important patterns in tissue-specific regulation of genes by miRNA. +
+ +
+SPIDER +SPIDER (Seeding PANDA Interactions to Derive Epigenetic Regulation) Sonawane et al. extends the PANDA framework by incorporating DNase-Seq data to account for chromatin state for the prediction of TF binding sites. The method consists of processing DNase-Seq data to find open chromatin regions and build a “mask” matrix that is then overlaid on the TF-gene motif network to select binding sites that are available fro TF binding. This method can be applied for various biological contexts such as cell lines and tissues. Sonawane and colleagues have employed their method to model cell- type specific GRNs using DNase-Seq data from ENCODE and showed that integrating epigenetic data in SPIDER networks allows building more accurate networks. +
+ +
+DRAGON +DRAGON (Determining Regulatory Associations using Graphical models on Omics Networks) Shutta et al. is a method for estimating multiomic Gaussian graphical models (GGMs, also known as partial correlation networks) that incorporate two different omics data types. DRAGON builds off of the popular covariance shrinkage method of Ledoit and Wolf with an optimization approach that explicitly accounts for the differences in two separate omics "layers" in the shrinkage estimator. The resulting sparse covariance matrix is then inverted to obtain a precision matrix estimate and a corresponding GGM. Although GGMs assume normally distributed data, DRAGON can be used on any type of continuous data by transforming data to approximate normality prior to network estimation. Currently, DRAGON can be applied to estimate networks with two different types of omics data. Investigators interested in applying DRAGON to more than two types of omics data can consider estimating pairwise networks and "chaining" them together. +
* Source protein-protein interaction network from [STRINGdb](https://string-db.org/) based on a list of protein of interest. diff --git a/inst/CITATION b/inst/CITATION index d0c797c1..7443d43b 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -2,11 +2,14 @@ citEntry(entry="article", title = "The Network Zoo: a multilingual package for the inference and analysis of biological networks", author = personList( as.person("Marouen Ben Guebila"), as.person("Tian Wang"), + as.person("Camila M Lopes-Ramos"), + as.person("Viola Fanfani"), as.person("John Quackenbush")), - year = 2022, - journal = "bioRxiv", - doi = "10.1101/2022.05.30.494077", - textVersion = - paste("Ben Guebila, M, Wang, T., Quackenbush, J.", - "The Network Zoo: a multilingual package for the inference and analysis of biological networks", - "bioRxiv, 2022" ) ) + year = 2023, + journal = "Genome Biology", + doi = "10.1186/s13059-023-02877-1 7", + textVersion = + paste("Ben Guebila, M., Wang, T., Lopes-Ramos, C.L., Fanfani, V., Quackenbush, J.", + "The Network Zoo: a multilingual package for the inference and analysis of gene regulatory networks", + "Genome Biology, 2023" ) ) + diff --git a/inst/extdata/lioness.py b/inst/extdata/lioness.py index d2ec2e7c..6578be0d 100644 --- a/inst/extdata/lioness.py +++ b/inst/extdata/lioness.py @@ -187,54 +187,100 @@ def normalize_network(x): ) return normalized_matrix - - class Lioness(Panda): - """ - Description: - Using LIONESS to infer single-sample gene regulatory networks. + """ Using LIONESS to infer single-sample gene regulatory networks. 1. Reading in PANDA network and preprocessed middle data 2. Computing coexpression network 3. Normalizing coexpression network 4. Running PANDA algorithm 5. Writing out LIONESS networks - Inputs: - Panda: Panda object. - - Methods: - __init__ : Intialize instance of Lioness class and load data. - __lioness_loop : The LIONESS algorithm. - save_lioness_results : Saves LIONESS network. - - Example: - from netZooPy.lioness.lioness import Lioness - To run the Lioness algorithm for single sample networks, first run PANDA using the keep_expression_matrix flag, then use Lioness as follows: - panda_obj = Panda('../../tests/ToyData/ToyExpressionData.txt', '../../tests/ToyData/ToyMotifData.txt', '../../tests/ToyData/ToyPPIData.txt', remove_missing=False, keep_expression_matrix=True) - lioness_obj = Lioness(panda_obj) - - Save Lioness results: - lioness_obj.save_lioness_results('Toy_Lioness.txt') - Return a network plot for one of the Lioness single sample networks: - plot = AnalyzeLioness(lioness_obj) - plot.top_network_plot(column= 0, top=100, file='top_100_genes.png') - - Example lioness output: - Sample1 Sample2 Sample3 Sample4 - ------------------------------- - -0.667452814003 -1.70433776179 -0.158129613892 -0.655795512803 - -0.843366539284 -0.733709815256 -0.84849895139 -0.915217389738 - 3.23445386464 2.68888472802 3.35809757371 3.05297381396 - 2.39500370135 1.84608635425 2.80179804094 2.67540878165 - -0.117475863987 0.494923925853 0.0518448588965 -0.0584810456421 - + Parameters + ---------- + + obj : object + PANDA object, generated with keep_expression_matrix=True. + computing : str + 'cpu' uses Central Processing Unit (CPU) to run PANDA + 'gpu' use the Graphical Processing Unit (GPU) to run PANDA + precision : str + 'double' computes the regulatory network in double precision (15 decimal digits). + 'single' computes the regulatory network in single precision (7 decimal digits) which is fastaer, requires half the memory but less accurate. + subset_numbers : list + List of sample index onto which lioness should be run. ([1,10,20]) + subset_names : list + List of sample names onto which lioness should be run. (['s1','s2','s3']) + start : int + Index of first sample to compute the network. If subset_numbers or subset_names is passed, + this is ignored + end : int + Index of last sample to compute the network.If subset_numbers or subset_names is passed, + this is ignored + all_background : bool + Pass the flag if you want to keep the whole samples as background + save_dir : str + Directory to save the networks. + save_fmt : str + Save format. + - '.npy': (Default) Numpy file of the network. + - '.txt': Text file, only values are saved, no tf or gene names. Will be deprecated. + - '.csv': text file with index (tf) and column (gene) names + - '.h5': hdf file, fastest way to save the lioness dataframe with index/column names + - '.mat': MATLAB file. + output : str + - 'network' returns all networks in a single edge-by-sample matrix (lioness_obj.total_lioness_network is the unlabeled variable and lioness_obj.export_lioness_results is the row-labeled variable). For large sample sizes, this variable requires large RAM memory. + - 'gene_targeting' returns gene targeting scores for all networks in a single gene-by-sample matrix (lioness_obj.total_lioness_network). + - 'tf_targeting' returns tf targeting scores for all networks in a single gene-by-sample matrix (lioness_obj.total_lioness_network). + alpha : float + learning rate, set to 0.1 by default but has to be changed manually to match the learning rate of the PANDA object. + save_single: bool + when set to True it will save each lioness network with its sample name inside the lioness output folder + export_filename: str + if passed, the final lioness table will be saved with all tf-gene edges as dataframe index and + samples as column name + ignore_final: bool + if True, no lioness network is kept in memory. This requires saving single networks at each step + Returns + -------- + export_lioness_results : _ + Depeding on the output argument, this can be either all the lioness + networks or their gene/tf targeting scores. + + Notes + ------- + Example on how to use Lioness and plot the network + + >>> from netZooPy.lioness.lioness import Lioness + >>> #To run the Lioness algorithm for single sample networks, first run PANDA using the keep_expression_matrix flag, then use Lioness as follows: + >>> panda_obj = Panda('../../tests/ToyData/ToyExpressionData.txt', '../../tests/ToyData/ToyMotifData.txt', '../../tests/ToyData/ToyPPIData.txt', remove_missing=False, keep_expression_matrix=True) + >>> lioness_obj = Lioness(panda_obj) + + >>> #Save Lioness results: + >>> lioness_obj.save_lioness_results('Toy_Lioness.txt') + >>> #Return a network plot for one of the Lioness single sample networks: + >>> plot = AnalyzeLioness(lioness_obj) + >>> plot.top_network_plot(column= 0, top=100, file='top_100_genes.png') + + Example lioness output: TF, Gene and Motif order is identical to the panda output file. - Authors: - Cho-Yi Chen, David Vi, Daniel Morgan + - Sample1 Sample2 Sample3 Sample4\n + - -------------------------------\n + - -0.667452814003 -1.70433776179 -0.158129613892 -0.655795512803\n + - -0.843366539284 -0.733709815256 -0.84849895139 -0.915217389738\n + - 3.23445386464 2.68888472802 3.35809757371 3.05297381396\n + - 2.39500370135 1.84608635425 2.80179804094 2.67540878165\n + - -0.117475863987 0.494923925853 0.0518448588965 -0.0584810456421\n + + - Reference: - Kuijjer, Marieke Lydia, et al. "Estimating sample-specific regulatory networks." Iscience 14 (2019): 226-240. + References + ----------- + + .. [1]__ Kuijjer, Marieke Lydia, et al. "Estimating sample-specific regulatory networks." + Iscience 14 (2019): 226-240. + + Authors: Cho-Yi Chen, David Vi, Daniel Morgan """ def __init__( @@ -245,52 +291,43 @@ def __init__( ncores=1, start=1, end=None, + subset_numbers=None, + subset_names=None, save_dir="lioness_output", save_fmt="npy", output="network", alpha=0.1, + save_single = False, + export_filename = None, + ignore_final=False, ): - """ - Description: - Initialize instance of Lioness class and load data. - - Inputs: - obj : PANDA object, generated with keep_expression_matrix=True. - obj.motif_matrix: TF DNA motif binding data in tf-by-gene format. - If set to None, Lioness will be performed on gene coexpression network. - computing : 'cpu' uses Central Processing Unit (CPU) to run PANDA - 'gpu' use the Graphical Processing Unit (GPU) to run PANDA - precision : 'double' computes the regulatory network in double precision (15 decimal digits). - 'single' computes the regulatory network in single precision (7 decimal digits) which is fastaer, requires half the memory but less accurate. - start : Index of first sample to compute the network. - end : Index of last sample to compute the network. - save_dir : Directory to save the networks. - save_fmt : Save format. - '.npy': (Default) Numpy file. - '.txt': Text file. - '.mat': MATLAB file. - output : 'network' returns all networks in a single edge-by-sample matrix (lioness_obj.total_lioness_network). For large sample sizes, this variable requires large RAM memory. - 'gene_targeting' returns gene targeting scores for all networks in a single gene-by-sample matrix (lioness_obj.total_lioness_network). - 'tf_targeting' returns tf targeting scores for all networks in a single gene-by-sample matrix (lioness_obj.total_lioness_network). - alpha : learning rate, set to 0.1 by default but has to be changed manually to match the learning rate of the PANDA object. - - Output: - export_lioness_results : Depeding on the output argument, this can be either all the lioness networks or their gene/tf targeting scores. + """ Initialize instance of Lioness class and load data. """ # Load data with Timer("Loading input data ..."): self.export_panda_results = obj.export_panda_results - self.expression_matrix = obj.expression_matrix - self.motif_matrix = obj.motif_matrix - self.ppi_matrix = obj.ppi_matrix - self.correlation_matrix = obj.correlation_matrix + self.expression_samples = obj.expression_samples if precision == "single": - self.correlation_matrix = np.float32(self.correlation_matrix) - self.motif_matrix = np.float32(self.motif_matrix) - self.ppi_matrix = np.float32(self.ppi_matrix) + self.expression_matrix = np.float32(obj.expression_matrix) + self.correlation_matrix = np.float32(obj.correlation_matrix) + self.motif_matrix = np.float32(obj.motif_matrix) + self.ppi_matrix = np.float32(obj.ppi_matrix) + self.alpha = np.float32(alpha) + else: + self.expression_matrix = obj.expression_matrix + self.motif_matrix = obj.motif_matrix + self.ppi_matrix = obj.ppi_matrix + self.correlation_matrix = obj.correlation_matrix + self.alpha = alpha + self.computing = computing - self.alpha = alpha self.n_cores = int(ncores) + self.save_single = save_single + self.precision = precision + if precision == "single": + self.np_dtype = np.float32 + else: + self.np_dtype = np.float64 if hasattr(obj, "panda_network"): self.network = obj.panda_network.to_numpy() elif hasattr(obj, "puma_network"): @@ -301,13 +338,41 @@ def __init__( gene_names = obj.gene_names tf_names = obj.unique_tfs origmotif = obj.motif_data # save state of original motif matrix + self.gene_names = gene_names + self.tf_names = tf_names del obj # Get sample range to iterate + # the number of conditions is the N parameter used for the number of samples in the whole background self.n_conditions = self.expression_matrix.shape[1] - self.indexes = range(self.n_conditions)[ - start - 1 : end - ] # sample indexes to include + self.n_lio_samples = self.n_conditions + if (subset_numbers!=None or subset_names!=None): + if (subset_numbers!=None and subset_names!=None): + sys.exit('Pass only one between subset_numbers and subset_names') + elif (subset_numbers!=None and subset_names==None): + # select using indexes + assert isinstance(subset_numbers, list) + assert isinstance(subset_numbers[0], int) + self.indexes = [int(i) for i in subset_numbers] + else: + #select using sample names + assert isinstance(subset_names, list) + assert isinstance(subset_names[0], int) + self.indexes = [self.expression_samples.index(int(i)) for i in subset_names] + #self.expression_samples = self.expression_samples[self.indexes] + # number of lioness networks to be computed + self.n_lio_samples = len(self.indexes) + else: + # if no subset is selected, we just use the start and end numbers to decide + # which samples need to be analyses. The background is always what is used for PANDA + # and stays the same + + self.indexes = range(self.n_conditions)[ + start - 1 : end + ] # sample indexes to include + #self.expression_samples = self.expression_samples[start-1:end] + self.n_lio_samples = len(self.indexes) + print("Number of total samples:", self.n_conditions) print("Number of computed samples:", len(self.indexes)) print("Number of parallel cores:", self.n_cores) @@ -315,68 +380,97 @@ def __init__( # Create the output folder if not exists self.save_dir = save_dir self.save_fmt = save_fmt + # if true, no final large matrix is saved + self.ignore_final=ignore_final + + # Here we make sure that at least one between the complete lioness (edgex x samples) + # dataframe or each single network is saved + if ((self.save_single==False) and (self.ignore_final==True)): + sys.exit("ERROR: you are passing save_single=False, and ignore_final=True. This way no output is saved, change one of the two to have either the single networks or the final large table") + # We create the folder if not os.path.exists(save_dir): os.makedirs(save_dir) + ############# # Run LIONESS + ############# if int(self.n_conditions) >= int(self.n_cores) and self.computing == "cpu": + # the total_lioness_network here is a list of 1d + # arrays (network:(tfXgene,1),gene_targeting:(gene,1),tf_targeting:(tf,1)) self.total_lioness_network = Parallel(n_jobs=self.n_cores)( self.__par_lioness_loop(i, output) for i in (self.indexes) - ) + ) elif self.computing == "gpu": for i in self.indexes: self.total_lioness_network = self.__lioness_loop(i) - # # self.export_lioness_results = pd.DataFrame(self.total_lioness_network) - + self.total_lioness_network = self.total_lioness_network.T # create result data frame - if output == "network": - if isinstance(origmotif, pd.DataFrame): - total_tfs = tf_names * len(gene_names) - total_genes = [i for i in gene_names for _ in range(len(tf_names))] - indDF = pd.DataFrame([total_tfs, total_genes], index=["tf", "gene"]) - indDF = indDF.append( - pd.DataFrame(self.total_lioness_network) + if self.ignore_final: + print('WARNING: we do not keep all lionesses in memory. They have been saved singularly.') + else: + if output == "network": + if isinstance(origmotif, pd.DataFrame): + # get row of all TFs + total_tfs = tf_names * len(gene_names) + # get row of all genes + total_genes = [i for i in gene_names for _ in range(len(tf_names))] + # first dataframe is made of tf and gene names + indDF = pd.DataFrame([total_tfs, total_genes], index=["tf", "gene"]) + # concatenate with dataframe of data, rows are samples, columns the edges + indDF = indDF.append( + pd.DataFrame(self.total_lioness_network, index = self.expression_samples[self.indexes]) + ).transpose() + else: # if equal to None to be specific + total_genes1 = gene_names * len(gene_names) + total_genes2 = [i for i in gene_names for _ in range(len(gene_names))] + indDF = pd.DataFrame( + [total_genes1, total_genes2], index=["gene1", "gene2"] + ) + indDF = indDF.append( + pd.DataFrame(self.total_lioness_network, index = self.expression_samples[self.indexes]) + ).transpose() + + # keep the df as the export results + self.export_lioness_results = indDF + del indDF + elif output == "gene_targeting": + self.export_lioness_results = pd.DataFrame( + self.total_lioness_network, columns=gene_names, index =self.expression_samples[self.indexes] ).transpose() - else: # if equal to None to be specific - total_genes1 = gene_names * len(gene_names) - total_genes2 = [i for i in gene_names for _ in range(len(gene_names))] - indDF = pd.DataFrame( - [total_genes1, total_genes2], index=["gene1", "gene2"] - ) - indDF = indDF.append( - pd.DataFrame(self.total_lioness_network) + elif output == "tf_targeting": + self.export_lioness_results = pd.DataFrame( + self.total_lioness_network, columns=tf_names, index = self.expression_samples[self.indexes] ).transpose() - self.export_lioness_results = indDF - elif output == "gene_targeting": - self.export_lioness_results = pd.DataFrame( - self.total_lioness_network, columns=gene_names - ).transpose() - elif output == "tf_targeting": - self.export_lioness_results = pd.DataFrame( - self.total_lioness_network, columns=tf_names - ).transpose() - self.save_lioness_results() + + # if export filename is passed, the full lioness table is saved + if export_filename: + self.export_lioness_table(output_filename = export_filename) + else: + self.save_lioness_results() def __lioness_loop(self, i): - """ - Description: - Initialize instance of Lioness class and load data. + #TODO: this is now for GPU only in practice + """ Initialize instance of Lioness class and load data. - Outputs: - self.total_lioness_network: An edge-by-sample matrix containing sample-specific networks. + Returns + -------- + self.total_lioness_network: array + An edge-by-sample matrix containing sample-specific networks. """ # for i in self.indexes: - print("Running LIONESS for sample %d:" % (i + 1)) + print("Running LIONESS for sample %d/%d:" %((i),(self.n_conditions))) idx = [x for x in range(self.n_conditions) if x != i] # all samples except i with Timer("Computing coexpression network:"): if self.computing == "gpu": import cupy as cp - - correlation_matrix = cp.corrcoef(self.expression_matrix[:, idx]) - if cp.isnan(correlation_matrix).any(): - cp.fill_diagonal(correlation_matrix, 1) - correlation_matrix = cp.nan_to_num(correlation_matrix) - correlation_matrix = cp.asnumpy(correlation_matrix) + + correlation_matrix_cp = cp.corrcoef(self.expression_matrix[:, idx].astype(self.np_dtype)).astype(self.np_dtype) + if cp.isnan(correlation_matrix_cp).any(): + cp.fill_diagonal(correlation_matrix_cp, 1) + correlation_matrix_cp = cp.nan_to_num(correlation_matrix_cp) + correlation_matrix = cp.asnumpy(correlation_matrix_cp) + del correlation_matrix_cp + cp._default_memory_pool.free_all_blocks() else: correlation_matrix = np.corrcoef(self.expression_matrix[:, idx]) if np.isnan(correlation_matrix).any(): @@ -410,48 +504,45 @@ def __lioness_loop(self, i): # old #lioness_network = self.n_conditions * (self.network - subset_panda_network) + subset_panda_network - with Timer( - "Saving LIONESS network %d to %s using %s format:" - % (i + 1, self.save_dir, self.save_fmt) - ): - path = os.path.join(self.save_dir, "lioness.%d.%s" % (i + 1, self.save_fmt)) - if self.save_fmt == "txt": - np.savetxt(path, lioness_network) - elif self.save_fmt == "npy": - np.save(path, lioness_network) - elif self.save_fmt == "mat": - from scipy.io import savemat - - savemat(path, {"PredNet": lioness_network}) - else: - print("Unknown format %s! Use npy format instead." % self.save_fmt) - np.save(path, lioness_network) - if self.computing == "gpu" and i == 0: - self.total_lioness_network = np.fromstring( - np.transpose(lioness_network).tostring(), dtype=lioness_network.dtype - ) - elif self.computing == "gpu" and i != 0: - self.total_lioness_network = np.column_stack( - ( - self.total_lioness_network, - np.fromstring( - np.transpose(lioness_network).tostring(), - dtype=lioness_network.dtype, - ), + if self.save_single: + with Timer( + "Saving LIONESS network %d (%s) to %s using %s format:" + % (i, self.expression_samples[i], self.save_dir, self.save_fmt) + ): + + path = os.path.join(self.save_dir, "lioness.%s.%s.%s" % (self.expression_samples[i],str(i),self.save_fmt)) + self.__lioness_to_disk(lioness_network, path) + + if self.ignore_final: + return(np.array([0])) + else: + if self.computing == "gpu" and i == 0: + self.total_lioness_network = np.fromstring( + np.transpose(lioness_network).tostring(), dtype=lioness_network.dtype + )[:,np.newaxis] + + elif self.computing == "gpu" and i != 0: + self.total_lioness_network = np.column_stack( + ( + self.total_lioness_network, + np.fromstring( + np.transpose(lioness_network).tostring(), + dtype=lioness_network.dtype, + ), + ) ) - ) - - return self.total_lioness_network + + return self.total_lioness_network @delayed @wrap_non_picklable_objects def __par_lioness_loop(self, i, output): - """ - Description: - Initialize instance of Lioness class and load data. + """ Initialize instance of Lioness class and load data. - Outputs: - self.total_lioness_network: An edge-by-sample matrix containing sample-specific networks. + Returns + --------- + self.total_lioness_network: array + An edge-by-sample matrix containing sample-specific networks. """ # for i in self.indexes: print("Running LIONESS for sample %d:" % (i + 1)) @@ -466,6 +557,7 @@ def __par_lioness_loop(self, i, output): correlation_matrix = cp.nan_to_num(correlation_matrix) correlation_matrix = cp.asnumpy(correlation_matrix) else: + # run on CPU with numpy correlation_matrix = np.corrcoef(self.expression_matrix[:, idx]) if np.isnan(correlation_matrix).any(): np.fill_diagonal(correlation_matrix, 1) @@ -478,6 +570,7 @@ def __par_lioness_loop(self, i, output): correlation_matrix = self._normalize_network(correlation_matrix) with Timer("Inferring LIONESS network:"): + # TODO: fix this correlation matrix+delete if self.motif_matrix is not None: del correlation_matrix_orig subset_panda_network = compute_panda( @@ -497,75 +590,123 @@ def __par_lioness_loop(self, i, output): lioness_network = (self.n_conditions * self.network) - ( (self.n_conditions - 1) * subset_panda_network ) - - with Timer( - "Saving LIONESS network %d to %s using %s format:" - % (i + 1, self.save_dir, self.save_fmt) - ): - path = os.path.join(self.save_dir, "lioness.%d.%s" % (i + 1, self.save_fmt)) - if self.save_fmt == "txt": - np.savetxt(path, lioness_network) - elif self.save_fmt == "npy": - np.save(path, lioness_network) - elif self.save_fmt == "mat": - from scipy.io import savemat - - savemat(path, {"PredNet": lioness_network}) - else: - print("Unknown format %s! File will not be saved." % self.save_fmt) - # np.save(path, lioness_network) + # the lioness network here is a TFxGENE np array + + # if save_single flag is passed, save each single lioness sample + if self.save_single: + # TODO: here we need to decide whether to add the tf and gene name + with Timer( + "Saving LIONESS network %d (%s) to %s using %s format:" + % (i, self.expression_samples[i], self.save_dir, self.save_fmt) + ): + + path = os.path.join(self.save_dir, "lioness.%s.%s.%s" % (self.expression_samples[i],str(i),self.save_fmt)) + print(path) + self.__lioness_to_disk(lioness_network, path) + + # TODO: why this? Should we remove it? # if i == 0: # self.total_lioness_network = np.fromstring(np.transpose(lioness_network).tostring(),dtype=lioness_network.dtype) # else: # self.total_lioness_network=np.column_stack((self.total_lioness_network ,np.fromstring(np.transpose(lioness_network).tostring(),dtype=lioness_network.dtype))) - if output == "network": - self.total_lioness_network = np.transpose(lioness_network).flatten() - elif output == "gene_targeting": - self.total_lioness_network = np.sum(lioness_network, axis=0) - elif output == "tf_targeting": - self.total_lioness_network = np.sum(lioness_network, axis=1) - return self.total_lioness_network - - def save_lioness_results(self): - """ - Description: - Saves LIONESS network. - - Outputs: - file: Path to save the network. + if self.ignore_final: + return(np.array([0])) + else: + if output == "network": + self.total_lioness_network = np.transpose(lioness_network).flatten() + elif output == "gene_targeting": + self.total_lioness_network = np.sum(lioness_network, axis=0) + elif output == "tf_targeting": + self.total_lioness_network = np.sum(lioness_network, axis=1) + return self.total_lioness_network + + def save_lioness_results(self, lioness_output_filename = None): + """ Saves LIONESS network. + Uses self.save_fmt, self.save_dir to save the data + into self.total_lioness_network """ # self.lioness_network.to_csv(file, index=False, header=False, sep="\t") - fullpath = os.path.join(self.save_dir, "lioness.%s" % (self.save_fmt)) - if self.save_fmt == "txt": + if lioness_output_filename: + fullpath = lioness_output_filename + else: + fullpath = os.path.join(self.save_dir, "lioness.%s" % (self.save_fmt)) + + if fullpath.endswith("txt"): np.savetxt( fullpath, np.transpose(self.total_lioness_network), delimiter="\t", - header="", ) - elif self.save_fmt == "npy": + elif fullpath.endswith("csv"): + np.savetxt( + fullpath, + np.transpose(self.total_lioness_network), + delimiter=",", + ) + elif fullpath.endswith("npy"): np.save(fullpath, np.transpose(self.total_lioness_network)) - elif self.save_fmt == "mat": + elif fullpath.endswith("h5"): + print('Warning: saving as txt for now. Need to rewrite this') + np.savetxt(fullpath, + np.transpose(self.total_lioness_network), + delimiter="\t", + ) + elif fullpath.endswith("mat"): from scipy.io import savemat - - savemat(fullpath, np.transpose(self.total_lioness_network)) + mdic = {"results": np.transpose(self.total_lioness_network), "label": "lioness"} + savemat(fullpath, mdic) + else: + print('Trying to save lioness output. Format %s not recognised' %str(fullpath)) return None - def export_lioness_table(self, output_filename="lioness_table.txt", header=False): - """ - Description: + def __lioness_to_disk(self, net, path): + if self.save_fmt == "txt": + np.savetxt(path, net) + elif self.save_fmt == "h5": + pd.DataFrame(data = net, columns=self.gene_names, index = self.tf_names).to_hdf(path, key = 'lioness' ,mode='w') + elif self.save_fmt == 'csv': + pd.DataFrame(data = net, columns=self.gene_names, index = self.tf_names).to_csv(path) + elif self.save_fmt == "npy": + np.save(path, net) + elif self.save_fmt == "mat": + from scipy.io import savemat + savemat(path, {"PredNet": net}) + else: + print("Unknown format %s! Use npy format instead." % self.save_fmt) + np.save(path, net) + + def export_lioness_table(self, output_filename="lioness_table.txt", header=False, output = 'network'): + """ Saves LIONESS network with edge names. This saves a dataframe with the corresponding header and indexes. - So far we - Outputs: - output_filename: Path to save the network. Specify relative path - and format. Choose between .csv, .tsv and .txt. (Defaults to .lioness_table.txt)) + + Parameters + ------------ + output_filename: str + Path to save the network. Specify relative path + and format. Choose between .csv, .tsv and .txt. + (Defaults to .lioness_table.txt)) """ - df = self.export_lioness_results - df = df.sort_values(by=['tf','gene']) - if output_filename.endswith("txt"): - df.to_csv(output_filename, sep=" ", header=False, index = False) - elif output_filename.endswith("csv"): - df.to_csv(output_filename, sep=",", header=False, index = False) - elif output_filename.endswith("tsv"): - df.to_csv(output_filename, sep="\t", header=False, index = False) + # TODO: add case where there is tf_targeting or gene_targeting + if (output=='network'): + # we first get the names of first two columns (tf,gene) or (gene1,gene2) + sort_cols = self.export_lioness_results.columns.tolist()[:2] + if output_filename.endswith("txt"): + self.export_lioness_results.sort_values(by=sort_cols).to_csv(output_filename, sep=" ", index = False) + elif output_filename.endswith("csv"): + self.export_lioness_results.sort_values(by=sort_cols).to_csv(output_filename, sep=",", index = False) + elif output_filename.endswith("tsv"): + self.export_lioness_results.sort_values(by=sort_cols).to_csv(output_filename, sep="\t", index = False) + else: + sys.exit('Export output unknown: use txt/csv/tsv') + + else: + # otherwise we only need one column and we sort by index + if output_filename.endswith("txt"): + self.export_lioness_results.sort_index().to_csv(output_filename, sep=" ") + elif output_filename.endswith("csv"): + self.export_lioness_results.sort_index().to_csv(output_filename, sep=",") + elif output_filename.endswith("tsv"): + self.export_lioness_results.sort_index().to_csv(output_filename, sep="\t") + else: + sys.exit('Export output unknown: use txt/csv/tsv') diff --git a/man/degreeAdjust.Rd b/man/degreeAdjust.Rd new file mode 100644 index 00000000..bce28290 --- /dev/null +++ b/man/degreeAdjust.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SPIDER.R +\name{degreeAdjust} +\alias{degreeAdjust} +\title{Function to adjust the degree so that the hub nodes are not penalized in z-score transformation} +\usage{ +degreeAdjust(A) +} +\arguments{ +\item{A}{Input adjacency matrix} +} +\description{ +Function to adjust the degree so that the hub nodes are not penalized in z-score transformation +} diff --git a/man/dragon.Rd b/man/dragon.Rd index 6b72a603..fdf2aaeb 100644 --- a/man/dragon.Rd +++ b/man/dragon.Rd @@ -4,7 +4,13 @@ \alias{dragon} \title{Run DRAGON in R.} \usage{ -dragon(layer1, layer2, pval = F, gradient = "finite_difference", verbose = F) +dragon( + layer1, + layer2, + pval = FALSE, + gradient = "finite_difference", + verbose = FALSE +) } \arguments{ \item{layer1}{: first layer of omics data; rows: samples (order must match layer2), columns: variables} @@ -13,7 +19,9 @@ dragon(layer1, layer2, pval = F, gradient = "finite_difference", verbose = F) \item{pval}{: calculate p-values for network edges. Not yet implemented in R; available in netZooPy.} -\item{gradient}{: method for estimating parameters of p-value distribution, applies only if p-val == T. default = "finite_difference"; other option = "exact"} +\item{gradient}{: method for estimating parameters of p-value distribution, applies only if p-val == TRUE. default = "finite_difference"; other option = "exact"} + +\item{verbose}{: verbosity level (TRUE/FALSE)} } \value{ A list of model results. cov : the shrunken covariance matrix diff --git a/man/pandaPy.Rd b/man/pandaPy.Rd index d45a09b1..95a10fa8 100644 --- a/man/pandaPy.Rd +++ b/man/pandaPy.Rd @@ -40,6 +40,8 @@ Also, this can be generated with a list of proteins of interest by \code{\link{s \item{modeProcess}{'legacy' refers to the processing mode in netZooPy<=0.5, 'union': takes the union of all TFs and genes across priors and fills the missing genes in the priors with zeros; 'intersection': intersects the input genes and TFs across priors and removes the missing TFs/genes. Default values is 'union'.} \item{remove_missing}{Only when modeProcess='legacy': remove_missing='TRUE' removes all unmatched TF and genes; remove_missing='FALSE' keeps all tf and genes. The default value is 'FALSE'.} + +\item{with_header}{if TRUE reads header of expression matrix} } \value{ When save_memory=FALSE(default), this function will return a list of three items: diff --git a/man/sourcePPI.Rd b/man/sourcePPI.Rd index 36dcc428..7df4f5da 100644 --- a/man/sourcePPI.Rd +++ b/man/sourcePPI.Rd @@ -19,7 +19,9 @@ sourcePPI(TF, STRING.version = "10", species.index, ...) A PPI data.frame which contains three columns: "from" and "to" indicating the direction of protein-protein interaction, and "score" indicating the interaction score between two proteins. } \description{ -This function is able to use a list of Transcription Factors(TF) of interest to source the Protein-Protein interactions (PPI)in STRING database +This function uses a list of Transcription Factors (TF) of interest to source the Protein-Protein interactions (PPI) from STRING database using all types of interactions not only the physical subnetwork +Important: this function produces a simple unweighted network for tutorial purposes, and does not support weighted PPI edges for the moment. +For more complex PPI network modeling, consider pulling the PPI network directly from STRINGdb directly or through their R package. } \examples{ # the example motif file diff --git a/man/spider.Rd b/man/spider.Rd new file mode 100644 index 00000000..2c054860 --- /dev/null +++ b/man/spider.Rd @@ -0,0 +1,97 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SPIDER.R +\name{spider} +\alias{spider} +\title{Seeding PANDA Interactions to Derive Epigenetic Regulation} +\usage{ +spider( + motif, + expr = NULL, + epifilter = NULL, + ppi = NULL, + alpha = 0.1, + hamming = 0.001, + iter = NA, + output = c("regulatory", "coexpression", "cooperative"), + zScale = TRUE, + progress = FALSE, + randomize = c("None", "within.gene", "by.gene"), + cor.method = "pearson", + scale.by.present = FALSE, + edgelist = FALSE, + remove.missing.ppi = FALSE, + remove.missing.motif = FALSE, + remove.missing.genes = FALSE, + mode = "union" +) +} +\arguments{ +\item{motif}{A motif dataset, a data.frame, matrix or exprSet containing 3 columns. +Each row describes an motif associated with a transcription factor (column 1) a +gene (column 2) and a score (column 3) for the motif.} + +\item{expr}{An expression dataset, as a genes (rows) by samples (columns) data.frame} + +\item{epifilter}{A binary matrix that is of the same size as motif that will be used as a mask to filter motif +for open chromatin region. Motif interactions that fall in open chromatin region will be kept and the others are removed.} + +\item{ppi}{A Protein-Protein interaction dataset, a data.frame containing 3 columns. +Each row describes a protein-protein interaction between transcription factor 1(column 1), +transcription factor 2 (column 2) and a score (column 3) for the interaction.} + +\item{alpha}{value to be used for update variable, alpha (default=0.1)} + +\item{hamming}{value at which to terminate the process based on hamming distance (default 10^-3)} + +\item{iter}{sets the maximum number of iterations SPIDER can run before exiting.} + +\item{output}{a vector containing which networks to return. Options include "regulatory", +"coregulatory", "cooperative".} + +\item{zScale}{Boolean to indicate use of z-scores in output. False will use [0,1] scale.} + +\item{progress}{Boolean to indicate printing of output for algorithm progress.} + +\item{randomize}{method by which to randomize gene expression matrix. Default "None". Must +be one of "None", "within.gene", "by.genes". "within.gene" randomization scrambles each row +of the gene expression matrix, "by.gene" scrambles gene labels.} + +\item{cor.method}{Correlation method, default is "pearson".} + +\item{scale.by.present}{Boolean to indicate scaling of correlations by percentage of positive samples.} + +\item{edgelist}{Boolean to indicate if edge lists instead of matrices should be returned.} + +\item{remove.missing.ppi}{Boolean to indicate whether TFs in the PPI but not in the motif data should be +removed. Only when mode=='legacy'.} + +\item{remove.missing.motif}{Boolean to indicate whether genes targeted in the motif data but not the +expression data should be removed. Only when mode=='legacy'.} + +\item{remove.missing.genes}{Boolean to indicate whether genes in the expression data but lacking +information from the motif prior should be removed. Only when mode=='legacy'.} + +\item{mode}{The data alignment mode. The mode 'union' takes the union of the genes in the expression matrix and the motif +and the union of TFs in the ppi and motif and fills the matrics with zeros for nonintersecting TFs and gens, 'intersection' +takes the intersection of genes and TFs and removes nonintersecting sets, 'legacy' is the old behavior with version 1.19.3. +#' Parameters remove.missing.ppi, remove.missingmotif, remove.missing.genes work only with mode=='legacy'.} +} +\value{ +An object of class "panda" containing matrices describing networks achieved by convergence +with SPIDER algorithm.\cr +"regNet" is the regulatory network\cr +"coregNet" is the coregulatory network\cr +"coopNet" is the cooperative network +} +\description{ +This function runs the SPIDER algorithm +} +\examples{ +data(pandaToyData) +spiderRes <- spider(pandaToyData$motif, pandaToyData$epifilter, + pandaToyData$expression,pandaToyData$ppi,hamming=.1,progress=TRUE) +} +\references{ +Sonawane, Abhijeet Rajendra, et al. "Constructing gene regulatory networks using epigenetic data." npj Systems Biology and Applications 7.1 (2021): 1-13. +} +\keyword{keywords} diff --git a/tests/testthat/test-monster.R b/tests/testthat/test-monster.R index 8d32f5d5..f7d9e306 100644 --- a/tests/testthat/test-monster.R +++ b/tests/testthat/test-monster.R @@ -16,7 +16,7 @@ test_that("MONSTER function works", { expect_equal(monsterTransformationMatrix(cc.net.1, cc.net.2), monsterTM, tolerance = 3e-3) # analyzes a bi-partite network by monsterTransformationMatrix() function with method "kabsch". - expect_equal(monsterTransformationMatrix(cc.net.1, cc.net.2,method = "kabsch"), monsterTM_kabsch, tolerance = 3e-3) + #expect_equal(monsterTransformationMatrix(cc.net.1, cc.net.2,method = "kabsch"), monsterTM_kabsch, tolerance = 3e-3) # to do: Add L1 method in test diff --git a/vignettes/pandaRApplicationinGTExData.Rmd b/vignettes/pandaRApplicationinGTExData.Rmd index b2b104f0..f0913e47 100644 --- a/vignettes/pandaRApplicationinGTExData.Rmd +++ b/vignettes/pandaRApplicationinGTExData.Rmd @@ -58,7 +58,7 @@ More details can be found in the published paper https://doi.org/10.1371/journal ## Building a PANDA regulatory network Now we locate our ppi and motif priors. The ppi represents physical interactions between transcription factor proteins, and is an undirected network. The transcription factor motif prior represents putative regulation events where a transcription factor binds in the promotor of a gene to regulate its expression, as predicted by the presence of transcription factor binding motifs in the promotor region of the gene. The motif prior is thus a directed network linking transcription factors to their predicted gene targets. These are small example priors for the purposes of demonstrating this method. A complete set of motif priors by species can be downloaded from: https://sites.google.com/a/channing.harvard.edu/kimberlyglass/tools/resources -The function source.PPI can be used to source the protein-protein interaction in the STRING database. +The function sourcePPI() can be used to source the protein-protein interaction network from the STRING database, but it does so only to produce an unweighted network that aggregates all types of interactions not only the physical interaction subnetwork. Please refer to STRINGdb website or R package to specify a PPI network suited for your application. Let's take a look at the priors: