1.2.1 (#271)

* add bioc information * update website * add crane and docs * update crane * update worflow * update crane * update workflows * update workflows * update workfolw * update workfolw * update workfolw * update workfolw * Update main.yml * Update main.yml * Update main.yml * fix lioness py * udpate lioness in R * include calculations python * update panda rm missing * update panda * update lioness joblib * revert commits * ALPACA fix to NAs in community assignement * version bump * revert lioness * update crane with alpaca changes * update lioness * update lioness python * fix test * update workflows * put back github actions * Fixed typo * Added LIONESS-pearson option + unit test * changed param labels in LIONESS example * updated docs for lioness() function * Update main.yml * Update main.yml * Update DESCRIPTION * fix crane issue #240 * expanded PANDA and LIONESS description * update crane fix * added ALPACA, EGRET, and OTTER descriptions * Update README.md * typo in na.rm * update workflow * update workflow2 * fix-actions-3 * add r lib v2 * add r lib v2 * update condor test * update condor test * update condor test * update condor test * show testthat output * add pandoc and use cache * use built in check * use built in check * update actions with built in check * bump version * use built in coverage * use built in coverage * remove sudo * update actions * update actions * removed sudo from actions * ugrade to R42 * remove virtualenv * remove virtualenv * remove virtualenv * modify condor test * update condor test * update condor test * adding retiucualte to covr * adding retiucualte to covr * add panda test to coverignore * update actions * remove lioness tests from coverage * remove lioness tests from coverage * added MONSTER and SAMBAR * remove lionessR test * Added CONDOR and YARN * Update README.md * put back otter in rbuild * remove condor test * remove condor test * remove lioness docs * add lioness to buildingore * donttest lionesss * remove coverage from ubuntu test * remove rbuilignore * update rbuildignore * update rbuildignore * update rbuildignore * update rbuildignore * update rbuildignore * add back buildignore * add back buildignore * add back buildignore * add back buildignore * add back buildignore * add back buildignore * add back buildignore * add back buildignore * add back buildignore * add back buildignore * put back tests * add padanedge diff test * add back egeret test * add monster test * fix monster test * tolerance in expect equal * tolerance in expect equal * tolerance in expect equal * tolerance in expect equal * add sourceppi * fixed PANDA tests * modify panda test * update lioness test * fix expect message * update tests * update tests * update tests * update tests * update tests * update tests * reduced lioness test data * reduced lioness test data * reduced lioness test data * reduced lioness test data * reduce test size * updating bioc (#250) * updating bioc * remove empty lines * remove empty lines * update citation * update citation * Update README.md * Update README.md * Update main.yml * Update main.yml * Update .Rbuildignore * Update .Rbuildignore * update lioness test * removed two last lioness tests * removed two last lioness tests * removed two last lioness tests * no saving of lioness networks * no saving of lioness networks * no saving of lioness networks * no saving of lioness networks * update lioness test * lower sizeof test * update test data * update test data * update test data * reduce panda tests * remove old test data * remove old test data * Update main.yml * 1.1.16 update pandapy with header argument (#260) * trying collapse readme (#261) * trying collapse readme * collapse animal description * Update README.md * update biconda (#265) * add spider (#269) * add spider * udpate spider * udpate spider * update spuder with degreeadjust * fix spider bug Co-authored-by: Kate Hoff Shutta <kshutta@umass.edu> Co-authored-by: katehoffshutta <43797774+katehoffshutta@users.noreply.github.com>
netZoo · Dec 19, 2022 · 7c5b781 · 7c5b781
1 parent e36310c
commit 7c5b781
Show file tree

Hide file tree

Showing 119 changed files with 19,564 additions and 889 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: netZooR
 Type: Package
 Title: Unified methods for the inference and analysis of gene regulatory networks
-Version: 1.1.15
+Version: 1.2.1
 Date: 2022-07-07
 Authors@R: c(person("Marouen", "Ben Guebila",
 			email = "benguebila@hsph.harvard.edu", role = c("aut","cre"), comment = c(ORCID = "0000-0001-5934-966X")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -31,6 +31,7 @@ export(monsterTransitionNetworkPlot)
 export(monsterTransitionPCAPlot)
 export(monsterdTFIPlot)
 export(otter)
+export(pandaDiffEdges)
 export(pandaPy)
 export(pandaToAlpaca)
 export(pandaToCondorObject)

diff --git a/R/PANDA.R b/R/PANDA.R
@@ -65,7 +65,7 @@
 #'
 
 
-pandaPy <- function(expr_file, motif_file=NULL, ppi_file=NULL, computing="cpu", precision="double",save_memory=FALSE, save_tmp=TRUE, keep_expression_matrix=FALSE, modeProcess="union", remove_missing=FALSE){
+pandaPy <- function(expr_file, motif_file=NULL, ppi_file=NULL, computing="cpu", precision="double",save_memory=FALSE, save_tmp=TRUE, keep_expression_matrix=FALSE, modeProcess="union", remove_missing=FALSE, with_header=FALSE){
 
   if(missing(expr_file)){
     stop("Please provide the path of gene expression data file to 'expr_file' variable") }
@@ -106,6 +106,13 @@ pandaPy <- function(expr_file, motif_file=NULL, ppi_file=NULL, computing="cpu",
     keepexpression.str <- "keep_expression_matrix=True"
   } else{ keepexpression.str <- "keep_expression_matrix=False" }
 
+  # with header option
+  if(with_header==FALSE){
+    withheader.str <- "with_header=False"
+  }else if (with_header==TRUE){
+    withheader.str <- "with_header=True"   
+  }
+
   # when pre-processing mode is legacy
   if(modeProcess == "legacy"){
 
@@ -130,7 +137,9 @@ pandaPy <- function(expr_file, motif_file=NULL, ppi_file=NULL, computing="cpu",
   reticulate::source_python(pandapath,convert = TRUE)
 
   # invoke Python script to create a Panda object
-  obj.str <-  paste("panda_obj=Panda(", expr.str, ",", motif.str,",", ppi.str, ",", computing.str, ",", precision.str, ",", savememory.str, ",", savetmp.str, "," , keepexpression.str, ",",  mode.str, ")", sep ='')
+  obj.str <-  paste("panda_obj=Panda(", expr.str, ",", motif.str,",", ppi.str, ",", 
+                    computing.str, ",", precision.str, ",", savememory.str, ",", savetmp.str, "," , 
+                    keepexpression.str, ",",  mode.str, "," , withheader.str, ")", sep ='')
 
   # run Python code
   py_run_string(obj.str)

diff --git a/R/SPIDER.R b/R/SPIDER.R
@@ -0,0 +1,308 @@
+#' Seeding PANDA Interactions to Derive Epigenetic Regulation
+#'
+#' This function runs the SPIDER algorithm
+#'
+#' @param motif A motif dataset, a data.frame, matrix or exprSet containing 3 columns.
+#' Each row describes an motif associated with a transcription factor (column 1) a
+#' gene (column 2) and a score (column 3) for the motif.
+#' @param epifilter A binary matrix that is of the same size as motif that will be used as a mask to filter motif
+#' for open chromatin region. Motif interactions that fall in open chromatin region will be kept and the others are removed.
+#' @param expr An expression dataset, as a genes (rows) by samples (columns) data.frame
+#' @param ppi A Protein-Protein interaction dataset, a data.frame containing 3 columns.
+#' Each row describes a protein-protein interaction between transcription factor 1(column 1),
+#' transcription factor 2 (column 2) and a score (column 3) for the interaction.
+#' @param alpha value to be used for update variable, alpha (default=0.1)
+#' @param hamming value at which to terminate the process based on hamming distance (default 10^-3)
+#' @param iter sets the maximum number of iterations SPIDER can run before exiting.
+#' @param progress Boolean to indicate printing of output for algorithm progress.
+#' @param output a vector containing which networks to return.  Options include "regulatory",
+#' "coregulatory", "cooperative".
+#' @param zScale Boolean to indicate use of z-scores in output.  False will use [0,1] scale.
+#' @param randomize method by which to randomize gene expression matrix.  Default "None".  Must
+#' be one of "None", "within.gene", "by.genes".  "within.gene" randomization scrambles each row
+#' of the gene expression matrix, "by.gene" scrambles gene labels.
+#' @param cor.method Correlation method, default is "pearson".
+#' @param scale.by.present Boolean to indicate scaling of correlations by percentage of positive samples.
+#' @param remove.missing.ppi Boolean to indicate whether TFs in the PPI but not in the motif data should be
+#' removed. Only when mode=='legacy'.
+#' @param remove.missing.motif Boolean to indicate whether genes targeted in the motif data but not the
+#' expression data should be removed. Only when mode=='legacy'.
+#' @param remove.missing.genes Boolean to indicate whether genes in the expression data but lacking
+#' information from the motif prior should be removed. Only when mode=='legacy'.
+#' @param edgelist Boolean to indicate if edge lists instead of matrices should be returned. 
+#' @param mode The data alignment mode. The mode 'union' takes the union of the genes in the expression matrix and the motif
+#' and the union of TFs in the ppi and motif and fills the matrics with zeros for nonintersecting TFs and gens, 'intersection' 
+#' takes the intersection of genes and TFs and removes nonintersecting sets, 'legacy' is the old behavior with version 1.19.3.
+#' #' Parameters remove.missing.ppi, remove.missingmotif, remove.missing.genes work only with mode=='legacy'.
+#' @keywords keywords
+#' @importFrom matrixStats rowSds
+#' @importFrom matrixStats colSds
+#' @importFrom Biobase assayData
+#' @importFrom reshape melt.array
+#' @export
+#' @return An object of class "panda" containing matrices describing networks achieved by convergence
+#' with SPIDER algorithm.\cr
+#' "regNet" is the regulatory network\cr
+#' "coregNet" is the coregulatory network\cr
+#' "coopNet" is the cooperative network
+#' @examples
+#' data(pandaToyData)
+#' spiderRes <- spider(pandaToyData$motif, pandaToyData$epifilter
+#'            pandaToyData$expression,pandaToyData$ppi,hamming=.1,progress=TRUE)
+#' @references
+#' Sonawane, Abhijeet Rajendra, et al. "Constructing gene regulatory networks using epigenetic data." npj Systems Biology and Applications 7.1 (2021): 1-13.
+spider <- function(motif,expr=NULL,epifilter=NULL,ppi=NULL,alpha=0.1,hamming=0.001,
+                  iter=NA,output=c('regulatory','coexpression','cooperative'),
+                  zScale=TRUE,progress=FALSE,randomize=c("None", "within.gene", "by.gene"),cor.method="pearson",
+                  scale.by.present=FALSE,edgelist=FALSE,remove.missing.ppi=FALSE,
+                  remove.missing.motif=FALSE,remove.missing.genes=FALSE,mode="union"){
+
+  randomize <- match.arg(randomize)  
+  if(progress)
+    print('Initializing and validating')
+
+  if(epifilter[c(1,2),] != motif[c(1,2),]){
+    stop('Chromatin accessibility data does not match motif data size and order.')
+  }
+
+  if(class(expr)=="ExpressionSet")
+    expr <- assayData(expr)[["exprs"]]
+
+  if (is.null(expr)){
+    # Use only the motif data here for the gene list
+    num.conditions <- 0
+    if (randomize!="None"){
+      warning("Randomization ignored because gene expression is not used.")
+      randomize <- "None"
+    }
+  } else {
+    if(mode=='legacy'){
+      if(remove.missing.genes){
+        # remove genes from expression data that are not in the motif data
+        n <- nrow(expr)
+        expr <- expr[which(rownames(expr)%in%motif[,2]),]
+        message(sprintf("%s genes removed that were not present in motif", n-nrow(expr)))
+      }
+      if(remove.missing.motif){
+        # remove genes from motif data that are not in the expression data
+        n <- nrow(motif)
+        motif <- motif[which(motif[,2]%in%rownames(expr)),]
+        epifilter <- epifilter[which(motif[,2]%in%rownames(expr)),]
+        message(sprintf("%s motif edges removed that targeted genes missing in expression data", n-nrow(motif)))
+      }
+      # Use the motif data AND the expr data (if provided) for the gene list
+      # Keep everything sorted alphabetically
+      expr <- expr[order(rownames(expr)),]
+    }else if(mode=='union'){
+      gene.names=unique(union(rownames(expr),unique(motif[,2])))
+      tf.names  =unique(union(unique(ppi[,1]),unique(motif[,1])))
+      num.TFs    <- length(tf.names)
+      num.genes  <- length(gene.names)
+      # gene expression matrix
+      expr1=as.data.frame(matrix(0,num.genes,ncol(expr)))
+      rownames(expr1)=gene.names
+      expr1[which(gene.names%in%rownames(expr)),]=expr[]
+      expr=expr1
+      #PPI matrix
+      tfCoopNetwork <- matrix(0,num.TFs,num.TFs)
+      colnames(tfCoopNetwork)=tf.names
+      rownames(tfCoopNetwork)=tf.names
+      Idx1 <- match(ppi[,1], tf.names);
+      Idx2 <- match(ppi[,2], tf.names);
+      Idx <- (Idx2-1)*num.TFs+Idx1;
+      tfCoopNetwork[Idx] <- ppi[,3];
+      Idx <- (Idx1-1)*num.TFs+Idx2;
+      tfCoopNetwork[Idx] <- ppi[,3];
+      #Motif matrix
+      regulatoryNetwork=matrix(0,num.TFs,num.genes)
+      colnames(regulatoryNetwork)=gene.names
+      rownames(regulatoryNetwork)=tf.names
+      Idx1=match(motif[,1], tf.names);
+      Idx2=match(motif[,2], gene.names);
+      Idx=(Idx2-1)*num.TFs+Idx1;
+      regulatoryNetwork[Idx]=motif[,3]*epifilter[,3]
+    }else if(mode=='intersection'){
+      gene.names=unique(intersect(rownames(expr),unique(motif[,2])))
+      tf.names  =unique(intersect(unique(ppi[,1]),unique(motif[,1])))
+      num.TFs    <- length(tf.names)
+      num.genes  <- length(gene.names)
+      # gene expression matrix
+      expr1=as.data.frame(matrix(0,num.genes,ncol(expr)))
+      rownames(expr1)=gene.names
+      interGeneNames=gene.names[which(gene.names%in%rownames(expr))]
+      expr1[interGeneNames,]=expr[interGeneNames,]
+      expr=expr1
+      #PPI matrix
+      tfCoopNetwork <- matrix(0,num.TFs,num.TFs)
+      colnames(tfCoopNetwork)=tf.names
+      rownames(tfCoopNetwork)=tf.names
+      Idx1 <- match(ppi[,1], tf.names);
+      Idx2 <- match(ppi[,2], tf.names);
+      Idx <- (Idx2-1)*num.TFs+Idx1;
+      indIdx=!is.na(Idx)
+      Idx=Idx[indIdx] #remove missing TFs
+      tfCoopNetwork[Idx] <- ppi[indIdx,3];
+      Idx <- (Idx1-1)*num.TFs+Idx2;
+      indIdx=!is.na(Idx)
+      Idx=Idx[indIdx] #remove missing TFs
+      tfCoopNetwork[Idx] <- ppi[indIdx,3];
+      #Motif matrix
+      regulatoryNetwork=matrix(0,num.TFs,num.genes)
+      colnames(regulatoryNetwork)=gene.names
+      rownames(regulatoryNetwork)=tf.names
+      Idx1=match(motif[,1], tf.names);
+      Idx2=match(motif[,2], gene.names);
+      Idx=(Idx2-1)*num.TFs+Idx1;
+      indIdx=!is.na(Idx)
+      Idx=Idx[indIdx] #remove missing genes
+      regulatoryNetwork[Idx]=motif[indIdx,3]*epifilter[indIdx,3];          
+    }
+    num.conditions <- ncol(expr)
+    if (randomize=='within.gene'){
+      expr <- t(apply(expr, 1, sample))
+      if(progress)
+        print("Randomizing by reordering each gene's expression")
+    } else if (randomize=='by.gene'){
+      rownames(expr) <- sample(rownames(expr))
+      expr           <- expr[order(rownames(expr)),]
+      if(progress)
+        print("Randomizing by reordering each gene labels")
+    }
+  }
+
+  if (mode=='legacy'){
+    # Create vectors for TF names and Gene names from motif dataset
+    tf.names   <- sort(unique(motif[,1]))
+    gene.names <- sort(unique(rownames(expr)))
+    num.TFs    <- length(tf.names)
+    num.genes  <- length(gene.names)
+  }
+
+  # Bad data checking
+  if (num.genes==0){
+    stop("Error validating data.  No matched genes.\n  Please ensure that gene names in expression data match gene names in motif data")
+  }
+
+  if(num.conditions==0) {
+    warning('No expression data given.  SPIDER will run based on an identity co-regulation matrix')
+    geneCoreg <- diag(num.genes)
+  } else if(num.conditions<3) {
+    warning('Not enough expression conditions detected to calculate correlation. Co-regulation network will be initialized to an identity matrix.')
+    geneCoreg <- diag(num.genes)
+  } else {
+
+    if(scale.by.present){
+      num.positive=(expr>0)%*%t((expr>0))
+      geneCoreg <- cor(t(expr), method=cor.method, use="pairwise.complete.obs")*(num.positive/num.conditions)
+    } else {
+      geneCoreg <- cor(t(expr), method=cor.method, use="pairwise.complete.obs")
+    }
+    if(progress)
+      print('Verified sufficient samples')
+  }
+  if (any(is.na(geneCoreg))){ #check for NA and replace them by zero
+    diag(geneCoreg)=1
+    geneCoreg[is.na(geneCoreg)]=0
+  }
+
+  if (any(duplicated(motif))) {
+    warning("Duplicate edges have been found in the motif data. Weights will be summed.")
+    motif <- aggregate(motif[,3], by=list(motif[,1], motif[,2]), FUN=sum)
+  }
+
+  # Prior Regulatory Network
+  if(mode=='legacy'){
+    Idx1=match(motif[,1], tf.names);
+    Idx2=match(motif[,2], gene.names);
+    Idx=(Idx2-1)*num.TFs+Idx1;
+    regulatoryNetwork=matrix(data=0, num.TFs, num.genes);
+    regulatoryNetwork[Idx]=motif[,3]
+    colnames(regulatoryNetwork) <- gene.names
+    rownames(regulatoryNetwork) <- tf.names
+    # PPI data
+    # If no ppi data is given, we use the identity matrix
+    tfCoopNetwork <- diag(num.TFs)
+    # Else we convert our two-column data.frame to a matrix
+    if (!is.null(ppi)){
+      if(any(duplicated(ppi))){
+        warning("Duplicate edges have been found in the PPI data. Weights will be summed.")
+        ppi <- aggregate(ppi[,3], by=list(ppi[,1], ppi[,2]), FUN=sum)
+      }
+      if(remove.missing.ppi){
+        # remove edges in the PPI data that target TFs not in the motif
+        n <- nrow(ppi)
+        ppi <- ppi[which(ppi[,1]%in%tf.names & ppi[,2]%in%tf.names),]
+        message(sprintf("%s PPI edges removed that were not present in motif", n-nrow(ppi)))
+      }
+      Idx1 <- match(ppi[,1], tf.names);
+      Idx2 <- match(ppi[,2], tf.names);
+      Idx <- (Idx2-1)*num.TFs+Idx1;
+      tfCoopNetwork[Idx] <- ppi[,3];
+      Idx <- (Idx1-1)*num.TFs+Idx2;
+      tfCoopNetwork[Idx] <- ppi[,3];
+    }
+    colnames(tfCoopNetwork) <- tf.names
+    rownames(tfCoopNetwork) <- tf.names
+  }
+
+  ## Run SPIDER ##
+  tic=proc.time()[3]
+
+  # adjusting degree distribution
+  regulatoryNetwork = degreeAdjust(regulatoryNetwork)
+
+  if(progress)
+    print('Normalizing networks...')
+  regulatoryNetwork = normalizeNetwork(regulatoryNetwork)
+  tfCoopNetwork     = normalizeNetwork(tfCoopNetwork)
+  geneCoreg         = normalizeNetwork(geneCoreg)
+
+  if(progress)
+    print('Learning Network...')
+
+  minusAlpha = 1-alpha
+  step=0
+  hamming_cur=1
+  if(progress)
+    print("Using tanimoto similarity")
+  while(hamming_cur>hamming){
+    if ((!is.na(iter))&&step>=iter){
+      print(paste("Reached maximum iterations, iter =",iter),sep="")
+      break
+    }
+    Responsibility=tanimoto(tfCoopNetwork, regulatoryNetwork)
+    Availability=tanimoto(regulatoryNetwork, geneCoreg)
+    RA = 0.5*(Responsibility+Availability)
+
+    hamming_cur=sum(abs(regulatoryNetwork-RA))/(num.TFs*num.genes)
+    regulatoryNetwork=minusAlpha*regulatoryNetwork + alpha*RA
+
+    ppi=tanimoto(regulatoryNetwork, t(regulatoryNetwork))
+    ppi=update.diagonal(ppi, num.TFs, alpha, step)
+    tfCoopNetwork=minusAlpha*tfCoopNetwork + alpha*ppi
+
+    CoReg2=tanimoto(t(regulatoryNetwork), regulatoryNetwork)
+    CoReg2=update.diagonal(CoReg2, num.genes, alpha, step)
+    geneCoreg=minusAlpha*geneCoreg + alpha*CoReg2
+
+    if(progress)
+      message("Iteration", step,": hamming distance =", round(hamming_cur,5))
+    step=step+1
+  }
+
+  toc=proc.time()[3] - tic
+  if(progress)
+    message("Successfully ran SPIDER on ", num.genes, " Genes and ", num.TFs, " TFs.\nTime elapsed:", round(toc,2), "seconds.")
+  prepResult(zScale, output, regulatoryNetwork, geneCoreg, tfCoopNetwork, edgelist, motif)
+}
+
+#' Function to adjust the degree so that the hub nodes are not penalized in z-score transformation
+#'
+#' @param A Input adjacency matrix
+degreeAdjust <- function(A){
+  k1 <- colSums(A)/dim(A,1)
+  k2 <- rowSums(A)/dim(A,2)
+  B <- (matrix(replicate(dim(A,2),k1),nrow=dim(A,1)))^2
+  B <- B + (matrix(t(replicate(dim(A,2),k2)),nrow=dim(A,1)))^2
+  A <- A * sqrt(B);
+}