Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add spider #269

Merged
merged 5 commits into from
Dec 15, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
308 changes: 308 additions & 0 deletions R/SPIDER.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,308 @@
#' Seeding PANDA Interactions to Derive Epigenetic Regulation
#'
#' This function runs the SPIDER algorithm
#'
#' @param motif A motif dataset, a data.frame, matrix or exprSet containing 3 columns.
#' Each row describes an motif associated with a transcription factor (column 1) a
#' gene (column 2) and a score (column 3) for the motif.
#' @param epifilter A binary matrix that is of the same size as motif that will be used as a mask to filter motif
#' for open chromatin region. Motif interactions that fall in open chromatin region will be kept and the others are removed.
#' @param expr An expression dataset, as a genes (rows) by samples (columns) data.frame
#' @param ppi A Protein-Protein interaction dataset, a data.frame containing 3 columns.
#' Each row describes a protein-protein interaction between transcription factor 1(column 1),
#' transcription factor 2 (column 2) and a score (column 3) for the interaction.
#' @param alpha value to be used for update variable, alpha (default=0.1)
#' @param hamming value at which to terminate the process based on hamming distance (default 10^-3)
#' @param iter sets the maximum number of iterations SPIDER can run before exiting.
#' @param progress Boolean to indicate printing of output for algorithm progress.
#' @param output a vector containing which networks to return. Options include "regulatory",
#' "coregulatory", "cooperative".
#' @param zScale Boolean to indicate use of z-scores in output. False will use [0,1] scale.
#' @param randomize method by which to randomize gene expression matrix. Default "None". Must
#' be one of "None", "within.gene", "by.genes". "within.gene" randomization scrambles each row
#' of the gene expression matrix, "by.gene" scrambles gene labels.
#' @param cor.method Correlation method, default is "pearson".
#' @param scale.by.present Boolean to indicate scaling of correlations by percentage of positive samples.
#' @param remove.missing.ppi Boolean to indicate whether TFs in the PPI but not in the motif data should be
#' removed. Only when mode=='legacy'.
#' @param remove.missing.motif Boolean to indicate whether genes targeted in the motif data but not the
#' expression data should be removed. Only when mode=='legacy'.
#' @param remove.missing.genes Boolean to indicate whether genes in the expression data but lacking
#' information from the motif prior should be removed. Only when mode=='legacy'.
#' @param edgelist Boolean to indicate if edge lists instead of matrices should be returned.
#' @param mode The data alignment mode. The mode 'union' takes the union of the genes in the expression matrix and the motif
#' and the union of TFs in the ppi and motif and fills the matrics with zeros for nonintersecting TFs and gens, 'intersection'
#' takes the intersection of genes and TFs and removes nonintersecting sets, 'legacy' is the old behavior with version 1.19.3.
#' #' Parameters remove.missing.ppi, remove.missingmotif, remove.missing.genes work only with mode=='legacy'.
#' @keywords keywords
#' @importFrom matrixStats rowSds
#' @importFrom matrixStats colSds
#' @importFrom Biobase assayData
#' @importFrom reshape melt.array
#' @export
#' @return An object of class "panda" containing matrices describing networks achieved by convergence
#' with SPIDER algorithm.\cr
#' "regNet" is the regulatory network\cr
#' "coregNet" is the coregulatory network\cr
#' "coopNet" is the cooperative network
#' @examples
#' data(pandaToyData)
#' spiderRes <- spider(pandaToyData$motif, pandaToyData$epifilter
#' pandaToyData$expression,pandaToyData$ppi,hamming=.1,progress=TRUE)
#' @references
#' Sonawane, Abhijeet Rajendra, et al. "Constructing gene regulatory networks using epigenetic data." npj Systems Biology and Applications 7.1 (2021): 1-13.
spider <- function(motif,expr=NULL,epifilter=NULL,ppi=NULL,alpha=0.1,hamming=0.001,
iter=NA,output=c('regulatory','coexpression','cooperative'),
zScale=TRUE,progress=FALSE,randomize=c("None", "within.gene", "by.gene"),cor.method="pearson",
scale.by.present=FALSE,edgelist=FALSE,remove.missing.ppi=FALSE,
remove.missing.motif=FALSE,remove.missing.genes=FALSE,mode="union"){

randomize <- match.arg(randomize)
if(progress)
print('Initializing and validating')

if(epifilter[c(1,2),] != motif[c(1,2),]){
stop('Chromatin accessibility data does not match motif data size and order.')
}

if(class(expr)=="ExpressionSet")
expr <- assayData(expr)[["exprs"]]

if (is.null(expr)){
# Use only the motif data here for the gene list
num.conditions <- 0
if (randomize!="None"){
warning("Randomization ignored because gene expression is not used.")
randomize <- "None"
}
} else {
if(mode=='legacy'){
if(remove.missing.genes){
# remove genes from expression data that are not in the motif data
n <- nrow(expr)
expr <- expr[which(rownames(expr)%in%motif[,2]),]
message(sprintf("%s genes removed that were not present in motif", n-nrow(expr)))
}
if(remove.missing.motif){
# remove genes from motif data that are not in the expression data
n <- nrow(motif)
motif <- motif[which(motif[,2]%in%rownames(expr)),]
epifilter <- epifilter[which(motif[,2]%in%rownames(expr)),]
message(sprintf("%s motif edges removed that targeted genes missing in expression data", n-nrow(motif)))
}
# Use the motif data AND the expr data (if provided) for the gene list
# Keep everything sorted alphabetically
expr <- expr[order(rownames(expr)),]
}else if(mode=='union'){
gene.names=unique(union(rownames(expr),unique(motif[,2])))
tf.names =unique(union(unique(ppi[,1]),unique(motif[,1])))
num.TFs <- length(tf.names)
num.genes <- length(gene.names)
# gene expression matrix
expr1=as.data.frame(matrix(0,num.genes,ncol(expr)))
rownames(expr1)=gene.names
expr1[which(gene.names%in%rownames(expr)),]=expr[]
expr=expr1
#PPI matrix
tfCoopNetwork <- matrix(0,num.TFs,num.TFs)
colnames(tfCoopNetwork)=tf.names
rownames(tfCoopNetwork)=tf.names
Idx1 <- match(ppi[,1], tf.names);
Idx2 <- match(ppi[,2], tf.names);
Idx <- (Idx2-1)*num.TFs+Idx1;
tfCoopNetwork[Idx] <- ppi[,3];
Idx <- (Idx1-1)*num.TFs+Idx2;
tfCoopNetwork[Idx] <- ppi[,3];
#Motif matrix
regulatoryNetwork=matrix(0,num.TFs,num.genes)
colnames(regulatoryNetwork)=gene.names
rownames(regulatoryNetwork)=tf.names
Idx1=match(motif[,1], tf.names);
Idx2=match(motif[,2], gene.names);
Idx=(Idx2-1)*num.TFs+Idx1;
regulatoryNetwork[Idx]=motif[,3]*epifilter[,3]
}else if(mode=='intersection'){
gene.names=unique(intersect(rownames(expr),unique(motif[,2])))
tf.names =unique(intersect(unique(ppi[,1]),unique(motif[,1])))
num.TFs <- length(tf.names)
num.genes <- length(gene.names)
# gene expression matrix
expr1=as.data.frame(matrix(0,num.genes,ncol(expr)))
rownames(expr1)=gene.names
interGeneNames=gene.names[which(gene.names%in%rownames(expr))]
expr1[interGeneNames,]=expr[interGeneNames,]
expr=expr1
#PPI matrix
tfCoopNetwork <- matrix(0,num.TFs,num.TFs)
colnames(tfCoopNetwork)=tf.names
rownames(tfCoopNetwork)=tf.names
Idx1 <- match(ppi[,1], tf.names);
Idx2 <- match(ppi[,2], tf.names);
Idx <- (Idx2-1)*num.TFs+Idx1;
indIdx=!is.na(Idx)
Idx=Idx[indIdx] #remove missing TFs
tfCoopNetwork[Idx] <- ppi[indIdx,3];
Idx <- (Idx1-1)*num.TFs+Idx2;
indIdx=!is.na(Idx)
Idx=Idx[indIdx] #remove missing TFs
tfCoopNetwork[Idx] <- ppi[indIdx,3];
#Motif matrix
regulatoryNetwork=matrix(0,num.TFs,num.genes)
colnames(regulatoryNetwork)=gene.names
rownames(regulatoryNetwork)=tf.names
Idx1=match(motif[,1], tf.names);
Idx2=match(motif[,2], gene.names);
Idx=(Idx2-1)*num.TFs+Idx1;
indIdx=!is.na(Idx)
Idx=Idx[indIdx] #remove missing genes
regulatoryNetwork[Idx]=motif[indIdx,3]*epifilter[indIdx,3];
}
num.conditions <- ncol(expr)
if (randomize=='within.gene'){
expr <- t(apply(expr, 1, sample))
if(progress)
print("Randomizing by reordering each gene's expression")
} else if (randomize=='by.gene'){
rownames(expr) <- sample(rownames(expr))
expr <- expr[order(rownames(expr)),]
if(progress)
print("Randomizing by reordering each gene labels")
}
}

if (mode=='legacy'){
# Create vectors for TF names and Gene names from motif dataset
tf.names <- sort(unique(motif[,1]))
gene.names <- sort(unique(rownames(expr)))
num.TFs <- length(tf.names)
num.genes <- length(gene.names)
}

# Bad data checking
if (num.genes==0){
stop("Error validating data. No matched genes.\n Please ensure that gene names in expression data match gene names in motif data")
}

if(num.conditions==0) {
warning('No expression data given. SPIDER will run based on an identity co-regulation matrix')
geneCoreg <- diag(num.genes)
} else if(num.conditions<3) {
warning('Not enough expression conditions detected to calculate correlation. Co-regulation network will be initialized to an identity matrix.')
geneCoreg <- diag(num.genes)
} else {

if(scale.by.present){
num.positive=(expr>0)%*%t((expr>0))
geneCoreg <- cor(t(expr), method=cor.method, use="pairwise.complete.obs")*(num.positive/num.conditions)
} else {
geneCoreg <- cor(t(expr), method=cor.method, use="pairwise.complete.obs")
}
if(progress)
print('Verified sufficient samples')
}
if (any(is.na(geneCoreg))){ #check for NA and replace them by zero
diag(geneCoreg)=1
geneCoreg[is.na(geneCoreg)]=0
}

if (any(duplicated(motif))) {
warning("Duplicate edges have been found in the motif data. Weights will be summed.")
motif <- aggregate(motif[,3], by=list(motif[,1], motif[,2]), FUN=sum)
}

# Prior Regulatory Network
if(mode=='legacy'){
Idx1=match(motif[,1], tf.names);
Idx2=match(motif[,2], gene.names);
Idx=(Idx2-1)*num.TFs+Idx1;
regulatoryNetwork=matrix(data=0, num.TFs, num.genes);
regulatoryNetwork[Idx]=motif[,3]
colnames(regulatoryNetwork) <- gene.names
rownames(regulatoryNetwork) <- tf.names
# PPI data
# If no ppi data is given, we use the identity matrix
tfCoopNetwork <- diag(num.TFs)
# Else we convert our two-column data.frame to a matrix
if (!is.null(ppi)){
if(any(duplicated(ppi))){
warning("Duplicate edges have been found in the PPI data. Weights will be summed.")
ppi <- aggregate(ppi[,3], by=list(ppi[,1], ppi[,2]), FUN=sum)
}
if(remove.missing.ppi){
# remove edges in the PPI data that target TFs not in the motif
n <- nrow(ppi)
ppi <- ppi[which(ppi[,1]%in%tf.names & ppi[,2]%in%tf.names),]
message(sprintf("%s PPI edges removed that were not present in motif", n-nrow(ppi)))
}
Idx1 <- match(ppi[,1], tf.names);
Idx2 <- match(ppi[,2], tf.names);
Idx <- (Idx2-1)*num.TFs+Idx1;
tfCoopNetwork[Idx] <- ppi[,3];
Idx <- (Idx1-1)*num.TFs+Idx2;
tfCoopNetwork[Idx] <- ppi[,3];
}
colnames(tfCoopNetwork) <- tf.names
rownames(tfCoopNetwork) <- tf.names
}

## Run SPIDER ##
tic=proc.time()[3]

# adjusting degree distribution
regulatoryNetwork = degreeAdjust(regulatoryNetwork)

if(progress)
print('Normalizing networks...')
regulatoryNetwork = normalizeNetwork(regulatoryNetwork)
tfCoopNetwork = normalizeNetwork(tfCoopNetwork)
geneCoreg = normalizeNetwork(geneCoreg)

if(progress)
print('Learning Network...')

minusAlpha = 1-alpha
step=0
hamming_cur=1
if(progress)
print("Using tanimoto similarity")
while(hamming_cur>hamming){
if ((!is.na(iter))&&step>=iter){
print(paste("Reached maximum iterations, iter =",iter),sep="")
break
}
Responsibility=tanimoto(tfCoopNetwork, regulatoryNetwork)
Availability=tanimoto(regulatoryNetwork, geneCoreg)
RA = 0.5*(Responsibility+Availability)

hamming_cur=sum(abs(regulatoryNetwork-RA))/(num.TFs*num.genes)
regulatoryNetwork=minusAlpha*regulatoryNetwork + alpha*RA

ppi=tanimoto(regulatoryNetwork, t(regulatoryNetwork))
ppi=update.diagonal(ppi, num.TFs, alpha, step)
tfCoopNetwork=minusAlpha*tfCoopNetwork + alpha*ppi

CoReg2=tanimoto(t(regulatoryNetwork), regulatoryNetwork)
CoReg2=update.diagonal(CoReg2, num.genes, alpha, step)
geneCoreg=minusAlpha*geneCoreg + alpha*CoReg2

if(progress)
message("Iteration", step,": hamming distance =", round(hamming_cur,5))
step=step+1
}

toc=proc.time()[3] - tic
if(progress)
message("Successfully ran SPIDER on ", num.genes, " Genes and ", num.TFs, " TFs.\nTime elapsed:", round(toc,2), "seconds.")
prepResult(zScale, output, regulatoryNetwork, geneCoreg, tfCoopNetwork, edgelist, motif)
}

#' Function to adjust the degree so that the hub nodes are not penalized in z-score transformation
#'
#' @param A Input adjacency matrix
degreeAdjust <- function(A){
k1 <- colSums(A)/dim(A,1)
k2 <- rowSums(A)/dim(A,2)
B <- (matrix(replicate(dim(A,2),k1),nrow=dim(A,1)))^2
B <- B + (matrix(t(replicate(dim(A,2),k2)),nrow=dim(A,1)))^2
A <- A * sqrt(B);
}