# RNA + MS data integration

In [None]:
library(RColorBrewer)
library(VennDiagram)
library(igraph)
library(ggplot2)
library(parallel) # 2.5 fold improvement in speed for networks overlap

In [None]:
set.seed(20)

In [None]:
# Load transformed MS data with t-test results
protFiltered = read.table("Perseus_run/ProteinsTTestFiltered.txt", sep = "\t", head=T, quote="")

In [None]:
names(protFiltered)[2] <- "LogPvalue"
# Color for significant abundance changes
protFiltered$color = ifelse(test = protFiltered$Difference > 0, "#4A91C4", "#F09F4E")
protFiltered[protFiltered$Significant != "+", 99] = "grey"
protFiltered$Gene.names <- as.character(protFiltered$Gene.names)
# Keep only the first gene name when several are provided
protFiltered[,7] <- sub(";.*", "", protFiltered[,7])

In [None]:
listDownMS = protFiltered[protFiltered$color == "#4A91C4",7]
listUpMS = protFiltered[protFiltered$color == "#F09F4E",7]

In [None]:
load("RNAlists.diff")

In [None]:
drawVennFromList = function(a, b, z = c(), d = c(), ...){
    if (is.null(z)){
        draw.pairwise.venn(length(a), length(b),
            length(intersect(a, b)), ...)
    }
    else if (is.null(d)){
        draw.triple.venn(length(a), length(b), length(z),
            length(intersect(a, b)), length(intersect(b, z)), length(intersect(a, z)), 
            length(intersect(intersect(a, b), z)), ...)          
    }
    else{
        draw.quad.venn(length(a), length(b), length(z), length(d),
            length(intersect(a, b)), length(intersect(a, z)), length(intersect(a, d)),
            length(intersect(b, z)), length(intersect(b, d)), length(intersect(z, d)),
            length(intersect(intersect(a, b), z)), length(intersect(intersect(a, b), d)),
            length(intersect(intersect(a, z), d)), length(intersect(intersect(z, b), d)),
            length(intersect(intersect(intersect(a, b), z), d)), ...)  
    }
}

In [None]:
pdf("RNA_MS_overlap.pdf")
drawVennFromList(listDownMS, listDownRNA, listUpMS, listUpRNA, category = c("MS - down", "RNA-seq - down", "MS - up", "RNA-seq - up"),
                fill = brewer.pal(4, "Paired"), fontfamily = "sans", cat.fontfamily = "sans")
dev.off()

In [None]:
for (i in intersect(listDownMS, listDownRNA)) cat(paste0(i, "\n"))

In [None]:
for (i in union(listUpMS, listDownMS)) cat(paste0(i, "\n"))

In [None]:
for (i in intersect(listUpMS, listUpRNA)) cat(paste0(i, "\n"))

In [None]:
for (i in listUpRNA) cat(paste0(i, "\n"))

## Prepare PPI for Omics Integrator
The PPI was obtained from BioGRID Release 3.4.160 as [all the interactions available for mice in the TAB2 format](https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-3.4.160/BIOGRID-ORGANISM-3.4.160.tab2.zip).

In [None]:
PPI = read.table("OmicsIntegratorRun/BIOGRID-ORGANISM-Mus_musculus-3.4.160.tab2.txt", sep = "\t", header = T,
                quote = '', na.strings = "-", comment.char = "")
head(PPI[,c(8,9)])

In [None]:
# NB: BioGrid provides interactions between mouse proteins and proteins from other organisms
table(PPI$Organism.Interactor.A)
table(PPI$Organism.Interactor.B)

In [None]:
PPI = PPI[which((PPI$Organism.Interactor.A == 10090)&(PPI$Organism.Interactor.B == 10090)),]

In [None]:
# Map an experimental system to a confidence score for direct PPI
expScore = data.frame(levels(PPI$Experimental.System), c(0.2,0.2,0.2,0.2,0.99,0.5,0.2,0.2,0.2,
  0,0,0.99,0.99,0,0.99,0,0,0,
  0.99,0.2,0.2,0.99,0,0,0,0,0.99))
expScore

In [None]:
PPI$expScore = expScore[PPI$Experimental.System,2] # Levels = row index in the lookup table

In [None]:
write.table(PPI[,c(8,9,25),], file = "OmicsIntegratorRun/mousePPI.tsv", col.names = F, row.names = F, sep = "\t",
            quote = F, dec = ".")

## Combine Omics Integrator networks
We first combine the optimal forests for the three comparisons and export an annotated version of the union of these nodes and their direct neighbors, with all known edges between these nodes.

In [None]:
recNodes = unique(unlist(read.table("OmicsIntegratorRun/recArg1_output/result_optimalForest.sif")[,c(1,3)]))
ranklNodes = unique(unlist(read.table("OmicsIntegratorRun/ranklDiff_output/result_optimalForest.sif")[,c(1,3)]))

In [None]:
drawVennFromList(recNodes, ranklNodes, category = c("recArg1", "RANKL"),
                fill = c("tomato2", "darkgreen"), fontfamily = "sans", cat.fontfamily = "sans")

In [None]:
# forestNodes = union(union(recNodes, ranklNodes), deplNodes)
forestNodes = union(recNodes, ranklNodes)
PPI$Official.Symbol.Interactor.A = as.character(PPI$Official.Symbol.Interactor.A)
PPI$Official.Symbol.Interactor.B = as.character(PPI$Official.Symbol.Interactor.B)
neighbors = union(PPI$Official.Symbol.Interactor.B[PPI$Official.Symbol.Interactor.A %in% forestNodes],
PPI$Official.Symbol.Interactor.A[PPI$Official.Symbol.Interactor.B %in% forestNodes])
allNodes = union(forestNodes, neighbors)
length(allNodes)

In [None]:
interactionsWithNeighbors = ((PPI$Official.Symbol.Interactor.A %in% allNodes) | (PPI$Official.Symbol.Interactor.B %in% allNodes)) & 
    (PPI$Official.Symbol.Interactor.A != PPI$Official.Symbol.Interactor.B)
neighborsPPI = graph_from_edgelist(as.matrix(PPI[interactionsWithNeighbors, c(8,9)]), directed = F)
A = as.matrix(as_adjacency_matrix(neighborsPPI))

In [None]:
allNodes = union(names(which(colSums(A[rownames(A) %in% forestNodes,]) >= 2)), forestNodes)
length(allNodes)

In [None]:
interactionsWithForests = ((PPI$Official.Symbol.Interactor.A %in% allNodes) & (PPI$Official.Symbol.Interactor.B %in% allNodes)) & 
    (PPI$Official.Symbol.Interactor.A != PPI$Official.Symbol.Interactor.B)
expandedForest = graph_from_edgelist(as.matrix(PPI[interactionsWithForests, c(8,9)]), directed = F)
all(names(V(expandedForest)) %in% allNodes)

In [None]:
nodeType = rep(0, vcount(expandedForest))
nodeType = nodeType + sapply(names(V(expandedForest)), function(x) ifelse(x %in% recNodes, 1, 0))
nodeType = nodeType + sapply(names(V(expandedForest)), function(x) ifelse(x %in% ranklNodes, 2, 0))
table(nodeType)
expandedForest = set_vertex_attr(expandedForest, name = "nodeType", value = nodeType) 

## Add edge types to optimal forests graph

In [None]:
# Keep only interaction between forest nodes
forestSubPPI = PPI[(PPI$Official.Symbol.Interactor.A %in% forestNodes)&(PPI$Official.Symbol.Interactor.B %in% forestNodes),]

pasteSorted <- function(line){
    return(paste(sort(line), collapse = "_"))
}

# Keep list of edges in sub-PPI as strings
forestEdges = apply(forestSubPPI[,c(8,9)], 1, pasteSorted)

In [None]:
# Keep list of edges in each analysis as strings
recEdges = apply(read.table("OmicsIntegratorRun/recArg1_output/result_optimalForest.sif")[,c(1,3)], 1,
    pasteSorted)
ranklEdges = apply(read.table("OmicsIntegratorRun/ranklDiff_output/result_optimalForest.sif")[,c(1,3)], 1,
    pasteSorted)

In [None]:
# Check that all edges in the analysis are actual edges in the PPI
# all(all(recEdges %in% forestEdges), all(ranklEdges %in% forestEdges), all(deplEdges %in% forestEdges))
all(all(recEdges %in% forestEdges), all(ranklEdges %in% forestEdges))

In [None]:
forest = graph_from_edgelist(as.matrix(PPI[as.character(rownames(forestSubPPI)),c(8,9)]), directed = F)

In [None]:
# Color nodes and edges based on the analyses they are found in
edgeType = rep(0, ecount(forest))
edgeType = edgeType + sapply(forestEdges, function(x) ifelse(x %in% recEdges, 1, 0))
edgeType = edgeType + sapply(forestEdges, function(x) ifelse(x %in% ranklEdges, 2, 0))
# edgeType = edgeType + sapply(forestEdges, function(x) ifelse(x %in% deplEdges, 4, 0))
nodeType = rep(0, vcount(forest))
nodeType = nodeType + sapply(names(V(forest)), function(x) ifelse(x %in% recNodes, 1, 0))
nodeType = nodeType + sapply(names(V(forest)), function(x) ifelse(x %in% ranklNodes, 2, 0))
# nodeType = nodeType + sapply(names(V(forest)), function(x) ifelse(x %in% deplNodes, 4, 0))

In [None]:
forest = set_edge_attr(graph = forest, name = "weight", value = PPI[as.character(rownames(forestSubPPI)),25])
forest = set_edge_attr(graph = forest, name = "type", value = edgeType)
forest = set_vertex_attr(graph = forest, name = "type", value = nodeType)

In [None]:
library(jsonlite)
library(stringr)
# Import updated list of kegg pathways
keggPath = fromJSON("https://www.kegg.jp/kegg-bin/download_htext?htext=mmu00001&format=json", flatten = F)

In [None]:
# Extract all symbols for metabolism genes
indexPathMetab = which(keggPath$children$name == '09100 Metabolism')
pathMetab = unlist(sapply(keggPath$children$children[[indexPathMetab]]$children, function(x) str_match(x$name, "mmu\\d{5}")))
pathMetab = unique(na.exclude(pathMetab))

In [None]:
getSymbol <- function(x){
    str_remove(string = str_remove(string = str_match(x$name, ' [^ ;]*;'), pattern = ' '), pattern = ';')
}
symbolMetab = unique(unlist(sapply(keggPath$children$children[[indexPathMetab]]$children, 
                    function(x) sapply(x$children, getSymbol))))

In [None]:
# 9 out of 179 genes in the final graph are linked to metabolism
table(names(V(forest)) %in% symbolMetab)
names(V(forest))[which(names(V(forest)) %in% symbolMetab)]

In [None]:
forest = set_vertex_attr(graph = forest, name = "metabo", value = (names(V(forest)) %in% symbolMetab))
forest = simplify(forest, edge.attr.comb = "max") # Remove self-loops and multiple edges
# In case of multiple annotations, take the highest values (= best proof of interaction)

In [None]:
write_graph(forest, "OmicsIntegratorRun/recAndRanklForestsColored.gml", format = "gml")

## Simulated network
We want to generate a synthetic network will 2 independent connected components of sizes matched to the actual network, to be able to see if the overlap is similar or not.

In [None]:
# Length of the original sets of nodes
lSet1 = length(ranklNodes)
lSet2 = length(recNodes)
# Store full PPI
igraphPPI = graph_from_edgelist(as.matrix(PPI[,c(8,9)]), directed = F)

In [None]:
# We want two subnetworks of size lSet1 and lSet2

# https://stackoverflow.com/questions/33084860/sampling-subgraphs-from-different-sizes-using-igraph
getSubnetwork <- function(subSize){
    n <- vcount(igraphPPI)

    # Determine which nodes fall in sufficiently large connected components
    comp <- components(igraphPPI)
    valid <- which(comp$csize[comp$membership] >= subSize)

    # Seed node from which the subnetwork should be expanded
    first.node <- sample(valid, 1)
    used <- (1:n) == first.node  # Is this node selected?
    neigh <- (1:n) %in% neighbors(igraphPPI, first.node)  # Does each node neighbor our selections?
    for (i in 2:subSize) {
      new.node <- sample(which(neigh & !used), 1)
      used[new.node] <- TRUE
      neigh[neighbors(igraphPPI, new.node)] <- TRUE
    }
    return(used)
}

In [None]:
singleRunOvlp <- function(x){
    nodSet1 = getSubnetwork(lSet1)
    nodSet2 = getSubnetwork(lSet2)
    return(sum(nodSet1 & nodSet2))
}

N = 2000
ovlp <- unlist(mclapply(1:N, mc.cores=4, singleRunOvlp))

In [None]:
ovlp.pval = paste("p = ", sum(ovlp >= 8)/length(ovlp))

In [None]:
gp <- ggplot(data.frame(x = ovlp, c = (ovlp == 8)), aes(x, fill = c)) + geom_histogram(binwidth = 1) + theme_light() + xlab("Overlapping genes") +
    scale_fill_manual(values = c("#AAAAAA", "#4A91C4"), guide = F) + annotate("text", x = 10, y = 425, label = ovlp.pval, size = 7) +
    ylab("Frequency") + theme(panel.grid.minor=element_blank(), text=element_text(size=26, family="sans"))
ggsave(filename = "network_overlap.svg", plot = gp)