# Differential expression analysis

## Install and import dependencies

In [None]:
source("https://bioconductor.org/biocLite.R")
biocLite()

In [None]:
biocLite(c("limma", "Glimma", "edgeR", "clusterProfiler", "Mus.musculus"))

In [None]:
library(limma)
library(Glimma)
library(edgeR)
library(RColorBrewer)
library(scales)
library(heatmaply)
library(clusterProfiler)
library(grDevices)
library(ggplot2)
library(ggrepel)
library(Mus.musculus)
library(jsonlite)
library(stringr)

In [None]:
getwd()

## Example analysis
https://www.bioconductor.org/help/workflows/RNAseq123/  
Used as basis for the following analysis

In [None]:
setwd("/Users/lvulliard/OneShotProject/RecArg1/counts")

## RecArg1 - Counts with multi-mapping reads

In [None]:
countMat = matrix(nrow=26301, ncol=28)
colnames(countMat) <- list.files()
for (i in 1:28) {
    load(colnames(countMat)[i])
    countMat[,i] <- fcMulti$counts
}
rownames(countMat) <- rownames(fcMulti$counts)
# Four replicates of seven different conditions
conditionList = c(5,6,7,1,1,2,2,3,3,4,4,5,1,7,5,2,3,4,6,7,7,1,2,4,5,6,3,6)
# If s06 is Control RANKL rescue and s08 is Control rescue
# conditionList = c(5,6,7,1,1,2,2,3,3,4,4,5,1,7,5,2,3,4,6,7,7,1,2,4,5,6,6,3)

In [None]:
# Convert to DGE object
dgeData = DGEList(counts = countMat, genes = rownames(countMat), group = conditionList)

In [None]:
dim(dgeData)

In [None]:
names(dgeData)

In [None]:
# Counts equal to 0 for all but 3 samples at most
table(rowSums(dgeData$counts==0)>=25)
# Counts equal to 0 for all sample
table(rowSums(dgeData$counts==0)==28)

In [None]:
dgeDataLogCPMUnfiltered = cpm(dgeData, log = T)

In [None]:
table(rowSums(dgeDataLogCPMUnfiltered > 0) > 3)

In [None]:
head(dgeData$counts)

In [None]:
dgeData = dgeData[rowSums(dgeDataLogCPMUnfiltered > 0) > 3,, keep.lib.sizes=FALSE]

In [None]:
head(dgeData$counts)

In [None]:
length(rownames(dgeData))

3862 genes are never expressed, 6090 are not expressed in more than 3 conditions, so not even in all replicates of a single condition.  
We keep the 12952 genes with a logCPM value of one or higher in 4 or more samples.

In [None]:
# Input gene annotation
genes <- select(Mus.musculus, keys=rownames(dgeData), columns=c("SYMBOL", "TXCHROM"), 
                keytype="ENTREZID")
# Choose to keep only the first chromosome position of each gene
genes <- genes[!duplicated(genes$ENTREZID),]
head(genes)

dgeData$genes <- genes

In [None]:
# Input sample annotation
dgeData$samples$aMEM = c(TRUE,FALSE,FALSE,TRUE,FALSE,FALSE,TRUE)[conditionList]
dgeData$samples$aMEMwoLArg = c(FALSE,TRUE,TRUE,FALSE,TRUE,TRUE,FALSE)[conditionList]
dgeData$samples$LArg = c(FALSE,FALSE,TRUE,FALSE,FALSE,TRUE,FALSE)[conditionList]
dgeData$samples$recArg = c(FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE)[conditionList]
dgeData$samples$RANKL = c(FALSE,FALSE,FALSE,TRUE,TRUE,TRUE,TRUE)[conditionList]
# dgeData$samples$mouse = c()
dgeData$samples

In [None]:
dgeDataLogCPM = cpm(dgeData, log = T)
# NB: prior count of 0.25

nsamples <- ncol(dgeDataLogCPM)
col <- rainbow(nsamples)

par(mfrow=c(1,2), bg="white")

plot(density(dgeDataLogCPMUnfiltered[,1]), col=col[1], lwd=2, ylim=c(0,0.21), las=2, 
     main="", xlab="")
title(main="A. Raw data", xlab="Log-cpm")
abline(v=0, lty=3)
for (i in 2:nsamples){
   den <- density(dgeDataLogCPMUnfiltered[,i])
   lines(den$x, den$y, col=col[i], lwd=2)
}
legend("topright", rownames(dgeData$samples), text.col=col, bty="n")

plot(density(dgeDataLogCPM[,1]), col=col[1], lwd=2, ylim=c(0,0.21), las=2, 
     main="", xlab="")
title(main="B. Filtered data", xlab="Log-cpm")
abline(v=0, lty=3)
for (i in 2:nsamples){
 den <- density(dgeDataLogCPM[,i])
 lines(den$x, den$y, col=col[i], lwd=2)
}
legend("topright", rownames(dgeData$samples), text.col=col, bty="n")

In [None]:
pdf("../filteringRNA.pdf")

par(family = "sans", mfrow=c(1,2), bg="white")

plot(density(dgeDataLogCPMUnfiltered[,1]), col=col[1], lwd=2, ylim=c(0,0.21), las=2, 
     main="", xlab="")
title(main="Raw data", xlab="Log-cpm")
abline(v=0, lty=3)
for (i in 2:nsamples){
   den <- density(dgeDataLogCPMUnfiltered[,i])
   lines(den$x, den$y, col=col[i], lwd=2)
}

plot(density(dgeDataLogCPM[,1]), col=col[1], lwd=2, ylim=c(0,0.21), las=2, 
     main="", xlab="")
title(main="Filtered data", xlab="Log-cpm")
abline(v=0, lty=3)
for (i in 2:nsamples){
 den <- density(dgeDataLogCPM[,i])
 lines(den$x, den$y, col=col[i], lwd=2)
}

dev.off()

In [None]:
dgeData <- calcNormFactors(dgeData, method = "TMM")
dgeData$samples

In [None]:
sampleLabels = paste0("s", 1:28, "_", conditionList)
heatmaply(cor(dgeData$counts),symm=T, labRow = sampleLabels, labCol = sampleLabels, branches_lwd = 0.25, margins = c(50,50,5,0), cexRow = 0.6)

In [None]:
par(bg="white", cex = 1.3)
colpal = brewer.pal(7, "Set2")
plotMDS(dgeData, labels = sampleLabels, col = colpal[as.numeric(dgeData$samples$group)])

In [None]:
pdf("../mdsMultiRNA.pdf")
par(family = "sans")
plotMDS(dgeData, labels = sampleLabels, col = colpal[as.numeric(dgeData$samples$group)])
dev.off()

In [None]:
# Which mice are the samples coming from?
mouse = c(1,1,1,2,1,2,1,2,1,2,1,2,4,3,4,4,4,4,4,4,2,3,3,3,3,3,2,3)
table(mouse)

In [None]:
design <- with(data.frame(dgeData$samples), 
#    model.matrix(~0+aMEM+LArg+recArg+RANKL))
    model.matrix(~0+group))
design

In [None]:
contr.matrix <- makeContrasts(
                        g1v2 = group1 - group2,
                        g1v3 = group1 - group3,
                        g1v4 = group1 - group4,
                        g2v3 = group2 - group3,
                        g2v5 = group2 - group5,
                        g3v6 = group3 - group6,
                        g4v5 = group4 - group5,
                        g4v6 = group4 - group6,
                        g4v7 = group4 - group7,
                        g5v7 = group5 - group7,
                        rankl = group6 + group5 + group4 - group3 - group2 - group1,
                        arg = group1 + group4 - group5 - group2,
                        rescue = group1 + group4 - group6 - group3,
                        levels = colnames(design))
contr.matrix

In [None]:
par(mfrow=c(1,2), bg="white")
#Correct for heteroscedasticity 
v <- voom(dgeData, design, plot=TRUE)
# Linear model
vfit <- lmFit(v, design)
vfit <- contrasts.fit(vfit, contrasts=contr.matrix)
# Moderated t-statistics, moderated F-statistic, and log-odds of differential expression by empirical Bayes 
# moderation of the standard errors towards a common value 
efit <- eBayes(vfit)
plotSA(efit, main="Final model")

In [None]:
# How many genes are differentially expressed overall
summary(decideTests(efit), lfc=1)

In [None]:
# Same as eBayes but only for genes that have a log2 fold-change higher than 1
tfit <- treat(vfit, lfc=1)
# FDR = 5%
dt <- decideTests(tfit)
summary(dt)

In [None]:
pdf("../vennMultiRNA.pdf")
par(family = "sans")
vennDiagram(dt[,c(7,9,10)], circle.col=c("turquoise", "salmon", "forestgreen"))
dev.off()

In [None]:
par(bg = "white")
vennDiagram(dt[,c(7,9,10)], circle.col=c("turquoise", "salmon", "forestgreen"))

In [None]:
pdf("../DeplOrRecVenn_multimap.pdf")
par(family = "sans", cex = 1.15)
vennDiagram(dt[,c(7,9)], circle.col=c("turquoise", "salmon"), names = c("Arginine depletion", "recArg1"))
dev.off()

Adding recArg1 modified more gene expressions than lack of arginine.  
Most of the genes perturbed by lack of arginine are also perturbed by recArg1.  
Looking at the difference between arg$^-$ and recArg1$^+$ samples doesn't show additional information,

### Effect of RANKL

In [None]:
pdf("../vennMultiRanklRNA.pdf")
par(family = "sans")
vennDiagram(dt[,c(3,5,6,11)], circle.col=c("turquoise", "salmon", "forestgreen", "tomato2"))
dev.off()

In [None]:
pdf("../vennMultiRanklRNA2.pdf")
par(family = "sans")
vennDiagram(dt[,c(3,5,6)], circle.col=c("turquoise", "salmon", "forestgreen"))
dev.off()

In [None]:
par(bg = "white")
vennDiagram(dt[,c(3,5,6,11)], circle.col=c("turquoise", "salmon", "forestgreen", "tomato2"))

In [None]:
par(bg = "white")
vennDiagram(dt[,c(3,5,6)], circle.col=c("turquoise", "salmon"))

In [None]:
pdf("../VolcanoRanklMultiRNA.pdf")
top1v4 <- topTreat(tfit, coef=3, n=Inf)
top1v4$logpval = -log(top1v4$P.Val)
top1v4$State = 'No significant change'
top1v4[top1v4$ENTREZID %in% names(which(dt[,3] == -1)),10] <- 'Upregulated'
top1v4[top1v4$ENTREZID %in% names(which(dt[,3] == 1)),10] <- 'Downregulated'
top1v4$Labs = ifelse(top1v4$logpval > 32, top1v4$SYMBOL, NA)
n_genes = paste("n = ", paste(table(top1v4$State), collapse = "/"))
ggplot(top1v4, aes(x = -logFC, y = logpval, color = State, label = Labs)) + theme_light() + 
  theme(text=element_text(size=26, family="sans"), plot.margin = margin(5, 15, 5, 5), panel.grid.minor=element_blank(),
  legend.position = c(0.275, 0.9), legend.spacing = unit(10, "cm"), legend.title=element_blank(),
  legend.background = element_rect(size=0.3, color = "black"), legend.margin=margin(5,5,10,5)) + 
  scale_color_manual(values = c("#4A91C4","#AAAAAA","#F09F4E")) + ggtitle("DE induced by RANKL") +
  ylab("-log10(P-value)") + xlab("-log2(FC)") + annotate("text", x = 6.3, y = 0, label = n_genes, size = 7) +
  geom_point() + geom_text_repel(size = 7, point.padding = 0.2, force = 0.4, segment.alpha = 0.5, show.legend = F)
dev.off()

In [None]:
pdf("../VolcanoRanklSlcMultiRNA.pdf")
indSlc = grep("^Slc", dgeData$genes$SYMBOL)
indSlcTfit = which(rownames(tfit$coefficients) %in% dgeData$genes$ENTREZID[indSlc])
slcfit = tfit[indSlcTfit,]

top1v4slc <- topTreat(slcfit, coef=3, n=Inf)
top1v4slc$logpval = -log(top1v4slc$P.Val)
top1v4slc$State = 'No significant change'
top1v4slc[top1v4slc$ENTREZID %in% names(which(dt[,3] == -1)),10] <- 'Upregulated'
top1v4slc[top1v4slc$ENTREZID %in% names(which(dt[,3] == 1)),10] <- 'Downregulated'
top1v4slc$Labs = ifelse(top1v4slc$State != 'No significant change', top1v4slc$SYMBOL, NA)
n_genes = paste("n = ", paste(table(top1v4slc$State), collapse = "/"))
ggplot(top1v4slc, aes(x = -logFC, y = logpval, color = State, label = Labs)) + theme_light() + 
  theme(text=element_text(size=26, family="sans"), panel.grid.minor=element_blank(),
  legend.position = c(0.73, 0.9), legend.spacing = unit(10, "cm"), legend.title=element_blank(),
  legend.background = element_rect(size=0.3, color = "black"), legend.margin=margin(5,5,10,5)) + 
  scale_color_manual(values = c("#4A91C4","#AAAAAA","#F09F4E")) + ggtitle("DE induced by RANKL") +
  ylab("-log10(P-value)") + xlab("-log2(FC)") + annotate("text", x = -5, y = 0, label = n_genes, size = 7) +
  geom_point() + geom_text_repel(size = 7, point.padding = 0.3, force = 8, segment.alpha = 0.5, show.legend = F)
dev.off()

### Effect of arginin depletion

In [None]:
par(bg = "white")
vennDiagram(dt[,c(1,7,12)], circle.col=c("turquoise", "salmon"))

In [None]:
par(bg = "white")
vennDiagram(dt[,c(1,7)], circle.col=c("turquoise", "salmon"))

### Effect of arginin rescue

In [None]:
par(bg = "white")
vennDiagram(dt[,c(2,8,13)], circle.col=c("turquoise", "salmon"))

In [None]:
par(bg = "white")
vennDiagram(dt[,c(2,8)], circle.col=c("turquoise", "salmon"))

In [None]:
top4v5 <- topTreat(tfit, coef=7, n=Inf)
top4v7 <- topTreat(tfit, coef=9, n=Inf)
head(top4v7)

In [None]:
par(bg="white")
plotMD(tfit, column=9, status=dt[,9], main=colnames(tfit)[9], col = c("#4A91C4","#F09F4E"), bg.col = "grey", legend = F)

In [None]:
pdf("../VolcanoRecArg1MultiRNA.pdf")
top4v7$logpval = -log(top4v7$P.Val)
top4v7$State = 'No significant change'
top4v7[top4v7$ENTREZID %in% names(which(dt[,9] == -1)),10] <- 'Upregulated'
top4v7[top4v7$ENTREZID %in% names(which(dt[,9] == 1)),10] <- 'Downregulated'
top4v7$Labs = ifelse(top4v7$logpval > 26, top4v7$SYMBOL, NA)
n_genes = paste("n = ", paste(table(top4v7$State), collapse = "/"))
ggplot(top4v7, aes(x = -logFC, y = logpval, color = State, label = Labs)) + theme_light() + 
  theme(text=element_text(size=26, family="sans"), plot.margin = margin(5, 15, 5, 5), panel.grid.minor=element_blank(),
  legend.position = c(0.55, 0.9), legend.spacing = unit(10, "cm"), legend.title=element_blank(),
  legend.background = element_rect(size=0.3, color = "black"), legend.margin=margin(5,5,10,5)) + 
  scale_color_manual(values = c("#4A91C4","#AAAAAA","#F09F4E")) + ggtitle("DE induced by recArg1") +
  ylab("-log10(P-value)") + xlab("-log2(FC)") + annotate("text", x = 4.2, y = 0, label = n_genes, size = 7) +
  geom_point() + geom_text_repel(size = 7, point.padding = 0.2, force = 0.3, segment.alpha = 0.5, show.legend = F)
dev.off()

In [None]:
pdf("../VolcanoRecArg1SlcMultiRNA.pdf")

top4v7slc <- topTreat(slcfit, coef=9, n=Inf)
top4v7slc$logpval = -log(top4v7slc$P.Val)
top4v7slc$State = 'No significant change'
top4v7slc[top4v7slc$ENTREZID %in% names(which(dt[,9] == -1)),10] <- 'Upregulated'
top4v7slc[top4v7slc$ENTREZID %in% names(which(dt[,9] == 1)),10] <- 'Downregulated'
top4v7slc$Labs = ifelse(top4v7slc$State != 'No significant change', top4v7slc$SYMBOL, NA)
n_genes = paste("n = ", paste(table(top4v7slc$State), collapse = "/"))
ggplot(top4v7slc, aes(x = -logFC, y = logpval, color = State, label = Labs)) + theme_light() + 
  theme(text=element_text(size=26, family="sans"), panel.grid.minor=element_blank(),
  legend.position = c(0.73, 0.9), legend.spacing = unit(10, "cm"), legend.title=element_blank(),
  legend.background = element_rect(size=0.3, color = "black"), legend.margin=margin(5,5,10,5)) + 
  scale_color_manual(values = c("#4A91C4","#AAAAAA","#F09F4E")) + ggtitle("DE induced by recArg1") +
  ylab("-log10(P-value)") + xlab("-log2(FC)") + annotate("text", x = -5, y = 0, label = n_genes, size = 7) +
  geom_point() + geom_text_repel(size = 7, point.padding = 0.3, force = 8, segment.alpha = 0.5, show.legend = F)
dev.off()

In [None]:
pdf("../MDMultiRNA.pdf")
par(family = "sans")
plotMD(tfit, column=9, status=dt[,9], main=colnames(tfit)[9], col = c("#4A91C4","#F09F4E"), bg.col = "grey", legend = F)
dev.off()

In [None]:
for (i in head(top4v7$SYMBOL, 40)){ cat(paste0(i, "\n")) }

In [None]:
top4v7.topgenes <- top4v7$ENTREZID[1:40]
i <- which(v$genes$ENTREZID %in% top4v7.topgenes)
colPal <- colorRampPalette(c("#4A91C4","white","#F09F4E"))
heatmaply(v$E[i,], labCol=sampleLabels, labRow=v$genes$SYMBOL[i], branches_lwd = 0.25, col = colPal, file = "../heatmapMultiRNA.pdf",
    cexCol = 0.8, cexRow = 0.1, margins = c(80,0,5,0), ColSideColors=dgeData$samples$group, plot_method = "plotly")

In [None]:
i <- which(v$genes$ENTREZID %in% top4v7$ENTREZID[1:4])
heatmaply(v$E[i,], labCol=sampleLabels, labRow=v$genes$SYMBOL[i], branches_lwd = 0.25, col = colPal, 
    cexCol = 0.8, cexRow = 0.1, margins = c(80,0,5,0))

top4v7[1:4,]

In [None]:
dt.multi = dt

### Differences between recArg1 and arginine depletion in presence of RANKL
Conditions 7 and 5

In [None]:
suppressWarnings(which(apply(dt.multi[,c(7,9,10)], 1, all)))
dt.multi[suppressWarnings(which(apply(dt.multi[,c(7,9,10)], 1, all))),c(7,9,10)]

Prostate transmembrane protein, androgen induced 1 (ENTREZ gene 65112) is overexpressed in arginine depletion compared to osteoclasts and recArg1, whereas it's underexpressed when comparing recArg1 to osteoclasts.

In [None]:
top5v7 <- topTreat(tfit, coef=10, n=Inf)
head(top5v7)

In [None]:
par(bg="white")
plotMD(tfit, column=10, status=dt[,10], main=colnames(tfit)[10], col = c("#4A91C4","#F09F4E"), bg.col = "grey", legend = F)

In [None]:
pdf("../MDMultiRecArgVsDepl.pdf")
par(family = "sans")
plotMD(tfit, column=10, status=dt[,10], main=colnames(tfit)[10], col = c("#4A91C4","#F09F4E"), bg.col = "grey", legend = F)
dev.off()

In [None]:
for (i in head(top5v7$SYMBOL, 40)){ cat(paste0(i, "\n")) }

In [None]:
top5v7.topgenes <- top5v7$ENTREZID[1:40]
i <- which(v$genes$ENTREZID %in% top5v7.topgenes)
colPal <- colorRampPalette(c("#4A91C4","white","#F09F4E"))
heatmaply(v$E[i,], labCol=sampleLabels, labRow=v$genes$SYMBOL[i], branches_lwd = 0.25, col = colPal, file = "../heatmapMultiRecArgVsDepl.pdf",
    cexCol = 0.8, cexRow = 0.1, margins = c(80,0,5,0), ColSideColors=dgeData$samples$group, plot_method = "plotly")

In [None]:
pdf("../deplStarComparison.pdf")
deplInd = (dgeData$samples$group == 5)
starInd = (dgeData$samples$group == 7)
compDeplStar = as.data.frame(cbind(x = rowMeans(dgeDataLogCPM[,deplInd]), y = rowMeans(dgeDataLogCPM[,starInd])))
ggplot(data = compDeplStar, aes(x = x, y = y)) + geom_point() + theme_light() + 
  theme(text=element_text(size=26, family="sans"), panel.grid.minor=element_blank()) +
  xlab("Log2(CPM) for arginine depletion") + ylab("Log2(CPM) for recArg1")
dev.off()

### Export expression for OmicsIntegrator

In [None]:
write.table(top4v7[(top4v7$ENTREZID %in% names(which(dt[,9] != 0))),c(2,4)], file = "../OmicsIntegratorRun/recArg1_expr.tsv", 
            sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) 

In [None]:
write.table(top5v7[(top5v7$ENTREZID %in% names(which(dt[,10] != 0))),c(2,4)], file = "../OmicsIntegratorRun/deplOrRec_expr.tsv", 
            sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) 

In [None]:
upDeplGenes1 = names(which((dt.multi[,3] == 1) & (dt.multi[,6] == 1) & (dt.multi[,5] == 0)))
downDeplGenes1 = names(which((dt.multi[,3] == -1) & (dt.multi[,6] == -1) & (dt.multi[,5] == 0)))
notUpDeplGenes1 = names(which((dt.multi[,3] == 0) & (dt.multi[,6] == 0) & (dt.multi[,5] == 1)))
notDownDeplGenes1 = names(which((dt.multi[,3] == 0) & (dt.multi[,6] == 0) & (dt.multi[,5] == -1)))

In [None]:
deplGenes = union(union(upDeplGenes1, downDeplGenes1), union(notUpDeplGenes1, notDownDeplGenes1))

In [None]:
deplGenes

In [None]:
top3v6 <- topTreat(tfit, coef=6, n=Inf)
top2v5 <- topTreat(tfit, coef=5, n=Inf)

In [None]:
lfcFc = unlist(sapply(deplGenes, function(i) log2(2**(top2v5[top2v5$ENTREZID == i,4])/2**(top3v6[top3v6$ENTREZID == i,4]))))

In [None]:
lfcFc = abs(lfcFc)
write.table(data.frame(v$genes$SYMBOL[v$genes$ENTREZID %in% deplGenes[lfcFc > 0.5]], lfcFc[lfcFc > 0.5]), 
                       file = "../OmicsIntegratorRun/ranklDiff_expr.tsv", 
            sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) 

## RecArg1 - Counts without multi-mapping reads

In [None]:
countMat = matrix(nrow=26301, ncol=28)
colnames(countMat) <- list.files()
for (i in 1:28) {
    load(colnames(countMat)[i])
    countMat[,i] <- fcStrict$counts
}
rownames(countMat) <- rownames(fcStrict$counts)
# Four replicates of seven different conditions
conditionList = c(5,6,7,1,1,2,2,3,3,4,4,5,1,7,5,2,3,4,6,7,7,1,2,4,5,6,3,6)
# If s06 is Control RANKL rescue and s08 is Control rescue
# conditionList = c(5,6,7,1,1,2,2,3,3,4,4,5,1,7,5,2,3,4,6,7,7,1,2,4,5,6,6,3)

In [None]:
# Convert to DGE object
dgeData = DGEList(counts = countMat, genes = rownames(countMat), group = conditionList)

In [None]:
dim(dgeData)

In [None]:
names(dgeData)

In [None]:
# Counts equal to 0 for all but 3 samples at most
table(rowSums(dgeData$counts==0)>=25)
# Counts equal to 0 for all sample
table(rowSums(dgeData$counts==0)==28)

In [None]:
dgeDataLogCPMUnfiltered = cpm(dgeData, log = T)

In [None]:
table(rowSums(dgeDataLogCPMUnfiltered > 0) > 3)

In [None]:
dgeData = dgeData[rowSums(dgeDataLogCPMUnfiltered > 0) > 3,, keep.lib.sizes=FALSE]
# dgeData = dgeData[rowSums(dgeData$counts==0)>=25,, keep.lib.sizes=FALSE]

5010 genes are never expressed, 7267 are not expressed in more than 3 conditions, so not even in all replicates of a single condition.
We keep the 12952 genes with a logCPM value of one or higher in 4 or more samples.

In [None]:
# Input gene annotation
genes <- select(Mus.musculus, keys=rownames(dgeData), columns=c("SYMBOL", "TXCHROM"), 
                keytype="ENTREZID")
# Choose to keep only the first chromosome position of each gene
genes <- genes[!duplicated(genes$ENTREZID),]
head(genes)

dgeData$genes <- genes

In [None]:
# Input sample annotation
dgeData$samples$aMEM = c(TRUE,FALSE,FALSE,TRUE,FALSE,FALSE,TRUE)[conditionList]
dgeData$samples$aMEMwoLArg = c(FALSE,TRUE,TRUE,FALSE,TRUE,TRUE,FALSE)[conditionList]
dgeData$samples$LArg = c(FALSE,FALSE,TRUE,FALSE,FALSE,TRUE,FALSE)[conditionList]
dgeData$samples$recArg = c(FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE)[conditionList]
dgeData$samples$RANKL = c(FALSE,FALSE,FALSE,TRUE,TRUE,TRUE,TRUE)[conditionList]
# dgeData$samples$mouse = c()
dgeData$samples

In [None]:
dgeDataLogCPM = cpm(dgeData, log = T)
# NB: prior count of 0.25

nsamples <- ncol(dgeDataLogCPM)
col <- rainbow(nsamples)

par(mfrow=c(1,2), bg="white")

plot(density(dgeDataLogCPMUnfiltered[,1]), col=col[1], lwd=2, ylim=c(0,0.21), las=2, 
     main="", xlab="")
title(main="A. Raw data", xlab="Log-cpm")
abline(v=0, lty=3)
for (i in 2:nsamples){
   den <- density(dgeDataLogCPMUnfiltered[,i])
   lines(den$x, den$y, col=col[i], lwd=2)
}
legend("topright", rownames(dgeData$samples), text.col=col, bty="n")

plot(density(dgeDataLogCPM[,1]), col=col[1], lwd=2, ylim=c(0,0.21), las=2, 
     main="", xlab="")
title(main="B. Filtered data", xlab="Log-cpm")
abline(v=0, lty=3)
for (i in 2:nsamples){
 den <- density(dgeDataLogCPM[,i])
 lines(den$x, den$y, col=col[i], lwd=2)
}
legend("topright", rownames(dgeData$samples), text.col=col, bty="n")

In [None]:
dgeData <- calcNormFactors(dgeData, method = "TMM")
dgeData$samples

In [None]:
sampleLabels = paste0("s", 1:28, "_", conditionList)
heatmaply(cor(dgeData$counts),symm=T, labRow = sampleLabels, labCol = sampleLabels, branches_lwd = 0.25, margins = c(50,50,5,0), cexRow = 0.6)

In [None]:
par(bg="white")
colpal = brewer.pal(7, "Set2")
plotMDS(dgeData, labels = sampleLabels, col = colpal[as.numeric(dgeData$samples$group)])

In [None]:
with(data.frame(dgeData$samples), tvr <- aMEM)

In [None]:
design <- with(data.frame(dgeData$samples), 
#    model.matrix(~0+aMEM+LArg+recArg+RANKL))
    model.matrix(~0+group))
design

In [None]:
contr.matrix <- makeContrasts(
                        g1v2 = group1 - group2,
                        g1v3 = group1 - group3,
                        g1v4 = group1 - group4,
                        g2v3 = group2 - group3,
                        g2v5 = group2 - group5,
                        g3v6 = group3 - group6,
                        g4v5 = group4 - group5,
                        g4v6 = group4 - group6,
                        g4v7 = group4 - group7,
                        g5v7 = group5 - group7,
                        rankl = group6 + group5 + group4 - group3 - group2 - group1,
                        arg = group1 + group4 - group5 - group2,
                        rescue = group1 + group4 - group6 - group3,
                        levels = colnames(design))
contr.matrix

In [None]:
par(mfrow=c(1,2), bg="white")
v <- voom(dgeData, design, plot=TRUE)
vfit <- lmFit(v, design)
vfit <- contrasts.fit(vfit, contrasts=contr.matrix)
efit <- eBayes(vfit)
plotSA(efit, main="Final model")

In [None]:
summary(decideTests(efit))

In [None]:
tfit <- treat(vfit, lfc=1)
dt <- decideTests(tfit)
summary(dt)

In [None]:
de.common <- which(dt[,1]!=0 & dt[,2]!=0)
length(de.common)

In [None]:
head(tfit$genes$SYMBOL[de.common], n=20)

In [None]:
par(bg = "white")
vennDiagram(dt[,c(7,9)], circle.col=c("turquoise", "salmon"))

In [None]:
par(bg = "white")
vennDiagram(dt[,c(7,9)], circle.col=c("turquoise", "salmon"))

Adding recArg1 modified more gene expressions than lack of arginine.  
Most of the genes perturbed by lack of arginine are also perturbed by recArg1.  
Looking at the difference between arg$^-$ and recArg1$^+$ samples doesn't show additional information,

### Effect of RANKL

In [None]:
par(bg = "white")
vennDiagram(dt[,c(3,5,6,11)], circle.col=c("turquoise", "salmon"))

In [None]:
par(bg = "white")
vennDiagram(dt[,c(3,5,6)], circle.col=c("turquoise", "salmon"))

### Effect of arginin depletion

In [None]:
par(bg = "white")
vennDiagram(dt[,c(1,7,12)], circle.col=c("turquoise", "salmon"))

In [None]:
par(bg = "white")
vennDiagram(dt[,c(1,7)], circle.col=c("turquoise", "salmon"))

### Effect of arginin rescue

In [None]:
par(bg = "white")
vennDiagram(dt[,c(2,8,13)], circle.col=c("turquoise", "salmon"))

In [None]:
par(bg = "white")
vennDiagram(dt[,c(2,8)], circle.col=c("turquoise", "salmon"))

In [None]:
top4v5 <- topTreat(tfit, coef=7, n=Inf)
top4v7 <- topTreat(tfit, coef=9, n=Inf)
head(top4v7)

In [None]:
par(bg="white")
plotMD(tfit, column=9, status=dt[,9], main=colnames(tfit)[9], 
       xlim=c(-8,13))

In [None]:
top4v7.topgenes <- top4v7$ENTREZID[1:100]
i <- which(v$genes$ENTREZID %in% top4v7.topgenes)
colPal <- colorRampPalette(c("#4A91C4","white","#F09F4E"))
heatmaply(v$E[i,], labCol=sampleLabels, labRow=v$genes$SYMBOL[i], branches_lwd = 0.25, col = colPal, 
    cexCol = 0.8, cexRow = 0.1, margins = c(80,0,5,0))

In [None]:
top4v7[1:50,c(2,4)]

In [None]:
dt.strict = dt

### Differences between recArg1 and arginine depletion in presence of RANKL
Conditions 7 and 5

In [None]:
suppressWarnings(which(apply(dt.strict[,c(7,9,10)], 1, all)))
dt.strict[suppressWarnings(which(apply(dt.strict[,c(7,9,10)], 1, all))),c(7,9,10)]

Prostate transmembrane protein, androgen induced 1 (ENTREZ gene 65112) is overexpressed in arginine depletion compared to osteoclasts and recArg1, whereas it's underexpressed when comparing recArg1 to osteoclasts.

In [None]:
top5v7 <- topTreat(tfit, coef=10, n=Inf)
head(top5v7)

In [None]:
par(bg="white")
plotMD(tfit, column=10, status=dt[,10], main=colnames(tfit)[10], col = c("#4A91C4","#F09F4E"), bg.col = "grey", legend = F)

In [None]:
pdf("../MDStrictRecArgVsDepl.pdf")
par(family = "sans")
plotMD(tfit, column=10, status=dt[,10], main=colnames(tfit)[10], col = c("#4A91C4","#F09F4E"), bg.col = "grey", legend = F)
dev.off()

In [None]:
length(top5v7$SYMBOL)

In [None]:
for (i in head(top5v7$SYMBOL, 40)){ cat(paste0(i, "\n")) }

In [None]:
top5v7.topgenes <- top5v7$ENTREZID[1:40]
i <- which(v$genes$ENTREZID %in% top5v7.topgenes)
colPal <- colorRampPalette(c("#4A91C4","white","#F09F4E"))
heatmaply(v$E[i,], labCol=sampleLabels, labRow=v$genes$SYMBOL[i], branches_lwd = 0.25, col = colPal, file = "../heatmapStrictRecArgVsDepl.pdf",
    cexCol = 0.8, cexRow = 0.1, margins = c(80,0,5,0), ColSideColors=dgeData$samples$group, plot_method = "plotly")

## Compare results with and without multimapping

In [None]:
dim(dt.multi)
dim(dt.strict)

In [None]:
sum(dt.strict != 0)
sum(dt.multi != 0)

In [None]:
commonGenes = intersect(rownames(dt.strict), rownames(dt.multi))
length(commonGenes)

In [None]:
dt.multi.g4v7 = dt.multi[dt.multi[,9] != 0, 9]
dt.strict.g4v7 = dt.strict[dt.strict[,9] != 0, 9]
length(dt.multi.g4v7)
length(dt.strict.g4v7)

In [None]:
commonGenes = intersect(names(dt.strict.g4v7), names(dt.multi.g4v7))
length(commonGenes)
length(commonGenes) / length(union(names(dt.strict.g4v7), names(dt.multi.g4v7)))

81% of the genes in the two lists are found in both.

In [None]:
i <- which(v$genes$ENTREZID %in% commonGenes)
head(v$E[i,])

### Output lists of genes differentially expressed in both cases

In [None]:
listDownRNA = intersect(names(which(dt.strict.g4v7 == 1)), names(which(dt.multi.g4v7 == 1)))
listUpRNA = intersect(names(which(dt.strict.g4v7 == -1)), names(which(dt.multi.g4v7 == -1)))
length(listDownRNA)
length(listUpRNA)

In [None]:
i <- which(v$genes$ENTREZID %in% listUpRNA)
listUpRNA = v$genes$SYMBOL[i]
i <- which(v$genes$ENTREZID %in% listDownRNA)
listDownRNA = v$genes$SYMBOL[i]
save(file = "../RNAlists.diff", listUpRNA, listDownRNA )

In [None]:
for (i in listDownRNA) {
    cat(paste0(i, "\n"))
}

In [None]:
for (i in listUpRNA) {
    cat(paste0(i, "\n"))
}

### Differences between recArg1 and arginine depletion

In [None]:
dt.multi.g5v7 = dt.multi[dt.multi[,10] != 0, 10]
dt.strict.g5v7 = dt.strict[dt.strict[,10] != 0, 10]
length(dt.multi.g5v7)
length(dt.strict.g5v7)

In [None]:
commonGenes = intersect(names(dt.strict.g5v7), names(dt.multi.g5v7))
length(commonGenes)
length(commonGenes) / length(union(names(dt.strict.g5v7), names(dt.multi.g5v7)))

In [None]:
listDownRecArgVsDepletion = intersect(names(which(dt.strict.g5v7 == 1)), names(which(dt.multi.g5v7 == 1)))
listUpRecArgVsDepletion = intersect(names(which(dt.strict.g5v7 == -1)), names(which(dt.multi.g5v7 == -1)))
length(listDownRecArgVsDepletion)
length(listUpRecArgVsDepletion)

i <- which(v$genes$ENTREZID %in% listUpRecArgVsDepletion)
listUpRecArgVsDepletion = v$genes$SYMBOL[i]
i <- which(v$genes$ENTREZID %in% listDownRecArgVsDepletion)
listDownRecArgVsDepletion = v$genes$SYMBOL[i]
save(file = "../RNAlistsRecArg1VsDepletion.diff", listUpRNA, listDownRNA )

In [None]:
for (i in listDownRecArgVsDepletion) {
    cat(paste0(i, "\n"))
}

In [None]:
i <- which(v$genes$SYMBOL %in% listDownRecArgVsDepletion)
head(v$E[i,which(dgeData$samples$group == 5)])
head(v$E[i,which(dgeData$samples$group == 7)])

Corresponding [Enrichr](http://amp.pharm.mssm.edu/Enrichr/enrich?dataset=3uzqw)

In [None]:
for (i in listUpRecArgVsDepletion) {
    cat(paste0(i, "\n"))
}

### Differences between RANKL effect in presence and absence of arginine
Intersection between comparisons 6 (3/6) and 3 (1/4) against comparison 5 (2/5)

In [None]:
length(notUpDeplGenes1)
length(notDownDeplGenes1)
length(upDeplGenes1)
length(downDeplGenes1)
# We find back the 217 genes differentially regulated in presence of arginine when adding RANKL and the 14 in absence of arginine, 
# as shown on corresponding Venn diagram

In [None]:
upDeplGenes2 = names(which((dt.strict[,3] == 1) & (dt.strict[,6] == 1) & (dt.strict[,5] == 0)))
downDeplGenes2 = names(which((dt.strict[,3] == -1) & (dt.strict[,6] == -1) & (dt.strict[,5] == 0)))
notUpDeplGenes2 = names(which((dt.strict[,3] == 0) & (dt.strict[,6] == 0) & (dt.strict[,5] == 1)))
notDownDeplGenes2 = names(which((dt.strict[,3] == 0) & (dt.strict[,6] == 0) & (dt.strict[,5] == -1)))

In [None]:
notUpDeplGenes = intersect(notUpDeplGenes1, notUpDeplGenes2)
notDownDeplGenes = intersect(notDownDeplGenes1, notDownDeplGenes2)
upDeplGenes = intersect(upDeplGenes1, upDeplGenes2)
downDeplGenes = intersect(downDeplGenes1, downDeplGenes2)

In [None]:
length(notUpDeplGenes)
length(notDownDeplGenes)
length(upDeplGenes)
length(downDeplGenes)

In [None]:
for (i in v$genes$SYMBOL[which(v$genes$ENTREZID %in% downDeplGenes)]){
    cat(paste0(i, "\n"))
}
for (i in v$genes$SYMBOL[which(v$genes$ENTREZID %in% notUpDeplGenes)]){
    cat(paste0(i, "\n"))
}
write.csv(file = "../RANKL_effect_reduced.csv", row.names = F,
          x = c(v$genes$SYMBOL[which(v$genes$ENTREZID %in% downDeplGenes)], v$genes$SYMBOL[which(v$genes$ENTREZID %in% notUpDeplGenes)]))

In [None]:
for (i in v$genes$SYMBOL[which(v$genes$ENTREZID %in% upDeplGenes)]){
    cat(paste0(i, "\n"))
}
for (i in v$genes$SYMBOL[which(v$genes$ENTREZID %in% notDownDeplGenes)]){
    cat(paste0(i, "\n"))
}
write.csv(file = "../RANKL_effect_increased.csv", row.names = F,
         x = c(v$genes$SYMBOL[which(v$genes$ENTREZID %in% upDeplGenes)], v$genes$SYMBOL[which(v$genes$ENTREZID %in% notDownDeplGenes)]))

Enrich links:  
[upregulated](http://amp.pharm.mssm.edu/Enrichr/enrich?dataset=3v95x)  
[downregulated](http://amp.pharm.mssm.edu/Enrichr/enrich?dataset=3v96c)

In [None]:
print(which(apply(dt.multi[,c(7,9,10)]!=0, 1, all)))
dt.multi[(which(apply(dt.multi[,c(7,9,10)]!=0, 1, all))),]
# Gene 65112 (Pmepa1, prostate transmembrane protein) is up by arg depletion and down by recArg1

### Differences between MCSF only and MCSF + RANKL

In [None]:
ranklOnlyGenes = intersect(names(which((dt.strict[,3] != 0))), names(which((dt.multi[,3] != 0))))
length(ranklOnlyGenes)

In [None]:
for (i in v$genes$SYMBOL[which(v$genes$ENTREZID %in% ranklOnlyGenes)]){
    cat(paste0(i, "\n"))
}

In [None]:
search_kegg_organism('mmu', by='kegg_code')

In [None]:
pdf("../enrichment_kegg.pdf")
enrichRanklOnly <- as.data.frame(enrichKEGG(gene = ranklOnlyGenes, organism = 'mmu', pvalueCutoff = 1, qvalueCutoff = 0.1))
enrichRanklOnly$intergenes <- sapply(enrichRanklOnly$GeneRatio, function(x) as.numeric(strsplit(x, "/")[[1]][1]))
enrichRanklOnly$setgenes <- sapply(enrichRanklOnly$BgRatio, function(x) as.numeric(strsplit(x, "/")[[1]][1]))
enrichRanklOnly$p.adjust <- -log10(enrichRanklOnly$p.adjust)
# 8254 genes are part of the Kegg annotations, including 214 genes differentially expressed upon RANKL addition 
attach(enrichRanklOnly)
enrichRanklOnly$oddsratio <- (intergenes*(8254-(214 - intergenes)))/((214 - intergenes)*(setgenes - intergenes)) 
detach(enrichRanklOnly)
attach(enrichRanklOnly)
enrichRanklOnly$labs <- ifelse(oddsratio>4,Description,'') # Remove labels of points with low odds-ratios
enrichRanklOnly$labs <- ifelse(runif(length(oddsratio))>0.3,enrichRanklOnly$labs,'') # Remove ~30% of the remaining points for readability
colpal = colorRampPalette(c("#000000", "#FF0000"))(max(intergenes))
ggplot(enrichRanklOnly, aes(y=oddsratio, x=p.adjust, color = intergenes, label = labs)) +
  scale_colour_gradientn(colours = colpal, limits=c(0,max(intergenes)),
  breaks=c(0,round(max(max(intergenes))/2),max(max(intergenes))), name = "Shared genes") + 
  geom_point(size = 5) + theme_light() + theme(text=element_text(size=26, family="sans"), 
  panel.grid.minor=element_blank(), legend.position = c(0.8, 0.175), legend.spacing = unit(10, "cm"),
  legend.background = element_rect(size=0.3, color = "black"), legend.margin=margin(5,5,10,5)) +  
  xlab("-log10(corrected p-value)") + ylab("Odds-ratio") + ggtitle("KEGG enrichment with RANKL") + 
  geom_text_repel(size = 6.5, point.padding = 0.3, force = 20, segment.alpha = 0.5)
detach(enrichRanklOnly)
dev.off()

In [None]:
# Import updated list of kegg pathways
keggPath = fromJSON("https://www.kegg.jp/kegg-bin/download_htext?htext=mmu00001&format=json", flatten = F)

In [None]:
indexPathMetab = which(keggPath$children$name == '09100 Metabolism')
pathMetab = unlist(sapply(keggPath$children$children[[indexPathMetab]]$children, function(x) str_match(x$name, "mmu\\d{5}")))
pathMetab = na.exclude(pathMetab)

In [None]:
pdf("../enrichment_kegg_metabo.pdf")
enrichRanklOnly <- as.data.frame(enrichKEGG(gene = ranklOnlyGenes, organism = 'mmu', pvalueCutoff = 1, qvalueCutoff = 1))
enrichRanklOnly$intergenes <- sapply(enrichRanklOnly$GeneRatio, function(x) as.numeric(strsplit(x, "/")[[1]][1]))
enrichRanklOnly$setgenes <- sapply(enrichRanklOnly$BgRatio, function(x) as.numeric(strsplit(x, "/")[[1]][1]))
enrichRanklOnly$p.adjust <- -log10(enrichRanklOnly$p.adjust)
# We filter to keep only metabolism pathways
enrichRanklOnly = enrichRanklOnly[enrichRanklOnly$ID %in% pathMetab,]
# 8254 genes are part of the Kegg annotations, including 214 genes differentially expressed upon RANKL addition 
attach(enrichRanklOnly)
enrichRanklOnly$oddsratio <- (intergenes*(8254-(214 - intergenes)))/((214 - intergenes)*(setgenes - intergenes)) 
detach(enrichRanklOnly)
attach(enrichRanklOnly)
enrichRanklOnly$labs <- ifelse(oddsratio>3,Description,'') # Remove labels of points with low odds-ratios
colpal = colorRampPalette(c("#000000", "#FF0000"))(max(intergenes))
ggplot(enrichRanklOnly, aes(y=oddsratio, x=p.adjust, color = intergenes, label = labs)) +
  scale_colour_gradientn(colours = colpal, limits=c(0,max(intergenes)),
  breaks=c(0,round(max(max(intergenes))/2),max(max(intergenes))), name = "Shared genes") + 
  geom_point(size = 5) + theme_light() + theme(text=element_text(size=26, family="sans"), 
  panel.grid.minor=element_blank(), legend.position = c(0.8, 0.175), legend.spacing = unit(10, "cm"),
  legend.background = element_rect(size=0.3, color = "black"), legend.margin=margin(5,5,10,5), 
  plot.margin = margin(r = 10, b = 5)) +  
  xlab("-log10(corrected p-value)") + ylab("Odds-ratio") + ggtitle("Metabolism perturbed by RANKL") + 
  geom_text_repel(size = 6.5, point.padding = 0.4, force = 10, segment.alpha = 0.5)
detach(enrichRanklOnly)
dev.off()

In [None]:
enrichGoMF = enrichGO(ranklOnlyGenes, 'org.Mm.eg.db')
enrichGoCC = enrichGO(ranklOnlyGenes, 'org.Mm.eg.db', ont = "CC")
enrichGoBP = enrichGO(ranklOnlyGenes, 'org.Mm.eg.db', ont = "BP")

In [None]:
head(enrichGoMF, 7)$Description
head(enrichGoCC, 8)$Description
head(enrichGoBP, 6)$Description

### Enrichment of genes differentially expressed with both recArg1 and depletion

In [None]:
sigGenes = intersect(names(which(((dt.multi[,7] == 1) & (dt.multi[,9] == 1))|((dt.multi[,7] == -1) & (dt.multi[,9] == -1)))),
          names(which(((dt.strict[,7] == 1) & (dt.strict[,9] == 1))|((dt.strict[,7] == -1) & (dt.strict[,9] == -1)))))
length(sigGenes)

In [None]:
pdf("../enrichment_kegg_starv.pdf")
enrichStarv <- as.data.frame(enrichKEGG(gene = sigGenes, organism = 'mmu', pvalueCutoff = 1, qvalueCutoff = 0.1))
enrichStarv$intergenes <- sapply(enrichStarv$GeneRatio, function(x) as.numeric(strsplit(x, "/")[[1]][1]))
enrichStarv$setgenes <- sapply(enrichStarv$BgRatio, function(x) as.numeric(strsplit(x, "/")[[1]][1]))
enrichStarv$p.adjust <- -log10(enrichStarv$p.adjust)
# 8254 genes are part of the Kegg annotations, including 56 genes differentially expressed upon RANKL addition 
attach(enrichStarv)
enrichStarv$oddsratio <- (intergenes*(8254-(56 - intergenes)))/((56 - intergenes)*(setgenes - intergenes)) 
detach(enrichStarv)
attach(enrichStarv)
colpal = colorRampPalette(c("#000000", "#FF0000"))(max(intergenes))
ggplot(enrichStarv, aes(y=oddsratio, x=p.adjust, color = intergenes, label = Description)) +
  scale_colour_gradientn(colours = colpal, limits=c(0,max(intergenes)),
  breaks=c(0,round(max(max(intergenes))/2),max(max(intergenes))), name = "Shared genes") + 
  geom_point(size = 5) + theme_light() + theme(text=element_text(size=26, family="sans"), 
  panel.grid.minor=element_blank(), legend.position = c(0.8, 0.475), legend.spacing = unit(10, "cm"),
  legend.background = element_rect(size=0.3, color = "black"), legend.margin=margin(5,5,10,5)) +  
  xlab("-log10(corrected p-value)") + ylab("Odds-ratio") + ggtitle("KEGG enrichment") + 
  geom_text_repel(size = 6.5, point.padding = 0.3, force = 20, segment.alpha = 0.5)
detach(enrichStarv)
dev.off()

In [None]:
enrichGoMF = enrichGO(sigGenes, 'org.Mm.eg.db', pvalueCutoff = 1, qvalueCutoff = 0.1)
enrichGoCC = enrichGO(sigGenes, 'org.Mm.eg.db', ont = "CC", pvalueCutoff = 1, qvalueCutoff = 0.1) # Empty
enrichGoBP = enrichGO(sigGenes, 'org.Mm.eg.db', ont = "BP", pvalueCutoff = 1, qvalueCutoff = 0.1)

In [None]:
pdf("../enrichment_gomf_starv.pdf")
enrichStarv <- as.data.frame(enrichGoMF)
enrichStarv$intergenes <- sapply(enrichStarv$GeneRatio, function(x) as.numeric(strsplit(x, "/")[[1]][1]))
enrichStarv$setgenes <- sapply(enrichStarv$BgRatio, function(x) as.numeric(strsplit(x, "/")[[1]][1]))
enrichStarv$p.adjust <- -log10(enrichStarv$p.adjust)
attach(enrichStarv)
enrichStarv$oddsratio <- (intergenes*(22933-(107 - intergenes)))/((107 - intergenes)*(setgenes - intergenes)) 
detach(enrichStarv)
attach(enrichStarv)
enrichStarv$labs <- ifelse((oddsratio>40)|(p.adjust > 2) ,Description,'') # Remove labels of points with low odds-ratios
enrichStarv$labs[3] <- 'oxidoreductase activity' # Shorten long term  not fitting in view
colpal = colorRampPalette(c("#000000", "#FF0000"))(max(intergenes))
ggplot(enrichStarv, aes(y=oddsratio, x=p.adjust, color = intergenes, label = labs)) +
  scale_colour_gradientn(colours = colpal, limits=c(0,max(intergenes)),
  breaks=c(0,round(max(max(intergenes))/2),max(max(intergenes))), name = "Shared genes") + 
  geom_point(size = 5) + theme_light() + theme(text=element_text(size=26, family="sans"), 
  panel.grid.minor=element_blank(), legend.position = c(0.8, 0.475), legend.spacing = unit(10, "cm"),
  legend.background = element_rect(size=0.3, color = "black"), legend.margin=margin(5,5,10,5)) +  
  xlab("-log10(corrected p-value)") + ylab("Odds-ratio") + ggtitle("GO-MF enrichment") + 
  geom_text_repel(size = 6.5, point.padding = 0.3, force = 20, segment.alpha = 0.5)
detach(enrichStarv)
dev.off()

In [None]:
pdf("../enrichment_gobp_starv.pdf")
enrichStarv <- as.data.frame(enrichGoBP)
enrichStarv$intergenes <- sapply(enrichStarv$GeneRatio, function(x) as.numeric(strsplit(x, "/")[[1]][1]))
enrichStarv$setgenes <- sapply(enrichStarv$BgRatio, function(x) as.numeric(strsplit(x, "/")[[1]][1]))
enrichStarv$p.adjust <- -log10(enrichStarv$p.adjust)
attach(enrichStarv)
enrichStarv$oddsratio <- (intergenes*(23057-(106 - intergenes)))/((106 - intergenes)*(setgenes - intergenes)) 
detach(enrichStarv)
attach(enrichStarv)
enrichStarv$labs <- ifelse((oddsratio>42)|(p.adjust > 2) ,Description,'') # Remove labels of points with low odds-ratios
colpal = colorRampPalette(c("#000000", "#FF0000"))(max(intergenes))
ggplot(enrichStarv, aes(y=oddsratio, x=p.adjust, color = intergenes, label = labs)) +
  scale_colour_gradientn(colours = colpal, limits=c(0,max(intergenes)),
  breaks=c(0,round(max(max(intergenes))/2),max(max(intergenes))), name = "Shared genes") + 
  geom_point(size = 5, alpha = 0.8) + theme_light() + theme(text=element_text(size=26, family="sans"), 
  panel.grid.minor=element_blank(), legend.position = c(0.8, 0.76), legend.spacing = unit(10, "cm"),
  legend.background = element_rect(size=0.3, color = "black"), legend.margin=margin(5,5,10,5)) +  
  xlab("-log10(corrected p-value)") + ylab("Odds-ratio") + ggtitle("GO-BP enrichment") + 
  geom_text_repel(size = 6.5, point.padding = 0.3, force = 20, segment.alpha = 0.5)
detach(enrichStarv)
dev.off()

In [None]:
sessionInfo()