# Tissue enrichment analysis
The file *Mouse_Gene_Atlas* has been downloaded from [Enrichr](http://amp.pharm.mssm.edu/Enrichr/#stats) on 2018-06-14.  

In [None]:
library(grDevices)
library(org.Mm.eg.db)
library(ggplot2)

In [None]:
# Load differentially expressed genes in our study
incSet = unlist(read.csv("RANKL_effect_increased.csv"))
decSet = unlist(read.csv("RANKL_effect_reduced.csv"))

In [None]:
#NB: identifiers are capitalized outdated mouse gene names
fileMGA = file("Mouse_Gene_Atlas",open="r")
rawMGA = readLines(fileMGA)
close(fileMGA)

In [None]:
extractTissue <- function(x){
    return(strsplit(x, "\t")[[1]][1])
}
extractGenes <- function(x){
    geneTissue = sapply(strsplit(strsplit(x, "\t")[[1]][-c(1:2)], ","), function(x) x[[1]])
    return(geneTissue)
}

In [None]:
MGA = lapply(rawMGA, extractGenes)
names(MGA) <- lapply(rawMGA, extractTissue)

In [None]:
updIncSet = select(org.Mm.eg.db, keys = as.character(incSet), keytype="ALIAS", col="SYMBOL")
updDecSet = select(org.Mm.eg.db, keys = as.character(decSet), keytype="ALIAS", col="SYMBOL")

In [None]:
length(incSet)
dim(updIncSet)
length(decSet)
dim(updDecSet)
length(which(updIncSet$ALIAS != updIncSet$SYMBOL))
length(which(updDecSet$ALIAS != updDecSet$SYMBOL))

In [None]:
updIncSet[updIncSet$ALIAS %in% updIncSet$ALIAS[duplicated(updIncSet$ALIAS)],]

In [None]:
updDecSet[updDecSet$ALIAS %in% updDecSet$ALIAS[duplicated(updDecSet$ALIAS)],]

The symbols used in these lists are up-to-date and/or ambiguous.

In [None]:
aliasToSymbol = select(org.Mm.eg.db, keys(org.Mm.eg.db, keytype="ALIAS"), keytype="ALIAS", col="SYMBOL", fuzzy=T)
aliasToSymbol$ALIAS = toupper(aliasToSymbol$ALIAS)

In [None]:
lapply(MGA, function(x) table(x %in% aliasToSymbol$ALIAS))

In [None]:
MGA$uterus[which(!(MGA$uterus %in% aliasToSymbol$ALIAS))]
MGA$common_myeloid_progenitor[which(!(MGA$common_myeloid_progenitor %in% aliasToSymbol$ALIAS))]

Some of these gene names cannot be converted back to up-to-date symbols.

In [None]:
updateSymbols <- function(x){
    updSet = aliasToSymbol[aliasToSymbol$ALIAS %in% x,]
    dupliSymbols = updSet$ALIAS[duplicated(updSet$ALIAS)]
    # Return non-ambiguous cases or don't change name it is an ambiguous but uptodate symbol
    return(unique(with(updSet, updSet[(!(ALIAS %in% dupliSymbols))|(ALIAS == toupper(SYMBOL)), 2])))
}

In [None]:
updMGA = lapply(MGA, updateSymbols)

In [None]:
n = length(unique(unlist(updMGA),200))
compareTissueToInc <- function(x, set){
    # Get gene names of overexpressed genes in the tissue
    contTab = matrix(c(length(intersect(x, set)), length(setdiff(x, set)), length(setdiff(set, x)), 0), ncol = 2)
    contTab[2,2] = n - sum(contTab)
    ft = fisher.test(contTab, alternative = "greater")
    return(c(pval = ft$p.value, ft$estimate, inter = length(intersect(x, set))))
}

In [None]:
pdf("enrichment_wi.pdf")
decEnrich = sapply(updMGA, function(x) compareTissueToInc(x, decSet))
decEnrich[1,] = -log10(sapply(decEnrich[1,] * ncol(decEnrich), function(x) min(x, 1)))
rownames(decEnrich)[2] = "oddsratio"
colpal = colorRampPalette(c("#000000", "#FF0000"))(max(decEnrich[3,]))
ggplot(as.data.frame(t(decEnrich)), aes(x=oddsratio, y=pval, color = inter)) +
  scale_colour_gradientn(colours = colpal, limits=c(0,max(decEnrich[3,])),
  breaks=c(0,round(max(decEnrich[3,])/2),max(decEnrich[3,])), name = "Shared genes") + 
  geom_point(size = 5) + theme_light() + theme(text=element_text(size=26, family="sans"), 
    panel.grid.minor=element_blank(), legend.position = c(0.8, 0.6), legend.spacing = unit(10, "cm"),
    legend.background = element_rect(size=0.3, color = "black"), legend.margin=margin(5,5,10,5)) +  
  xlab("Odd-ratio") + ylab("-log10(corrected p-value)") + ggtitle("With arginine") +
  geom_text(aes(label=ifelse(pval>1,colnames(decEnrich),'')), hjust = 1.02, vjust = 1.05, size = 10)
dev.off()

In [None]:
pdf("enrichment_wo.pdf")
incEnrich = sapply(updMGA, function(x) compareTissueToInc(x, incSet))
incEnrich[1,] = -log10(sapply(incEnrich[1,] * ncol(incEnrich), function(x) min(x, 1)))
rownames(incEnrich)[2] = "oddsratio"
colpal = colorRampPalette(c("#000000", "#FF0000"))(1+max(incEnrich[3,]))
ggplot(as.data.frame(t(incEnrich)), aes(x=oddsratio, y=pval, color = inter)) +
  scale_colour_gradientn(colours = colpal, limits=c(0,max(incEnrich[3,])),
  breaks=c(0,round(max(incEnrich[3,])/2),max(incEnrich[3,])), name = "Shared genes") + 
  geom_point(size = 5) + theme_light() + theme(text=element_text(size=26, family="sans"), 
    panel.grid.minor=element_blank(), legend.position = c(0.8, 0.6), legend.spacing = unit(10, "cm"),
    legend.background = element_rect(size=0.3, color = "black"), legend.margin=margin(5,5,10,5)) +  
  xlab("Odd-ratio") + ylab("-log10(corrected p-value)") + ggtitle("Without arginine") + 
  geom_text(aes(label=ifelse(pval>1,colnames(incEnrich),'')), hjust = 1.02, vjust = 1.05, size = 10)
dev.off()