# MS proteomics data analysis

In [None]:
library(limma)
library(heatmaply)
library(RColorBrewer)
library(ggplot2)
library(ggrepel)
library(clusterProfiler)
library(Mus.musculus)
library(plyr)
library(dendextend)
library(httr)
library(stringr)

In [None]:
#Load data
MS = read.table("20180202_OsteoclastProteomes_Protein_groups_filtered_imputed.txt", sep = "\t", head=T)

In [None]:
# Unfiltered MDF plot
svg("mdsLogMS.svg")
cndtType = c("MCSF", "MCSF_Arg", "MCSF_RANKL", "MCSF_RANKL_Arg", "MCSF_RANKL_Arg_recArg")
par(bg = "white", family = "sans", cex = 1.3)
colpal = brewer.pal(5, "Set2")
plotMDS(MS[,1:20], pch = 16, col = colpal[rep(1:5, each=4)])
legend(0.16, -.53, legend=cndtType,
       col=colpal, pch=16, cex=0.8)
dev.off()

In [None]:
# Load transformed data with t-test results
protFiltered = read.table("Perseus_run/ProteinsTTestFiltered.txt", sep = "\t", head=T, quote="")

In [None]:
names(protFiltered)[2] <- "LogPvalue"
# Color for significant abundance changes
protFiltered$color = ifelse(test = protFiltered$Difference > 0, "Downregulated", "Upregulated")
protFiltered[protFiltered$Significant != "+", 99] = "No significant change"
protFiltered$Gene.names <- as.character(protFiltered$Gene.names)
# Keep only the first gene name when several are provided
protFiltered[,7] <- sub(";.*", "", protFiltered[,7])

In [None]:
pdf("volcanoPlotFiltering.pdf")
n_genes = paste("n =", paste(table(protFiltered$color), collapse = "/"))
ptsToLabel = c(441,147,112,132,307,184,306,10,221,3,
               222,290,208,258,490,286,507,451,282,385)
protFiltered$labs = ifelse((protFiltered$LogPvalue > 3)|(1:length(protFiltered$LogPvalue) %in% ptsToLabel),
                           protFiltered$Gene.names,"")
ggplot(protFiltered, aes(x = - Difference, y = LogPvalue, color = color, label = labs)) + theme_light() +
  theme(text=element_text(size=20, family="sans"), plot.margin = margin(5, 15, 5, 5), panel.grid.minor=element_blank(),
  legend.position = c(0.725, 0.9), legend.spacing = unit(10, "cm"), legend.title=element_blank(),
  legend.background = element_rect(size=0.3, color = "black"), legend.margin=margin(5,5,10,5)) + 
  scale_color_manual(values = c("#4A91C4","#AAAAAA","#F09F4E")) +
  ylab("-log10(P-value)") + xlab("mean(RANKL+Arg+recArg1) - mean(RANKL+Arg)") + annotate("text", x = 1.7, y = 0, label = n_genes, size = 7) +
  geom_point() + geom_text_repel(size = 7, point.padding = 0.5, force = 0.02, segment.alpha = 0.5, show.legend = F)
dev.off()

In [None]:
write.csv(protFiltered[protFiltered$color == "Downregulated",c(7,2,3)], "prot_recArg_Down.csv", row.names = F)
write.csv(protFiltered[protFiltered$color == "Upregulated",c(7,2,3)], "prot_recArg_Up.csv", row.names = F)

In [None]:
ensgRecArg = unique(na.omit(sapply(protFiltered[protFiltered$color != "No significant change",4],
               function(x) strsplit(as.character(x), ";")[[1]][1])))
head(enrichKEGG(ensgRecArg, "mmu", keyType = "uniprot"))

In [None]:
for (i in protFiltered[rev(order(protFiltered$LogPvalue))[1:40],7]){cat(i);cat("\n")}

### Comparison between arginine depletion and recArg1

In [None]:
# Load transformed data with t-test results
protFilteredDvr = read.table("Perseus_run/ProteinsTTestFiltered_deplVsRec.txt", sep = "\t", head=T, quote="")

In [None]:
names(protFilteredDvr)[2] <- "LogPvalue"
# Color for significant abundance changes
protFilteredDvr$color = ifelse(test = protFilteredDvr$Difference > 0, "#4A91C4", "#F09F4E")
protFilteredDvr[protFilteredDvr$Significant != "+", 93] = "grey"
protFilteredDvr$Gene.names <- as.character(protFilteredDvr$Gene.names)
# Keep only the first gene name when several are provided
protFilteredDvr[,7] <- sub(";.*", "", protFilteredDvr[,7])

In [None]:
# Threshold curve
protFilterCurveDvr = read.table("Perseus_run/ProteinsTTestFiltered_deplVsRec_Curve.txt", sep = "\t", head=T, quote="")

In [None]:
pdf("volcanoPlotFiltering_deplVsRec.pdf")
par(family = "sans", bg = "white")
ptsToLabelRight = c(221,32,155,245,197,112,461,251,114,168,3,56,100,259,225,110,173, 234, 173, 212,32,231,386,443,391)
ptsToLabelLeft = c(290,151,286,6,502,352,180,73,2,296,94,279)
with(protFilteredDvr, plot(LogPvalue ~ Difference, pch=20, col=color, ylab = "log(p-value)",
                       xlab = "mean(RANKL) - mean(RANKL+Arg+recArg1)"))
with(protFilteredDvr[ptsToLabelLeft,], text(LogPvalue ~ Difference, labels = Gene.names, cex = 0.8, adj = c(-0.2,1)))
with(protFilteredDvr[ptsToLabelRight,], text(LogPvalue ~ Difference, labels = Gene.names, cex = 0.8, adj = c(1.2,1)))
lines(protFilterCurveDvr, lty = 2)
dev.off()

### Effect of RANKL with and without arginine

In [None]:
# Load transformed data with t-test results
protFilteredRanklArg = read.table("Perseus_run/ProteinsTTestFiltered_rankl_Arg.txt", sep = "\t", head=T, quote="")
protFilteredRanklDep = read.table("Perseus_run/ProteinsTTestFiltered_rankl_noArg.txt", sep = "\t", head=T, quote="")

In [None]:
names(protFilteredRanklArg)[2] <- "LogPvalue"
names(protFilteredRanklDep)[2] <- "LogPvalue"
# Color for significant abundance changes
protFilteredRanklArg$color = ifelse(test = protFilteredRanklArg$Difference > 0, "Downregulated", "Upregulated")
protFilteredRanklArg[protFilteredRanklArg$Significant != "+", 93] = "No significant change"
protFilteredRanklArg$Gene.names <- as.character(protFilteredRanklArg$Gene.names)
# Keep only the first gene name when several are provided
protFilteredRanklArg[,7] <- sub(";.*", "", protFilteredRanklArg[,7])
# Color for significant abundance changes
protFilteredRanklDep$color = ifelse(test = protFilteredRanklDep$Difference > 0, "#4A91C4", "#F09F4E")
protFilteredRanklDep[protFilteredRanklDep$Significant != "+", 93] = "grey"
protFilteredRanklDep$Gene.names <- as.character(protFilteredRanklDep$Gene.names)
# Keep only the first gene name when several are provided
protFilteredRanklDep[,7] <- sub(";.*", "", protFilteredRanklDep[,7])

In [None]:
# Threshold curve
protFilterCurveRanklDep = read.table("Perseus_run/ProteinsTTestFiltered_rankl_noArg_Curve.txt", sep = "\t", head=T, quote="")

In [None]:
pdf("volcanoPlotFiltering_ranklArg.pdf")
n_genes = paste("n =", paste(table(protFilteredRanklArg$color), collapse = "/"))
ptsToLabel = c(290,6,352,221,155,151,245,197,112)
protFilteredRanklArg$labs = ifelse((protFilteredRanklArg$LogPvalue > 4.5)|
                                   (1:length(protFilteredRanklArg$LogPvalue) %in% ptsToLabel),
                           protFilteredRanklArg$Gene.names,"")
ggplot(protFilteredRanklArg, aes(x = - Difference, y = LogPvalue, color = color, label = labs)) + theme_light() +
  theme(text=element_text(size=20, family="sans"), plot.margin = margin(5, 15, 5, 5), panel.grid.minor=element_blank(),
  legend.position = c(0.225, 0.9), legend.spacing = unit(10, "cm"), legend.title=element_blank(),
  legend.background = element_rect(size=0.3, color = "black"), legend.margin=margin(5,5,10,5)) + 
  scale_color_manual(values = c("#4A91C4","#AAAAAA","#F09F4E")) +
  ylab("-log10(P-value)") + xlab("mean(RANKL+Arg) - mean(Arg)") + annotate("text", x = 1.6, y = 0, label = n_genes, size = 7) +
  geom_point() + geom_text_repel(size = 7, point.padding = 0.6, force = 0.04, segment.alpha = 0.5, show.legend = F)
dev.off()

Label RANKL and recArg1 volcano plots based on shared or specific DEG. The proteins are in the same order in both lists:

In [None]:
all(protFiltered$Gene.names == protFilteredRanklArg$Gene.names)

In [None]:
svg("volcanoPlotFiltering_ranklArg_relabelled.svg")
n_genes = paste("n =", paste(table(protFilteredRanklArg$color), collapse = "/"))
protFilteredRanklArg$labs = ifelse((protFilteredRanklArg$color != "No significant change") & (protFiltered$color != "No significant change"),
      protFiltered$Gene.names, "")
ggplot(protFilteredRanklArg, aes(x = - Difference, y = LogPvalue, color = color, label = labs)) + theme_light() +
  theme(text=element_text(size=20, family="sans"), plot.margin = margin(5, 15, 5, 5), panel.grid.minor=element_blank(),
  legend.position = c(0.225, 0.9), legend.spacing = unit(10, "cm"), legend.title=element_blank(),
  legend.background = element_rect(size=0.3, color = "black"), legend.margin=margin(5,5,10,5)) + 
  scale_color_manual(values = c("#4A91C4","#AAAAAA","#F09F4E")) +
  ylab("-log10(P-value)") + xlab("mean(RANKL+Arg) - mean(Arg)") + annotate("text", x = 1.6, y = 0, label = n_genes, size = 7) +
  geom_point() + geom_text_repel(size = 7, point.padding = 0.6, force = 0.04, segment.alpha = 0.5, show.legend = F)
dev.off()

In [None]:
svg("volcanoPlotFiltering_relabelled.svg")
n_genes = paste("n =", paste(table(protFiltered$color), collapse = "/"))
protFiltered$labs = ifelse( (protFilteredRanklArg$labs == "") & (protFiltered$color != "No significant change"),
        protFiltered$Gene.names, "")
ggplot(protFiltered, aes(x = - Difference, y = LogPvalue, color = color, label = labs)) + theme_light() +
  theme(text=element_text(size=20, family="sans"), plot.margin = margin(5, 15, 5, 5), panel.grid.minor=element_blank(),
  legend.position = c(0.725, 0.9), legend.spacing = unit(10, "cm"), legend.title=element_blank(),
  legend.background = element_rect(size=0.3, color = "black"), legend.margin=margin(5,5,10,5)) + 
  scale_color_manual(values = c("#4A91C4","#AAAAAA","#F09F4E")) +
  ylab("-log10(P-value)") + xlab("mean(RANKL+Arg+recArg1) - mean(RANKL+Arg)") + annotate("text", x = 1.7, y = 0, label = n_genes, size = 7) +
  geom_point() + geom_text_repel(size = 7, point.padding = 0.5, force = 0.02, segment.alpha = 0.5, show.legend = F)
dev.off()

In [None]:
write.csv(protFilteredRanklArg[protFilteredRanklArg$color == "Downregulated",c(7,2,3)], "prot_rankl_Down.csv", row.names = F)
write.csv(protFilteredRanklArg[protFilteredRanklArg$color == "Upregulated",c(7,2,3)], "prot_rankl_Up.csv", row.names = F)

In [None]:
ensgRankl = unique(na.omit(sapply(protFilteredRanklArg[protFilteredRanklArg$color != "No significant change",4],
               function(x) strsplit(as.character(x), ";")[[1]][1])))
head(enrichKEGG(ensgRankl, "mmu", keyType = "uniprot"))

In [None]:
pdf("volcanoPlotFiltering_ranklDep.pdf")
par(family = "sans", bg = "white")
ptsToLabelRight =  c(290,6,352) #c(221,32,155,245,197,112,461,251,114,168,3,56,100,259,225,110,173, 234, 173, 212,32,231,386,443,391)
ptsToLabelLeft = c(221,155,151,245,197,112) #c(290,151,286,6,502,352,180,73,2,296,94,279)
with(protFilteredRanklDep, plot(LogPvalue ~ Difference, pch=20, col=color, ylab = "log(p-value)",
                       xlab = "mean(MCSF) - mean(RANKL+MCSF)"))
with(protFilteredRanklDep[ptsToLabelLeft,], text(LogPvalue ~ Difference, labels = Gene.names, cex = 0.8, adj = c(-0.2,1)))
with(protFilteredRanklDep[ptsToLabelRight,], text(LogPvalue ~ Difference, labels = Gene.names, cex = 0.8, adj = c(1.2,1)))
lines(protFilterCurveRanklDep, lty = 2)
dev.off()

## Export for OmicsIntegrator

In [None]:
pf <- protFiltered
pf$Difference <- abs(pf$Difference)
write.table(pf[pf$Significant == "+",c(13,3)], "OmicsIntegratorRun/recArg1_prot.tsv", row.names = FALSE,
          col.names = FALSE, quote = FALSE, sep = "\t") 

In [None]:
pfd <- protFilteredDvr
pfd$Difference <- abs(pfd$Difference)
write.table(pfd[pfd$Significant == "+",c(13,3)], "OmicsIntegratorRun/deplOrRec_prot.tsv", row.names = FALSE,
          col.names = FALSE, quote = FALSE, sep = "\t") 

In [None]:
mergedProtRANKL = cbind(protFilteredRanklArg[,c(1:3,13)], protFilteredRanklDep[,c(1:3,13)])

# Test if a protein with differential abundance for one condition have a different behavior for the other
checkIfDiff <- function(x){
    if (x[,1] == "+" & x[,5] == "+"){
        if (x[,3] > 0 & x[,7] < 0){
            return(1) # Prot abundance is up in presence of arginine and down without
        }
        if (x[,3] < 0 & x[,7] > 0){
            return(-1) # Prot abundance is down in presence of arginine and up without
        }
    }
    else if (x[,1] == "+"){
        if (x[,3] > max(0, x[,7] + 1)){
            return(1) # Prot abundance is up in presence of arginine, not without
        }
        if (x[,3] < min(0, x[,7] - 1)){
            return(-1) # Prot abundance is down in presence of arginine, not without
        }
    }
    else if (x[,5] == "+"){
        if (x[,7] > max(0, x[,3] + 1)){
            return(-1) # Prot abundance is up in absence of arginine, not with it
        }
        if (x[,7] < min(0, x[,3] - 1)){
            return(1) # Prot abundance is down in absence of arginine, not with it
        }
    }
    return(0)
}

protRanklGroup = adply(mergedProtRANKL, 1, checkIfDiff, .expand = FALSE)$V1

In [None]:
for (i in mergedProtRANKL[which(protRanklGroup == 1),4]){
    cat(strsplit(i, ";")[[1]][1], "\n")
}
for (i in mergedProtRANKL[which(protRanklGroup == -1),4]){
    cat(strsplit(i, ";")[[1]][1], "\n")
}

In [None]:
table(protRanklGroup)/(44+365+103)

In [None]:
names(mergedProtRANKL) = make.names(names(mergedProtRANKL), unique = T)
mpr = mergedProtRANKL[c(which(protRanklGroup == 1), which(protRanklGroup == -1)),]
mpr = data.frame(mpr$Gene.name, apply(abs(mpr[,c(3,7)]), 1, max))
# Take abs big diff + Gene.name
write.table(mpr, "OmicsIntegratorRun/ranklDiff_prot.tsv", row.names = FALSE,
          col.names = FALSE, quote = FALSE, sep = "\t") 

## Format heatmap

In [None]:
filterProtTable = read.table("Perseus_run/prot_hm_data.txt", sep="\t", header = T)

In [None]:
colPal <- colorRampPalette(c("#4A91C4","white","#F09F4E"))
genes = sapply(as.character(filterProtTable$T..Gene.names), function(x) strsplit(x, ';')[[1]][1])
rownames(filterProtTable) <- genes
cndt = data.frame(Conditions = rep(c('MCSF', 'MCSF+arg', 'MCSF+RANKL', 'MCSF+RANKL+arg', 'MCSF+RANKL+arg+recArg1'), each = 4))
# Compute row dendrograms on 2 conditions only
dist_dend <- dist(filterProtTable[,1:20])
row_dend <- as.dendrogram(hclust(dist_dend))
row_dend <- seriate_dendrogram(row_dend, dist_dend, method = "OLO")
row_dend <- row_dend %>% set("branches_col", "white")

heatmaply(filterProtTable[,1:20], labCol = rep(NA, 20), labRow = rep(NA, length(genes)), file = "heatmapFilteredMS.pdf",
    cexRow = 0.5, margins = c(50,60,NA,0), ColSideColors=cndt, plot_method = "plotly", col = colPal,
    col_side_palette = colorRampPalette(brewer.pal(n = 7, name = "Set1")), Rowv = row_dend,
    key.title = "Abundance (z-score)", xlab = "Samples", ylab = "Genes")

In [None]:
sessionInfo()