# WGCNA of LC M001 Proteomics

***by Tomasz Wilmanski***  

In this notebook, weighted gene correlation network analysis (WGCNA; Langfelder, P. & Horvath, S. BMC Bioinform. 2008) is applied to the Longevity Consortium (LC) M001 proteomics dataset. This notebook is written by R.  

In [None]:
#install.packages(c("matrixStats", "Hmisc", "splines", "foreach", "doParallel", "fastcluster", "dynamicTreeCut", "survival")) 
BiocManager::install((c("GO.db", "preprocessCore", "impute")))
BiocManager::install('WGCNA')

In [None]:
library(robustHD)
# from https://gist.github.com/stevenworthington/3178163
ipak <- function(pkg){
    new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
    if (length(new.pkg))
        install.packages(new.pkg, dependencies = TRUE, repos = "http://cran.r-project.org")
    sapply(pkg, require, character.only = TRUE)
}

# usage
packages <- c("ggplot2",  "gplots", "lattice", "plyr", "reshape2",
              "RColorBrewer", "grid", "gridExtra", "igraph", "igraphdata")
suppressMessages(ipak(packages))

In [None]:
library(WGCNA);
options(stringsAsFactors = FALSE)
(.packages())
library(robustHD)

In [None]:
#load proteomics data without imputation
LC_data = read.csv("proteomics_not_imputed.csv");
# tranpose data 
LC_data=t(LC_data)
#Take a quick look at what is in the data set:
dim(LC_data)
head(LC_data)
#save column names
LC_data_names<-LC_data[1,]
#convert to df
LC_data=as.data.frame(LC_data)
dim(LC_data)
#subset just the sample rows in the df
LC_data=LC_data[3:50,]
head(LC_data)

In [None]:
#name columns in df
colnames(LC_data)<-LC_data_names
#convert df to numeric and replace 0 with NA (missing)
num_df<-sapply(LC_data, function(x) as.numeric(as.character(x)))
num_df[num_df==0.00]<-NA
dim(num_df)

In [None]:
#save the processed df
write.csv(num_df,'proteomics_cleaned.csv')

In [None]:
#filter columns (proteins) by missingness using code provided by WGCNA atuhors (50% default threshold)
gsg = goodSamplesGenes(num_df, verbose = 1);
gsg$allOK
if (!gsg$allOK)
{
  # Optionally, print the gene and sample names that were removed:
  if (sum(!gsg$goodGenes)>0) 
     printFlush(paste("Removing genes:", paste(names(num_df)[!gsg$goodGenes], collapse = ", ")));
  if (sum(!gsg$goodSamples)>0) 
     printFlush(paste("Removing samples:", paste(rownames(num_df)[!gsg$goodSamples], collapse = ", ")));
  # Remove the offending genes and samples from the data:
  num_df = num_df[gsg$goodSamples, gsg$goodGenes]
}

In [None]:
dim(num_df)

In [None]:
#exclude proteins not mapped to mouse (human proteins)
non_human_genes<-c(
'sp|ALBU_BOVIN|',
'sp|K1C10_HUMAN|',
'sp|K1C9_HUMAN|',
'sp|K22E_HUMAN|',
'sp|K2C1_HUMAN|',
'sp|TRYP_PIG|')
dim(num_df)
num_df = num_df[,!(colnames(num_df) %in% non_human_genes)]
dim(num_df)

In [None]:
# now that the data has been processed and cleaned accordingly, the next step is to choose the right beta value to approximate 
# a scale free topology
# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 11, to=15, by=1))
# Call the network topology analysis function
sft = pickSoftThreshold(num_df, powerVector = powers, verbose = 5,corOptions=c(use='p',method='spearman'), networkType='signed hybrid')
# Plot the results:
#sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.8;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
    main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
    labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.80,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
    xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
    main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")

In [None]:
#generate the adjacency matrix using the chosen soft-thresholding power
softPower = 7;
adjacency = adjacency(num_df, power = softPower,corOptions=list(use='p',method='spearman'),type = "signed hybrid" );

In [None]:
# Turn adjacency into topological overlap
TOM = TOMsimilarity(adjacency,TOMType = "signed");
dissTOM = 1-TOM

In [None]:
# Call the hierarchical clustering function
geneTree = hclust(as.dist(dissTOM), method = "average");
# Plot the resulting clustering tree (dendrogram)
#sizeGrWindow(12,9)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
    labels = FALSE, hang = 0.04);

In [None]:
#We like large modules, so we set the minimum module size relatively high:
minModuleSize = 30;
# Module identification using dynamic tree cut:
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
              deepSplit = 3, pamRespectsDendro = FALSE,
              minClusterSize = minModuleSize);
table(dynamicMods)

In [None]:
# Convert numeric lables into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
# Plot the dendrogram and colors underneath
#sizeGrWindow(8,6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
                  dendroLabels = FALSE, hang = 0.03,
                  addGuide = TRUE, guideHang = 0.05,
                  main = "Gene dendrogram and module colors")

In [None]:

# Calculate eigengenes
MEList = moduleEigengenes(num_df, colors = dynamicColors,nPC = 2)
MEs = MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs);
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average");
# Plot the result
#sizeGrWindow(7, 6)
plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
MEDissThres = .2
abline(h=MEDissThres, col = "red")

In [None]:

# Call an automatic merging function
merge = mergeCloseModules(num_df, dynamicColors, cutHeight = MEDissThres, verbose = 3)
# The merged module colors
mergedColors = merge$colors;
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs;
#pdf(file = "Plots/geneDendro-3.pdf", wi = 9, he = 6)
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
# Plot the cut line into the dendrogram

#dev.off()

In [None]:
# Rename to moduleColors
moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1;
MEs = mergedMEs;
# Save module colors and labels for use in subsequent parts
#save(MEs, moduleLabels, moduleColors, geneTree, file = "FemaleLiver-02-networkConstruction-stepByStep.RData")

In [None]:
#Save module eigenvalues
rownames(MEs)<-rownames(LC_data)
write.csv(MEs,'MEs.csv')

In [None]:
# Define numbers of genes and samples
nGenes = ncol(num_df);
nSamples = nrow(num_df);
geneModuleMembership = as.data.frame(cor(num_df, MEs, use = "p"));
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples));

In [None]:
modNames = substring(names(MEs), 3)
names(geneModuleMembership) = paste("MM", modNames, sep="");
names(MMPvalue) = paste("p.MM", modNames, sep="");
write.csv(geneModuleMembership[order(geneModuleMembership$MMdarkgreen),],'module_membership.csv')

In [None]:
# Recalculate topological overlap
TOM = TOMsimilarity(adjacency,TOMType = "signed");
# Read in the annotation file
annot = read.csv(file = "annotated_genes.csv");
# Select module
module = "darkgreen";
# Select module probes
probes = colnames(num_df)
inModule = (moduleColors==module);
modProbes = probes[inModule];
# Select the corresponding Topological Overlap
modTOM = TOM[inModule, inModule];
dimnames(modTOM) = list(modProbes, modProbes)
# Export the network into an edge list file VisANT can read
vis = exportNetworkToVisANT(modTOM,
  file = paste("VisANTInput-", module, ".txt", sep=""),
  weighted = TRUE,
  threshold = 0,
  probeToGene = data.frame(annot$gene, annot$final_gene) )

In [None]:
length(modProbes)

In [None]:
nTop = 30;
IMConn = softConnectivity(num_df[, modProbes],corOptions=list(use='p',method='spearman'),type="signed hybrid",power=7);
top = (rank(-IMConn) <= nTop)
vis = exportNetworkToVisANT(modTOM[top, top],
  file = paste("VisANTInput-", module, "-top30.txt", sep=""),
  weighted = TRUE,
  threshold = 0,
  probeToGene = data.frame(annot$gene, annot$final_gene) )

In [None]:
sessionInfo()

In [None]:
connectivity<-as.data.frame(modProbes)
connectivity$connect<-IMConn
connectivity$tom_score<-rowSums(modTOM)
connectivity[order(connectivity$connect),]
write.csv(connectivity,'darkgreen_connectivity.csv')

In [None]:
module = "darkgreen"
column = match(module, modNames);
moduleGenes = moduleColors==module;

In [None]:
genes<-t(num_df)
d<-as.data.frame(moduleColors)
d$gene<-rownames(genes)
head(d)

In [None]:
write.csv(d,'module_assignments.csv')