# Pathway selection and network-file creation
In this notebook we will select pathways from 3 databases, i.e. [KEGG](https://www.genome.jp/kegg/), [REACTOME](https://reactome.org/) and [WikiPathways](https://www.wikipathways.org/index.php/WikiPathways).
We base this selection on genes associated with inflammation, of which the retrieving process is described in a different [notebook](https://github.com/macsbio/inflammation_networks/tree/master/Jupyter%20-%20Inflammation%20genes).

#### The following step only works in RStudio. If working in another environment, please set the working directory properly and check if the working directory is correct. 

In [1]:
# set wd to where script file is saved
setwd(dirname(rstudioapi::callFun("getActiveDocumentContext")$path))

ERROR: Error: RStudio not running


In [2]:
# check wd
getwd()

In [4]:
# load library
library(limma)
library(qusage)
library(plyr)
library(dplyr)
library(tidyr)

First we will load in the .gmt files of the 3 databases, next we will change the structure so we can work more easily with it based on our needs. Next we will load in the genes associated with inflammation.

In [None]:
# load data files and clean data files
### KEGG
kegg <- read.gmt(file.path(getwd(), "databases", "c2.cp.kegg.v6.2.entrez.gmt"))
kegg <- ldply(kegg, data.frame)
colnames(kegg)[c(1,2)] <- c("pathway", "entrezgene")

In [None]:
### REACTOME
reactome <- read.gmt(file.path(getwd(), "databases", "c2.cp.reactome.v6.2.entrez.gmt"))
reactome <- ldply(reactome, data.frame)
colnames(reactome)[c(1,2)] <- c("pathway", "entrezgene")

In [None]:
### WikiPathways
wp <- read.gmt(file.path(getwd(), "databases", "wikipathways-combined_final.gmt"))
wp <- ldply(wp, data.frame)
colnames(wp)[c(1,2)] <- c("pathway", "entrezgene")

In [None]:
# load in gene list associated with inflammation
infl <- as.data.frame(read.table(file.path(getwd(), "genes", "merged_infl_genes_list.txt"), header = T, sep = "\t", stringsAsFactors = FALSE))
infl <- as.data.frame(infl[,c(-1,-3,-4)])
colnames(infl)[1] <- "entrezgene"

Now the 3 databases and the genes associated with inflammation are loaded in, we can start with filtering the pathways based on our criteria and the genes associated with inflammation. 

The criteria are; the pathways has to contain at least 10 genes, has to contain at least 5 genes associated with inflammation and at least 30% of the genes in the pathway have to be associated with inflammation.

In [None]:
### KEGG
# filter first that pathway has at least 10 genes, so the row with the pathwayname has to occur at least 10 times in the data frame
kegg <- kegg[kegg$pathway %in% names(table(kegg$pathway))[table(kegg$pathway) >= 10],]
# freq of every pathway in database
freqkegg <- plyr::count(kegg$pathway)
# pathway names of the rows (entrezgene IDs) that occur in the gene list associated with inflammation
keggpathways <- as.data.frame(kegg$pathway[kegg$entrezgene %in% infl$entrezgene])
# if these pathwaynames occur at least 5 times we can create list of pathways with at least 5 inflammation genes 
# (and which were already filtered on at least 10 gesn in total)
colnames(keggpathways)[1] <- "pathway"
keggpathways <- as.data.frame(keggpathways[keggpathways$pathway %in% names(table(keggpathways$pathway))[table(keggpathways$pathway) >= 5],])
colnames(keggpathways)[1] <- "pathway"
# freq of every pathway in keggpathways
freqkeggpathways <- plyr::count(keggpathways$pathway)
# freqkegg has total amount of genes in the pathway and freqkeggpathways has total amount of infalmmation genes in pathway
# combine the two data frames, so we have the total amount of genes per pathway which has at least 5 inflammation genes in them
# and so we also have the amount of inflammation genes per pathway, so we can calculate the percentage, which has to be at least 30%
keggpathways <- as.data.frame(merge(freqkeggpathways, freqkegg, by = "x"))
colnames(keggpathways)[c(1,2,3)] <- c("pathway", "freqInflGene", "freqGene")
# calculate percentage and add column
# select rows which have percentage of at least 30%
keggpathways$percentage <- ((keggpathways$freqInflGene/keggpathways$freqGene) * 100)
keggpathways <- keggpathways[keggpathways$percentage >= 30, ]
keggpathways$percentage <- round(keggpathways$percentage, 2)

In [None]:
### REACTOME
# filter first that pathway has at least 10 genes, so the row with the pathwayname has to occur at least 10 times in the data frame
reactome <- reactome[reactome$pathway %in% names(table(reactome$pathway))[table(reactome$pathway) >= 10],]
# freq of every pathway in database
freqreactome <- plyr::count(reactome$pathway)
# pathway names of the rows (entrezgene IDs) that occur in the gene list associated with inflammation
reactomepathways <- as.data.frame(reactome$pathway[reactome$entrezgene %in% infl$entrezgene])
# if these pathwaynames occur at least 5 times we can create list of pathways with at least 5 inflammation genes 
# (and which were already filtered on at least 10 gesn in total)
colnames(reactomepathways)[1] <- "pathway"
reactomepathways <- as.data.frame(reactomepathways[reactomepathways$pathway %in% names(table(reactomepathways$pathway))[table(reactomepathways$pathway) >= 5],])
colnames(reactomepathways)[1] <- "pathway"
# freq of every pathway in reactomepathways
freqreactomepathways <- plyr::count(reactomepathways$pathway)
# freqreactome has total amount of genes in the pathway and freqreactomepathways has total amount of infalmmation genes in pathway
# combine the two data frames, so we have the total amount of genes per pathway which has at least 5 inflammation genes in them
# and so we also have the amount of inflammation genes per pathway, so we can calculate the percentage, which has to be at least 30%
reactomepathways <- as.data.frame(merge(freqreactomepathways, freqreactome, by = "x"))
colnames(reactomepathways)[c(1,2,3)] <- c("pathway", "freqInflGene", "freqGene")
# calculate percentage and add column
# select rows which have percentage of at least 30%
reactomepathways$percentage <- ((reactomepathways$freqInflGene/reactomepathways$freqGene) * 100)
reactomepathways <- reactomepathways[reactomepathways$percentage >= 30, ]
reactomepathways$percentage <- round(reactomepathways$percentage, 2)

In [None]:
### WP
# filter first that pathway has at least 10 genes, so the row with the pathwayname has to occur at least 10 times in the data frame
wp <- wp[wp$pathway %in% names(table(wp$pathway))[table(wp$pathway) >= 10],]
# freq of every pathway in database
freqwp <- plyr::count(wp$pathway)
# pathway names of the rows (entrezgene IDs) that occur in the gene list associated with inflammation
wppathways <- as.data.frame(wp$pathway[wp$entrezgene %in% infl$entrezgene])
# if these pathwaynames occur at least 5 times we can create list of pathways with at least 5 inflammation genes 
# (and which were already filtered on at least 10 gesn in total)
colnames(wppathways)[1] <- "pathway"
wppathways <- as.data.frame(wppathways[wppathways$pathway %in% names(table(wppathways$pathway))[table(wppathways$pathway) >= 5],])
colnames(wppathways)[1] <- "pathway"
# freq of every pathway in wppathways
freqwppathways <- plyr::count(wppathways$pathway)
# freqwp has total amount of genes in the pathway and freqwppathways has total amount of infalmmation genes in pathway
# combine the two data frames, so we have the total amount of genes per pathway which has at least 5 inflammation genes in them
# and so we also have the amount of inflammation genes per pathway, so we can calculate the percentage, which has to be at least 30%
wppathways <- as.data.frame(merge(freqwppathways, freqwp, by = "x"))
colnames(wppathways)[c(1,2,3)] <- c("pathway", "freqInflGene", "freqGene")
# calculate percentage and add column
# select rows which have percentage of at least 30%
wppathways$percentage <- ((wppathways$freqInflGene/wppathways$freqGene) * 100)
wppathways <- wppathways[wppathways$percentage >= 30, ]
wppathways$percentage <- round(wppathways$percentage, 2)

Now we have for each database the selected pathways based on our criteria. Lets save those with the stats of how many genes the pathway has, how many genes associated with inflammation and the percentage of genes associated with inflammation within the pathway. 

Next, 3 pathways were manually added, while they did not meet the criteria, as they were seen as highly important for the inflammation network.

In [None]:
# combine the retrieved pathways so we have a list of the pathways that met the criteria
pathwayStats <- as.data.frame(rbind(keggpathways, reactomepathways, wppathways))

# save pathwaysStats
write.table(pathwayStats, file.path(getwd(), "results", "pathwayStats.txt"), col.names = T, row.names = F, sep = "\t", quote = F)

# get list of only selected pathways without stats
selectedpws <- pathwayStats[c(-2,-3,-4)]  

In [None]:
# Three pathways were manually added although they did not meet the criteria, lets add them to the selectedpws
added <- data.frame(c("IL17 signaling pathway%WikiPathways_20181110%WP2112%Homo sapiens", 
                      "IL-6 signaling pathway%WikiPathways_20181110%WP364%Homo sapiens",
                      "Relationship between inflammation, COX-2 and EGFR%WP4483%Homo sapiens"))
colnames(added)[1] <- "pathway"
selectedpws <- rbind(selectedpws, added)

# save selectedpws
write.table(selectedpws, file.path(getwd(), "results", "selected_pws.txt"), col.names = T, row.names = F, sep = "\t", quote = F)

Now we have a list of selected pathways, it is interesting to see in how many pathways the inflammation genes are.
First we will check in how many total pathways the genes are, then check in how many selected pathways these genes are.

In [None]:
### KEGG
# in how many pathways are the inflammation genes
nGenesTotal <- as.data.frame(kegg$entrezgene[kegg$entrezgene %in% infl$entrezgene])
colnames(nGenesTotal)[1] <- "entrezgene"

# in how many slected pathways are the inflammation genes
selectedKegg <- as.data.frame(kegg[kegg$pathway %in% keggpathways$pathway,])
nInflGenesTotal <- as.data.frame(selectedKegg$entrezgene[selectedKegg$entrezgene %in% infl$entrezgene])
colnames(nInflGenesTotal)[1] <- "entrezgene"

In [None]:
### REACTOME
# in how many pathways are the inflammation genes
nGenesTotal1 <- as.data.frame(reactome$entrezgene[reactome$entrezgene %in% infl$entrezgene])
colnames(nGenesTotal1)[1] <- "entrezgene"

# in how many slected pathways are the inflammation genes
selectedReactome <- as.data.frame(reactome[reactome$pathway %in% reactomepathways$pathway,])
nInflGenesTotal1 <- as.data.frame(selectedReactome$entrezgene[selectedReactome$entrezgene %in% infl$entrezgene])
colnames(nInflGenesTotal1)[1] <- "entrezgene"

In [None]:
### WP
# in how many pathways are the inflammation genes
nGenesTotal2 <- as.data.frame(wp$entrezgene[wp$entrezgene %in% infl$entrezgene])
colnames(nGenesTotal2)[1] <- "entrezgene"

# in how many slected pathways are the inflammation genes
selectedWp <- as.data.frame(wp[wp$pathway %in% wppathways$pathway,])
nInflGenesTotal2 <- as.data.frame(selectedWp$entrezgene[selectedWp$entrezgene %in% infl$entrezgene])
colnames(nInflGenesTotal2)[1] <- "entrezgene"

In [None]:
# combine all frequencies together
# first the inflammation genes in all pathways
nGenesTotalF <- rbind(nGenesTotal, nGenesTotal1, nGenesTotal2)
nGenesTotalF <- plyr::count(nGenesTotalF)
colnames(nGenesTotalF)[2] <- "n_All_Pathways"
# next the inflammation genes in selected pathways
nInflGenesTotalF <- rbind(nInflGenesTotal, nInflGenesTotal1, nInflGenesTotal2)
nInflGenesTotalF <- plyr::count(nInflGenesTotalF)
colnames(nInflGenesTotalF)[2] <- "n_Sel_Pathways"

In [None]:
# save tables
write.table(nGenesTotalF, file.path(getwd(), "results", "nGenes.txt"), col.names = T, row.names = F, sep = "\t", quote = F)
write.table(nInflGenesTotalF, file.path(getwd(), "results", "nInflGenes.txt"), col.names = T, row.names = F, sep = "\t", quote = F)

Now we have the selected pathways, we will create the network files which will be used in the Cytoscape automation using the RCy3 package. 

In [None]:
# edge table
allDatabases <- rbind(kegg, reactome, wp)
edge_table <- as.data.frame(allDatabases[allDatabases$pathway %in% selectedpws$pathway,])

In [None]:
# cluster pathways together into groups
# Allograft rejection
dataSet$Key[dataSet$Key == "KEGG_ALLOGRAFT_REJECTION"] <- "Allograft_rejection"
dataSet$Key[dataSet$Key == "Allograft Rejection%WikiPathways_20181110%WP2328%Homo sapiens"] <- "Allograft_rejection"
dataSet$Key[dataSet$Key == "KEGG_GRAFT_VERSUS_HOST_DISEASE"] <- "Allograft_rejection"
dataSet$Key[dataSet$Key == "Control of immune tolerance by vasoactive intestinal peptide%WP4484%Homo sapiens"] <- "Allograft_rejection"
dataSet$Key[dataSet$Key == "Inflammatory Response Pathway%WikiPathways_20181110%WP453%Homo sapiens"] <- "Allograft_rejection"

# Cytokines
dataSet$Key[dataSet$Key == "IL-6 signaling pathway%WikiPathways_20181110%WP364%Homo sapiens"] <- "Cytokines"
dataSet$Key[dataSet$Key == "IL-10 Anti-inflammatory Signaling Pathway%WP4495%Homo sapiens"] <- "Cytokines"
dataSet$Key[dataSet$Key == "Cytokines and Inflammatory Response%WikiPathways_20181110%WP530%Homo sapiens"] <- "Cytokines"
dataSet$Key[dataSet$Key == "IL17 signaling pathway%WikiPathways_20181110%WP2112%Homo sapiens"] <- "Cytokines"
dataSet$Key[dataSet$Key == "Signal transduction through IL1R%WP4496%Homo sapiens"] <- "Cytokines"
dataSet$Key[dataSet$Key == "ncRNAs involved in STAT3 signaling in hepatocellular carcinoma%WikiPathways_20181110%WP4337%Homo sapiens"] <- "Cytokines"

# Diseases
dataSet$Key[dataSet$Key == "KEGG_TYPE_I_DIABETES_MELLITUS"] <- "Diseases"
dataSet$Key[dataSet$Key == "KEGG_ASTHMA"] <- "Diseases"
dataSet$Key[dataSet$Key == "Lung fibrosis%WikiPathways_20181110%WP3624%Homo sapiens"] <- "Diseases"
dataSet$Key[dataSet$Key == "KEGG_INTESTINAL_IMMUNE_NETWORK_FOR_IGA_PRODUCTION"] <- "Diseases"

# Immune cell regulation
dataSet$Key[dataSet$Key == "Cells and Molecules involved in local acute inflammatory response%WP4493%Homo sapiens"] <- "Immune_cell_regulation"
dataSet$Key[dataSet$Key == "Selective expression of chemokine receptors during T-cell polarization%WP4494%Homo sapiens"] <- "Immune_cell_regulation"
dataSet$Key[dataSet$Key == "KEGG_LEISHMANIA_INFECTION"] <- "Immune_cell_regulation"

# NFkB
dataSet$Key[dataSet$Key == "KEGG_NOD_LIKE_RECEPTOR_SIGNALING_PATHWAY"] <- "NFkB"
dataSet$Key[dataSet$Key == "REACTOME_TAK1_ACTIVATES_NFKB_BY_PHOSPHORYLATION_AND_ACTIVATION_OF_IKKS_COMPLEX"] <- "NFkB"
dataSet$Key[dataSet$Key == "REACTOME_TRAF6_MEDIATED_NFKB_ACTIVATION"] <- "NFkB"
dataSet$Key[dataSet$Key == "REACTOME_RIP_MEDIATED_NFKB_ACTIVATION_VIA_DAI"] <- "NFkB"
dataSet$Key[dataSet$Key == "Photodynamic therapy-induced NF-kB survival signaling%WikiPathways_20181110%WP3617%Homo sapiens"] <- "NFkB"
dataSet$Key[dataSet$Key == "Simplified Depiction of MYD88 Distinct Input-Output Pathway%WikiPathways_20181110%WP3877%Homo sapiens"] <- "NFkB"
dataSet$Key[dataSet$Key == "EBV LMP1 signaling%WikiPathways_20181110%WP262%Homo sapiens"] <- "NFkB"

# Vitamin B12
dataSet$Key[dataSet$Key == "Folate Metabolism%WikiPathways_20181110%WP176%Homo sapiens"] <- "Vitamin_B12"
dataSet$Key[dataSet$Key == "Vitamin B12 Metabolism%WikiPathways_20181110%WP1533%Homo sapiens"] <- "Vitamin_B12"
dataSet$Key[dataSet$Key == "IL1 and megakaryocytes in obesity%WikiPathways_20181110%WP2865%Homo sapiens"] <- "Vitamin_B12"

# Inflammation
dataSet$Key[dataSet$Key == "Vitamin D in inflammatory diseases%WP4482%Homo sapiens"] <- "Inflammation"
dataSet$Key[dataSet$Key == "LTF danger signal response pathway%WP4478%Homo sapiens"] <- "Inflammation"
dataSet$Key[dataSet$Key == "Resistin as a regulator of inflammation%WP4481%Homo sapiens"] <- "Inflammation"
dataSet$Key[dataSet$Key == "Relationship between inflammation, COX-2 and EGFR%WP4483%Homo sapiens"] <- "Inflammation"

In [None]:
# save edge table
write.table(edge_table, file.path(getwd(), "results", "edge_table.txt"), col.names = T, row.names = F, sep = "\t", quote = F)

In [None]:
# node table
# split edge table in two seperate data frames, rbind, unique, add type.
pathwayNodes <- as.data.frame(unique(edge_table$pathway))
colnames(pathwayNodes)[1] <- "nodes"
geneNodes <- as.data.frame(unique(edge_table$entrezgene))
colnames(geneNodes)[1] <- "nodes"
node_table <- as.data.frame(rbind(pathwayNodes, geneNodes))
# add type
node_table$Type <- "Gene"
node_table$Type[node_table$nodes %in% infl$entrezgene] <- "InflGene"
node_table$Type[node_table$nodes %in% selectedpws$pathway] <- "Process"

# save node table
write.table(node_table, file.path(getwd(), "results", "node_table.txt"), col.names = T, row.names = F, sep = "\t", quote = F)