In [1]:
suppressWarnings(suppressPackageStartupMessages(library("clusterProfiler"))) # v3.14.3
suppressWarnings(suppressPackageStartupMessages(library("DOSE")))
suppressWarnings(suppressPackageStartupMessages(library("ReactomePA")))

In [2]:
bg_genes <- rownames(read.table("data/preprocessed_v6/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.log2_exprs_z_v6.tsv",sep = "\t",header = 1,row.names = 1))
length(bg_genes)

bg_genes <- suppressMessages(bitr(bg_genes, fromType = "SYMBOL",
        toType = c("ENTREZID"),
        OrgDb = 'org.Hs.eg.db')[,"ENTREZID"])
length(bg_genes)

“1.33% of input gene IDs are fail to map...”


# TCGA

In [3]:
df <- read.table("Supplementary_tables_and_data/TableS2/TableS2_potentially_novel.tsv",sep = "\t",header = 1,row.names = 1,stringsAsFactors = F)
dim(df)

In [4]:
gsoa <- function(gset,bg_genes){
    
    # map gene names to Entrez ID 
    gset_entrez <- c()
    try(
    gset_entrez <- bitr(gset, fromType = "SYMBOL",
            toType = c("ENTREZID"),
            OrgDb = 'org.Hs.eg.db')[,"ENTREZID"]
    )
    
    results <- list()
    results["BP"] <-""
    results["MF"] <-""
    results["CC"] <-""
    results["KEGG"] <-""
    results["DO"] <-""
    results["Reactome"] <- ""
    
    if (length(gset_entrez)>1){
        # GO
        for (GOC in c("BP","MF","CC")){
            df <- enrichGO(gene          = gset_entrez,       # query gene list
                            universe      = bg_genes,   # background
                            OrgDb         = "org.Hs.eg.db",
                            ont           = GOC,
                            pAdjustMethod = "BH",
                            minGSSize = 2,
                            maxGSSize = 500,
                            pvalueCutoff  = 0.05,
                            qvalueCutoff  = 0.05,
                            readable      = T)
            df <- df[df$Count >= 2]

            if (!is.null(df)){
                if (dim(df)[1]>0){
                    df <- df[order(df$p.adjust, -df$Count), ]
                    results[GOC] <- paste(paste0(df$Description," (",df$ID,") ", df$GeneRatio," p.adj=",formatC(df$p.adjust, format = "e", digits = 2),"\n"),collapse='')
                }
            }
        }

        # KEGG
        df <- enrichKEGG(gene = gset_entrez,
                     universe = bg_genes,
                     organism     = 'hsa',
                     pAdjustMethod = "BH",
                     minGSSize = 2,
                     maxGSSize = 500,
                     pvalueCutoff  = 0.05,
                     qvalueCutoff  = 0.05)
        df <- df[df$Count >= 2]
        
        if (!is.null(df)){
                if (dim(df)[1]>0){
                    df <- df[order(df$p.adjust, -df$Count), ]
                    results["KEGG"] <- paste(paste0(df$Description," (",df$ID,") ", df$GeneRatio," p.adj=",formatC(df$p.adjust, format = "e", digits = 2),"\n"),collapse='')
                }
        }

        # Disease Ontology
        df <- enrichDO(gene = gset_entrez,
                  universe = bg_genes,
                  ont = "DO",
                  pAdjustMethod = "BH",
                  minGSSize = 2,
                  maxGSSize = 500,
                  pvalueCutoff  = 0.05,
                  qvalueCutoff  = 0.05,
                  readable      = T)
        df <- df[df$Count >= 2]
        
        if (!is.null(df)){
                if (dim(df)[1]>0){
                    df <- df[order(df$p.adjust, -df$Count), ]
                    results["DO"] <- paste(paste0(df$Description," (",df$ID,") ", df$GeneRatio," p.adj=",formatC(df$p.adjust, format = "e", digits = 2),"\n"),collapse='')
                }
        }

        # Reactome
        df <- enrichPathway(gene=gset_entrez,
                        universe = bg_genes,
                        organism = "human",
                        pvalueCutoff = 0.05, 
                        pAdjustMethod = "BH",
                        qvalueCutoff = 0.05,
                        minGSSize = 2,
                        maxGSSize = 500,
                        readable = F)
        df <- df[df$Count >= 2]
        
    
        if (!is.null(df)){
                if (dim(df)[1]>0){
                    df <- df[order(df$p.adjust, -df$Count), ]
                    results["Reactome"] <- paste(paste0(df$Description," (",df$ID,") ", df$GeneRatio," p.adj=",formatC(df$p.adjust, format = "e", digits = 2),"\n"),collapse='')
                }
        }
    }
    return (results)
}



In [5]:
all_results <- c()
bic_ids <- rownames(df)
target_column <- "shared" #"union" # "shared"
for (bic in bic_ids){
    gset <- unlist(strsplit(df[bic,target_column],' '))
    gsoa_results <- suppressMessages(suppressWarnings(gsoa(gset,bg_genes)))
    all_results <- rbind(all_results,gsoa_results)
}
rownames(all_results) <- bic_ids
dim(all_results)
all_results

Unnamed: 0_level_0,BP,MF,CC,KEGG,DO,Reactome,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0,Unnamed: 19_level_0,Unnamed: 20_level_0,Unnamed: 21_level_0
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
0,lipid transport (GO:0006869) 3/9 p.adj=3.23e-02 lipid localization (GO:0010876) 3/9 p.adj=3.23e-02 long-chain fatty acid transport (GO:0015909) 2/9 p.adj=3.23e-02 fatty acid transport (GO:0015908) 2/9 p.adj=3.23e-02 triglyceride metabolic process (GO:0006641) 2/9 p.adj=3.23e-02 acylglycerol metabolic process (GO:0006639) 2/9 p.adj=3.23e-02 neutral lipid metabolic process (GO:0006638) 2/9 p.adj=3.23e-02 monocarboxylic acid transport (GO:0015718) 2/9 p.adj=4.65e-02,retinol binding (GO:0019841) 2/10 p.adj=6.82e-04 retinal binding (GO:0016918) 2/10 p.adj=6.82e-04 retinoid binding (GO:0005501) 2/10 p.adj=1.62e-03 isoprenoid binding (GO:0019840) 2/10 p.adj=1.62e-03 alcohol binding (GO:0043178) 2/10 p.adj=7.10e-03 lipid transporter activity (GO:0005319) 2/10 p.adj=9.99e-03 vitamin binding (GO:0019842) 2/10 p.adj=9.99e-03,lipid droplet (GO:0005811) 4/11 p.adj=2.52e-06,,fatty liver disease (DOID:9452) 3/5 p.adj=8.75e-04 lipid storage disease (DOID:9455) 3/5 p.adj=1.01e-03 lysosomal storage disease (DOID:3211) 3/5 p.adj=1.02e-03 polycystic ovary syndrome (DOID:11612) 3/5 p.adj=1.17e-03 lipodystrophy (DOID:811) 2/5 p.adj=1.78e-03 obesity (DOID:9970) 3/5 p.adj=3.41e-03 overnutrition (DOID:654) 3/5 p.adj=3.41e-03 inherited metabolic disorder (DOID:655) 3/5 p.adj=3.41e-03 nutrition disease (DOID:374) 3/5 p.adj=3.41e-03 retinal vascular disease (DOID:2462) 2/5 p.adj=3.41e-03 diabetic retinopathy (DOID:8947) 2/5 p.adj=3.41e-03 type 2 diabetes mellitus (DOID:9352) 2/5 p.adj=2.77e-02 pre-eclampsia (DOID:10591) 2/5 p.adj=4.31e-02,Triglyceride catabolism (R-HSA-163560) 2/7 p.adj=2.95e-03 Triglyceride metabolism (R-HSA-8979227) 2/7 p.adj=3.16e-03 Transcriptional regulation of white adipocyte differentiation (R-HSA-381340) 2/7 p.adj=1.19e-02,,,,,,,,,,,,,,,
1,,"carboxylic acid binding (GO:0031406) 3/10 p.adj=3.11e-03 organic acid binding (GO:0043177) 3/10 p.adj=3.11e-03 structural constituent of nuclear pore (GO:0017056) 2/10 p.adj=3.11e-03 aminoacyl-tRNA ligase activity (GO:0004812) 2/10 p.adj=3.11e-03 ligase activity, forming carbon-oxygen bonds (GO:0016875) 2/10 p.adj=3.11e-03 tRNA binding (GO:0000049) 2/10 p.adj=4.67e-03 amino acid binding (GO:0016597) 2/10 p.adj=4.83e-03 catalytic activity, acting on RNA (GO:0140098) 3/10 p.adj=7.39e-03 catalytic activity, acting on a tRNA (GO:0140101) 2/10 p.adj=1.04e-02 ligase activity (GO:0016874) 2/10 p.adj=1.28e-02",nuclear pore (GO:0005643) 2/11 p.adj=4.73e-02 nuclear envelope (GO:0005635) 3/11 p.adj=4.79e-02 organelle outer membrane (GO:0031968) 2/11 p.adj=4.79e-02 outer membrane (GO:0019867) 2/11 p.adj=4.79e-02,,Charcot-Marie-Tooth disease (DOID:10595) 2/4 p.adj=1.40e-03 neuromuscular disease (DOID:440) 2/4 p.adj=4.59e-03 neuropathy (DOID:870) 2/4 p.adj=1.42e-02,Cytosolic tRNA aminoacylation (R-HSA-379716) 2/8 p.adj=1.30e-02 tRNA Aminoacylation (R-HSA-379724) 2/8 p.adj=1.83e-02,,,,,,,,,,,,,,,
2,"defense response to other organism (GO:0098542) 8/9 p.adj=1.41e-10 defense response to virus (GO:0051607) 7/9 p.adj=1.41e-10 type I interferon signaling pathway (GO:0060337) 6/9 p.adj=1.41e-10 cellular response to type I interferon (GO:0071357) 6/9 p.adj=1.41e-10 response to type I interferon (GO:0034340) 6/9 p.adj=1.41e-10 response to virus (GO:0009615) 7/9 p.adj=1.00e-09 regulation of viral genome replication (GO:0045069) 5/9 p.adj=2.09e-08 viral genome replication (GO:0019079) 5/9 p.adj=6.68e-08 regulation of viral life cycle (GO:1903900) 5/9 p.adj=1.48e-07 negative regulation of viral genome replication (GO:0045071) 4/9 p.adj=3.67e-07 regulation of viral process (GO:0050792) 5/9 p.adj=6.04e-07 regulation of symbiosis, encompassing mutualism through parasitism (GO:0043903) 5/9 p.adj=8.01e-07 negative regulation of viral life cycle (GO:1903901) 4/9 p.adj=1.10e-06 negative regulation of viral process (GO:0048525) 4/9 p.adj=1.89e-06 viral life cycle (GO:0019058) 5/9 p.adj=5.13e-06 regulation of multi-organism process (GO:0043900) 5/9 p.adj=1.19e-05 negative regulation of multi-organism process (GO:0043901) 4/9 p.adj=1.31e-05 extrinsic apoptotic signaling pathway (GO:0097191) 2/9 p.adj=2.70e-02",GTP binding (GO:0005525) 2/9 p.adj=3.56e-02 purine ribonucleoside binding (GO:0032550) 2/9 p.adj=3.56e-02 purine nucleoside binding (GO:0001883) 2/9 p.adj=3.56e-02 ribonucleoside binding (GO:0032549) 2/9 p.adj=3.56e-02 nucleoside binding (GO:0001882) 2/9 p.adj=3.56e-02 guanyl nucleotide binding (GO:0019001) 2/9 p.adj=3.56e-02 guanyl ribonucleotide binding (GO:0032561) 2/9 p.adj=3.56e-02,,,influenza (DOID:8469) 2/4 p.adj=1.98e-02 multiple sclerosis (DOID:2377) 2/4 p.adj=1.98e-02 demyelinating disease (DOID:3213) 2/4 p.adj=1.98e-02 hepatitis C (DOID:1883) 2/4 p.adj=2.35e-02 hepatitis (DOID:2237) 2/4 p.adj=4.87e-02,Interferon alpha/beta signaling (R-HSA-909733) 6/6 p.adj=9.39e-13 Interferon Signaling (R-HSA-913531) 6/6 p.adj=2.88e-10 Antiviral mechanism by IFN-stimulated genes (R-HSA-1169410) 4/6 p.adj=2.38e-07 ISG15 antiviral mechanism (R-HSA-1169408) 3/6 p.adj=2.50e-05,,,,,,,,,,,,,,,
3,regulation of catecholamine secretion (GO:0050433) 3/8 p.adj=5.31e-04 catecholamine secretion (GO:0050432) 3/8 p.adj=5.31e-04 catecholamine transport (GO:0051937) 3/8 p.adj=6.32e-04 monoamine transport (GO:0015844) 3/8 p.adj=7.12e-04 regulation of amine transport (GO:0051952) 3/8 p.adj=7.12e-04 amine transport (GO:0015837) 3/8 p.adj=7.53e-04 negative regulation of peptide secretion (GO:0002792) 3/8 p.adj=1.32e-03 negative regulation of catecholamine secretion (GO:0033604) 2/8 p.adj=1.32e-03 adult behavior (GO:0030534) 3/8 p.adj=1.42e-03 negative regulation of secretion by cell (GO:1903531) 3/8 p.adj=3.01e-03 negative regulation of amine transport (GO:0051953) 2/8 p.adj=3.01e-03 positive regulation of amine transport (GO:0051954) 2/8 p.adj=3.51e-03 negative regulation of secretion (GO:0051048) 3/8 p.adj=3.64e-03 organic hydroxy compound transport (GO:0015850) 3/8 p.adj=4.66e-03 negative regulation of peptide hormone secretion (GO:0090278) 2/8 p.adj=4.70e-03 negative regulation of hormone secretion (GO:0046888) 2/8 p.adj=9.29e-03 regulation of protein secretion (GO:0050708) 3/8 p.adj=1.62e-02 negative regulation of transport (GO:0051051) 3/8 p.adj=1.62e-02 signal release (GO:0023061) 3/8 p.adj=1.62e-02 regulation of peptide secretion (GO:0002791) 3/8 p.adj=1.62e-02 regulation of neuron projection development (GO:0010975) 3/8 p.adj=1.62e-02 cellular response to calcium ion (GO:0071277) 2/8 p.adj=1.62e-02 negative regulation of protein secretion (GO:0050709) 2/8 p.adj=1.62e-02 response to calcium ion (GO:0051592) 2/8 p.adj=2.01e-02 negative regulation of protein transport (GO:0051224) 2/8 p.adj=2.27e-02 negative regulation of establishment of protein localization (GO:1904950) 2/8 p.adj=2.27e-02 regulation of blood pressure (GO:0008217) 2/8 p.adj=2.27e-02 regulation of insulin secretion (GO:0050796) 2/8 p.adj=2.27e-02 cellular response to metal ion (GO:0071248) 2/8 p.adj=2.48e-02 insulin secretion (GO:0030073) 2/8 p.adj=2.72e-02 regulation of peptide hormone secretion (GO:0090276) 2/8 p.adj=2.72e-02 cellular response to inorganic substance (GO:0071241) 2/8 p.adj=2.72e-02 peptide hormone secretion (GO:0030072) 2/8 p.adj=3.06e-02 regulation of hormone secretion (GO:0046883) 2/8 p.adj=3.21e-02 positive regulation of neuron projection development (GO:0010976) 2/8 p.adj=3.22e-02 developmental maturation (GO:0021700) 2/8 p.adj=3.33e-02 hormone secretion (GO:0046879) 2/8 p.adj=3.79e-02 hormone transport (GO:0009914) 2/8 p.adj=3.94e-02 response to metal ion (GO:0010038) 2/8 p.adj=4.31e-02 positive regulation of neuron differentiation (GO:0045666) 2/8 p.adj=4.36e-02 positive regulation of cell projection organization (GO:0031346) 2/8 p.adj=4.50e-02 positive regulation of secretion by cell (GO:1903532) 2/8 p.adj=4.57e-02 synapse organization (GO:0050808) 2/8 p.adj=4.63e-02 positive regulation of secretion (GO:0051047) 2/8 p.adj=4.87e-02,calcium-dependent phospholipid binding (GO:0005544) 2/6 p.adj=2.23e-03 hormone activity (GO:0005179) 2/6 p.adj=5.49e-03 phospholipid binding (GO:0005543) 2/6 p.adj=2.53e-02 receptor ligand activity (GO:0048018) 2/6 p.adj=2.53e-02 receptor regulator activity (GO:0030545) 2/6 p.adj=2.55e-02,transport vesicle membrane (GO:0030658) 2/8 p.adj=2.81e-02 glutamatergic synapse (GO:0098978) 2/8 p.adj=3.98e-02 transport vesicle (GO:0030133) 2/8 p.adj=4.42e-02,,Creutzfeldt-Jakob disease (DOID:11949) 2/5 p.adj=2.21e-03 prion disease (DOID:649) 2/5 p.adj=2.55e-03 Alzheimer's disease (DOID:10652) 3/5 p.adj=2.11e-02 tauopathy (DOID:680) 3/5 p.adj=2.11e-02 brain disease (DOID:936) 3/5 p.adj=2.11e-02 lateral sclerosis (DOID:230) 2/5 p.adj=2.11e-02 amyotrophic lateral sclerosis (DOID:332) 2/5 p.adj=2.75e-02 motor neuron disease (DOID:231) 2/5 p.adj=3.49e-02,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,
5,,,,,,,,,,,,,,,,,,,,,
6,,,,,,,,,,,,,,,,,,,,,
7,embryonic skeletal system morphogenesis (GO:0048704) 3/3 p.adj=5.38e-06 embryonic skeletal system development (GO:0048706) 3/3 p.adj=6.16e-06 anterior/posterior pattern specification (GO:0009952) 3/3 p.adj=2.19e-05 skeletal system morphogenesis (GO:0048705) 3/3 p.adj=2.19e-05 embryonic organ morphogenesis (GO:0048562) 3/3 p.adj=3.28e-05 regionalization (GO:0003002) 3/3 p.adj=4.64e-05 embryonic organ development (GO:0048568) 3/3 p.adj=7.00e-05 pattern specification process (GO:0007389) 3/3 p.adj=7.00e-05 skeletal system development (GO:0001501) 3/3 p.adj=9.70e-05,"RNA polymerase II distal enhancer sequence-specific DNA binding (GO:0000980) 2/3 p.adj=3.32e-04 enhancer sequence-specific DNA binding (GO:0001158) 2/3 p.adj=3.32e-04 enhancer binding (GO:0035326) 2/3 p.adj=3.32e-04 DNA-binding transcription activator activity, RNA polymerase II-specific (GO:0001228) 2/3 p.adj=2.76e-03",,,,,,,,,,,,,,,,,,,
8,response to ionizing radiation (GO:0010212) 2/3 p.adj=1.19e-02 response to radiation (GO:0009314) 2/3 p.adj=2.59e-02,carboxylic acid binding (GO:0031406) 2/3 p.adj=6.05e-03 organic acid binding (GO:0043177) 2/3 p.adj=6.05e-03,,,,,,,,,,,,,,,,,,,
9,epidermis development (GO:0008544) 2/3 p.adj=2.09e-02,,,,,,,,,,,,,,,,,,,,


In [6]:
df2 <- as.data.frame(all_results)
df2[] <- lapply(df2, as.character)
table <- cbind(df,df2)

In [7]:
out_prefix = "Supplementary_tables_and_data/TableS2/TableS2_potentially_novel.with_enrichment"
fname = paste0(out_prefix,"_",target_column,".tsv")
write.table(table, file = fname,quote = T, sep="\t",row.names = T,col.names = T)
fname