In [35]:
import pandas as pd, numpy as np

In [99]:
def load_kobas_analysis(filepath):
    '''
        load, process and conver the output data from kobas analysis into a cleaned up dataframe
    '''
    # load analysis result from KOBAS 3.0
    with open(filepath) as f:
        df = pd.read_csv(f, sep='\t', header=3)
    # clear out rows with non numeric p-values
    df['Corrected P-Value'] = pd.to_numeric(df['Corrected P-Value'], errors='coerce')
    nan_rows = df['Corrected P-Value'].isnull()
    df = df[~nan_rows]
    return df

In [100]:
# load the data from the KOBAS genetic ontology analysis
genes_enriched = load_kobas_analysis('./gene datasets/coronavirus-kobas-analysis.txt')
genes_enriched

Unnamed: 0,Term,Database,ID,Input number,Background number,P-Value,Corrected P-Value,Input,Hyperlink
0,Immune System,Reactome,R-HSA-168256,106,2096,8.55023011727e-80,6.068098e-76,BCL2L1|PTGS2|LAT2|HSPA8|AKT1|IKBKE|EGF|MS4A1|S...,http://www.reactome.org/cgi-bin/eventbrowser_s...
1,Cytokine Signaling in Immune system,Reactome,R-HSA-1280215,74,836,1.76959142298e-70,6.279395e-67,BCL2L1|TGFB1|MX1|CD40LG|CCL4|PTGS2|HLA-C|HSPA8...,http://www.reactome.org/cgi-bin/eventbrowser_s...
2,Signaling by Interleukins,Reactome,R-HSA-449147,57,619,1.11021504614e-54,2.626399e-51,BCL2L1|TGFB1|CCL4|PTGS2|HSPA8|PSMD1|HSP90B1|SM...,http://www.reactome.org/cgi-bin/eventbrowser_s...
3,Epstein-Barr virus infection,KEGG PATHWAY,hsa05169,41,201,4.58478942438e-52,8.134563e-49,HLA-C|PSMD1|BAX|USP7|MAPK14|AKT1|IRF7|IKBKE|DD...,http://www.genome.jp/kegg-bin/show_pathway?hsa...
4,Influenza A,KEGG PATHWAY,hsa05164,37,167,3.43465687378e-48,4.875152e-45,CCL2|CCL5|BAX|MX1|AKT1|IKBKE|IL1A|IL1B|DDX58|S...,http://www.genome.jp/kegg-bin/show_pathway?hsa...
5,Measles,KEGG PATHWAY,hsa05162,35,138,2.04271344642e-47,2.416190e-44,BCL2L1|IL2|HSPA8|BAX|MX1|CD209|IKBKE|IL1A|IL1B...,http://www.genome.jp/kegg-bin/show_pathway?hsa...
6,Pathways in cancer,KEGG PATHWAY,hsa05200,48,530,1.29871005927e-45,1.316706e-42,BCL2L1|TGFB1|IL13|PTGS2|SMAD4|IL2|BIRC3|HSP90B...,http://www.genome.jp/kegg-bin/show_pathway?hsa...
7,Hepatitis B,KEGG PATHWAY,hsa05161,35,163,3.4404629999e-45,3.052121e-42,TGFB1|SMAD4|BAX|SMAD3|MAPK14|AKT1|IKBKE|DDX58|...,http://www.genome.jp/kegg-bin/show_pathway?hsa...
8,Innate Immune System,Reactome,R-HSA-168249,58,1043,4.99274745754e-44,3.937059e-41,BCL2L1|CRP|LAT2|HLA-C|HSPA8|BIRC3|HSP90B1|PSMD...,http://www.reactome.org/cgi-bin/eventbrowser_s...
9,Toll-like receptor signaling pathway,KEGG PATHWAY,hsa04620,31,104,5.93456277408e-44,4.211759e-41,CCL3|CCL4|CCL5|MAPK14|AKT1|IKBKE|IL1B|IFNB1|ST...,http://www.genome.jp/kegg-bin/show_pathway?hsa...


In [109]:
def get_most_significant_genes(df, top=5):
    '''
        return a set of the `top` most significant genes from the kobas data
    '''
    # the list is already in descending order of the corrected p-values, we take the most significant 5.
    significant_inputs = df[:top]['Input']
    significant_genes = []
    for i, genes_str in enumerate(significant_inputs):
        genes = genes_str.split('|')
        process = df['Term'][i]
        print(f'{len(genes)} from {process}')
        significant_genes += genes
    return set(significant_genes)

In [104]:
# get the most significant genes to form the enriched set
top_genes = get_most_significant_genes(genes_enriched)
print(f'{len(top_genes)} most significant unique genes: {top_genes}')

106 from Immune System
74 from Cytokine Signaling in Immune system
57 from Signaling by Interleukins
41 from Epstein-Barr virus infection
37 from Influenza A
119 most significant unique genes: {'RPS6KA3', 'IL2', 'IRF3', 'BCL2', 'GLB1', 'RAB3D', 'PABPN1', 'IL18', 'CCL2', 'ADAM10', 'CCL5', 'CLEC4G', 'NOS2', 'CD68', 'IL10', 'BIRC3', 'IL7R', 'MX1', 'MBL2', 'PRKCA', 'PPIA', 'IL8', 'CYCS', 'CRP', 'ANPEP', 'CD46', 'HLA-C', 'MBP', 'ARF1', 'TUBB4', 'PSMD2', 'SOD1', 'IFNA1', 'ICAM3', 'SLC25A4', 'STUB1', 'SMAD3', 'USP7', 'RB1', 'BCL2L1', 'STAT1', 'IL6', 'TNF', 'FGL2', 'TP53', 'CTSL1', 'E2F1', 'RUNX3', 'VIM', 'CDK4', 'IFNG', 'IL28A', 'FCER2', 'NFKB1', 'CDA', 'IFNA2', 'MAPK3', 'IFNB1', 'KIT', 'CD40LG', 'FAS', 'CST3', 'DDX58', 'MAVS', 'HSPA8', 'IRF1', 'CXCL10', 'RHOA', 'CD44', 'ACAA1', 'IL1B', 'CCL3', 'HSP90B1', 'CD40', 'MYC', 'IL13', 'TBK1', 'LAT2', 'IL1A', 'ITGB2', 'LTF', 'BAX', 'CD79A', 'IL5', 'CD14', 'ISG15', 'MAPK1', 'CD4', 'MAPK14', 'CCL4', 'IMPDH1', 'MS4A1', 'MAPK8', 'MYD88', 'OAS1', 'CD80', 

In [84]:
with open('./gene datasets/coronavirus-kobas-refined.csv', 'w+') as f:
    np.savetxt(f, list(top_genes), delimiter=',', fmt='%s')
    print('Saved successfully!')

Saved successfully!


In [106]:
# after the initial refinement we repeat the process for the most relevant pathways
# load the data from the KOBAS pathways analysis
genes_pathways = load_kobas_analysis('./gene datasets/coronavirus-kobas-pathways.txt')
genes_pathways

Unnamed: 0,Term,Database,ID,Input number,Background number,P-Value,Corrected P-Value,Input,Hyperlink
0,Immune System,Reactome,R-HSA-168256,106,2096,1.83869744403e-134,2.015212e-131,BCL2L1|PTGS2|LAT2|HSPA8|AKT1|IKBKE|EGF|MS4A1|S...,http://www.reactome.org/cgi-bin/eventbrowser_s...
1,Cytokine Signaling in Immune system,Reactome,R-HSA-1280215,74,836,7.91128500605e-102,4.335384e-99,BCL2L1|TGFB1|MX1|CD40LG|CCL4|PTGS2|HLA-C|HSPA8...,http://www.reactome.org/cgi-bin/eventbrowser_s...
2,Signaling by Interleukins,Reactome,R-HSA-449147,57,619,5.8157014156e-77,2.124670e-74,BCL2L1|TGFB1|CCL4|PTGS2|HSPA8|PSMD1|HSP90B1|PS...,http://www.reactome.org/cgi-bin/eventbrowser_s...
3,Epstein-Barr virus infection,KEGG PATHWAY,hsa05169,41,201,2.57440666593e-67,7.053874e-65,HLA-C|PSMD1|BAX|USP7|MAPK14|AKT1|IRF7|IKBKE|DD...,http://www.genome.jp/kegg-bin/show_pathway?hsa...
4,Innate Immune System,Reactome,R-HSA-168249,58,1043,2.27180487681e-66,4.979796e-64,BCL2L1|CRP|MAVS|LAT2|HLA-C|HSPA8|BIRC3|HSP90B1...,http://www.reactome.org/cgi-bin/eventbrowser_s...
5,Influenza A,KEGG PATHWAY,hsa05164,37,167,9.21705758035e-62,1.683649e-59,CCL2|CCL5|BAX|MX1|AKT1|IKBKE|IL1A|IL1B|DDX58|S...,http://www.genome.jp/kegg-bin/show_pathway?hsa...
6,Measles,KEGG PATHWAY,hsa05162,34,138,4.71658176374e-58,7.384819e-56,BCL2L1|IL2|HSPA8|BAX|MX1|CD209|IKBKE|IL1A|IL1B...,http://www.genome.jp/kegg-bin/show_pathway?hsa...
7,Hepatitis B,KEGG PATHWAY,hsa05161,34,163,7.35706486878e-56,1.007918e-53,TGFB1|BAX|SMAD3|MAPK14|AKT1|IKBKE|DDX58|STAT1|...,http://www.genome.jp/kegg-bin/show_pathway?hsa...
8,Toll-like receptor signaling pathway,KEGG PATHWAY,hsa04620,30,104,5.94732530756e-53,7.242521e-51,CCL3|CCL4|CCL5|MAPK14|AKT1|IKBKE|IL1B|CD86|STA...,http://www.genome.jp/kegg-bin/show_pathway?hsa...
9,Hepatitis C,KEGG PATHWAY,hsa05160,31,155,2.3212829674e-50,2.544126e-48,IKBKE|EGFR|BAX|MX1|AKT1|CXCL10|EGF|DDX58|STAT1...,http://www.genome.jp/kegg-bin/show_pathway?hsa...


In [113]:
# get the most significant genes to form the further enriched set
top_genes_pathways = get_most_significant_genes(genes_pathways, top=5)
print(f'{len(top_genes_pathways)} most significant unique genes: {top_genes}')

106 from Immune System
74 from Cytokine Signaling in Immune system
57 from Signaling by Interleukins
41 from Epstein-Barr virus infection
58 from Innate Immune System
116 most significant unique genes: {'RPS6KA3', 'IL2', 'IRF3', 'BCL2', 'GLB1', 'RAB3D', 'IL18', 'CCL2', 'ADAM10', 'CCL5', 'CLEC4G', 'NOS2', 'CD68', 'IL10', 'BIRC3', 'IL7R', 'MX1', 'MBL2', 'PPIA', 'IL8', 'CYCS', 'CRP', 'ANPEP', 'CD46', 'HLA-C', 'MBP', 'ARF1', 'TUBB4', 'PSMD2', 'SOD1', 'IFNA1', 'ICAM3', 'STUB1', 'SMAD3', 'USP7', 'RB1', 'BCL2L1', 'STAT1', 'IL6', 'TNF', 'FGL2', 'TP53', 'CTSL1', 'E2F1', 'RUNX3', 'VIM', 'CDK4', 'IFNG', 'IL28A', 'FCER2', 'NFKB1', 'CDA', 'IFNA2', 'MAPK3', 'IFNB1', 'KIT', 'CD40LG', 'FAS', 'CST3', 'DDX58', 'MAVS', 'HSPA8', 'IRF1', 'CXCL10', 'RHOA', 'CD44', 'ACAA1', 'IL1B', 'CCL3', 'HSP90B1', 'CD40', 'MYC', 'IL13', 'TBK1', 'LAT2', 'IL1A', 'ITGB2', 'LTF', 'BAX', 'CD79A', 'IL5', 'CD14', 'ISG15', 'MAPK1', 'CD4', 'MAPK14', 'CCL4', 'IMPDH1', 'MS4A1', 'MAPK8', 'MYD88', 'OAS1', 'CD80', 'PSMD1', 'TRAF3', 'TG

In [114]:
with open('./gene datasets/coronavirus-kobas-pathways-refined.csv', 'w+') as f:
    np.savetxt(f, list(top_genes_pathways), delimiter=',', fmt='%s')
    print('Saved successfully!')

Saved successfully!
