# Benchmarking of pathway databases using Over Representation Analysis (ORA)



Import required modules

In [1]:
import urllib.request

from pathway_forte.pathway_enrichment.over_representation import perform_hypergeometric_test

# TODO: Apply filtering on Fold change to identify the significant gene set

Load GMT files 

In [2]:
KEGG_GMT_FILE = 'https://raw.githubusercontent.com/pathwayforte/pathway-forte/master/data/gmt_files/kegg_geneset_final.gmt'
REACTOME_GMT_FILE = 'https://raw.githubusercontent.com/pathwayforte/pathway-forte/master/data/gmt_files/reactome_geneset_final.gmt'
WIKIPATHWAYS_GMT_FILE = 'https://raw.githubusercontent.com/pathwayforte/pathway-forte/master/data/gmt_files/wikipathways_geneset_final.gmt'
MERGE_GMT_FILE = 'https://raw.githubusercontent.com/pathwayforte/pathway-forte/master/data/gmt_files/concatenated_merge.gmt'

In [3]:
def gmt_to_dict(url):
    
    with urllib.request.urlopen(url) as response:
        
        lines = response.readlines()
                                        
        pathway_tuples = [
            tuple(line.decode().split('\t')[0:2])
            for line in lines
        ]
        
        pathway_to_geneset = {}
        
        # Build gene sets while cleaning up format
        gene_sets = []
        for number, line in enumerate(lines):
             pathway_to_geneset[pathway_tuples[number]] = {
                gene.strip()
                for gene in line.decode().split('\t')[2:]
            }
        
    return pathway_to_geneset

In [4]:
kegg_gene_sets = gmt_to_dict(KEGG_GMT_FILE)
reactome_gene_sets = gmt_to_dict(REACTOME_GMT_FILE)
wp_gene_sets = gmt_to_dict(WIKIPATHWAYS_GMT_FILE)
merge_gene_sets = gmt_to_dict(MERGE_GMT_FILE)

Apply hypergeometric test to genes found significant

In [13]:
results = perform_hypergeometric_test(
    {'AKT1', 'APP', 'MAPT', 'CDK1', 'TP53'},
    kegg_gene_sets,
    apply_threshold=True
)

In [14]:
results

{('hsa01522', 'kegg'): 0.001,
 ('hsa01524', 'kegg'): 0.0009,
 ('hsa04010', 'kegg'): 0.0005,
 ('hsa04071', 'kegg'): 0.0013,
 ('hsa04110', 'kegg'): 0.0013,
 ('hsa04115', 'kegg'): 0.0009,
 ('hsa04151', 'kegg'): 0.0056,
 ('hsa04210', 'kegg'): 0.0014,
 ('hsa04211', 'kegg'): 0.001,
 ('hsa04218', 'kegg'): 0.0002,
 ('hsa04722', 'kegg'): 0.0013,
 ('hsa04914', 'kegg'): 0.001,
 ('hsa04919', 'kegg'): 0.0013,
 ('hsa05010', 'kegg'): 0.0017,
 ('hsa05160', 'kegg'): 0.0016,
 ('hsa05161', 'kegg'): 0.0017,
 ('hsa05162', 'kegg'): 0.0014,
 ('hsa05163', 'kegg'): 0.0024,
 ('hsa05165', 'kegg'): 0.005,
 ('hsa05166', 'kegg'): 0.0023,
 ('hsa05167', 'kegg'): 0.0019,
 ('hsa05168', 'kegg'): 0.0019,
 ('hsa05169', 'kegg'): 0.0021,
 ('hsa05170', 'kegg'): 0.0022,
 ('hsa05203', 'kegg'): 0.0021,
 ('hsa05205', 'kegg'): 0.0021,
 ('hsa05210', 'kegg'): 0.001,
 ('hsa05212', 'kegg'): 0.0009,
 ('hsa05213', 'kegg'): 0.0009,
 ('hsa05214', 'kegg'): 0.0009,
 ('hsa05215', 'kegg'): 0.001,
 ('hsa05218', 'kegg'): 0.0009,
 ('hsa05220', 