In [14]:
#from .. import utils
import numpy as np
import scipy.stats as sstats

from collections import namedtuple

def set_enrichment(your_set, other_set, universe, abcd_values=False):
    """
    Perform set enrichment using either a fisher exact test or the chi2 test.
    parameters:
    -----------
    your_set:  list. Elements you want to test for enrichment
    other_set: list. Elements you want to see whether they are enriched in your_set
    universe:  list. Total universe of elements
    abcd_values: Boolean. If True, it will return the actual element values in the contingency table, rather than just counts
    
    returns:
    Named tuple with:
        * oddsratio: fisher oddsratio
        * c2statistic : chi2 test statistic
        * pvalue : pvalue of test
        * table: contingency table [ [a,b],[c,d] ]
           - a: Overlap of the two sets
           - b: What is in other_set but not in your_set
           - c: what is in your_set but not in other_set
           - d: What is in universe but not in your_set or other_set
        * method : fisher|chi2
    """

    
    resTuple = namedtuple("setEnrichmentResult", [ 'oddsratio', 'c2statistic', 'pvalue', 'table', 'method'])

    universe  = set(universe)
    your_set  = set(your_set) & universe
    other_set = set(other_set) & universe
    
    a = your_set & other_set
    b = other_set - your_set
    c = your_set - other_set
    d = universe - (your_set | other_set)
    
    table = [ [len(a), len(b)], [len(c), len(d)]]
    print(table)
    if min(min(table)) <= 5:
        method = 'fisher'
        oddsratio, p = sstats.fisher_exact(table)
        #print(oddsratio, p)
        chi2 = None
    else:
        method = 'chi2'
        chi2, p, dof, expected = sstats.chi2_contingency(table)
        oddsratio = 100
        if table[1][0] > 0 and table[0][1] > 0:
            oddsratio = table[0][0] * table[1][1] / (table[1][0] * table[0][1])
        else:
            oddsratio = np.inf
        #print(dof, p, expected, oddsratio)
        #fi
    #fi
    if abcd_values:
        return resTuple(oddsratio, chi2, p, [[a,b],[c,d]], method)
    else:
        return resTuple(oddsratio, chi2, p, table, method)
    #fi
#edef
def main():
    my_set = open("test_list_genes_your_set.txt").readlines()
    other_set2 = open("test_list_genes.txt").readlines()
    universe2 = open("test_list_genes_homo_sapien.txt").readlines()
    print(set_enrichment(my_set, other_set2, universe2, abcd_values=True)) 


main() 


[[842, 367], [7894, 30593]]
setEnrichmentResult(oddsratio=8.89141685921567, c2statistic=1645.791263061754, pvalue=0.0, table=[[{'701\n', '291\n', '2690\n', '2581\n', '4088\n', '88\n', '208\n', '4548\n', '53938\n', '1316\n', '2033\n', '2006\n', '2580\n', '5144\n', '8622\n', '1786\n', '84678\n', '607\n', '79695\n', '60489\n', '23414\n', '5521\n', '10950\n', '2180\n', '117289\n', '9138\n', '3956\n', '2153\n', '5578\n', '57534\n', '1499\n', '7134\n', '56916\n', '3777\n', '6224\n', '23178\n', '1639\n', '64122\n', '639\n', '5580\n', '6899\n', '4629\n', '7277\n', '9760\n', '3329\n', '3683\n', '30815\n', '23108\n', '11222\n', '275\n', '1837\n', '5168\n', '6310\n', '2597\n', '4330\n', '2778\n', '4739\n', '4650\n', '5424\n', '23533\n', '9046\n', '9315\n', '201633\n', '6547\n', '5352\n', '3909\n', '6389\n', '23640\n', '80331\n', '1630\n', '8482\n', '4208\n', '4092\n', '335\n', '7071\n', '25\n', '5122\n', '831\n', '1975\n', '51741\n', '9759\n', '1977\n', '79058\n', '3075\n', '3417\n', '4282\n', '9