## ncHGT.py code

In [10]:
import numpy as np
import sys
sys.path.append('../GOCAM_Project/dev')

import rpy2
from rpy2.robjects.packages import importr
BiasedUrn = importr('BiasedUrn')

import utils

def get_M_wM():
    """ returns M, the number of entities in the background, and w_M, the mean size of entities in the background"""
    setID2members = utils.csv2dict('../data/setID2members.csv')
    l = []
    for s,m in setID2members.items():
        l.append(len(m))
    l = np.array(l)
    l = np.sort(l)
    num_empty_sets = np.sum(l==0)
    
    l = l[l!=0]
    mean = np.mean(l)#l[4:-4]) 1% trimmed mean?
    num_sets = len(l)
    bg = len(utils.csv2dict('../data/ID2gocam_mouse.csv'))
    M = bg-num_empty_sets
    
    w_M = np.round(((M-num_sets)+num_sets*mean)/M,decimals=2)
    return M, w_M

def get_M_wM_XT_incorrect(num_bins = 1):
    """ returns M, the number of entities in the background, and w_M, the mean size of entities in the background"""
    setID2members = utils.csv2dict('../data/setID2members.csv')
    bg_sets = []
    for s,m in setID2members.items():
        bg_sets.append(len(m))
    bg_sets = np.array(bg_sets)
    bg_sets = np.sort(bg_sets)
    num_empty_sets = np.sum(bg_sets==0)
    
    bg_sets = bg_sets[bg_sets!=0]
    num_sets = len(bg_sets)
    len_bg_dict = len(utils.csv2dict('../data/ID2gocam_mouse.csv'))
    M_weight_1 = len_bg_dict-num_empty_sets-num_sets #background = individual genes + empty sets + sets
    bg = np.concatenate([bg_sets,np.ones(M_weight_1)]) #create a new l that also includes individual genes
    
    quantiles_ = np.quantile(bg,np.arange(num_bins+1)/num_bins)
    quantiles_[0] = -np.inf
    quantiles_[-1] = np.inf

    w_M = []
    M = []
    for i in range(num_bins):
        print('lower: ',quantiles_[i],' upper: ',quantiles_[i+1])
        mask1 = bg >= quantiles_[i]
        mask2 = bg <  quantiles_[i+1]
        bg_ = bg[np.logical_and(mask1, mask2)]
        print('bg_:',bg_)
        w_M.append(round(np.mean(bg_),2))
        M.append(len(bg_))
    
    
    return M, w_M

def get_M_wM_XT(num_bins = 1):
    """ returns M, the number of entities in the background, and w_M, the mean size of entities in the background"""
    setID2members = utils.csv2dict('../data/setID2members.csv')
    bg_sets = []
    for s,m in setID2members.items():
        bg_sets.append(len(m))
    bg_sets = np.array(bg_sets)
    num_empty_sets = np.sum(bg_sets==0)
    
    bg_sets = bg_sets[bg_sets!=0]
    num_sets = len(bg_sets)
    len_bg_dict = len(utils.csv2dict('../data/ID2gocam_mouse.csv'))
    M_weight_1 = len_bg_dict-num_empty_sets-num_sets #background = individual genes + empty sets + sets
    bg = np.concatenate([np.ones(M_weight_1),bg_sets]) #create a new l that also includes individual genes

    bg = np.sort(bg)

    w_M = []
    M = []
    idx = 0
    for i in range(num_bins):
        bg_ = bg[idx:int(idx+len(bg)/num_bins)]
        idx+=int(len(bg)/num_bins)
        w_M.append(round(np.mean(bg_),2))
        M.append(len(bg_))
    
    
    return M, w_M

def make_initial_vectors(gocam2ID,setID2members, gc, M,w_M):
    """initializes counts vector (m) and weights vector (w), where each entity gets its own element in the arrays
- values in m only take on 0 (if there is no solo proteins) or 1
- values in w correspond to the weight of each element in m (weighted by the # genes in a set or 1 for solo proteins)"""
    w_gc = [1] #initialize with 1 as the weight of single proteins (irrespective of whether there are any)
    m_gc = [0] #initialize with 0 single proteins
    num_protein = 0
    for i in gocam2ID.get(gc):
        if "sset:" in i:
            w_i = len(setID2members.get(i))
            w_gc.append(w_i)
            m_gc.append(1)
        else:
            num_protein+=1
    m_gc[0] = num_protein
    m_gc.append(M-np.sum(m_gc)) #entities not in the gocam (roughly)
    w_gc.append(w_M) #weight for entities not in the gocam (all weighted as w_M (the mean))
    return w_gc, m_gc


def make_new_vectors(w_gc,m_gc,M,w_M):
    """compress the m and w vectors by grouping elements according to their weights
- w is the ordered set of unique weights for entities of the gocam + the background bin
- m[i] is the number of entities in the pathway with the weight specified in w[i] + the background bin"""
    w_temp = w_gc[:-1]
    if w_temp[0] != 1:
        print('Possible bug: w_temp[0] != 1',w_temp)
        
    w_new, m_temp = np.unique(w_temp, return_counts=True)
    m_temp[0]=m_gc[0] #w_gc and m_gc have weight 1 as w_gc[0] and the number of single proteins as m_gc[0]
    m_new = np.append(m_temp,np.array([M-np.sum(m_temp)]))
    w_new = np.append(np.unique(w_temp),np.array([w_M]))
    return w_new, m_new




def ncHGT_sf(XT,m,N,w):
    """survival function, sums PMF for all possibilities where K >= k by calling BiasedUrn"""
    #l = len(XT)/len(m)
    if len(XT) == 0:
        print('len(XT) = 0')
        return -1
    pval = 0
    #np.seterr(under='warn')
    ### This could be optimized by setting a threshold and stopping the for loop when the sum exceeds some threshold###
    iterator_ = tqdm.tqdm(XT)
    for xt in iterator_:
        x = rpy2.robjects.IntVector(xt)
        pval = pval + BiasedUrn.dMFNCHypergeo(x,m,N,w, precision = 1e-10)[0]
    return pval

import time

def enumerate_possibilities(m_new,i,prev_array):
    """enumerate all possible counts vectors"""
    
    first = True
    for j in range(m_new[i]+1):
        xt = prev_array.copy()
        xt[0][i] = j
        
        #recursion
        if (i < len(m_new)-1):
            xt = enumerate_possibilities(m_new, i+1, xt) #will return matrix (array of arrays)
            
        #combining results into matrix
        if not first:
            XT = np.concatenate([XT,xt], axis = 0)
        else:
            XT = xt
            first = False
    return XT


def do_ncHGT(k,gc,M,N,bins =2):
    setID2members = utils.csv2dict('../data/setID2members.csv')
    gocam2ID = utils.csv2dict('../data/gocam2ID_mouse.csv')
    
    M, w_M = get_M_wM()
    
    #make weight (w) and bin size (m) vectors where each entity in the gocam gets its own entry
    w_in, m_in = make_initial_vectors(gocam2ID, setID2members, gc, M,w_M)
    
    #update m and w vectors by grouping sets of the same size
    w_new , m_new= make_new_vectors(w_in,m_in,M,w_M)

    #make XT matrix, an enumeration of all possible arangements of balls in bins based on m_new and w_new
    m_gc = m_new[:-1] #don't pass the background bin to XT
    t0 = time.time()
    XT = enumerate_possibilities(m_gc,0,np.zeros(shape=(1,len(m_gc))))
    print(f'time to enumerate: {round(time.time()-t0,3)}')
    
    #filter XT to only include the region of the sample space >= k (which is what we want to sum probabilities over)
    mask1 = (np.sum(XT, axis=1) >= k)
    XT = XT[mask1]

    #filter XT to ensure that more than N entities are not picked
    mask2 = (np.sum(XT, axis=1) <= N)
    XT = XT[mask2]

    #### MODIFIED CODE BELOW #####
    #add the remaining entities to the m+1th bin (non gocam bin)
    M_array, w_M_array = get_M_wM_XT(num_bins = bins)
    #print(M_array)
    #print(w_M_array)
    t0 = time.time()
    xt_bg = enumerate_possibilities(M_array,0,np.zeros(shape=(1,len(M_array)))) #all the ways the background could be sampled
    print(f'time to enumerate bg: {round(time.time()-t0,3)}')
    XT_complete = []
    for k_ in range(k,sum(m_gc)): #group by k
        N_bg = N - k_
        mask_bg = (np.sum(xt_bg, axis=1) == N_bg)
        xt_bg_ = xt_bg[mask_bg]
        XT_slice = XT[np.sum(XT, axis =1) == k_]
        xt_duplicated = np.tile(XT_slice,(len(xt_bg_),1))
        if len(XT_complete) == 0:
            XT_complete = np.concatenate([xt_duplicated,xt_bg_],axis = 1)
        else:
            XT_complete = np.concatenate([XT_complete, (np.concatenate([xt_duplicated,xt_bg_],axis = 1))], axis = 0)
    XT_complete = np.array(XT_complete)   
    #x_mp1_vec = N- np.sum(XT, axis = 1) #number of balls to be drawn from the last bin (the non-gocam background)
    #XT = np.concatenate((XT,x_mp1_vec.reshape(len(x_mp1_vec),1)), axis = 1)
    
    m_new = np.concatenate([m_new[:-1],M_array]) #overwrite the last entry in m_new (dedicated to the background) with  M_array
    w_new = np.concatenate([w_new[:-1],w_M_array]) #overwrite the last entry in w_new (dedicated to the background) with w_M_array
    ##### MODIFIED CODE ABOVE #####
    
    m = rpy2.robjects.IntVector(m_new)
    w = rpy2.robjects.FloatVector(w_new)
    #print(XT_complete)
    #print()
    #print()
    #print(m_new)
    #print(w_new)
    t0 = time.time()
    pval = ncHGT_sf(XT_complete,m,N,w)
    print(f'time to compute pval: {round(time.time()-t0,3)}')
    return pval

def do_ncHGT_old(k,gc,M,N,bins =2):
    setID2members = utils.csv2dict('../data/setID2members.csv')
    gocam2ID = utils.csv2dict('../data/gocam2ID_mouse.csv')
    
    M, w_M = get_M_wM()
    
    #make weight (w) and bin size (m) vectors where each entity in the gocam gets its own entry
    w_in, m_in = make_initial_vectors(gocam2ID, setID2members, gc, M,w_M)
    
    #update m and w vectors by grouping sets of the same size
    w_new , m_new= make_new_vectors(w_in,m_in,M,w_M)

    #make XT matrix, an enumeration of all possible arangements of balls in bins based on m_new and w_new
    m_gc = m_new[:-1] #don't pass the background bin to XT
    t0 = time.time()
    XT = enumerate_possibilities(m_gc,0,np.zeros(shape=(1,len(m_gc))))
    print(f'time to enumerate: {round(time.time()-t0,3)}')
    
    #filter XT to only include the region of the sample space >= k (which is what we want to sum probabilities over)
    mask1 = (np.sum(XT, axis=1) >= k)
    XT = XT[mask1]

    #filter XT to ensure that more than N entities are not picked
    mask2 = (np.sum(XT, axis=1) <= N)
    XT = XT[mask2]

    #### MODIFIED CODE BELOW #####
    #add the remaining entities to the m+1th bin (non gocam bin)
    M_array, w_M_array = get_M_wM_XT(num_bins = bins)
    #print(M_array)
    #print(w_M_array)
    t0 = time.time()
    xt_bg = enumerate_possibilities(M_array,0,np.zeros(shape=(1,len(M_array)))) #all the ways the background could be sampled
    print(f'time to enumerate bg: {round(time.time()-t0,3)}')
    XT_complete = []
    for k_ in range(k,sum(m_gc)): #group by k
        N_bg = N - k_
        mask_bg = (np.sum(xt_bg, axis=1) == N_bg)
        xt_bg_ = xt_bg[mask_bg]
        XT_slice = XT[np.sum(XT, axis =1) == k_]
        xt_duplicated = np.tile(xt,(len(xt_bg_),1))
        if len(XT_complete) == 0:
            XT_complete = np.concatenate([xt_duplicated,np.repeat(xt_bg_,len(XT_slice),axis=0)],axis = 1)
        else:
            XT_complete = np.concatenate([XT_complete, (np.concatenate([xt_duplicated,np.repeat(xt_bg_,len(XT_slice),axis=0)],axis = 1))], axis = 0)
            
    XT_complete = np.array(XT_complete)   
    #x_mp1_vec = N- np.sum(XT, axis = 1) #number of balls to be drawn from the last bin (the non-gocam background)
    #XT = np.concatenate((XT,x_mp1_vec.reshape(len(x_mp1_vec),1)), axis = 1)
    
    m_new = np.concatenate([m_new[:-1],M_array]) #overwrite the last entry in m_new (dedicated to the background) with  M_array
    w_new = np.concatenate([w_new[:-1],w_M_array]) #overwrite the last entry in w_new (dedicated to the background) with w_M_array
    ##### MODIFIED CODE ABOVE #####
    
    m = rpy2.robjects.IntVector(m_new)
    w = rpy2.robjects.FloatVector(w_new)
    #print(XT_complete)
    #print()
    #print()
    #print(m_new)
    #print(w_new)
    t0 = time.time()
    pval = ncHGT_sf(XT_complete,m,N,w)
    print(f'time to compute pval: {round(time.time()-t0,3)}')
    return pval





In [16]:
x = np.array([[1,2],[3,4]])
np.repeat(x, 2, axis = 0)

array([[1, 2],
       [1, 2],
       [3, 4],
       [3, 4]])

Instead, I could make XT the enumeration over np.concatenate([m_gc,M_array]) and mask such that the gocam portion sums to >=k and the whole vector (axis = 1) sums to N

## enrich.py code

In [2]:
import pandas as pd
import numpy as np
import scipy.stats
from scipy.stats import hypergeom
import sys
sys.path.append('../GOCAM_Project/dev')
import os
import tqdm

import utils
pd.options.display.max_colwidth = 100

def get_sizes (data): #data= dataframe with gocam IDs and gene identifiers as columns
    """get number of entities in each gocam"""
    return data['gocam'].value_counts()
    
def get_sets (gene_list):
    """map list of genes to all sets that contain members of that list"""
    sets = []
    not_in_a_set = []
    members2setID = utils.csv2dict('../data/members2setID.csv')
    setID2members_input = {}
    for g in gene_list:
        s = members2setID.get(g)
        if s != None:
            sets = sets +s
            for i in s:
                if (i in setID2members_input) == False:
                    setID2members_input[i]={g}
                else:
                    prev = setID2members_input.get(i)
                    prev.add(g)
                    setID2members_input[i] = prev
        else:
            not_in_a_set.append(g)
    return not_in_a_set, list(set(sets)),setID2members_input #remove duplicates

def filter_gene_list(gene_list, Dict):
    """remove members of gene_list that are not in Dict.
    use function to filter a user's input list of genes based on those that appear at least 
    once in the gocam model database"""
    filtered_gene_list = []
    filtered_out = []
    for gene in gene_list:
        if gene in Dict:
            filtered_gene_list.append(gene)
        else:
            filtered_out.append(gene)
    return filtered_out, filtered_gene_list

def count_genes(gene_list, Dict):
    """ count number of genes in user's gene_list that are in each gocam"""
    gocam_counts = {} #key=gocam, value=list of genes in gocam that are also in the user's list
    for g in gene_list: 
            gocams = Dict.get(g)
            for gocam in gocams:
                if (gocam in gocam_counts) == False:
                    gocam_counts[gocam]=[g]
                else:
                    prev = gocam_counts.get(gocam)
                    prev.append(g)
                    gocam_counts[gocam] = prev
    return gocam_counts

#BENJAMINI HOCHBERG CORRECTION applied in correct_pval_and_format()
#ncHGT is either False (indicating that regular HGT should be done) or a positive integer denoting N for ncHGT
def hgt(counts, gocam_sizes, FDR, gene_list_size, background_gene_list_size, ncHGT = False, num_bins = 0):
    """ performs either the hypergeometric test or our introduced test using Fisher's noncentral hypergeometric dist.
    Whether our unweighted set enrichment or the standard HGT is performed is determined upstream based on what
    Dict of gocams->entities and filtered gene_list are passed into count_genes().
    ncHGT is either False (for set or standard methods) or corresponds to N """
    results = []
    iterator = tqdm.tqdm(counts.items())
    for gocam, gene_list in iterator:
        count = len(gene_list) 
        gocam_size = gocam_sizes[gocam]
        pvalue = None
        if ncHGT:
            if count <=1: #avoid unnecessary calls to BiasedUrn due to computation time
                pvalue = 1
            else:
                pvalue = do_ncHGT(count -1,gocam,background_gene_list_size,ncHGT,bins = num_bins)
        else: #set or standard methods
            pvalue = hypergeom.sf(count-1, background_gene_list_size,  gocam_size, gene_list_size) 
        if pvalue < 1: #FDR:
            r = (gocam, pvalue, count, gocam_size, gene_list )
            results.append(r)
    return results

#Benjamini Hochberg correction
def correct_pval_and_format(enriched_gocams, background_num_gocams,FDR):
    """performs Benjamini Hochberg correction to control the false discovery rate and formats output for display"""
    df = pd.DataFrame(enriched_gocams, columns =['url', 'pval (uncorrected)', '# entities in list','#entities in model','shared entities in gocam'])
    df.sort_values('pval (uncorrected)',inplace=True)
    df.reset_index(drop=True, inplace=True)
    df['FDR_val'] = (df.index+1)*FDR/background_num_gocams
    df['Less_than'] = (df['pval (uncorrected)'] < df['FDR_val'])
    index = df.Less_than.where(df.Less_than==True).last_valid_index()
    df_significant = df
    
    df_significant = df.loc[0:index].copy()
    if index == None:
        df_significant = pd.DataFrame(columns =['url', 'pval (uncorrected)', '# entities in list','#entities in model','shared entities in gocam'])
    df_display = df_significant[['url','pval (uncorrected)', '# entities in list', '#entities in model','shared entities in gocam']].copy()
    #modelID2title = pd.read_csv('../data/modelID2title_mouse.csv')
    temp = pd.read_csv('../data/modelID2title_mouse.csv',header = 0,names=['gocam','title'])
    modelID2title = pd.Series(temp.title.values,index=temp.gocam).to_dict()
    df_display['title'] = df_display['url'].map(modelID2title)
    cols = df_display.columns.to_list()
    cols[0]='title'
    cols[-1]='url'
    df_display = df_display[cols]
    return df_display

#Dict can only contain 1 instance of each gene per gocam (no duplicates)
def enrich(gene_list, uni_list,uniprot2input,gocam_sizes, Dict, ncHGT=False,FDR=.05, num_bins = 0):
    """uni_list is the list of uniprot IDs, because the backend dictionary, Dict, is gocam_id-> list(uniprot id's).
    uniprot2input is a dictionary keeping track of which of the user's inputs mapped to which uniprot id's so results can be 
    displayed in the user's inputted format, as the mapping is not always 1:1."""
    background_gene_list_size = len(Dict)
    if ncHGT: 
    #we consider the background size to be equal to the total # of genes 
    #(the sum of the weights of all entities would double count genes that occur in multiple sets
    #... is this the right thing to do though?
        background_gene_list_size = len(utils.csv2dict('../data/ID2gocam_mouse_ff.csv'))
        
    not_in_a_set, sets, setID2members_input_uni = get_sets(uni_list)
    
    setID2members_input = utils.map_dict_vals(uniprot2input, setID2members_input_uni)
    
    filtered_out1, set_list_filtered = filter_gene_list(sets,Dict)
    filtered_out2, gene_list_filtered = filter_gene_list(uni_list, Dict) #need to clean gene_list to only include genes in the gocam
    
    
    filtered_list = gene_list_filtered + set_list_filtered
    gene_list_size = len(filtered_list)
    
    flist2input = {**uniprot2input, **setID2members_input}
    filtered_list_as_genes = set(pd.Series(list(filtered_list)).map(flist2input).explode())
    filtered_out_genes = set(gene_list) - filtered_list_as_genes
    
    counts = count_genes(filtered_list, Dict)
    
    N_ncHGT = False
    if ncHGT == True:
        N_ncHGT = len(gene_list)-len(filtered_out_genes)
        if N_ncHGT <= 0:
            return "error no genes found in gocams"
        
    enriched_gocams = hgt(counts, gocam_sizes, FDR, gene_list_size, background_gene_list_size, ncHGT=N_ncHGT, num_bins = num_bins)
    background_num_gocams = len(gocam_sizes)
    df_display = correct_pval_and_format(enriched_gocams, background_num_gocams,FDR)
    return filtered_out_genes, filtered_list, setID2members_input_uni, setID2members_input, df_display
    
def enrich_wrapper(filename, id_type, method = 'set', return_all = False, FDR=.05,fpath= '../test_data', display_gene_symbol = True, num_bins = 1):
    """ wrapper to perform enrichment given a filename, gene ID type, enrichment method, and false discovery rate.
    other parameters:
    
    return_all: 
        if false, only returns the dataframe displaying results. 
        if true: returns (gene_list, filtered_out_genes, filtered_list, setID2members_input_uni, setID2members_input, df_display)
        return_all = True is not just for debugging. User may want to know which of their input genes were filtered out as well as how
        the IDs were mapped, as uniprot IDs can sometimes map to more than one HGNC gene symbol
    display_gene_symbol: if true, display HGNC symbols on output regardless of input ID type"""
        
    #set method files
    gcs = '../data/gocam_sizes_mouse.csv'
    id2g = '../data/ID2gocam_mouse.csv'
    
    #standard method files
    if method == 'standard':
        gcs = '../data/gocam_sizes_mouse_ff.csv'
        id2g = '../data/ID2gocam_mouse_ff.csv'
    
    gene_list = pd.read_csv(os.path.join(fpath,filename),header=None,names = ['g'])
    
    #normally not needed, but I found a bug where HSPA1A and HSPA1B are listed as synonyms, both in Simplemine and official sources like the Alliance
    gene_list.drop_duplicates(inplace = True) 
    
    gene_list_converted = []
    uniprot2input = {}
    not_converted = []
    
    #conversion to uniprot IDs not needed for a list of uniprot IDs
    if id_type == 'uniprot':
        gene_list_converted = gene_list.g
        uniprot2input = pd.Series(gene_list_converted.values,index=gene_list_converted).to_dict()
    else:
        gene_list_converted, uniprot2input, not_converted = utils.convert_IDs(gene_list,id_type)
    
    #read in dictionary and the gocam sizes
    x = pd.read_csv(gcs)
    gocam_sizes = pd.Series(x.sizes.values,index=x.gocam)
    Dict = utils.csv2dict(id2g)
    
    #call enrich()
    ncHGT = False
    if method == 'ncHGT':
        ncHGT = True
    #results: (filtered_out_genes, filtered_list, setID2members_input_uni, setID2members_input, df_display)
    results = enrich(list(gene_list.g), gene_list_converted, uniprot2input, gocam_sizes, Dict, ncHGT = ncHGT, FDR=FDR,num_bins = num_bins)
    
    if display_gene_symbol == True:
        results[4]['shared entities in gocam'] = utils.uniprot2gene(results[4]['shared entities in gocam'])
        results[4]['shared entities in gocam'] = results[4]['shared entities in gocam'].apply(lambda x: [x_.replace('sset:','set:') for x_ in x])
    if method == 'set' or method == 'ncHGT':
        print(f"Analysis run on {len(results[1])} entities from {len(gene_list)-len(results[0])} out of {len(gene_list)} input genes")
    

    if return_all:
        return (gene_list, *results)
    else:
        return results[4]
    

# testing

In [5]:
import enrich as enrich_

govaere_1 = enrich_.enrich_wrapper('Goavere_S2.csv','Gene Symbol',method='ncHGT',FDR = 0.1,fpath = '../test_data/processed/')
govaere_1

100%|███████████████████████████████████████████| 98/98 [00:01<00:00, 83.75it/s]


Analysis run on 68 entities from 48 out of 118 input genes


Unnamed: 0,title,pval (uncorrected),# entities in list,#entities in model,shared entities in gocam,url
0,Activation of Matrix Metalloproteinases - Reactome,2.712844e-07,7,18,"[set:proMMP3 initial activators, set:proMMP8 initial activators, set:proMMP9 activating protease...",http://model.geneontology.org/R-HSA-1592389
1,Cytosolic sulfonation of small molecules - Reactome,0.0001303727,5,19,"[SULT2A1, set:SULTs active on DHEA, set:SULT dimers (T3), set:SULT dimers (T2), set:SULT1E1,2A1]",http://model.geneontology.org/R-HSA-156584


In [6]:
govaere_2 = enrich_wrapper('Goavere_S2.csv','Gene Symbol',method='ncHGT',FDR = 0.1,num_bins = 2, fpath = '../test_data/processed/')
govaere_2

100%|███████████████████████████████████████████| 98/98 [10:23<00:00,  6.36s/it]


Analysis run on 68 entities from 48 out of 118 input genes


Unnamed: 0,title,pval (uncorrected),# entities in list,#entities in model,shared entities in gocam,url
0,Activation of Matrix Metalloproteinases - Reactome,2.684776e-07,7,18,"[set:proMMP3 initial activators, set:proMMP8 initial activators, set:proMMP9 activating protease...",http://model.geneontology.org/R-HSA-1592389
1,Cytosolic sulfonation of small molecules - Reactome,0.0001293896,5,19,"[SULT2A1, set:SULTs active on DHEA, set:SULT dimers (T3), set:SULT dimers (T2), set:SULT1E1,2A1]",http://model.geneontology.org/R-HSA-156584


In [7]:
platelet_up_1 = enrich_.enrich_wrapper('platelets_up.csv','Gene Symbol',method='ncHGT',FDR = 0.1,fpath = '../test_data/processed/')
platelet_up_1

100%|█████████████████████████████████████████| 482/482 [01:23<00:00,  5.77it/s]


Analysis run on 423 entities from 365 out of 1172 input genes


Unnamed: 0,title,pval (uncorrected),# entities in list,#entities in model,shared entities in gocam,url
0,Collagen biosynthesis and modifying enzymes - Reactome,4.680930e-07,10,12,"[PPIB, P3H1, PLOD3, P4HB, set:COLGALT1,COLGALT2, set:Prolyl 3-hydroxylases, set:Procollagen C-pr...",http://model.geneontology.org/R-HSA-1650814
1,Hedgehog ligand biogenesis - Reactome,4.690733e-06,13,50,"[PSMD13, PSMA5, PSMD11, PSMA7, PSME2, PSMD4, PSMB1, PSMB7, PSMD8, PSMB5, P4HB, PSMB6, SYVN1]",http://model.geneontology.org/R-HSA-5358346
2,ER-Phagosome pathway - Reactome,5.123754e-06,13,51,"[PSMD13, PSMA5, PSMD11, SEC61B, PSMA7, PSME2, PSMD4, PSMB1, PSMB7, PSMD8, PSMB5, PSMB6, set:SEC6...",http://model.geneontology.org/R-HSA-1236974
3,Regulation of APC/C activators between G1/S and early anaphase - Reactome,7.226495e-06,13,52,"[PSMD13, PSMA5, PSMD11, PSMA7, PSME2, PSMD4, PSMB1, PSMB7, CDK1, PSMD8, PSMB5, PSMB6, set:CDC25]",http://model.geneontology.org/R-HSA-176408
4,Neddylation - Reactome,9.256658e-06,14,62,"[UBE2M, PSMD13, PSMA5, PSMD11, PSMA7, PSME2, PSMD4, PSMB1, CUL9, PSMB7, PSMD8, PSMB5, PSMB6, set...",http://model.geneontology.org/R-HSA-8951664
...,...,...,...,...,...,...
62,Separation of Sister Chromatids - Reactome,8.704360e-04,11,63,"[PSMD13, PSMA5, PSMD11, PSMA7, PSME2, PSMD4, PSMB1, PSMB7, PSMD8, PSMB5, PSMB6]",http://model.geneontology.org/R-HSA-2467813
63,CDK-mediated phosphorylation and removal of Cdc6 - Reactome,1.441100e-03,11,65,"[PSMD13, PSMA5, PSMD11, PSMA7, PSME2, PSMD4, PSMB1, PSMB7, PSMD8, PSMB5, PSMB6]",http://model.geneontology.org/R-HSA-69017
64,Interleukin-1 signaling - Reactome,1.727041e-03,11,64,"[PSMD13, PSMA5, PSMD11, PSMA7, PSME2, PSMD4, PSMB1, PSMB7, PSMD8, PSMB5, PSMB6]",http://model.geneontology.org/R-HSA-9020702
65,Downstream TCR signaling - Reactome,1.813878e-03,14,74,"[PSMD13, PSMA5, PSMD11, PSMA7, PSME2, PSMD4, PSMB1, PSMB7, PSMD8, PSMB5, PSMB6, CD3D, set:Activa...",http://model.geneontology.org/R-HSA-202424


In [11]:
platelets_up_2 = enrich_wrapper('platelets_up.csv','Gene Symbol',method='ncHGT',FDR = 0.1,num_bins = 2, fpath = '../test_data/processed/')
platelets_up_2

  0%|                                                   | 0/482 [00:00<?, ?it/s]

time to enumerate: 0.0
time to enumerate bg: 14.122



  0%|                                                  | 0/2175 [00:00<?, ?it/s][A
100%|████████████████████████████████████| 2175/2175 [00:00<00:00, 12580.62it/s][A
  0%|                                         | 1/482 [00:14<1:56:25, 14.52s/it]

time to compute pval: 0.174
time to enumerate: 0.0
time to enumerate bg: 14.195



  0%|                                                  | 0/2534 [00:00<?, ?it/s][A
100%|████████████████████████████████████| 2534/2534 [00:00<00:00, 12653.62it/s][A
  0%|▏                                        | 2/482 [00:29<1:56:50, 14.61s/it]

time to compute pval: 0.202
time to enumerate: 0.0
time to enumerate bg: 14.988



  0%|                                                  | 0/2534 [00:00<?, ?it/s][A
 48%|█████████████████▍                  | 1226/2534 [00:00<00:00, 12259.16it/s][A
100%|████████████████████████████████████| 2534/2534 [00:00<00:00, 12258.56it/s][A
  1%|▎                                        | 3/482 [00:44<1:59:43, 15.00s/it]

time to compute pval: 0.208
time to enumerate: 0.0
time to enumerate bg: 14.365



  0%|                                                  | 0/2534 [00:00<?, ?it/s][A
100%|████████████████████████████████████| 2534/2534 [00:00<00:00, 12835.77it/s][A
  1%|▎                                        | 4/482 [00:59<1:58:57, 14.93s/it]

time to compute pval: 0.199
time to enumerate: 0.0
time to enumerate bg: 14.284



  0%|                                                  | 0/5768 [00:00<?, ?it/s][A
 16%|██████                                | 921/5768 [00:00<00:00, 9197.47it/s][A
 32%|███████████▊                         | 1841/5768 [00:00<00:00, 9135.47it/s][A
 48%|█████████████████▋                   | 2763/5768 [00:00<00:00, 9172.58it/s][A
 64%|███████████████████████▋             | 3687/5768 [00:00<00:00, 9198.44it/s][A
 80%|█████████████████████████████▌       | 4607/5768 [00:00<00:00, 9163.44it/s][A
100%|█████████████████████████████████████| 5768/5768 [00:00<00:00, 9157.71it/s][A
  1%|▍                                        | 5/482 [01:14<1:59:30, 15.03s/it]

time to compute pval: 0.631
time to enumerate: 0.0


  1%|▍                                        | 5/482 [01:26<2:16:56, 17.22s/it]


KeyboardInterrupt: 

In [6]:
hoang_fibrosis_set = enrich_.enrich_wrapper('Hoang_2019_fibrosis.csv','Gene Symbol',method='ncHGT',FDR = 0.1,fpath = '../test_data/processed/')
hoang_fibrosis_set

100%|█████████████████████████████████████████| 171/171 [00:02<00:00, 67.58it/s]


Analysis run on 122 entities from 103 out of 440 input genes


Unnamed: 0,title,pval (uncorrected),# entities in list,#entities in model,shared entities in gocam,url
0,Activation of Matrix Metalloproteinases - Reactome,2.4e-05,7,18,"[MMP7, MMP2, set:MMP1,7, set:proMMP9 activating proteases, set:MMP2,3,7,10,11, set:MMP1 (2, 3, 7...",http://model.geneontology.org/R-HSA-1592389


In [7]:
hoang_fibrosis_1 = enrich_wrapper('Hoang_2019_fibrosis.csv','Gene Symbol',num_bins =1, method='ncHGT',show_significant = False, FDR = 0.1,fpath = '../test_data/processed/')
hoang_fibrosis_1

100%|█████████████████████████████████████████| 171/171 [00:03<00:00, 55.71it/s]


Analysis run on 122 entities from 103 out of 440 input genes


Unnamed: 0,title,pval (uncorrected),# entities in list,#entities in model,shared entities in gocam,url
0,Activation of Matrix Metalloproteinases - Reactome,2.4e-05,7,18,"[MMP7, MMP2, set:MMP1,7, set:proMMP9 activating proteases, set:MMP2,3,7,10,11, set:MMP1 (2, 3, 7...",http://model.geneontology.org/R-HSA-1592389


In [None]:
hoang_fibrosis_2 = enrich_wrapper('Hoang_2019_fibrosis.csv','Gene Symbol',num_bins =2, method='ncHGT',show_significant = False, FDR = 0.1,fpath = '../test_data/processed/')
hoang_fibrosis_2