Modified ncHGT.py file that doesnt call BiasedUrn and instead returns the number of calls that would be made to BiasedUrn

In [28]:
import numpy as np
import sys
sys.path.append('../dev')

import rpy2
from rpy2.robjects.packages import importr
BiasedUrn = importr('BiasedUrn')

import utils


def get_M_wM():
    """ returns M, the number of entities in the background, and w_M, the mean size of entities in the background"""
    setID2members = utils.csv2dict('../data/setID2members.csv')
    l = []
    for s,m in setID2members.items():
        l.append(len(m))
    l = np.array(l)
    l = np.sort(l)
    num_empty_sets = np.sum(l==0)
    
    l = l[l!=0]
    mean = np.mean(l)#l[4:-4]) 1% trimmed mean?
    num_sets = len(l)
    bg = len(utils.csv2dict('../data/ID2gocam_mouse.csv'))
    M = bg-num_empty_sets
    
    w_M = np.round(((M-num_sets)+num_sets*mean)/M,decimals=2)
    return M, w_M

def make_initial_vectors(gocam2ID,setID2members, gc, M,w_M):
    """initializes counts vector (m) and weights vector (w), where each entity gets its own element in the arrays
- values in m only take on 0 (if there is no solo proteins) or 1
- values in w correspond to the weight of each element in m (weighted by the # genes in a set or 1 for solo proteins)"""
    w_gc = [1] #initialize with 1 as the weight of single proteins (irrespective of whether there are any)
    m_gc = [0] #initialize with 0 single proteins
    num_protein = 0
    for i in gocam2ID.get(gc):
        if "sset:" in i:
            w_i = len(setID2members.get(i))
            w_gc.append(w_i)
            m_gc.append(1)
        else:
            num_protein+=1
    m_gc[0] = num_protein
    m_gc.append(M-np.sum(m_gc)) #entities not in the gocam (roughly)
    w_gc.append(w_M) #weight for entities not in the gocam (all weighted as w_M (the mean))
    return w_gc, m_gc


def make_new_vectors(w_gc,m_gc,M,w_M):
    """compress the m and w vectors by grouping elements according to their weights
- w is the ordered set of unique weights for entities of the gocam + the background bin
- m[i] is the number of entities in the pathway with the weight specified in w[i] + the background bin"""
    w_temp = w_gc[:-1]
    if w_temp[0] != 1:
        print('Possible bug: w_temp[0] != 1',w_temp)
        
    w_new, m_temp = np.unique(w_temp, return_counts=True)
    m_temp[0]=m_gc[0] #w_gc and m_gc have weight 1 as w_gc[0] and the number of single proteins as m_gc[0]
    m_new = np.append(m_temp,np.array([M-np.sum(m_temp)]))
    w_new = np.append(np.unique(w_temp),np.array([w_M]))
    return w_new, m_new




def ncHGT_sf(XT,m,N,w):
    """survival function, sums PMF for all possibilities where K >= k by calling BiasedUrn"""
    #l = len(XT)/len(m)
    if len(XT) == 0:
        print('len(XT) = 0')
        return -1
    pval = 0
    np.seterr(under='warn')
    for i in range(len(XT)):
        x = rpy2.robjects.IntVector(XT[i])
        pval = pval + BiasedUrn.dMFNCHypergeo(x,m,N,w, precision = 1e-10)[0]
    return pval


def enumerate_possibilities(m_new,i,prev_array):
    """enumerate all possible counts vectors"""
    first = True
    for j in range(m_new[i]+1):
        xt = prev_array.copy()
        xt[0][i] = j
        
        #recursion
        if (i < len(m_new)-1):
            xt = enumerate_possibilities(m_new, i+1, xt) #will return matrix (array of arrays)
            
        #combining results into matrix
        if not first:
            XT = np.concatenate([XT,xt], axis = 0)
        else:
            XT = xt
            first = False
    return XT


def do_ncHGT(k,gc,M,N, compress = True):
    setID2members = utils.csv2dict('../data/setID2members.csv')
    gocam2ID = utils.csv2dict('../data/gocam2ID_mouse.csv')
    
    M, w_M = get_M_wM()
    
    #make weight (w) and bin size (m) vectors where each entity in the gocam gets its own entry
    w_in, m_in = make_initial_vectors(gocam2ID, setID2members, gc, M,w_M)
    
    #update m and w vectors by grouping sets of the same size
    w_new = w_in
    m_new = m_in
    if compress:
        w_new , m_new= make_new_vectors(w_in,m_in,M,w_M)

    #make XT matrix, an enumeration of all possible arangements of balls in bins based on m_new and w_new
    m_gc = m_new[:-1] #don't pass the background bin to XT
    XT = enumerate_possibilities(m_gc,0,np.zeros(shape=(1,len(m_gc))))
    
    #filter XT to only include the region of the sample space >= k (which is what we want to sum probabilities over)
    mask1 = (np.sum(XT, axis=1) >= k)
    XT = XT[mask1]

    #filter XT to ensure that more than N entities are not picked
    mask2 = (np.sum(XT, axis=1) <= N)
    XT = XT[mask2]

    #add the remaining entities to the m+1th bin (non gocam bin)
    x_mp1_vec = N- np.sum(XT, axis = 1) #number of balls to be drawn from the last bin (the non-gocam background)
    XT = np.concatenate((XT,x_mp1_vec.reshape(len(x_mp1_vec),1)), axis = 1)

    m = rpy2.robjects.IntVector(m_new)
    w = rpy2.robjects.FloatVector(w_new)
    size = len(XT)
    return size





 Modified enrich.py file to make calls to the above ncHGT code

In [24]:
import pandas as pd
import numpy as np
import scipy.stats
from scipy.stats import hypergeom
import sys
sys.path.append('../GOCAM_Project/dev')
import os
import tqdm

import utils

pd.options.display.max_colwidth = 100

def get_sizes (data): #data= dataframe with gocam IDs and gene identifiers as columns
    return data['gocam'].value_counts()
    
def get_sets (gene_list):
    sets = []
    not_in_a_set = []
    members2setID = utils.csv2dict('../data/members2setID.csv')
    setID2members_input = {}
    for g in gene_list:
        s = members2setID.get(g)
        if s != None:
            sets = sets +s
            for i in s:
                if (i in setID2members_input) == False:
                    setID2members_input[i]={g}
                else:
                    prev = setID2members_input.get(i)
                    prev.add(g)
                    setID2members_input[i] = prev
        else:
            not_in_a_set.append(g)
    return not_in_a_set, list(set(sets)),setID2members_input #remove duplicates

def filter_gene_list(gene_list, Dict):
    filtered_gene_list = []
    filtered_out = []
    for gene in gene_list:
        if gene in Dict:
            filtered_gene_list.append(gene)
        else:
            filtered_out.append(gene)
    return filtered_out, filtered_gene_list

def count_genes(gene_list, Dict):
    gocam_counts = {} #key=gocam, value=list of genes in gocam that are also in the user's list
    for g in gene_list: 
            gocams = Dict.get(g)
            for gocam in gocams:
                if (gocam in gocam_counts) == False:
                    gocam_counts[gocam]=[g]
                else:
                    prev = gocam_counts.get(gocam)
                    prev.append(g)
                    gocam_counts[gocam] = prev
    return gocam_counts

#BENJAMINI HOCHBERG CORRECTION applied in correct_pval_and_format()
#ncHGT is either False (indicating that regular HGT should be done) or a positive integer denoting N for ncHGT
def hgt(counts, gocam_sizes, FDR, gene_list_size, background_gene_list_size, ncHGT = False, compress = True):
    """ ncHGT is either False (for set or standard methods) or corresponds to N """
    num_calls = []
    iterator = tqdm.tqdm(counts.items())
    for gocam, gene_list in iterator:
        count = len(gene_list) 
        gocam_size = gocam_sizes[gocam]
        pvalue = None
        if ncHGT:
            if count <=1: #avoid unnecessary calls to BiasedUrn due to computation time
                pvalue = 1
            else:
                r = do_ncHGT(count -1,gocam,background_gene_list_size,ncHGT, compress = compress)
                num_calls.append(r)
    return num_calls

#Benjamini Hochberg correction
def correct_pval_and_format(enriched_gocams, background_num_gocams,show_significant,FDR):
    df = pd.DataFrame(enriched_gocams, columns =['url', 'pval (uncorrected)', '# entities in list','#entities in model','shared entities in gocam'])
    df.sort_values('pval (uncorrected)',inplace=True)
    df.reset_index(drop=True, inplace=True)
    df['FDR_val'] = (df.index+1)*FDR/background_num_gocams
    df['Less_than'] = (df['pval (uncorrected)'] < df['FDR_val'])
    index = df.Less_than.where(df.Less_than==True).last_valid_index()
    df_significant = df
    if (show_significant):
        df_significant = df.loc[0:index].copy()
        if index == None:
            df_significant = pd.DataFrame(columns =['url', 'pval (uncorrected)', '# entities in list','#entities in model','shared entities in gocam'])
    df_display = df_significant[['url','pval (uncorrected)', '# entities in list', '#entities in model','shared entities in gocam']].copy()
    #modelID2title = pd.read_csv('../data/modelID2title_mouse.csv')
    temp = pd.read_csv('../data/modelID2title_mouse.csv',header = 0,names=['gocam','title'])
    modelID2title = pd.Series(temp.title.values,index=temp.gocam).to_dict()
    df_display['title'] = df_display['url'].map(modelID2title)
    cols = df_display.columns.to_list()
    cols[0]='title'
    cols[-1]='url'
    df_display = df_display[cols]
    return df_display

#Dict can only contain 1 instance of each gene per gocam (no duplicates)
#show_significant only affects the multiple testing correction. If the uncorrected pval > FDR, hgt() will already remove it
def enrich(gene_list, uni_list,uniprot2input,gocam_sizes, Dict, ncHGT=False, show_significant=True,FDR=.05, compress = True):
    background_gene_list_size = len(Dict)
    if ncHGT: 
    #we consider the background size to be equal to the total # of genes 
    #(the sum of the weights of all entities would double count genes that occur in multiple sets
    #... is this the right thing to do though?
        background_gene_list_size = len(utils.csv2dict('../data/ID2gocam_mouse_ff.csv'))
        
    not_in_a_set, sets, setID2members_input_uni = get_sets(uni_list)
    
    setID2members_input = utils.map_dict_vals(uniprot2input, setID2members_input_uni)
    
    filtered_out1, set_list_filtered = filter_gene_list(sets,Dict)
    filtered_out2, gene_list_filtered = filter_gene_list(uni_list, Dict) #need to clean gene_list to only include genes in the gocam
    
    
    filtered_list = gene_list_filtered + set_list_filtered
    gene_list_size = len(filtered_list)
    
    flist2input = {**uniprot2input, **setID2members_input}
    filtered_list_as_genes = set(pd.Series(list(filtered_list)).map(flist2input).explode())
    filtered_out_genes = set(gene_list) - filtered_list_as_genes
    
    counts = count_genes(filtered_list, Dict)
    
    N_ncHGT = False
    if ncHGT == True:
        N_ncHGT = len(gene_list)-len(filtered_out_genes)
        if N_ncHGT <= 0:
            return "error no genes found in gocams"
        
    num_calls = hgt(counts, gocam_sizes, FDR, gene_list_size, background_gene_list_size, ncHGT=N_ncHGT, compress= compress)
    
    return num_calls
    
def enrich_wrapper(filename, id_type, return_all = False, method = 'set', show_significant=True,FDR=.05,fpath= '../test_data', 
                   display_gene_symbol = True, compress = True):
    """ returns (gene_list, filtered_out_genes, filtered_list, setID2members_input_uni, setID2members_input, df_display)"""
    #set method files
    gcs = '../data/gocam_sizes_mouse.csv'
    id2g = '../data/ID2gocam_mouse.csv'
    
    #standard method files
    if method == 'standard':
        gcs = '../data/gocam_sizes_mouse_ff.csv'
        id2g = '../data/ID2gocam_mouse_ff.csv'
    
    gene_list = pd.read_csv(os.path.join(fpath,filename),header=None,names = ['g'])
    
    #normally not needed, but I found a bug where HSPA1A and HSPA1B are listed as synonyms, both in Simplemine and official sources like the Alliance
    gene_list.drop_duplicates(inplace = True) 
    
    gene_list_converted = []
    uniprot2input = {}
    not_converted = []
    
    #conversion to uniprot IDs not needed for a list of uniprot IDs
    if id_type == 'uniprot':
        gene_list_converted = gene_list.g
        uniprot2input = pd.Series(gene_list_converted.values,index=gene_list_converted).to_dict()
    else:
        gene_list_converted, uniprot2input, not_converted = utils.convert_IDs(gene_list,id_type)
    
    #read in dictionary and the gocam sizes
    x = pd.read_csv(gcs)
    gocam_sizes = pd.Series(x.sizes.values,index=x.gocam)
    Dict = utils.csv2dict(id2g)
    
    #call enrich()
    ncHGT = False
    if method == 'ncHGT':
        ncHGT = True
    #results: (filtered_out_genes, filtered_list, setID2members_input_uni, setID2members_input, df_display)
    num_calls = enrich(list(gene_list.g), gene_list_converted, uniprot2input, gocam_sizes, Dict, ncHGT = ncHGT, 
                       show_significant = show_significant, FDR=FDR, compress = compress)
    return num_calls

        
        

# Examining the number of calls for the LMNA_comb dataset between the compressed and uncompressed methods.

looking both at specific pathways and also at the total number of calls


## Looking at the reduction on LNDA

In [29]:
num_calls_F = enrich_wrapper('LMNA_comb.csv','Gene Symbol',method='ncHGT',FDR = 0.1,fpath = '../test_data/processed/', compress = False)

100%|█████████████████████████████████████████| 326/326 [00:33<00:00,  9.85it/s]


In [31]:
num_calls_T = enrich_wrapper('LMNA_comb.csv','Gene Symbol',method='ncHGT',FDR = 0.1,fpath = '../test_data/processed/', compress = True)

100%|████████████████████████████████████████| 326/326 [00:01<00:00, 188.51it/s]


In [38]:
compressed = np.array(num_calls_T)
not_compressed = np.array(num_calls_F)

In [39]:
print(compressed[:10])

[   16    47    50     5 11519    13     5    76    13   191]


In [40]:
print(not_compressed[:10])

[     16      47      67       5 6291455      13       5     346      13
     383]


In [45]:
np.quantile(np.divide(not_compressed,compressed), [0.5,.8,.9,1.0])

array([  1.        ,   2.03537631,  10.7037283 , 546.18065804])

In [46]:
np.sum(compressed)/np.sum(not_compressed)

0.01427793196010582

# Median reduction in the number of calls to BiasedUrn per dataset

In [48]:
path = '../test_data/processed'

datasets = {'Covid-19 Platelets':('Gene Symbol',['platelets_up.csv','platelets_down.csv']),
            'DCM Cardiomyocytes':('Gene Symbol',['LMNA_comb.csv','PKP2_comb.csv','RBM20_comb.csv','TTN_comb.csv','PVneg_comb.csv']),
            'DCM Fibroblasts':('Gene Symbol',['LMNA_FB_comb.csv','PKP2_FB_comb.csv','RBM20_FB_comb.csv','TTN_FB_comb.csv','PVneg_FB_comb.csv']),
            'Aging Brain Astrocytes':('Gene Symbol',['astro_HTH_up.csv','astro_CB_up.csv','astro_HTH_down.csv','astro_CB_down.csv']),
            'P97 Inhibitor':('uniprot',['P97.csv']),
            'Macrophage':('Gene Symbol',['mac_comb.csv']),
            'NASH':('Gene Symbol',['Goavere_S2.csv'])
           }

total_calls = []
for paper,val in datasets.items():
    symbol_type = val[0]
    datasets_ = val[1]
    
    for dataset in datasets_:
        filename = os.path.join(path,dataset)
        
        num_calls_F = enrich_wrapper(filename,symbol_type,method='ncHGT', compress = False)
        num_calls_T = enrich_wrapper(filename,symbol_type,method='ncHGT', compress = True)

        compressed = np.array(num_calls_T)
        not_compressed = np.array(num_calls_F)
        total_calls.append(np.sum(compressed)/np.sum(not_compressed))

100%|█████████████████████████████████████████| 482/482 [00:25<00:00, 19.20it/s]
100%|████████████████████████████████████████| 482/482 [00:02<00:00, 181.01it/s]
100%|█████████████████████████████████████████| 436/436 [00:33<00:00, 12.85it/s]
100%|████████████████████████████████████████| 436/436 [00:01<00:00, 257.66it/s]
100%|█████████████████████████████████████████| 326/326 [00:33<00:00,  9.61it/s]
100%|████████████████████████████████████████| 326/326 [00:01<00:00, 185.08it/s]
100%|█████████████████████████████████████████| 185/185 [00:15<00:00, 11.74it/s]
100%|████████████████████████████████████████| 185/185 [00:00<00:00, 223.33it/s]
100%|█████████████████████████████████████████| 195/195 [00:31<00:00,  6.22it/s]
100%|████████████████████████████████████████| 195/195 [00:00<00:00, 252.67it/s]
100%|█████████████████████████████████████████| 268/268 [00:32<00:00,  8.28it/s]
100%|████████████████████████████████████████| 268/268 [00:00<00:00, 276.01it/s]
100%|███████████████████████

In [49]:
np.median(total_calls)

0.015445306496629257

In [50]:
1/np.median(total_calls)

64.74458763367612