In [8]:
import numpy as np
import scipy.stats as sstats
import pandas as pd 

from collections import namedtuple

def drug_targets():
    """
    Function: 
    This function should return a table of drugs with the enrichment done to the given list of genes. Later this list will be 
    ranked in a different function. 
    
    Variables: 
    mapped = a dataset where STITCH and DrugBank are mapped. 
    dictio = a dictionary that contains all the drugs with a list of targets.
    biomart_data = a dataset containing the ensembl ID's of genes, transcripts and proteins. This dataset also contains the
    entrez gene id, Uniprot ID and the name of the target. 
    get_entrez = a dataset where biomart_data and mapped are mapped. 
    get_entrez_filtered = the same dataset as get_entrez but without some useless columns. 
    other_set1 = a list of genes. The reference set. 
    other_set2 = the same list of genes but the entrez id's are turned into int's and the \n are removed.
    universe1 = a list of genes. The universe. 
    universe2 = the same list of genes but the entrez id's are turned into int's and the \n are removed. 
    results = the enrichment results of one specific drug in the dictionary. This could be either written to a file or to a 
    table. 
    l_results = the enrichment results of one specific drug in the dictionary. This is the results but converted to a list. 
    di = a dictionary with the specific results of the enrichment. 
    list_dict = a list where the dictionary: di is appended to. You will end up with a list full of dictionaries. 
    
    """
    
    mapped = pd.read_csv('mapped_DB_STITCH.tsv', sep='\t')
    mapped['protein'] = mapped['protein'].map(lambda x: x.lstrip('9606.'))
    
    ensembl = pd.read_csv('protein_chemical_links_v5.0_2.1.tsv', sep='\t')
    ensembl['protein'] = ensembl['protein'].map(lambda x: x.lstrip('9606.'))
    
    
    #ensembl = pd.read_csv('test_esembl_all_STITCH.txt')
    #ensembl['protein'] = ensembl['protein'].map(lambda x: x.lstrip('9606.'))
    
    dictio = {}
    
    biomart_data = pd.read_csv("biomart.tsv", 
                  sep='\t', 
                  names=["gene", "transcript", "protein", "Entrez", "Uniprot", "name"])
    
    #get_entrez = pd.merge(mapped, biomart_data, on=["protein"])
    #get_entrez = get_entrez.dropna(subset=['entrez id'])
    #get_entrez['entrez id'] = get_entrez['entrez id'].astype(int)
    #get_entrez_filtered = get_entrez.drop(["gene", "transcript", "Uniprot"], axis=1)
    
    #get_entrezs = pd.merge(ensembl, biomart_data, on=["protein"])
    #get_entrezs = get_entrezs.dropna(subset=['entrez id'])
    #get_entrezs['entrez id'] = get_entrezs['entrez id'].astype(int)
    #get_entrezs_filtered = get_entrezs.drop(["gene", "transcript", "Uniprot"], axis=1)
    
    #for i in get_entrez_filtered['DrugBank ID'].unique(): 
    #    dictio[i] = [get_entrez_filtered['entrez id'][j] for j in get_entrez_filtered[get_entrez_filtered['DrugBank ID']==i].index]
    
    for i in mapped['DrugBank ID'].unique(): 
        dictio[i] = [mapped['protein'][j] for j in mapped[mapped['DrugBank ID']==i].index]
           
    other_set1 = pd.read_csv("test_list_genes1.0.txt")
    get_ens = pd.merge(other_set1, biomart_data, on=["Entrez"]) 
    get_ens = get_ens.dropna(subset=['protein'])
    get_ens_filtered = get_ens.drop(["gene", "transcript", "Uniprot"], axis=1)
    print(get_ens_filtered.head(10))
    
    #other_set1 = open("test_list_genes1.0.txt").readlines()[1:]
    #other_set2 = [z.replace('\n', '') for z in other_set1]
    #other_set2 = list(map(int, other_set2))
    
    #universe1 = open("test_list_genes_homo_sapiens1.0.txt").readlines()
    #universe2 = [g.replace('\n', '') for g in universe1]
    #universe2 = list(map(int, universe2))
    #df = pd.DataFrame(columns=['drug', 'oddsratio', 'c2statistic', 'pvalue', 'table', 'method'])
    list_dict = []
    for key, value in dictio.items():
        results = set_enrichment(value, get_ens_filtered['protein'], ensembl['protein'])
        l_results = list(results)
        di = {"drug" : key, "oddsratio" : l_results[0], "c2stat" : l_results[1], "pvalue" : l_results[2], "table" : l_results[3], "method" : l_results[4]} 
        list_dict.append(di)
    print(sorted(list_dict, key = lambda i: i['pvalue']))

def set_enrichment(your_set, other_set, universe, abcd_values=False):
    
    resTuple = namedtuple("setEnrichmentResult", [ 'oddsratio', 'c2statistic', 'pvalue', 'table', 'method'])

    universe  = set(universe)
    your_set  = set(your_set) & universe
    other_set = set(other_set) & universe
    
    a = your_set & other_set
    b = other_set - your_set
    c = your_set - other_set
    d = universe - (your_set | other_set)
    
    table = [ [len(a), len(b)], [len(c), len(d)]]
    if min(min(table)) <= 5:
        method = 'fisher'
        oddsratio, p = sstats.fisher_exact(table)
        chi2 = None
    else:
        method = 'chi2'
        chi2, p, dof, expected = sstats.chi2_contingency(table)
        oddsratio = 100
        if table[1][0] > 0 and table[0][1] > 0:
            oddsratio = table[0][0] * table[1][1] / (table[1][0] * table[0][1])
        else:
            oddsratio = np.inf
        #fi
    #fi
    if abcd_values:
        return resTuple(oddsratio, chi2, p, [[a,b],[c,d]], method)
    else:
        return resTuple(oddsratio, chi2, p, table, method)
    #fi
#edef
def main():
    drug_targets()


main()

   Entrez          protein  name
0    4000  ENSP00000357284  LMNA
1    4000  ENSP00000357283  LMNA
2    4000  ENSP00000357282  LMNA
3    4000  ENSP00000395597  LMNA
4    4000  ENSP00000424518  LMNA
5    4000  ENSP00000357280  LMNA
6    4000  ENSP00000426535  LMNA
7    4000  ENSP00000421821  LMNA
8    4000  ENSP00000424977  LMNA
9    4000  ENSP00000292304  LMNA
[{'drug': 'DB02709', 'oddsratio': 5.334301856335754, 'c2stat': 222.52574489684827, 'pvalue': 2.5437134147601424e-50, 'table': [[104, 885], [224, 10168]], 'method': 'chi2'}, {'drug': 'DB11091', 'oddsratio': 3.666424322538004, 'c2stat': 129.31054759293627, 'pvalue': 5.799606047981784e-30, 'table': [[99, 890], [306, 10086]], 'method': 'chi2'}, {'drug': 'DB01645', 'oddsratio': 5.601413543721236, 'c2stat': 122.89468564323546, 'pvalue': 1.470506263345418e-28, 'table': [[53, 936], [104, 10288]], 'method': 'chi2'}, {'drug': 'DB00997', 'oddsratio': 4.246548334425574, 'c2stat': 122.80412685462751, 'pvalue': 1.539178664125386e-28, 'table': 