In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statistics

mapped = pd.read_csv('mapped_DB_STITCH.tsv', sep='\t')
mapped['protein'] = mapped['protein'].map(lambda x: x.lstrip('9606.'))
new = mapped[['DrugBank ID', 'protein']].drop_duplicates()
counts1 = list(new['DrugBank ID'].value_counts()) 
df = pd.DataFrame({'genes': counts1})
bins = [0,200,400,600,800,1000,1200,1400,1600,1800,2000,2200,2400,2600,2800,3000]
oef = pd.concat([df.groupby(pd.cut(df.genes, bins=bins)).genes.count()], axis = 1)
lijst = []
for x in oef['genes']:
    lijst.append(x)

print("mean:", statistics.mean(lijst))
print("standard deviation:", statistics.stdev(lijst))
lijst2 = ['0-200', '200-400', '400-600', '600-800', '800-1000', '1000-1200', '1200-1400', '1400-1600', '1600-1800', '1800-2000', '2000,2200', '2200-2400', '2400-2600', '2600-2800', '2800-3000']

f, ax = plt.subplots(figsize=(18,10))
plt.bar(lijst2, lijst)
plt.show()


mean: 375.3333333333333
standard deviation: 1432.6865207248468


<Figure size 1800x1000 with 1 Axes>

In [26]:
import numpy as np
import scipy.stats as sstats
import pandas as pd 

from collections import namedtuple

class ProteinSet(object):
    def __init__(self, proteindict):
        self.proteindict = { name : set(p) for name, p in proteindict.items() }
    
    def enrich(self, otherset, background):
        list_res = []
        for name, pset in self.proteindict.items():
            term = [name]
            proteins = list(pset)
            results = set_enrichment(pset, otherset, background)
            l_results = list(results)
            joined = term + l_results
            joined.append(proteins)
            list_res.append(joined)
        
        df_final = pd.DataFrame(list_res)
        df_final.columns = ['Name', 'oddsratio', 'c2statistic', 'pvalue', 'table', 'method', 'proteins'] 
        return df_final
    
    def set_enrichment(your_set, other_set, universe, abcd_values=False):
    
        resTuple = namedtuple("setEnrichmentResult", [ 'oddsratio', 'c2statistic', 'pvalue', 'table', 'method'])

        universe  = set(universe)
        your_set  = set(your_set) & universe
        other_set = set(other_set) & universe

        a = your_set & other_set
        b = other_set - your_set
        c = your_set - other_set
        d = universe - (your_set | other_set)

        table = [ [len(a), len(b)], [len(c), len(d)]]
        if min(min(table)) <= 5:
            method = 'fisher'
            oddsratio, p = sstats.fisher_exact(table)
            chi2 = None
        else:
            method = 'chi2'
            chi2, p, dof, expected = sstats.chi2_contingency(table)
            oddsratio = 100
            if table[1][0] > 0 and table[0][1] > 0:
                oddsratio = table[0][0] * table[1][1] / (table[1][0] * table[0][1])
            else:
                oddsratio = np.inf

        if abcd_values:
            return resTuple(oddsratio, chi2, p, [[a,b],[c,d]], method)
        else:
            return resTuple(oddsratio, chi2, p, table, method)

def make_dictio_DT(): 
    mapped = pd.read_csv('mapped_DB_STITCH.tsv', sep='\t')
    mapped['protein'] = mapped['protein'].map(lambda x: x.lstrip('9606.'))
    mapped = mapped[['CID', 'InChIKey', 'DrugBank ID', 'Name', 'protein', 'combined_score']].drop_duplicates()
    
    dictio = {}
    for i in mapped['Name'].unique(): 
        dictio[i] = [mapped['protein'][j] for j in mapped[mapped['Name']==i].index]
    
    return dictio 

def entrez_to_protein(): 
    biomart_data = pd.read_csv("biomart.tsv", 
                  sep='\t', 
                  names=["gene", "transcript", "protein", "Entrez", "Uniprot", "name"])
    
    
    other_set1 = pd.read_csv("test_list_genes1.0.txt")
    get_ens = pd.merge(other_set1, biomart_data, on=["Entrez"]) 
    get_ens = get_ens.dropna(subset=['protein'])
    get_ens_filtered = get_ens.drop(["gene", "transcript", "Uniprot"], axis=1)
    
    return get_ens_filtered

def main_ACR_DT():
    ensembl = pd.read_csv('STITCH_proteins.txt')
    dictio = make_dictio_DT()
    get_ens_filtered = entrez_to_protein() 
    enrichment_call = ProteinSet(dictio)
    df = enrichment_call.enrich(get_ens_filtered['protein'], ensembl['protein']) 
    #print(df)

main_ACR_DT()
            