In [1]:
import pandas as pd

In [2]:
from joblib import Parallel, delayed

In [3]:
# conda install -c chembl chembl_webresource_client
import chembl_webresource_client as chembl

In [4]:
from ChEMBL import get_chembl_id, chembl_to_data_frame

In [11]:
def get_dude_dataframe(dude_id):
    """Download list of active compounds for a DUDe id and return a DataFrame"""
    
    # get Uniprot IDs associatet with DUDe
    # TODO: do we want to extend that?
    uniprot_ids = pd.read_csv('http://dude.docking.org/targets/%s/uniprot.txt' % dude_id, 
                              header=None, names=['uniprot_id']).uniprot_id.tolist()

    # get activities from ChEMBL
    data = pd.concat(chembl_to_data_frame(uniprot_id) for uniprot_id in uniprot_ids)
    
    # Resolve SMILES for ChEMBL IDs locally (very slow with web services for large targets)
    chembl_smiles = pd.read_csv('chembl_23.smi.gz', sep='\t', header=None,names=['smiles', 'chembl_id'])
    data_full = data.set_index('chembl_id').join(chembl_smiles.set_index('chembl_id')).reset_index()
    
    data_full['dude_id'] = dude_id
    
    # for memory efficiency set appropriate dtypes
    cat_columns = ['dude_id', 
                   'chembl_id', 
                   'uniprot_id',
                   'units', 
                   'operator', 
                   'bioactivity_type',
                   'smiles'
                  ]
    for col in cat_columns:
        data_full[col] = data_full[col].astype('category')
    
    data_full.to_csv('data_dude_%s.csv' % dude_id)
    # return data_full

In [12]:
%%time
data = get_dude_dataframe('aa2ar')

CPU times: user 7.55 s, sys: 152 ms, total: 7.7 s
Wall time: 8.57 s


In [7]:
dude_ids = ['lck', 'src', 'ada17', 'hivpr', 'mk14', 'mmp13', 'aa2ar', 'bace1', 'pparg', 'parp1', 'ace', 'thrb', 'cdk2', 'esr1', 'esr2', 'vgfr2', 'fnta', 'drd3', 'csf1r', 'dhi1', 'casp3', 'gria2', 'kit', 'dyr', 'braf', 'tryb1', 'hdac8', 'aldr', 'akt1', 'ital', 'kpcb', 'tysy', 'ppard', 'hivint', 'ppara', 'urok', 'wee1', 'reni', 'grik1', 'aces', 'fa10', 'dpp4', 'adrb2', 'jak2', 'hivrt', 'fkb1a', 'cah2', 'kif11', 'try1', 'adrb1', 'akt2', 'rock1', 'pa2ga', 'pygm', 'mapk2', 'fa7', 'tgfr1', 'mk10', 'fak1', 'gcr', 'hdac2', 'prgr', 'ptn1', 'nram', 'abl1', 'hs90a', 'egfr', 'hxk4', 'mk01', 'cxcr4', 'lkha4', 'ada', 'pur2', 'pnph', 'andr', 'rxra', 'fpps', 'cp3a4', 'met', 'ampc', 'mp2k1', 'pyrd', 'pgh1', 'kith', 'thb', 'comt', 'cp2c9', 'aofb', 'fabp4', 'mcr', 'inha', 'pgh2', 'def', 'xiap', 'glcm', 'pde5a', 'nos1', 'sahh', 'hmdh', 'igf1r', 'plk1', 'fgfr1']

In [None]:
%%time
# We need to run this few times, as some errors araise, I guess we are hitting some limits on ChEMBL there...
out = Parallel(n_jobs=1, verbose=1)(delayed(get_dude_dataframe)(dude_id) for dude_id in dude_ids)
# data = pd.concat(out)

In [9]:
len(data)

11989

In [10]:
data.dtypes

chembl_id           category
uniprot_id          category
bioactivity_type    category
operator            category
value                float64
units               category
smiles              category
dude_id             category
dtype: object

In [11]:
data.sort_values('value')

Unnamed: 0,chembl_id,uniprot_id,bioactivity_type,operator,value,units,smiles,dude_id
9639,CHEMBL460532,P29274,Activity,=,-74.00,%,c1ccc2c(c1)c(cc(n2)Nc1ccc(c(c1)Cl)Cl)NC(=O)C1C...,aa2ar
11090,CHEMBL601874,P29274,Inhibition,=,-33.00,%,c1c(c(cc2c1oc(c(c2=O)C=O)N)C)C,aa2ar
9914,CHEMBL475345,P29274,Activity,=,-20.00,%,c1ccc2c(c1)c(cc(n2)Nc1ccc(c(c1)Cl)Cl)NC(=O)c1c...,aa2ar
9881,CHEMBL472925,P29274,Activity,=,-19.00,%,c1ccc2c(c1)c(cc(n2)Nc1ccc(c(c1)Cl)Cl)NC(=O)C1C...,aa2ar
11256,CHEMBL605123,P29274,Inhibition,=,-19.00,%,c1cc(c2c(c1OC)oc(c2)C(=O)Nc1ccc(cc1)OC)c1nc(sc...,aa2ar
7108,CHEMBL3401302,P29274,Inhibition,=,-18.00,%,c1(ccc(cc1)NC(=O)CCCN)C(=O)Nc1sccn1,aa2ar
10407,CHEMBL522001,P29274,Activity,=,-17.30,%,C12CC3(C(CC(C3)C2)C1)c1[nH]c2c(c(nc3c2cccc3)Nc...,aa2ar
4794,CHEMBL2391147,P29274,Inhibition,=,-16.00,%,[C@H]1(N(C(=O)C[C@@H]1c1ccccc1)CC(=O)N)C,aa2ar
208,CHEMBL1082390,P29274,Inhibition,=,-16.00,%,c1(ccc2c(c1OC)oc(c2)C(=O)Nc1ccncc1)N1CCOCC1,aa2ar
6928,CHEMBL3334797,P29274,Inhibition,=,-14.70,%,N1(C(c2cc(c(cc2)Cl)Cl)CCN(C)C)CCOCC1,aa2ar


In [9]:
data.bioactivity_type.unique()

array(['IC50', 'Ki', 'Inhibition', 'Activity', 'Displacement', 'INH',
       'EC50', 'Kd', 'Kb', 'I', 'pC2A', 'Efficacy', 'pKi',
       'Inhibition (at concentration M)', 'Loss', 'pKi(uM)', 'ED50',
       'Emax', 'Alloste ic enhan er (AE)', 'pKb', 'Ratio', 'Potency ratio',
       'Relative potency', 'Kbapp', 'log(activity)', 'pKA', 'Ratio IC50',
       'Bmax', 'Time', 'FC', 'AE activity', 'Solubility', 'KA', 'Kdiss',
       'T1/2'], dtype=object)

In [13]:
data.units.unique()

array(['nM', '%', 'pmol/ml', 'uM', 'Unspecified', 'mM', 'pmol', 'pM mg-1',
       'min', '/s', 'hr'], dtype=object)

In [12]:
data.value.describe()

count    1.198900e+04
mean     5.276524e+03
std      1.459500e+05
min     -7.400000e+01
25%      1.400000e+01
50%      7.800000e+01
75%      1.050000e+03
max      1.584893e+07
Name: value, dtype: float64

In [7]:
targets = chembl.TargetResource()

In [10]:
targets.get(uniprot='P19785')

{'bioactivityCount': 2172,
 'chemblId': 'CHEMBL2094113',
 'compoundCount': 365,
 'description': 'Estrogen receptor',
 'geneNames': 'Unspecified',
 'organism': 'Mus musculus',
 'preferredName': 'Estrogen receptor',
 'proteinAccession': 'P19785',
 'synonyms': 'Esr ,Estra,Estrogen receptor,Nuclear receptor subfamily 3 group A member 1,Esr1,Estr,Nr3a1,ER,Estradiol receptor,ER-alpha',
 'targetType': 'PROTEIN FAMILY'}