In [1]:
from os.path import isfile

In [33]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
from joblib import Parallel, delayed

In [4]:
# conda install -c chembl chembl_webresource_client
import chembl_webresource_client as chembl

In [5]:
from ChEMBL import get_chembl_id, chembl_to_data_frame

In [6]:
def get_dude_dataframe(dude_id):
    """Download list of active compounds for a DUDe id and return a DataFrame"""
    
    # blacklist Uniprot IDs - investigate those and move to dictionary
    blacklist = []
    
    if isfile('data/data_dude_%s.csv' % dude_id):
        return None
    
    # get Uniprot IDs associatet with DUDe
    # TODO: do we want to extend that?
    uniprot_map = {
        # Obsolete
        'P0A5Y6': 'P9WGR1',
        'P45351': 'P0CS13',
        'P96275': 'P9WIJ3',
        # Not mapping
        'P68404': 'P05771',
    }
    
    uniprot_ids = pd.read_csv('http://dude.docking.org/targets/%s/uniprot.txt' % dude_id, 
                              header=None, names=['uniprot_id']).uniprot_id
    uniprot_ids = uniprot_ids.replace(uniprot_map).tolist()
   
    

    if len(uniprot_ids) == 0:
        return None
    
    # get activities from ChEMBL (try 5 times, again ChEMBL is fussy)
    for _ in range(5):
        try:
            data = pd.concat(chembl_to_data_frame(uniprot_id) 
                             for uniprot_id in uniprot_ids
                             if uniprot_id not in blacklist
                            )
            break
        except ValueError as e:
            pass
    else:
        return None
    
    # Resolve SMILES for ChEMBL IDs locally (very slow with web services for large targets)
    chembl_smiles = pd.read_csv('chembl_24.smi.gz', sep='\t', header=None,names=['smiles', 'chembl_id'])
    data_full = data.set_index('chembl_id').join(chembl_smiles.set_index('chembl_id')).reset_index()
    
    data_full['dude_id'] = dude_id
    
    # for memory efficiency set appropriate dtypes
    cat_columns = ['dude_id', 
                   'chembl_id', 
                   'uniprot_id',
                   'units', 
                   'operator', 
                   'bioactivity_type',
                   'smiles'
                  ]
    for col in cat_columns:
        data_full[col] = data_full[col].astype('category')
    
    data_full.to_csv('data/data_dude_%s.csv' % dude_id)
    # return data_full

In [7]:
%%time
data = get_dude_dataframe('kpcb')

CPU times: user 8.33 s, sys: 285 ms, total: 8.62 s
Wall time: 9.28 s


In [8]:
dude_ids = ['lck', 'src', 'ada17', 'hivpr', 'mk14', 'mmp13', 'aa2ar', 'bace1', 'pparg', 'parp1', 'ace', 'thrb', 'cdk2', 'esr1', 'esr2', 'vgfr2', 'fnta', 'drd3', 'csf1r', 'dhi1', 'casp3', 'gria2', 'kit', 'dyr', 'braf', 'tryb1', 'hdac8', 'aldr', 'akt1', 'ital', 'kpcb', 'tysy', 'ppard', 'hivint', 'ppara', 'urok', 'wee1', 'reni', 'grik1', 'aces', 'fa10', 'dpp4', 'adrb2', 'jak2', 'hivrt', 'fkb1a', 'cah2', 'kif11', 'try1', 'adrb1', 'akt2', 'rock1', 'pa2ga', 'pygm', 'mapk2', 'fa7', 'tgfr1', 'mk10', 'fak1', 'gcr', 'hdac2', 'prgr', 'ptn1', 'nram', 'abl1', 'hs90a', 'egfr', 'hxk4', 'mk01', 'cxcr4', 'lkha4', 'ada', 'pur2', 'pnph', 'andr', 'rxra', 'fpps', 'cp3a4', 'met', 'ampc', 'mp2k1', 'pyrd', 'pgh1', 'kith', 'thb', 'comt', 'cp2c9', 'aofb', 'fabp4', 'mcr', 'inha', 'pgh2', 'def', 'xiap', 'glcm', 'pde5a', 'nos1', 'sahh', 'hmdh', 'igf1r', 'plk1', 'fgfr1']

In [9]:
%%time
# We need to run this few times, as some errors araise, I guess we are hitting some limits on ChEMBL there...
out = Parallel(n_jobs=-1, verbose=1)(delayed(get_dude_dataframe)(dude_id) for dude_id in dude_ids)
# data = pd.concat(out)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.5min
Error when retrieving url: https://www.ebi.ac.uk/chemblws/targets/uniprot/P07700.json, status code: 404, msg: Target not found for accession:P07700
Error when retrieving url: https://www.ebi.ac.uk/chemblws/targets/uniprot/Q9TT96.json, status code: 404, msg: Target not found for accession:Q9TT96
Error when retrieving url: https://www.ebi.ac.uk/chemblws/targets/uniprot/P56519.json, status code: 404, msg: Target not found for accession:P56519
[Parallel(n_jobs=-1)]: Done 102 out of 102 | elapsed:  3.9min finished


CPU times: user 604 ms, sys: 122 ms, total: 725 ms
Wall time: 3min 55s


# Read master DataFrame with all data

In [10]:
data = pd.concat(pd.read_csv('data/data_dude_%s.csv' % dude_id, index_col=0) for dude_id in dude_ids)
cat_columns = ['dude_id', 
               'chembl_id', 
               'uniprot_id',
               'units', 
               'operator', 
               'bioactivity_type',
               'smiles'
              ]
for col in cat_columns:
    data[col] = data[col].astype('category')
data['value'] = data['value'].astype(float)

In [11]:
len(data)

642030

In [12]:
data.dtypes

chembl_id           category
uniprot_id          category
bioactivity_type    category
operator            category
value                float64
units               category
smiles              category
dude_id             category
dtype: object

In [13]:
data.sort_values('value')

Unnamed: 0,chembl_id,uniprot_id,bioactivity_type,operator,value,units,smiles,dude_id
6357,CHEMBL321397,P06536,Weight gain,=,-2.360000e+02,Unspecified,C12(C(=O)N(C(=O)N(C1=O)C)C)O[C@@H]1C(=CC2)[C@@...,gcr
6342,CHEMBL320320,P06536,Weight gain,=,-2.240000e+02,Unspecified,C12(C(=O)c3c(C1=O)cccc3)O[C@@H]1C(=CC2)[C@@]2(...,gcr
547,CHEMBL1643626,P27487,Inhibition,=,-2.022000e+02,%,C(CCCCCCC)c1c(CCCCCCCC(=O)ONN)nc2c(n1)cccc2,dpp4
390,CHEMBL113032,P06536,Weight gain,=,-1.860000e+02,Unspecified,c12n(ncc1C[C@@]1(C3=CCCC([C@H]3CCC1=C2)(C(=O)O...,gcr
339,CHEMBL109690,P06536,Weight gain,=,-1.800000e+02,Unspecified,C12(C(=O)NC(=O)NC1=O)O[C@@H]1C(=CC2)[C@@]2(C(=...,gcr
395,CHEMBL113561,P06536,Weight gain,=,-1.450000e+02,Unspecified,c12n(ncc1C[C@@]1(C3=CC[C@]4(C([C@@H]3CCC1=C2)C...,gcr
341,CHEMBL109690,P06536,Weight gain,=,-1.400000e+02,Unspecified,C12(C(=O)NC(=O)NC1=O)O[C@@H]1C(=CC2)[C@@]2(C(=...,gcr
6339,CHEMBL320320,P06536,Weight gain,=,-1.290000e+02,Unspecified,C12(C(=O)c3c(C1=O)cccc3)O[C@@H]1C(=CC2)[C@@]2(...,gcr
6354,CHEMBL321397,P06536,Weight gain,=,-1.270000e+02,Unspecified,C12(C(=O)N(C(=O)N(C1=O)C)C)O[C@@H]1C(=CC2)[C@@...,gcr
6402,CHEMBL306894,P35354,Inhibition,=,-1.250000e+02,%,n1(c(cc(n1)CO[N+](=O)[O-])c1ccc(S(=O)(=O)C)cc1...,pgh2


In [14]:
data.bioactivity_type.unique()

[Residual Activity, Ki, Kd, Inhibition, IC50, ..., Release, Activity remaining, cholesterol level, Suppression, PPB]
Length: 500
Categories (500, object): [Residual Activity, Ki, Kd, Inhibition, ..., Activity remaining, cholesterol level, Suppression, PPB]

In [15]:
data.units.unique()

[%, nM, ug.mL-1, mg.kg-1, M, ..., 1/minute, uM mg-1, mg dl-1, dpm, nM/s]
Length: 238
Categories (238, object): [%, nM, ug.mL-1, mg.kg-1, ..., uM mg-1, mg dl-1, dpm, nM/s]

In [16]:
data.value.describe()

count    6.420300e+05
mean     2.362975e+15
std      1.524019e+18
min     -2.360000e+02
25%      2.800000e+01
50%      6.309600e+02
75%      1.000000e+04
max      1.202264e+21
Name: value, dtype: float64

In [27]:
data.groupby('dude_id')['chembl_id'].nunique().sort_values()

dude_id
def         70
pur2       173
fabp4      229
comt       282
tryb1      288
fpps       299
sahh       307
pnph       381
inha       438
kith       441
nram       445
ada        488
ital       535
wee1       541
pa2ga      544
fa7        582
cxcr4      615
fkb1a      684
lkha4      693
grik1      705
pygm       854
kif11     1058
pyrd      1070
hxk4      1122
hmdh      1123
tgfr1     1163
rxra      1217
xiap      1230
mcr       1276
tysy      1438
         ...  
aofb      4344
adrb2     4652
dpp4      4727
akt1      4738
jak2      5265
met       5296
hivpr     5333
pparg     5350
drd3      5801
mk14      5863
esr1      5970
src       6140
cah2      6189
thb       6287
pgh1      6590
bace1     6665
fa10      6761
andr      6797
pgh2      7218
egfr      8509
aa2ar     8650
thrb      8994
vgfr2     9301
aces     10429
glcm     12116
cp2c9    12401
mk01     16799
cp3a4    19134
plk1     25643
ampc     62235
Name: chembl_id, Length: 102, dtype: int64

In [69]:
data['pvalue'] = (-np.log10(data['value']) + 9)

  if __name__ == '__main__':


In [70]:
data

Unnamed: 0,chembl_id,uniprot_id,bioactivity_type,operator,value,units,smiles,dude_id,pvalue
0,CHEMBL10,P06239,Residual Activity,=,32.00,%,[nH]1c(c(nc1c1ccc([S+]([O-])C)cc1)c1ccc(cc1)F)...,lck,7.494850
1,CHEMBL10,P06239,Residual Activity,=,91.00,%,[nH]1c(c(nc1c1ccc([S+]([O-])C)cc1)c1ccc(cc1)F)...,lck,7.040959
2,CHEMBL10,P06239,Residual Activity,=,27.00,%,[nH]1c(c(nc1c1ccc([S+]([O-])C)cc1)c1ccc(cc1)F)...,lck,7.568636
3,CHEMBL10,P06239,Residual Activity,=,79.00,%,[nH]1c(c(nc1c1ccc([S+]([O-])C)cc1)c1ccc(cc1)F)...,lck,7.102373
4,CHEMBL10,P06239,Ki,=,5011.87,nM,[nH]1c(c(nc1c1ccc([S+]([O-])C)cc1)c1ccc(cc1)F)...,lck,5.300000
5,CHEMBL10,P06239,Kd,=,2800.00,nM,[nH]1c(c(nc1c1ccc([S+]([O-])C)cc1)c1ccc(cc1)F)...,lck,5.552842
6,CHEMBL10,P06239,Inhibition,=,32.00,%,[nH]1c(c(nc1c1ccc([S+]([O-])C)cc1)c1ccc(cc1)F)...,lck,7.494850
7,CHEMBL10,P06239,Kd,=,2800.00,nM,[nH]1c(c(nc1c1ccc([S+]([O-])C)cc1)c1ccc(cc1)F)...,lck,5.552842
8,CHEMBL10,P06239,Inhibition,=,68.00,%,[nH]1c(c(nc1c1ccc([S+]([O-])C)cc1)c1ccc(cc1)F)...,lck,7.167491
9,CHEMBL10,P06239,Inhibition,=,55.00,%,[nH]1c(c(nc1c1ccc([S+]([O-])C)cc1)c1ccc(cc1)F)...,lck,7.259637


In [71]:
data.query('bioactivity_type in ["Ki", "Kd", "EC50", "IC50"] and pvalue > 6').sort_values('value')

Unnamed: 0,chembl_id,uniprot_id,bioactivity_type,operator,value,units,smiles,dude_id,pvalue
5501,CHEMBL34259,P00381,IC50,=,3.300000e-12,ug.mL-1,n1c(c2c(nc1N)ncc(n2)CN(c1ccc(C(=O)N[C@H](C(=O)...,dyr,20.481486
771,CHEMBL1790497,P00797,IC50,=,2.600000e-10,nM,C(=O)(N1[C@H](C(=O)N[C@H](C(=O)N([C@H](C(=O)N[...,reni,18.585027
6546,CHEMBL428862,P00375,Ki,=,5.230000e-09,nM,c12nc(nc(c1c(c(cn2)CNc1ccc(C(=O)NC(C(=O)O)CCC(...,dyr,17.281498
5366,CHEMBL34259,P00375,Ki,=,5.620000e-09,nM,n1c(c2c(nc1N)ncc(n2)CN(c1ccc(C(=O)N[C@H](C(=O)...,dyr,17.250264
3274,CHEMBL267104,P00381,IC50,=,1.000000e-08,ug.mL-1,n1c2c(c(nc1N)O)c(c(cn2)CNc1ccc(C(=O)NC(C(=O)O)...,dyr,17.000000
1151,CHEMBL267104,P00469,IC50,=,1.200000e-07,ug.mL-1,n1c2c(c(nc1N)O)c(c(cn2)CNc1ccc(C(=O)NC(C(=O)O)...,tysy,15.920819
682,CHEMBL157187,P34971,Kd,=,3.200000e-07,Unspecified,C(=O)(c1ccc(cc1)C)NCCCCC(NCC(c1cc(c(cc1)O)O)O)C,adrb1,15.494850
2423,CHEMBL157187,P18762,Kd,=,3.200000e-07,Unspecified,C(=O)(c1ccc(cc1)C)NCCCCC(NCC(c1cc(c(cc1)O)O)O)C,adrb2,15.494850
6304,CHEMBL423055,P18762,Kd,=,4.700000e-07,Unspecified,C(=O)(Nc1ccc(cc1)C)NCCCCC(NCC(c1cc(c(cc1)O)O)O)C,adrb2,15.327902
2437,CHEMBL423055,P34971,Kd,=,4.700000e-07,Unspecified,C(=O)(Nc1ccc(cc1)C)NCCCCC(NCC(c1cc(c(cc1)O)O)O)C,adrb1,15.327902


In [72]:
data_actives = data.query('bioactivity_type in ["Ki", "Kd", "EC50", "IC50"] and pvalue > 6')

In [73]:
data_actives.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 180295 entries, 24 to 4407
Data columns (total 9 columns):
chembl_id           180295 non-null category
uniprot_id          180295 non-null category
bioactivity_type    180295 non-null category
operator            180295 non-null category
value               180295 non-null float64
units               180295 non-null category
smiles              178149 non-null category
dude_id             180295 non-null category
pvalue              180295 non-null float64
dtypes: category(7), float64(2)
memory usage: 31.8 MB


In [74]:
data_actives = data_actives.groupby(['chembl_id', 'dude_id'])['value'].count().sort_values()

chembl_id      dude_id
CHEMBL320162   thrb         1
CHEMBL3642309  jak2         1
CHEMBL3642308  jak2         1
CHEMBL3642306  jak2         1
               fak1         1
CHEMBL3642305  jak2         1
               fak1         1
CHEMBL3642304  jak2         1
               fak1         1
CHEMBL3642303  jak2         1
               fak1         1
CHEMBL3642302  jak2         1
               fak1         1
CHEMBL3642300  jak2         1
               fak1         1
CHEMBL364230   nos1         1
CHEMBL3642297  jak2         1
CHEMBL3642296  jak2         1
CHEMBL3642295  jak2         1
               fak1         1
CHEMBL3642294  jak2         1
CHEMBL3642293  jak2         1
CHEMBL3642292  jak2         1
CHEMBL3642291  jak2         1
CHEMBL3642310  jak2         1
CHEMBL3642290  jak2         1
CHEMBL3642312  fak1         1
CHEMBL3642313  jak2         1
CHEMBL3642338  jak2         1
CHEMBL3642336  jak2         1
                         ... 
CHEMBL659      aces        64
CHEMBL21       ca

In [76]:
(data
 .query('bioactivity_type in ["Ki", "Kd", "EC50", "IC50"] and pvalue > 6')
 .drop_duplicates(['chembl_id', 'dude_id'])
 .sort_values('value')
)

Unnamed: 0,chembl_id,uniprot_id,bioactivity_type,operator,value,units,smiles,dude_id,pvalue
6546,CHEMBL428862,P00375,Ki,=,5.230000e-09,nM,c12nc(nc(c1c(c(cn2)CNc1ccc(C(=O)NC(C(=O)O)CCC(...,dyr,17.281498
3274,CHEMBL267104,P00381,IC50,=,1.000000e-08,ug.mL-1,n1c2c(c(nc1N)O)c(c(cn2)CNc1ccc(C(=O)NC(C(=O)O)...,dyr,17.000000
1151,CHEMBL267104,P00469,IC50,=,1.200000e-07,ug.mL-1,n1c2c(c(nc1N)O)c(c(cn2)CNc1ccc(C(=O)NC(C(=O)O)...,tysy,15.920819
2423,CHEMBL157187,P18762,Kd,=,3.200000e-07,Unspecified,C(=O)(c1ccc(cc1)C)NCCCCC(NCC(c1cc(c(cc1)O)O)O)C,adrb2,15.494850
682,CHEMBL157187,P34971,Kd,=,3.200000e-07,Unspecified,C(=O)(c1ccc(cc1)C)NCCCCC(NCC(c1cc(c(cc1)O)O)O)C,adrb1,15.494850
6304,CHEMBL423055,P18762,Kd,=,4.700000e-07,Unspecified,C(=O)(Nc1ccc(cc1)C)NCCCCC(NCC(c1cc(c(cc1)O)O)O)C,adrb2,15.327902
2437,CHEMBL423055,P34971,Kd,=,4.700000e-07,Unspecified,C(=O)(Nc1ccc(cc1)C)NCCCCC(NCC(c1cc(c(cc1)O)O)O)C,adrb1,15.327902
243,CHEMBL2368596,P56658,Ki,=,1.500000e-06,nM,c12n([C@@H]3O[C@@H]([C@H](C3)O)CO)cnc1[C@@H](C...,ada,14.823909
6033,CHEMBL379921,Q72874,Ki,=,6.000000e-06,nM,c1(ccc(cc1)OCc1ccccn1)C[C@H](NC(=O)O[C@@H]1[C@...,hivpr,14.221849
2580,CHEMBL160277,P18762,Kd,=,1.000000e-05,Unspecified,c1c(c(ccc1C(CNC(CCCCN)C)O)O)O,adrb2,14.000000
