In [1]:
from os.path import isfile

In [2]:
import pandas as pd

In [3]:
from joblib import Parallel, delayed

In [4]:
# conda install -c chembl chembl_webresource_client
import chembl_webresource_client as chembl

In [5]:
from ChEMBL import get_chembl_id, chembl_to_data_frame

In [6]:
def get_dude_dataframe(dude_id):
    """Download list of active compounds for a DUDe id and return a DataFrame"""
    
    # blacklist Uniprot IDs - investigate those and move to dictionary
    blacklist = []
    
    if isfile('data/data_dude_%s.csv' % dude_id):
        return None
    
    # get Uniprot IDs associatet with DUDe
    # TODO: do we want to extend that?
    uniprot_map = {
        # Obsolete
        'P0A5Y6': 'P9WGR1',
        'P45351': 'P0CS13',
        # Not mapping
        'P68404': 'P05771',
    }
    
    uniprot_ids = pd.read_csv('http://dude.docking.org/targets/%s/uniprot.txt' % dude_id, 
                              header=None, names=['uniprot_id']).uniprot_id
    uniprot_ids = uniprot_ids.replace(uniprot_map).tolist()
   
    

    if len(uniprot_ids) == 0:
        return None
    
    # get activities from ChEMBL (try 5 times, again ChEMBL is fussy)
    for _ in range(5):
        try:
            data = pd.concat(chembl_to_data_frame(uniprot_id) 
                             for uniprot_id in uniprot_ids
                             if uniprot_id not in blacklist
                            )
            break
        except ValueError as e:
            pass
    else:
        return None
    
    # Resolve SMILES for ChEMBL IDs locally (very slow with web services for large targets)
    chembl_smiles = pd.read_csv('chembl_23.smi.gz', sep='\t', header=None,names=['smiles', 'chembl_id'])
    data_full = data.set_index('chembl_id').join(chembl_smiles.set_index('chembl_id')).reset_index()
    
    data_full['dude_id'] = dude_id
    
    # for memory efficiency set appropriate dtypes
    cat_columns = ['dude_id', 
                   'chembl_id', 
                   'uniprot_id',
                   'units', 
                   'operator', 
                   'bioactivity_type',
                   'smiles'
                  ]
    for col in cat_columns:
        data_full[col] = data_full[col].astype('category')
    
    data_full.to_csv('data/data_dude_%s.csv' % dude_id)
    # return data_full

In [7]:
%%time
data = get_dude_dataframe('kpcb')

CPU times: user 32 µs, sys: 4 µs, total: 36 µs
Wall time: 40.5 µs


In [8]:
dude_ids = ['lck', 'src', 'ada17', 'hivpr', 'mk14', 'mmp13', 'aa2ar', 'bace1', 'pparg', 'parp1', 'ace', 'thrb', 'cdk2', 'esr1', 'esr2', 'vgfr2', 'fnta', 'drd3', 'csf1r', 'dhi1', 'casp3', 'gria2', 'kit', 'dyr', 'braf', 'tryb1', 'hdac8', 'aldr', 'akt1', 'ital', 'kpcb', 'tysy', 'ppard', 'hivint', 'ppara', 'urok', 'wee1', 'reni', 'grik1', 'aces', 'fa10', 'dpp4', 'adrb2', 'jak2', 'hivrt', 'fkb1a', 'cah2', 'kif11', 'try1', 'adrb1', 'akt2', 'rock1', 'pa2ga', 'pygm', 'mapk2', 'fa7', 'tgfr1', 'mk10', 'fak1', 'gcr', 'hdac2', 'prgr', 'ptn1', 'nram', 'abl1', 'hs90a', 'egfr', 'hxk4', 'mk01', 'cxcr4', 'lkha4', 'ada', 'pur2', 'pnph', 'andr', 'rxra', 'fpps', 'cp3a4', 'met', 'ampc', 'mp2k1', 'pyrd', 'pgh1', 'kith', 'thb', 'comt', 'cp2c9', 'aofb', 'fabp4', 'mcr', 'inha', 'pgh2', 'def', 'xiap', 'glcm', 'pde5a', 'nos1', 'sahh', 'hmdh', 'igf1r', 'plk1', 'fgfr1']

In [9]:
%%time
# We need to run this few times, as some errors araise, I guess we are hitting some limits on ChEMBL there...
out = Parallel(n_jobs=-1, verbose=1)(delayed(get_dude_dataframe)(dude_id) for dude_id in dude_ids)
# data = pd.concat(out)

CPU times: user 57.6 ms, sys: 41.3 ms, total: 98.9 ms
Wall time: 197 ms


[Parallel(n_jobs=-1)]: Done 102 out of 102 | elapsed:    0.0s finished


# Read master DataFrame with all data

In [10]:
data = pd.concat(pd.read_csv('data/data_dude_%s.csv' % dude_id, index_col=0) for dude_id in dude_ids)
cat_columns = ['dude_id', 
               'chembl_id', 
               'uniprot_id',
               'units', 
               'operator', 
               'bioactivity_type',
               'smiles'
              ]
for col in cat_columns:
    data[col] = data[col].astype('category')
data['value'] = data['value'].astype(float)

In [11]:
len(data)

487498

In [12]:
data.dtypes

chembl_id           category
uniprot_id          category
bioactivity_type    category
operator            category
value                float64
units               category
smiles              category
dude_id             category
dtype: object

In [13]:
data.sort_values('value')

Unnamed: 0,chembl_id,uniprot_id,bioactivity_type,operator,value,units,smiles,dude_id
600,CHEMBL321397,P06536,Weight gain,=,-2.360000e+02,Unspecified,C12(C(=O)N(C(=O)N(C1=O)C)C)O[C@@H]1C(=CC2)[C@@...,gcr
590,CHEMBL320320,P06536,Weight gain,=,-2.240000e+02,Unspecified,C12(C(=O)c3c(C1=O)cccc3)O[C@@H]1C(=CC2)[C@@]2(...,gcr
64,CHEMBL113032,P06536,Weight gain,=,-1.860000e+02,Unspecified,c12n(ncc1C[C@@]1(C3=CCCC([C@H]3CCC1=C2)(C(=O)O...,gcr
30,CHEMBL109690,P06536,Weight gain,=,-1.800000e+02,Unspecified,C12(C(=O)NC(=O)NC1=O)O[C@@H]1C(=CC2)[C@@]2(C(=...,gcr
69,CHEMBL113561,P06536,Weight gain,=,-1.450000e+02,Unspecified,c12n(ncc1C[C@@]1(C3=CC[C@]4(C([C@@H]3CCC1=C2)C...,gcr
32,CHEMBL109690,P06536,Weight gain,=,-1.400000e+02,Unspecified,C12(C(=O)NC(=O)NC1=O)O[C@@H]1C(=CC2)[C@@]2(C(=...,gcr
587,CHEMBL320320,P06536,Weight gain,=,-1.290000e+02,Unspecified,C12(C(=O)c3c(C1=O)cccc3)O[C@@H]1C(=CC2)[C@@]2(...,gcr
597,CHEMBL321397,P06536,Weight gain,=,-1.270000e+02,Unspecified,C12(C(=O)N(C(=O)N(C1=O)C)C)O[C@@H]1C(=CC2)[C@@...,gcr
6402,CHEMBL306894,P35354,Inhibition,=,-1.250000e+02,%,n1(c(cc(n1)CO[N+](=O)[O-])c1ccc(S(=O)(=O)C)cc1...,pgh2
11330,CHEMBL67324,P35354,Inhibition,=,-1.250000e+02,%,n1(c(cc(n1)CO)c1ccc(S(=O)(=O)C)cc1)C1CCCC1,pgh2


In [14]:
data.bioactivity_type.unique()

[Residual Activity, Ki, Kd, Inhibition, IC50, ..., Release, Activity remaining, cholesterol level, Suppression, PPB]
Length: 450
Categories (450, object): [Residual Activity, Ki, Kd, Inhibition, ..., Activity remaining, cholesterol level, Suppression, PPB]

In [15]:
data.units.unique()

[%, nM, ug.mL-1, mg.kg-1, M, ..., uM mg-1, mg dl-1, dpm, 10'4/s/M, nM/s]
Length: 207
Categories (207, object): [%, nM, ug.mL-1, mg.kg-1, ..., mg dl-1, dpm, 10'4/s/M, nM/s]

In [16]:
data.value.describe()

count    4.874980e+05
mean     3.111819e+15
std      1.748967e+18
min     -2.360000e+02
25%      1.900000e+01
50%      2.238700e+02
75%      8.400000e+03
max      1.202264e+21
Name: value, dtype: float64