In [1]:
import requests
from random import sample, random
import signal
from time import sleep

import pandas as pd
from chembl_webresource_client.new_client import new_client

class TimeOutException(Exception):
    pass
def alarm_handler(signum, frame):
    raise TimeOutException()
signal.signal(signal.SIGALRM, alarm_handler)

<Handlers.SIG_DFL: 0>

In [2]:
chembl_client = new_client.molecule

query = (
    chembl_client
    .filter(
        molecule_properties__num_ro5_violations=0,
        molecule_properties__rtb__range=(0, 2),  # inclusive on both ends of range
        molecule_properties__mw_freebase__range=(150, 199),  # inclusive on both ends of range
    ).only(
        ['molecule_structures', 'molecule_properties']
    )
)
len(query)

22419

In [7]:
class QueryIter:
    def __init__(self, chembl_query, prob=0.1):
        self.chembl_query = chembl_query
        self.prob = prob

    def __iter__(self):
        return self

    def __next__(self):
        rand_float = random()
        if rand_float < self.prob:
            try:
                signal.alarm(1)
                item = self.chembl_query.next()
                return item
            except TimeOutException as ex:
                raise StopIteration
        else:
            return {
                'molecule_structures': {
                    'canonical_smiles': ''
                },
                'molecule_properties': {
                    'rtb': '',
                    'mw_freebase': ''
                }
            }

In [8]:
mols = [
    {
        'smiles': mol['molecule_structures']['canonical_smiles'],
        'rot_bonds': mol['molecule_properties']['rtb'],
        'mw': mol['molecule_properties']['mw_freebase']
    }
    for mol in QueryIter(query)
]
mols

[{'smiles': '', 'rot_bonds': '', 'mw': ''},
 {'smiles': '', 'rot_bonds': '', 'mw': ''},
 {'smiles': '', 'rot_bonds': '', 'mw': ''},
 {'smiles': '', 'rot_bonds': '', 'mw': ''},
 {'smiles': '', 'rot_bonds': '', 'mw': ''},
 {'smiles': '', 'rot_bonds': '', 'mw': ''},
 {'smiles': '', 'rot_bonds': '', 'mw': ''},
 {'smiles': '', 'rot_bonds': '', 'mw': ''},
 {'smiles': '', 'rot_bonds': '', 'mw': ''},
 {'smiles': '', 'rot_bonds': '', 'mw': ''},
 {'smiles': '', 'rot_bonds': '', 'mw': ''},
 {'smiles': '', 'rot_bonds': '', 'mw': ''},
 {'smiles': '', 'rot_bonds': '', 'mw': ''},
 {'smiles': '', 'rot_bonds': '', 'mw': ''},
 {'smiles': '', 'rot_bonds': '', 'mw': ''},
 {'smiles': '', 'rot_bonds': '', 'mw': ''},
 {'smiles': 'O=c1oc2ccccc2c2ccccc12', 'rot_bonds': 0, 'mw': '196.21'},
 {'smiles': '', 'rot_bonds': '', 'mw': ''},
 {'smiles': '', 'rot_bonds': '', 'mw': ''},
 {'smiles': '', 'rot_bonds': '', 'mw': ''},
 {'smiles': '', 'rot_bonds': '', 'mw': ''},
 {'smiles': '', 'rot_bonds': '', 'mw': ''},
 {'sm

In [9]:
len(mols)

214

In [8]:
old_mols = mols
old_mols

[{'smiles': 'O=c1oc2ccccc2c2ccccc12', 'rot_bonds': 0, 'mw': '196.21'},
 {'smiles': 'CN1CCC[C@H]1c1cccnc1', 'rot_bonds': 1, 'mw': '162.24'},
 {'smiles': 'CCc1cc2c(O)ncnc2s1', 'rot_bonds': 1, 'mw': '180.23'},
 {'smiles': 'C1=C(c2ccccc2)CCNC1', 'rot_bonds': 1, 'mw': '159.23'},
 {'smiles': 'C[N+]1([O-])CC=C(c2ccccc2)CC1', 'rot_bonds': 1, 'mw': '189.26'},
 {'smiles': 'CCc1cc2c(Cl)ncnc2s1', 'rot_bonds': 1, 'mw': '198.68'},
 {'smiles': 'C1=NCCc2c1[nH]c1ccccc21', 'rot_bonds': 0, 'mw': '170.21'},
 {'smiles': 'Cn1c2c(c3ccccc31)CCNC2', 'rot_bonds': 0, 'mw': '186.26'},
 {'smiles': 'c1ccc2c3c([nH]c2c1)CNCC3', 'rot_bonds': 0, 'mw': '172.23'},
 {'smiles': 'c1ccc2c3c([nH]c2c1)CCNC3', 'rot_bonds': 0, 'mw': '172.23'},
 {'smiles': 'NCCc1c[nH]c2ccccc12', 'rot_bonds': 2, 'mw': '160.22'},
 {'smiles': 'Cc1c(CC(C)N)ccc2c1OCO2', 'rot_bonds': 2, 'mw': '193.25'},
 {'smiles': 'Cc1cc(CC(C)N)cc2c1OCO2', 'rot_bonds': 2, 'mw': '193.25'},
 {'smiles': 'COc1ccc2c(c1)[nH]c1cnccc12', 'rot_bonds': 1, 'mw': '198.22'},
 {'sm

In [19]:
len(mols)

560

In [46]:
mols = list(QueryIter(query))
mols

[{'molecule_properties': {'alogp': '2.03',
   'aromatic_rings': 0,
   'cx_logd': '-0.91',
   'cx_logp': '2.09',
   'cx_most_apka': None,
   'cx_most_bpka': '10.78',
   'full_molformula': 'C10H19N',
   'full_mwt': '153.27',
   'hba': 1,
   'hba_lipinski': 1,
   'hbd': 1,
   'hbd_lipinski': 1,
   'heavy_atoms': 11,
   'molecular_species': 'BASE',
   'mw_freebase': '153.27',
   'mw_monoisotopic': '153.1517',
   'num_lipinski_ro5_violations': 0,
   'num_ro5_violations': 0,
   'psa': '12.03',
   'qed_weighted': '0.61',
   'ro3_pass': 'Y',
   'rtb': 1},
  'molecule_structures': {'canonical_smiles': 'CNC1C2CCC(C2)C1(C)C',
   'molfile': '\n     RDKit          2D\n\n 11 12  0  0  0  0  0  0  0  0999 V2000\n    4.7042   -4.9292    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    5.0417   -4.2417    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    4.1000   -4.6125    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    4.4000   -3.8542    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0\n    4.225

In [47]:
len(mols)

480