In [11]:
import os
import sys
import pandas as pd
import networkx as nx
import tqdm
from multiprocessing import Pool
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

In [5]:
ROOT_PATH = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir, 'rebadd'))
if ROOT_PATH not in sys.path:
    sys.path = [ROOT_PATH] + sys.path
    
from chemutils import GSKScorer, JNKScorer, SAScorer, QEDScorer

In [6]:
def GetNumRings(mol):
    '''
    Reference: https://github.com/wengong-jin/iclr19-graph2graph/blob/master/props/properties.py
    '''
    cycle_list = nx.cycle_basis(nx.Graph(Chem.rdmolops.GetAdjacencyMatrix(mol)))
    return len(cycle_list)

# 1. Read the preprocessed data

In [7]:
df_chembl = pd.read_csv('all_processed.csv')

In [17]:
df_chembl

Unnamed: 0,smiles,length
0,c1cc(OCCCN2CCCCC2)ccc1CN1CCC2(CC1)OCCO2,39
1,CC1COC(c2cccn2Cc2ccccc2Cl)=N1,29
2,Cc1ncn(-c2ccc(C#N)nc2-c2nc3cc(-c4cnc(N)nc4)ccc...,60
3,Cc1c(-c2ccc(-c3cccnc3)cc2)nc2ccc(F)cc2c1C(=O)O,46
4,Cn1c(=O)c2c(SCC(=O)N3CCOCC3)nc(-c3ccccc3F)nc2n...,53
...,...,...
1480288,CNCCOc1ccc2cc3ccc(OCCNC)cc3nc2c1,32
1480289,Cc1nnc2n1-c1c(F)cc(-c3cncc(C(F)(F)F)c3)cc1CC2,45
1480290,Cc1c(Cc2cccnc2)c(=O)oc2cc(OC(=O)N(C)C)c(Cl)cc12,47
1480291,COOC1(OOCCCCCC(=O)O)CCCCCCCCCCC1,32


# 2. Set the oracles

In [12]:
calc_gsk3 = GSKScorer()
calc_jnk3 = JNKScorer()
calc_sa = SAScorer()
calc_qed = QEDScorer()

In [13]:
def calc_properties(smi):
    mol = Chem.MolFromSmiles(smi)

    ## init
    record = {'smiles':smi,}
    
    ## target properties
    record['gsk3'] = calc_gsk3(smi)
    record['jnk3'] = calc_jnk3(smi)
    record['sa'] = calc_sa(smi)
    record['qed'] = calc_qed(smi)
    
    ## number of atoms
    record['num_atoms'] = mol.GetNumAtoms()
    
    ## number of rings
    record['num_rings'] = GetNumRings(mol)
    
    return record

In [20]:
data = []
batch_size = 20000
n_jobs = 10

for i in tqdm.trange(0, len(df_chembl), batch_size):
    with Pool(n_jobs) as p:
        data += p.map(calc_properties, df_chembl['smiles'][i:i+batch_size])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 75/75 [15:50<00:00, 12.67s/it]


In [21]:
df_chembl = pd.DataFrame(data)

In [22]:
df_chembl.to_csv('chembl_gsk3_jnk3_qed_sa.csv', index=False)