In [1]:
import os
import sys
import pickle

In [2]:
import tqdm
import pandas as pd
import selfies as sf
from rdkit import Chem

# Set the path of ReBADD-SE

In [3]:
REBADD_LIB_PATH = os.path.abspath(os.pardir)
if REBADD_LIB_PATH not in sys.path:
    sys.path = [REBADD_LIB_PATH] + sys.path
    
from rebadd.datautils import get_fragment_from_selfies

# Set up the input/output directories

In [4]:
class DATACONFIGS:
    def __init__(self):
        ## input
        self.input_dir = os.path.join(os.pardir, 'data', 'chembl')
        self.train_data_path = os.path.join(self.input_dir, 'chembl_train_full.csv')
        ## output - please manually create an output directory
        self.output_dir = 'outputs_0_preprocess_data'
        assert os.path.exists(self.output_dir)

dataconfigs = DATACONFIGS()

# Read the input data

In [5]:
df_train = pd.read_csv(dataconfigs.train_data_path)

df_train

Unnamed: 0,smiles,gsk3,jnk3,sa,qed,num_atoms,num_rings
0,c1cc(OCCCN2CCCCC2)ccc1CN1CCC2(CC1)OCCO2,0.01,0.00,2.692034,0.684442,27,4
1,CC1COC(c2cccn2Cc2ccccc2Cl)=N1,0.06,0.00,3.047999,0.842972,19,3
2,Cc1ncn(-c2ccc(C#N)nc2-c2nc3cc(-c4cnc(N)nc4)ccc...,0.24,0.19,3.263126,0.439582,34,5
3,Cc1c(-c2ccc(-c3cccnc3)cc2)nc2ccc(F)cc2c1C(=O)O,0.09,0.12,2.117217,0.552710,27,4
4,Cn1c(=O)c2c(SCC(=O)N3CCOCC3)nc(-c3ccccc3F)nc2n...,0.00,0.01,2.530062,0.432354,31,4
...,...,...,...,...,...,...,...
1479823,CNCCOc1ccc2cc3ccc(OCCNC)cc3nc2c1,0.01,0.01,2.260887,0.492394,24,3
1479824,Cc1nnc2n1-c1c(F)cc(-c3cncc(C(F)(F)F)c3)cc1CC2,0.12,0.00,2.810497,0.626775,25,4
1479825,Cc1c(Cc2cccnc2)c(=O)oc2cc(OC(=O)N(C)C)c(Cl)cc12,0.04,0.00,2.505654,0.653669,26,3
1479826,COOC1(OOCCCCCC(=O)O)CCCCCCCCCCC1,0.00,0.01,3.349466,0.250995,25,1


# Refine the training data

In [6]:
df_trainable = df_train[ (df_train['gsk3'] > 0.5) | (df_train['jnk3'] > 0.5) | (df_train['qed'] > 0.6) | (df_train['sa'] < 4.001)]
df_trainable = df_trainable[ (df_trainable['gsk3'] > 0.01) | (df_trainable['jnk3'] > 0.01) ]
df_trainable = df_trainable.reset_index(drop=True)

df_trainable

Unnamed: 0,smiles,gsk3,jnk3,sa,qed,num_atoms,num_rings
0,CC1COC(c2cccn2Cc2ccccc2Cl)=N1,0.06,0.00,3.047999,0.842972,19,3
1,Cc1ncn(-c2ccc(C#N)nc2-c2nc3cc(-c4cnc(N)nc4)ccc...,0.24,0.19,3.263126,0.439582,34,5
2,Cc1c(-c2ccc(-c3cccnc3)cc2)nc2ccc(F)cc2c1C(=O)O,0.09,0.12,2.117217,0.552710,27,4
3,O=C(OCC(NCC1NCC(O)C1O)c1ccccc1)c1ccccc1F,0.01,0.02,3.566188,0.540996,27,3
4,CC(C)S(=O)(=O)NC1CN(C)CC1c1ccc(-c2cccc(NS(C)(=...,0.11,0.07,3.227577,0.674220,30,3
...,...,...,...,...,...,...,...
979116,COc1c(-c2ccc(O)c(O)c2)oc2c(O)c(O)cc(O)c2c1=O,0.15,0.00,2.670610,0.354333,24,3
979117,CC1(O)CCC(c2cccnc2Oc2ccc(Nc3nc4ccccc4s3)cc2)CC1,0.09,0.05,2.542342,0.370863,31,5
979118,Cc1nnc2n1-c1c(F)cc(-c3cncc(C(F)(F)F)c3)cc1CC2,0.12,0.00,2.810497,0.626775,25,4
979119,Cc1c(Cc2cccnc2)c(=O)oc2cc(OC(=O)N(C)C)c(Cl)cc12,0.04,0.00,2.505654,0.653669,26,3


In [7]:
data = df_trainable.loc[:,'smiles'].values.tolist()

with open(os.path.join(dataconfigs.output_dir, 'trainable_smiles.txt'), 'w') as fout:
    for smi in data:
        fout.write(f'{smi}\n')

In [8]:
print(f'Number of training data (raw): {len(data)}')
data[:5]

Number of training data (raw): 979121


['CC1COC(c2cccn2Cc2ccccc2Cl)=N1',
 'Cc1ncn(-c2ccc(C#N)nc2-c2nc3cc(-c4cnc(N)nc4)ccc3n2C(C)(C)C)n1',
 'Cc1c(-c2ccc(-c3cccnc3)cc2)nc2ccc(F)cc2c1C(=O)O',
 'O=C(OCC(NCC1NCC(O)C1O)c1ccccc1)c1ccccc1F',
 'CC(C)S(=O)(=O)NC1CN(C)CC1c1ccc(-c2cccc(NS(C)(=O)=O)c2)cc1']

In [9]:
def make_selfies_data(smiles_iter):
    selfies_list = []
    fragments_list = []
    
    for smi in tqdm.tqdm(smiles_iter):
        
        try:
            mol = Chem.MolFromSmiles(smi)
            Chem.Kekulize(mol)
        except:
            continue

        try:
            smi = Chem.MolToSmiles(mol, canonical=True, doRandom=False, isomericSmiles=False, kekuleSmiles=True)
            sel = sf.encoder(smi)
            frags = get_fragment_from_selfies([sel], use_tqdm=False)[0]
            if max([sf.len_selfies(x) for x in frags]) < 16:
                selfies_list.append(sel)
                fragments_list.append(frags)
        except AssertionError:
            pass
        except sf.EncoderError:
            pass
    
    return selfies_list, fragments_list

In [10]:
selfies_list, fragments_list = make_selfies_data(data)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 979121/979121 [05:39<00:00, 2883.32it/s]


In [11]:
print(f'Number of training data (selfies): {len(selfies_list)}')
selfies_list[:5]

Number of training data (selfies): 295601


['[C][N][C][=C][C][=C][Branch1][=Branch1][N+1][=Branch1][C][=O][O-1][C][=C][Ring1][=Branch2][C][=Branch1][C][=O][N][Ring1][=N][C][C][=C][C][=C][C][=C][Ring1][=Branch1][F]',
 '[C][O][C][=C][C][=C][Branch1][C][F][C][=C][Ring1][#Branch1][C][C][C][Branch1][#Branch1][C][N][Branch1][C][C][C][C][Ring1][#Branch2][=O]',
 '[O][=C][C][=C][C][=C][C][=C][Ring1][=Branch1][O][C][=C][C][Branch1][C][O][=C][C][Branch1][C][O][=C][Ring1][S][Ring1][Branch2]',
 '[C][C][N][Branch1][C][C][C][=N][C][=N][C][=C][Ring1][=Branch1][C][C][N][Branch1][=N][C][=Branch1][C][=O][C][=C][C][=C][N][Ring1][Branch1][C][C][C][Ring1][#C]',
 '[C][O][C][C][N][C][=Branch1][C][=O][C][=C][C][Branch1][=Branch2][C][=C][C][=C][C][=C][Ring1][=Branch1][=N][N][Ring1][O][C][C][C][C][Branch1][=Branch2][C][=C][C][=C][N][=C][Ring1][=Branch1][=N][O][Ring1][O]']

In [12]:
print(f'maxlen of fragments: {max([len(fragments) for fragments in fragments_list])}')
fragments_list[:2]

maxlen of fragments: 32


[['[C][N][C][=C][C][=C]',
  '[Branch1][=Branch1][N+1][=Branch1][C][=O][O-1]',
  '[C][=C][Ring1][=Branch2]',
  '[C]',
  '[=Branch1][C][=O]',
  '[N][Ring1][=N]',
  '[C][C][=C][C][=C][C][=C][Ring1][=Branch1]',
  '[F]'],
 ['[C][O][C][=C][C][=C]',
  '[Branch1][C][F]',
  '[C][=C][Ring1][#Branch1]',
  '[C][C][C]',
  '[Branch1][#Branch1][C][N][Branch1][C][C][C]',
  '[C][Ring1][#Branch2]',
  '[=O]']]

In [13]:
with open(os.path.join(dataconfigs.output_dir, 'fragments_list.pkl'), 'wb') as fout:
    pickle.dump(fragments_list, fout)

In [14]:
vocabs = set()
for fragments in tqdm.tqdm(fragments_list):
    vocabs = vocabs.union(set(fragments))

vocabs = sorted(vocabs)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 295601/295601 [02:19<00:00, 2112.23it/s]


In [15]:
print(f'Number of vocabulary(unique fragments): {len(vocabs)}')
vocabs

Number of vocabulary(unique fragments): 34617


['[#Branch1][#Branch1][#C][C][C][C][Ring1][Ring1]',
 '[#Branch1][#Branch2][#C][C][=C][C][=C][C][=C][Ring1][=Branch1]',
 '[#Branch1][#Branch2][#C][C][=C][C][=C][C][=N][Ring1][=Branch1]',
 '[#Branch1][#Branch2][#C][C][=C][C][=C][N][=C][Ring1][=Branch1]',
 '[#Branch1][#Branch2][#C][C][=C][C][=N][C][=C][Ring1][=Branch1]',
 '[#Branch1][#Branch2][#C][C][=C][N][=C][N][=C][Ring1][=Branch1]',
 '[#Branch1][#Branch2][#C][C][=N][C][=C][C][=N][Ring1][=Branch1]',
 '[#Branch1][#Branch2][#C][C][C][C][C][C][C][Ring1][=Branch1]',
 '[#Branch1][#Branch2][#C][C][N][C][C][C][C][Ring1][Branch1]',
 '[#Branch1][=Branch2][#C][C][=C][C][=C][S][Ring1][Branch1]',
 '[#Branch1][=Branch2][#C][C][=C][N][C][=C][Ring1][Branch1]',
 '[#Branch1][=Branch2][#C][C][=C][N][C][=N][Ring1][Branch1]',
 '[#Branch1][=Branch2][#C][C][=C][N][N][=C][Ring1][Branch1]',
 '[#Branch1][=Branch2][#C][C][=C][S][C][=C][Ring1][Branch1]',
 '[#Branch1][=Branch2][#C][C][=C][S][C][=N][Ring1][Branch1]',
 '[#Branch1][=Branch2][#C][C][=N][C][=C][S][Rin

In [16]:
with open(os.path.join(dataconfigs.output_dir, 'vocabulary.csv'), 'w') as fout:
    for v in vocabs:
        fout.write(f'{v}\n')

In [17]:
with open(os.path.join(dataconfigs.output_dir, 'selfies.csv'), 'w') as fout:
    for selfies in selfies_list:
        fout.write(f'{selfies}\n')