In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import pickle

In [3]:
import tqdm
import pandas as pd
import selfies as sf
from rdkit import Chem

In [4]:
REBADD_LIB_PATH = os.path.abspath(os.pardir)
if REBADD_LIB_PATH not in sys.path:
    sys.path = [REBADD_LIB_PATH] + sys.path
    
from rebadd.datautils import get_fragment_from_selfies
from rebadd.SMILES_branch_based_standardization import branch_based_standardization

In [5]:
class DATACONFIGS:
    def __init__(self):
        ## input
        self.input_dir = os.path.join(os.pardir, 'data', 'chembl')
        self.train_data_path = os.path.join(self.input_dir, 'chembl_train_full.csv')
        ## output
        self.output_dir = os.path.join('processed_data', 'gsk3_jnk3_qed_sa')
        assert os.path.exists(self.output_dir)

dataconfigs = DATACONFIGS()

In [6]:
df_train = pd.read_csv(dataconfigs.train_data_path)

df_train

Unnamed: 0,smiles,length,gsk3,jnk3,sa,qed,num_atoms,num_rings
0,c1cc(OCCCN2CCCCC2)ccc1CN1CCC2(CC1)OCCO2,39,0.01,0.00,2.692034,0.684442,27.0,4.0
1,CC1COC(c2cccn2Cc2ccccc2Cl)=N1,29,0.06,0.00,3.047999,0.842972,19.0,3.0
2,Cc1ncn(-c2ccc(C#N)nc2-c2nc3cc(-c4cnc(N)nc4)ccc...,60,0.24,0.19,3.263126,0.439582,34.0,5.0
3,Cc1c(-c2ccc(-c3cccnc3)cc2)nc2ccc(F)cc2c1C(=O)O,46,0.09,0.12,2.117217,0.552710,27.0,4.0
4,Cn1c(=O)c2c(SCC(=O)N3CCOCC3)nc(-c3ccccc3F)nc2n...,53,0.00,0.01,2.530062,0.432354,31.0,4.0
...,...,...,...,...,...,...,...,...
1469288,CNCCOc1ccc2cc3ccc(OCCNC)cc3nc2c1,32,0.01,0.01,2.260887,0.492394,24.0,3.0
1469289,Cc1nnc2n1-c1c(F)cc(-c3cncc(C(F)(F)F)c3)cc1CC2,45,0.12,0.00,2.810497,0.626775,25.0,4.0
1469290,Cc1c(Cc2cccnc2)c(=O)oc2cc(OC(=O)N(C)C)c(Cl)cc12,47,0.04,0.00,2.505654,0.653669,26.0,3.0
1469291,COOC1(OOCCCCCC(=O)O)CCCCCCCCCCC1,32,0.00,0.01,3.349466,0.250995,25.0,1.0


In [7]:
df_trainable = df_train[ (df_train['gsk3'] > 0.5) | (df_train['jnk3'] > 0.5) | (df_train['qed'] > 0.6) | (df_train['sa'] < 4.001)]
df_trainable = df_trainable[ (df_trainable['gsk3'] > 0.01) | (df_trainable['jnk3'] > 0.01) ]
df_trainable = df_trainable.reset_index(drop=True)

df_trainable

Unnamed: 0,smiles,length,gsk3,jnk3,sa,qed,num_atoms,num_rings
0,CC1COC(c2cccn2Cc2ccccc2Cl)=N1,29,0.06,0.00,3.047999,0.842972,19.0,3.0
1,Cc1ncn(-c2ccc(C#N)nc2-c2nc3cc(-c4cnc(N)nc4)ccc...,60,0.24,0.19,3.263126,0.439582,34.0,5.0
2,Cc1c(-c2ccc(-c3cccnc3)cc2)nc2ccc(F)cc2c1C(=O)O,46,0.09,0.12,2.117217,0.552710,27.0,4.0
3,O=C(OCC(NCC1NCC(O)C1O)c1ccccc1)c1ccccc1F,40,0.01,0.02,3.566188,0.540996,27.0,3.0
4,CC(C)S(=O)(=O)NC1CN(C)CC1c1ccc(-c2cccc(NS(C)(=...,57,0.11,0.07,3.227577,0.674220,30.0,3.0
...,...,...,...,...,...,...,...,...
968977,COc1c(-c2ccc(O)c(O)c2)oc2c(O)c(O)cc(O)c2c1=O,44,0.15,0.00,2.670610,0.354333,24.0,3.0
968978,CC1(O)CCC(c2cccnc2Oc2ccc(Nc3nc4ccccc4s3)cc2)CC1,47,0.09,0.05,2.542342,0.370863,31.0,5.0
968979,Cc1nnc2n1-c1c(F)cc(-c3cncc(C(F)(F)F)c3)cc1CC2,45,0.12,0.00,2.810497,0.626775,25.0,4.0
968980,Cc1c(Cc2cccnc2)c(=O)oc2cc(OC(=O)N(C)C)c(Cl)cc12,47,0.04,0.00,2.505654,0.653669,26.0,3.0


In [8]:
data = df_trainable.loc[:,'smiles'].values.tolist()

with open(os.path.join(dataconfigs.output_dir, 'trainable_smiles.txt'), 'w') as fout:
    for smi in data:
        fout.write(f'{smi}\n')

In [9]:
print(f'Number of training data (raw): {len(data)}')
data[:5]

Number of training data (raw): 968982


['CC1COC(c2cccn2Cc2ccccc2Cl)=N1',
 'Cc1ncn(-c2ccc(C#N)nc2-c2nc3cc(-c4cnc(N)nc4)ccc3n2C(C)(C)C)n1',
 'Cc1c(-c2ccc(-c3cccnc3)cc2)nc2ccc(F)cc2c1C(=O)O',
 'O=C(OCC(NCC1NCC(O)C1O)c1ccccc1)c1ccccc1F',
 'CC(C)S(=O)(=O)NC1CN(C)CC1c1ccc(-c2cccc(NS(C)(=O)=O)c2)cc1']

In [10]:
def make_selfies_data(smiles_iter):
    selfies_list = []
    fragments_list = []
    
    for smi in tqdm.tqdm(smiles_iter):
        
        try:
            mol = Chem.MolFromSmiles(smi)
            Chem.Kekulize(mol)
        except:
            continue

        try:
            smi = Chem.MolToSmiles(mol, canonical=True, doRandom=False, isomericSmiles=False, kekuleSmiles=True)
            std = branch_based_standardization(smi)
            sel = sf.encoder(std)
            frags = get_fragment_from_selfies([sel], use_tqdm=False)[0]
            if max([sf.len_selfies(x) for x in frags]) < 16:
                selfies_list.append(sel)
                fragments_list.append(frags)
        except AssertionError:
            pass
        except sf.EncoderError:
            pass
    
    return selfies_list, fragments_list

In [11]:
selfies_list, fragments_list = make_selfies_data(data)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 968982/968982 [08:11<00:00, 1972.76it/s]


In [12]:
print(f'Number of training data (selfies): {len(selfies_list)}')
selfies_list[:5]

Number of training data (selfies): 781797


['[C][C][C][O][C][=Branch1][Ring2][=N][Ring1][Branch1][C][=C][C][=C][N][Ring1][Branch1][C][C][=C][C][=C][C][=C][Ring1][=Branch1][Cl]',
 '[C][C][=N][N][Branch1][Branch1][C][=N][Ring1][Branch1][C][=C][C][=C][Branch1][Ring1][C][#N][N][=C][Ring1][Branch2][C][=N][C][=C][C][Branch1][N][C][=C][N][=C][Branch1][C][N][N][=C][Ring1][#Branch1][=C][C][=C][Ring1][=N][N][Ring1][S][C][Branch1][C][C][Branch1][C][C][C]',
 '[O][=C][Branch1][#Branch2][C][=C][C][=C][C][=C][Ring1][=Branch1][F][O][C][C][Branch1][=Branch2][C][=C][C][=C][C][=C][Ring1][=Branch1][N][C][C][N][C][C][Branch1][C][O][C][Ring1][=Branch1][O]',
 '[C][C][Branch1][C][C][S][=Branch1][C][=O][=Branch1][C][=O][N][C][C][N][Branch1][C][C][C][C][Ring1][=Branch1][C][=C][C][=C][Branch1][Branch1][C][=C][Ring1][=Branch1][C][=C][C][=C][C][=Branch1][Ring2][=C][Ring1][=Branch1][N][S][Branch1][C][C][=Branch1][C][=O][=O]',
 '[C][C][Branch1][=Branch2][C][=C][C][=C][C][=C][Ring1][=Branch1][C][N][C][C][C][Branch1][Branch1][C][C][Ring1][=Branch1][N][=C][Bran

In [13]:
print(f'maxlen of fragments: {max([len(fragments) for fragments in fragments_list])}')
fragments_list[:2]

maxlen of fragments: 38


[['[C][C][C][O][C]',
  '[=Branch1][Ring2][=N][Ring1][Branch1]',
  '[C][=C][C][=C][N][Ring1][Branch1]',
  '[C][C][=C][C][=C][C][=C][Ring1][=Branch1]',
  '[Cl]'],
 ['[C][C][=N][N]',
  '[Branch1][Branch1][C][=N][Ring1][Branch1]',
  '[C][=C][C][=C]',
  '[Branch1][Ring1][C][#N]',
  '[N][=C][Ring1][Branch2]',
  '[C][=N][C][=C][C]',
  '[Branch1][N][C][=C][N][=C][Branch1][C][N][N][=C][Ring1][#Branch1]',
  '[=C][C][=C][Ring1][=N]',
  '[N][Ring1][S]',
  '[C]',
  '[Branch1][C][C]',
  '[Branch1][C][C]',
  '[C]']]

In [14]:
with open(os.path.join(dataconfigs.output_dir, 'fragments_list.pkl'), 'wb') as fout:
    pickle.dump(fragments_list, fout)

In [15]:
vocabs = set()
for fragments in tqdm.tqdm(fragments_list):
    vocabs = vocabs.union(set(fragments))

vocabs = sorted(vocabs)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 781797/781797 [06:56<00:00, 1875.54it/s]


In [16]:
print(f'Number of vocabulary(unique fragments): {len(vocabs)}')
vocabs

Number of vocabulary(unique fragments): 53684


['[#Branch1][#Branch1][#C][C][C][C][Ring1][Ring1]',
 '[#Branch1][#Branch2][#C][C][=C][C][=C][C][=C][Ring1][=Branch1]',
 '[#Branch1][#Branch2][#C][C][=C][C][=C][C][=N][Ring1][=Branch1]',
 '[#Branch1][#Branch2][#C][C][=C][C][=C][N][=C][Ring1][=Branch1]',
 '[#Branch1][#Branch2][#C][C][=C][C][=N][C][=C][Ring1][=Branch1]',
 '[#Branch1][#Branch2][#C][C][=C][N][=C][N][=C][Ring1][=Branch1]',
 '[#Branch1][#Branch2][#C][C][=N][C][=C][C][=N][Ring1][=Branch1]',
 '[#Branch1][#Branch2][#C][C][C][C][C][C][C][Ring1][=Branch1]',
 '[#Branch1][#Branch2][#C][C][N][C][C][C][C][Ring1][Branch1]',
 '[#Branch1][=Branch2][#C][C][=C][C][=C][S][Ring1][Branch1]',
 '[#Branch1][=Branch2][#C][C][=C][N][C][=C][Ring1][Branch1]',
 '[#Branch1][=Branch2][#C][C][=C][N][C][=N][Ring1][Branch1]',
 '[#Branch1][=Branch2][#C][C][=C][N][N][=C][Ring1][Branch1]',
 '[#Branch1][=Branch2][#C][C][=C][S][C][=C][Ring1][Branch1]',
 '[#Branch1][=Branch2][#C][C][=C][S][C][=N][Ring1][Branch1]',
 '[#Branch1][=Branch2][#C][C][=N][C][=C][S][Rin

In [17]:
with open(os.path.join(dataconfigs.output_dir, 'vocabulary.csv'), 'w') as fout:
    for v in vocabs:
        fout.write(f'{v}\n')

In [18]:
with open(os.path.join(dataconfigs.output_dir, 'selfies.csv'), 'w') as fout:
    for selfies in selfies_list:
        fout.write(f'{selfies}\n')