In [1]:
import os
import sys
import pickle
import tqdm
import pandas as pd
import selfies as sf

In [2]:
REBADD_LIB_PATH = os.path.abspath(os.pardir)
if REBADD_LIB_PATH not in sys.path:
    sys.path = [REBADD_LIB_PATH] + sys.path

from rebadd.datautils import get_fragment_from_selfies

In [5]:
class DATACONFIGS:
    def __init__(self):
        ## input
        self.input_dir = os.path.join(os.pardir, 'data', 'zinc15')
        self.train_data_path = os.path.join(self.input_dir, 'zinc15_train.txt')
        ## output
        self.output_dir = 'outputs_0_preprocess_data'
        assert os.path.exists(self.output_dir)       

dataconfigs = DATACONFIGS()

In [6]:
with open(dataconfigs.train_data_path) as fin:
    lines = fin.readlines()
    
data = [line.rstrip() for line in lines]

In [7]:
print(f'Number of training data (raw): {len(data)}')
data[:5]

Number of training data (raw): 6291


['COc1cc(C(=O)NC(NC(=S)Nc2ccc(C)cc2)C(Cl)(Cl)Cl)cc(OC)c1OC',
 'CCCCN1C(=O)C(=c2sc3n(c2=O)C(c2cccs2)C(C(=O)OCC(C)C)=C(C)N=3)c2ccccc21',
 'Cc1ccccc1-c1cccc(CNc2cc(C(=O)NC3CC3)cc(S(=O)(=O)N3CCCC(C(N)=O)C3)c2)c1',
 'Cc1cc(C)c(S(=O)(=O)N(CC(=O)NC2CCCCC2C)c2c(C)n(C)n(-c3ccccc3)c2=O)c(C)c1',
 'COc1cc2c(c(OC)c1OC)-c1ccc(NC(C)C(=O)Nc3ccc4c(c3)ncn4C)c(=O)cc1C(NC(C)=O)CC2']

In [8]:
def make_selfies_data(smiles_iter):
    selfies_list = []
    
    for smi in tqdm.tqdm(smiles_iter):
        try:
            sel = sf.encoder(smi)
            selfies_list.append(sel)
        except AssertionError:
            pass
        except sf.EncoderError:
            pass
    
    return selfies_list

In [9]:
selfies_list = make_selfies_data(data)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6291/6291 [00:01<00:00, 4230.93it/s]


In [10]:
print(f'Number of training data (selfies): {len(selfies_list)}')
selfies_list[:5]

Number of training data (selfies): 6291


['[C][O][C][=C][C][Branch2][Ring2][Ring1][C][=Branch1][C][=O][N][C][Branch2][Ring1][C][N][C][=Branch1][C][=S][N][C][=C][C][=C][Branch1][C][C][C][=C][Ring1][#Branch1][C][Branch1][C][Cl][Branch1][C][Cl][Cl][=C][C][Branch1][Ring1][O][C][=C][Ring2][Ring1][O][O][C]',
 '[C][C][C][C][N][C][=Branch1][C][=O][C][=Branch2][Ring2][O][=C][S][C][N][Branch1][Branch1][C][Ring1][Branch1][=O][C][Branch1][Branch2][C][=C][C][=C][S][Ring1][Branch1][C][Branch1][N][C][=Branch1][C][=O][O][C][C][Branch1][C][C][C][=C][Branch1][C][C][N][=Ring2][Ring1][Branch1][C][=C][C][=C][C][=C][Ring1][=Branch1][Ring2][Ring1][P]',
 '[C][C][=C][C][=C][C][=C][Ring1][=Branch1][C][=C][C][=C][C][Branch2][Ring2][P][C][N][C][=C][C][Branch1][O][C][=Branch1][C][=O][N][C][C][C][Ring1][Ring1][=C][C][Branch2][Ring1][#Branch1][S][=Branch1][C][=O][=Branch1][C][=O][N][C][C][C][C][Branch1][=Branch1][C][Branch1][C][N][=O][C][Ring1][=Branch2][=C][Ring2][Ring1][Branch2][=C][Ring2][Ring1][S]',
 '[C][C][=C][C][Branch1][C][C][=C][Branch2][Branch1][

In [11]:
with open(os.path.join(dataconfigs.output_dir, 'selfies.csv'), 'w') as fout:
    for selfies in selfies_list:
        fout.write(f'{selfies}\n')

In [12]:
fragments_list = get_fragment_from_selfies(selfies_list)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6291/6291 [00:00<00:00, 67269.26it/s]


In [13]:
print(f'maxlen of fragments: {max([len(fragments) for fragments in fragments_list])}')
fragments_list[:2]

maxlen of fragments: 36


[['[C][O][C][=C][C]',
  '[Branch2][Ring2][Ring1][C][=Branch1][C][=O][N][C][Branch2][Ring1][C][N][C][=Branch1][C][=S][N][C][=C][C][=C][Branch1][C][C][C][=C][Ring1][#Branch1][C][Branch1][C][Cl][Branch1][C][Cl][Cl]',
  '[=C][C]',
  '[Branch1][Ring1][O][C]',
  '[=C][Ring2][Ring1][O]',
  '[O][C]'],
 ['[C][C][C][C][N][C]',
  '[=Branch1][C][=O]',
  '[C]',
  '[=Branch2][Ring2][O][=C][S][C][N][Branch1][Branch1][C][Ring1][Branch1][=O][C][Branch1][Branch2][C][=C][C][=C][S][Ring1][Branch1][C][Branch1][N][C][=Branch1][C][=O][O][C][C][Branch1][C][C][C][=C][Branch1][C][C][N][=Ring2][Ring1][Branch1]',
  '[C][=C][C][=C][C][=C][Ring1][=Branch1]',
  '[Ring2][Ring1][P]']]

In [14]:
with open(os.path.join(dataconfigs.output_dir, 'fragments_list.pkl'), 'wb') as fout:
    pickle.dump(fragments_list, fout)

In [15]:
vocabs = set()
for fragments in tqdm.tqdm(fragments_list):
    vocabs = vocabs.union(set(fragments))

vocabs = sorted(vocabs)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6291/6291 [00:00<00:00, 28268.09it/s]


In [16]:
print(f'Number of vocabulary(unique fragments): {len(vocabs)}')
vocabs

Number of vocabulary(unique fragments): 7493


['[=Branch1][#Branch1][=C][C][=C][Ring1][=Branch1][C]',
 '[=Branch1][#Branch1][=C][C][=C][Ring1][=Branch1][O]',
 '[=Branch1][#Branch1][=C][N][Branch1][C][C][C]',
 '[=Branch1][#Branch1][=C][Ring1][P][Ring2][Ring1][#Branch1]',
 '[=Branch1][#Branch2][=C][C][=C][C][=C][C][=C][Ring1][=Branch1]',
 '[=Branch1][#C][=C][C][=C][C][Branch1][C][Br][=C][C][=C][Ring1][#Branch1][O][C]',
 '[=Branch1][#C][=C][C][=C][NH1][C][=C][C][=C][C][=C][Ring1][=Branch2][Ring1][=Branch1]',
 '[=Branch1][#C][=N][C][=C][C][=C][Branch1][C][Br][C][=C][Ring1][#Branch1][Ring1][#Branch2]',
 '[=Branch1][=Branch1][=C][C][Ring1][#Branch1][=O]',
 '[=Branch1][=Branch1][=C][Ring1][#Branch2][Ring1][=Branch1]',
 '[=Branch1][=Branch1][=C][Ring1][=Branch1][O][C]',
 '[=Branch1][=Branch1][=C][Ring1][N][Ring1][=Branch1]',
 '[=Branch1][=Branch1][=C][Ring1][S][Ring1][=Branch2]',
 '[=Branch1][=Branch1][=C][Ring2][Ring1][#Branch1][O]',
 '[=Branch1][=Branch1][=C][Ring2][Ring1][Branch2][O]',
 '[=Branch1][=Branch1][=C][Ring2][Ring2][Branch1][

In [17]:
with open(os.path.join(dataconfigs.output_dir, 'vocabulary.csv'), 'w') as fout:
    for v in vocabs:
        fout.write(f'{v}\n')