In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
import tqdm
import pandas as pd
import selfies as sf

In [3]:
class DATACONFIGS:
    def __init__(self):
        ## input
        self.input_dir = os.path.join('data', 'zinc15')
        self.train_data_path = os.path.join(self.input_dir, 'zinc15_train.txt')
        ## output
        self.output_dir = os.path.join('processed_data', 'zinc15')
        assert os.path.exists(self.output_dir)       

dataconfigs = DATACONFIGS()

In [4]:
with open(dataconfigs.train_data_path) as fin:
    lines = fin.readlines()
    
data = [line.rstrip() for line in lines]

In [5]:
print(f'Number of training data (raw): {len(data)}')
data[:5]

Number of training data (raw): 600116


['CCCCCCc1cn(C2CC(O)C(COP(=O)(O)OP(=O)(O)OP(=O)(O)O)O2)c(=O)[nH]c1=O',
 'CC(C)=CC1COC23CC4(CO2)C(CCC2C5(C)CCC(OC6OCC(O)C(OC7OC(CO)C(O)C(O)C7O)C6OC6OC(CO)C(O)C6O)C(C)(C)C5CCC24C)C3C1(C)O',
 'CC(=O)OCC1(C(=O)O)C(CC(=O)O)=C(C(=O)OC2=CCOC=C2)C(OC(C)=O)=CC1(OC(C)=O)C(=O)O',
 'CN(CCc1ccccc1)C(=O)CNC(=O)C(CCS(C)=O)NC(=O)C(N)Cc1ccc(O)cc1',
 'CC(=O)NC1C(OC2C(COC(C)=O)OC(Oc3ccc4c(C)cc(=O)oc4c3)C(NC(C)=O)C2OC(C)=O)OC(COC(C)=O)C(OC(C)=O)C1OC(C)=O']

In [6]:
def make_selfies_data(smiles_iter):
    selfies_list = []
    
    for smi in tqdm.tqdm(smiles_iter):
        try:
            sel = sf.encoder(smi)
            selfies_list.append(sel)
        except AssertionError:
            pass
        except sf.EncoderError:
            pass
    
    return selfies_list

In [7]:
selfies_list = make_selfies_data(data)

100%|██████████| 600116/600116 [06:19<00:00, 1581.38it/s]


In [8]:
print(f'Number of training data (selfies): {len(selfies_list)}')
selfies_list[:5]

Number of training data (selfies): 600116


['[C][C][C][C][C][C][C][=C][N][Branch2][Ring2][=Branch2][C][C][C][Branch1][C][O][C][Branch2][Ring1][O][C][O][P][=Branch1][C][=O][Branch1][C][O][O][P][=Branch1][C][=O][Branch1][C][O][O][P][=Branch1][C][=O][Branch1][C][O][O][O][Ring2][Ring1][Ring2][C][=Branch1][C][=O][NH1][C][Ring2][Ring1][O][=O]',
 '[C][C][Branch1][C][C][=C][C][C][O][C][C][C][Branch1][Branch1][C][O][Ring1][Branch1][C][Branch2][#Branch1][#Branch1][C][C][C][C][Branch1][C][C][C][C][C][Branch2][Branch1][Branch1][O][C][O][C][C][Branch1][C][O][C][Branch2][Ring1][Branch1][O][C][O][C][Branch1][Ring1][C][O][C][Branch1][C][O][C][Branch1][C][O][C][Ring1][#Branch2][O][C][Ring2][Ring1][Ring1][O][C][O][C][Branch1][Ring1][C][O][C][Branch1][C][O][C][Ring1][Branch2][O][C][Branch1][C][C][Branch1][C][C][C][Ring2][Ring2][#Branch1][C][C][C][Ring2][Ring2][O][Ring2][Ring2][P][C][C][Ring2][Branch1][Branch1][C][Ring2][Branch1][=Branch2][Branch1][C][C][O]',
 '[C][C][=Branch1][C][=O][O][C][C][Branch1][=Branch1][C][=Branch1][C][=O][O][C][Branch1][

In [9]:
with open(os.path.join(dataconfigs.output_dir, 'selfies.csv'), 'w') as fout:
    for selfies in selfies_list:
        fout.write(f'{selfies}\n')

In [10]:
fragments_list = [list(sf.split_selfies(sel)) for sel in selfies_list]

In [11]:
print(f'maxlen of fragments: {max([len(fragments) for fragments in fragments_list])}')
fragments_list[:2]

maxlen of fragments: 154


[['[C]',
  '[C]',
  '[C]',
  '[C]',
  '[C]',
  '[C]',
  '[C]',
  '[=C]',
  '[N]',
  '[Branch2]',
  '[Ring2]',
  '[=Branch2]',
  '[C]',
  '[C]',
  '[C]',
  '[Branch1]',
  '[C]',
  '[O]',
  '[C]',
  '[Branch2]',
  '[Ring1]',
  '[O]',
  '[C]',
  '[O]',
  '[P]',
  '[=Branch1]',
  '[C]',
  '[=O]',
  '[Branch1]',
  '[C]',
  '[O]',
  '[O]',
  '[P]',
  '[=Branch1]',
  '[C]',
  '[=O]',
  '[Branch1]',
  '[C]',
  '[O]',
  '[O]',
  '[P]',
  '[=Branch1]',
  '[C]',
  '[=O]',
  '[Branch1]',
  '[C]',
  '[O]',
  '[O]',
  '[O]',
  '[Ring2]',
  '[Ring1]',
  '[Ring2]',
  '[C]',
  '[=Branch1]',
  '[C]',
  '[=O]',
  '[NH1]',
  '[C]',
  '[Ring2]',
  '[Ring1]',
  '[O]',
  '[=O]'],
 ['[C]',
  '[C]',
  '[Branch1]',
  '[C]',
  '[C]',
  '[=C]',
  '[C]',
  '[C]',
  '[O]',
  '[C]',
  '[C]',
  '[C]',
  '[Branch1]',
  '[Branch1]',
  '[C]',
  '[O]',
  '[Ring1]',
  '[Branch1]',
  '[C]',
  '[Branch2]',
  '[#Branch1]',
  '[#Branch1]',
  '[C]',
  '[C]',
  '[C]',
  '[C]',
  '[Branch1]',
  '[C]',
  '[C]',
  '[C]',
  '[C]',


In [12]:
with open(os.path.join(dataconfigs.output_dir, 'fragments_list.pkl'), 'wb') as fout:
    pickle.dump(fragments_list, fout)

In [13]:
vocabs = sf.get_alphabet_from_selfies(selfies_list)

vocabs = sorted(vocabs)

In [14]:
print(f'Number of vocabulary(unique fragments): {len(vocabs)}')
vocabs

Number of vocabulary(unique fragments): 46


['[#Branch1]',
 '[#Branch2]',
 '[#C]',
 '[#N+1]',
 '[#N]',
 '[#Ring2]',
 '[=Branch1]',
 '[=Branch2]',
 '[=C]',
 '[=N+1]',
 '[=N-1]',
 '[=N]',
 '[=O]',
 '[=PH1]',
 '[=P]',
 '[=Ring1]',
 '[=Ring2]',
 '[=S+1]',
 '[=S]',
 '[Br]',
 '[Branch1]',
 '[Branch2]',
 '[C-1]',
 '[CH0]',
 '[CH1-1]',
 '[CH1]',
 '[C]',
 '[Cl]',
 '[F]',
 '[N+1]',
 '[N-1]',
 '[NH0]',
 '[NH1+1]',
 '[NH1]',
 '[N]',
 '[O-1]',
 '[OH0]',
 '[O]',
 '[PH0]',
 '[PH1]',
 '[P]',
 '[Ring1]',
 '[Ring2]',
 '[S+1]',
 '[SH1]',
 '[S]']

In [15]:
with open(os.path.join(dataconfigs.output_dir, 'vocabulary.csv'), 'w') as fout:
    for v in vocabs:
        fout.write(f'{v}\n')