In [1]:
import sys
import numpy as np
import pickle

sys.path.append("..")
from src.features.tokenizer import Tokenizer

# Make SMILES tokenizer

In [2]:
PAD = 0
START = 1
END = 2
UNK = 3
smiles_voc_set = [f'{i}' for i in range(10)]+['(', ')', '[', ']', ':', '=', '@', '@@', '+', '/', '\\', '.',
    '-', '#', '%', 'c', 'i', 'o', 'n', 'p', 's', 'b',
    'H', 'B', 'C', 'N', 'O', 'F', 'P', 'S', 'Cl', 'Br', 'I']

voc2tok = {}
voc2tok.update({'<pad>':PAD, '<s>':START, '</s>':END})
voc2tok.update({voc:i+4 for i, voc in enumerate(smiles_voc_set)})

n_voc = max([tok for tok in voc2tok.values()])+1
tok2voc = np.ndarray(n_voc, dtype='<U5')
for voc, tok in voc2tok.items():
    tok2voc[tok] = voc
tok2voc[UNK] = '<unk>'

In [4]:
smiles_tokenizer = Tokenizer(tok2voc, PAD, START, END, UNK, 2)
with open("smiles_tokenizer.pkl", mode='wb') as f:
    pickle.dump(smiles_tokenizer, f)

# Make tokenizer for both SMILES and InChI (Optional)

In [7]:
inchi_smiles_vocs = \
    ['<pad>', '<s>', '</s>', 'unk']\
    +[str(i) for i in range(10)]\
    +['(', ')', '[', ']', ':', '=', '@', '@@', '+', '/', '\\', '.', ',', '?',
    '-', '#', '%', 'c', 'i', 'o', 'n', 'p', 's', 'b', 'l', 'h', 't', 'q', 'm',
    'H', 'B', 'C', 'N', 'O', 'F', 'P', 'S', 'Cl', 'Br', 'I']

inchi_smiles_tokenizer = Tokenizer(inchi_smiles_vocs, PAD, START, END, UNK, 2)
with open("inchi_smiles_tokenizer.pkl", mode='wb') as f:
    pickle.dump(inchi_smiles_tokenizer, f)