In [1]:
import sys
sys.path.append("..")

In [2]:
import pickle
from pathlib import Path

In [3]:
from molbart.tokeniser import MolEncTokeniser

In [7]:
VOCAB_PATH = "../bart_vocab.txt"
CHEM_TOKEN_START_IDX = 272
REGEX = "\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9]"

In [8]:
tokeniser = MolEncTokeniser.from_vocab_file(VOCAB_PATH, REGEX, CHEM_TOKEN_START_IDX)

In [9]:
example_smiles = [
    "O=C(NCc1ccc(OC(F)(F)F)cc1)C1CC(c2c(Cl)cccc2Cl)=NO1",
    "Cc1onc(-c2ccc(Cl)o2)c1-c1ccccc1F",
    "Cc1ccc(C(=O)Nc2cc(-c3c(-c4ccc(F)c(C)c4)nc4cccnn34)ccn2)cc1",
    "COc1ccc(S(=O)(=O)Nc2ccc(-c3cncc4ccccc34)c(OC)c2)cc1",
    "CC(=O)N1C[C@H](OCCC(c2ccccc2)c2ccccc2)C[C@H]1C(=O)N[C@@H](CCCN=C(N)N)C(=O)CCl"
]

In [10]:
for smi, tokens in zip(example_smiles, tokeniser.tokenise(example_smiles)["original_tokens"]):
    print(smi)
    print(tokens)
    print()

O=C(NCc1ccc(OC(F)(F)F)cc1)C1CC(c2c(Cl)cccc2Cl)=NO1
['^', 'O', '=', 'C', '(', 'N', 'C', 'c', '1', 'c', 'c', 'c', '(', 'O', 'C', '(', 'F', ')', '(', 'F', ')', 'F', ')', 'c', 'c', '1', ')', 'C', '1', 'C', 'C', '(', 'c', '2', 'c', '(', 'Cl', ')', 'c', 'c', 'c', 'c', '2', 'Cl', ')', '=', 'N', 'O', '1', '&']

Cc1onc(-c2ccc(Cl)o2)c1-c1ccccc1F
['^', 'C', 'c', '1', 'o', 'n', 'c', '(', '-', 'c', '2', 'c', 'c', 'c', '(', 'Cl', ')', 'o', '2', ')', 'c', '1', '-', 'c', '1', 'c', 'c', 'c', 'c', 'c', '1', 'F', '&']

Cc1ccc(C(=O)Nc2cc(-c3c(-c4ccc(F)c(C)c4)nc4cccnn34)ccn2)cc1
['^', 'C', 'c', '1', 'c', 'c', 'c', '(', 'C', '(', '=', 'O', ')', 'N', 'c', '2', 'c', 'c', '(', '-', 'c', '3', 'c', '(', '-', 'c', '4', 'c', 'c', 'c', '(', 'F', ')', 'c', '(', 'C', ')', 'c', '4', ')', 'n', 'c', '4', 'c', 'c', 'c', 'n', 'n', '3', '4', ')', 'c', 'c', 'n', '2', ')', 'c', 'c', '1', '&']

COc1ccc(S(=O)(=O)Nc2ccc(-c3cncc4ccccc34)c(OC)c2)cc1
['^', 'C', 'O', 'c', '1', 'c', 'c', 'c', '(', 'S', '(', '=', 'O', ')', '(', '=', 

In [11]:
for token, idx in tokeniser.vocab.items():
    print(f"{token:<10}{idx}")

<PAD>     0
?         1
^         2
&         3
<MASK>    4
<SEP>     5
LogD_change_(-0.1, 0.1]6
LogD_change_(0.1, 0.3]7
LogD_change_(0.3, 0.5]8
LogD_change_(0.5, 0.7]9
LogD_change_(0.7, 0.9]10
LogD_change_(0.9, 1.1]11
LogD_change_(1.1, 1.3]12
LogD_change_(1.3, 1.5]13
LogD_change_(1.5, 1.7]14
LogD_change_(1.7, 1.9]15
LogD_change_(1.9, 2.1]16
LogD_change_(2.1, 2.3]17
LogD_change_(2.3, 2.5]18
LogD_change_(2.5, 2.7]19
LogD_change_(2.7, 2.9]20
LogD_change_(2.9, 3.1]21
LogD_change_(3.1, 3.3]22
LogD_change_(3.3, 3.5]23
LogD_change_(3.5, 3.7]24
LogD_change_(3.7, 3.9]25
LogD_change_(3.9, 4.1]26
LogD_change_(4.1, 4.3]27
LogD_change_(4.3, 4.5]28
LogD_change_(4.5, 4.7]29
LogD_change_(4.7, 4.9]30
LogD_change_(4.9, 5.1]31
LogD_change_(5.1, 5.3]32
LogD_change_(5.3, 5.5]33
LogD_change_(5.5, 5.7]34
LogD_change_(5.7, 5.9]35
LogD_change_(5.9, inf]36
LogD_change_(-0.3, -0.1]37
LogD_change_(-0.5, -0.3]38
LogD_change_(-0.7, -0.5]39
LogD_change_(-0.9, -0.7]40
LogD_change_(-1.1, -0.9]41
LogD_change_(-1.3, -1

In [12]:
example_mol_opts = [
    "LogD_change_(-0.5, -0.3]Solubility_low->highClint_no_changeO=C(NC1CCC(=O)N(C(CSc2ccccc2)Cc2ccccc2)CC1)OCc1ccccc1",
    "LogD_change_(-0.5, -0.3]Solubility_no_changeClint_low->highCCCNC(=O)CCn1cc(-c2ccc(OC)cc2)nc1CCN",
    "LogD_change_(0.5, 0.7]Solubility_no_changeClint_no_changeCc1cc(Nc2ccc(F)cc2)n2ncnc2n1",
    "LogD_change_(-0.1, 0.1]Solubility_no_changeClint_no_changeCOc1ccc(C(=O)N2CCC(C(=O)Nc3ccc4c(c3)OCO4)CC2)cc1",
    "LogD_change_(0.3, 0.5]Solubility_no_changeClint_no_changeCOCCOCc1cccc2nc(NC(=O)c3sc4c(C(C)=O)ccc(COC)c4c3C)ccc12"
]

In [13]:
for smi, tokens in zip(example_mol_opts, tokeniser.tokenise(example_mol_opts)["original_tokens"]):
    print(smi)
    print(tokens)
    print()

LogD_change_(-0.5, -0.3]Solubility_low->highClint_no_changeO=C(NC1CCC(=O)N(C(CSc2ccccc2)Cc2ccccc2)CC1)OCc1ccccc1
['^', 'LogD_change_(-0.5, -0.3]', 'Solubility_low->high', 'Clint_no_change', 'O', '=', 'C', '(', 'N', 'C', '1', 'C', 'C', 'C', '(', '=', 'O', ')', 'N', '(', 'C', '(', 'C', 'S', 'c', '2', 'c', 'c', 'c', 'c', 'c', '2', ')', 'C', 'c', '2', 'c', 'c', 'c', 'c', 'c', '2', ')', 'C', 'C', '1', ')', 'O', 'C', 'c', '1', 'c', 'c', 'c', 'c', 'c', '1', '&']

LogD_change_(-0.5, -0.3]Solubility_no_changeClint_low->highCCCNC(=O)CCn1cc(-c2ccc(OC)cc2)nc1CCN
['^', 'LogD_change_(-0.5, -0.3]', 'Solubility_no_change', 'Clint_low->high', 'C', 'C', 'C', 'N', 'C', '(', '=', 'O', ')', 'C', 'C', 'n', '1', 'c', 'c', '(', '-', 'c', '2', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', 'c', '2', ')', 'n', 'c', '1', 'C', 'C', 'N', '&']

LogD_change_(0.5, 0.7]Solubility_no_changeClint_no_changeCc1cc(Nc2ccc(F)cc2)n2ncnc2n1
['^', 'LogD_change_(0.5, 0.7]', 'Solubility_no_change', 'Clint_no_change', 'C', 'c', '1', 'c',

In [14]:
smi = example_smiles[0] + "." + example_smiles[1]
print(tokeniser.prog.findall(smi))

['O', '=', 'C', '(', 'N', 'C', 'c', '1', 'c', 'c', 'c', '(', 'O', 'C', '(', 'F', ')', '(', 'F', ')', 'F', ')', 'c', 'c', '1', ')', 'C', '1', 'C', 'C', '(', 'c', '2', 'c', '(', 'Cl', ')', 'c', 'c', 'c', 'c', '2', 'Cl', ')', '=', 'N', 'O', '1', '.', 'C', 'c', '1', 'o', 'n', 'c', '(', '-', 'c', '2', 'c', 'c', 'c', '(', 'Cl', ')', 'o', '2', ')', 'c', '1', '-', 'c', '1', 'c', 'c', 'c', 'c', 'c', '1', 'F']
