In [None]:
import sys
import pickle
import pandas as pd
from pathlib import Path
from rdkit import Chem

In [None]:
sys.path.append("..")

In [None]:
import molbart
from molbart.tokeniser import MolEncTokeniser

In [None]:
PICKLE_PATH = "../../data/uspto_mixed.pickle"
VOCAB_PATH = "../bart_vocab.txt"
CHEM_TOKEN_START_IDX = 272
REGEX = "\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9]"

In [None]:
df = pd.read_pickle(PICKLE_PATH)

In [None]:
df.head()

In [None]:
tokeniser = MolEncTokeniser.from_vocab_file(VOCAB_PATH, REGEX, CHEM_TOKEN_START_IDX)

In [None]:
reacts = df["reactants"]
react_mols = df["reactants_mol"]
reacts_smiles = [Chem.MolToSmiles(mol) for mol in react_mols]

In [None]:
prods = df["products"]
prod_mols = df["products_mol"]
prods_smiles = [Chem.MolToSmiles(mol) for mol in prod_mols]

In [None]:
non_can_reacts = [(react == react_smi, react, react_smi) for react, react_smi in zip(reacts, reacts_smiles)]
non_can_reacts = [(react, react_smi) for eq, react, react_smi in non_can_reacts if not eq]
print(f"Non-canonical reactants: {len(non_can_reacts)}")

In [None]:
non_can_prods = [(prod == prod_smi, prod, prod_smi) for prod, prod_smi in zip(prods, prods_smiles)]
non_can_prods = [(prod, prod_smi) for eq, prod, prod_smi in non_can_prods if not eq]
print(f"Non-canonical products: {len(non_can_prods)}")

In [None]:
for react, react_smi in non_can_reacts:
    print(react)
    print(react_smi)
    print()

In [None]:
for prod, prod_smi in non_can_prods:
    print(prod)
    print(prod_smi)
    print()

In [None]:
# **************************
# *** Tokeniser Analysis ***
# **************************

In [None]:
mols = reacts_smiles[:10]
tokens = tokeniser.tokenise(mols)

In [None]:
for i in range(len(mols)):
    print(mols[i])
    print(tokens["original_tokens"][i])
    print()

In [None]:
MOL_OPT_DATA_PATH = "../../data/mol_opt.csv"

In [None]:
opt_df = pd.read_csv(MOL_OPT_DATA_PATH)

In [None]:
opt_df.head()

In [None]:
mol_opts = opt_df["Input"][:5]

In [None]:
import re

In [None]:
prog = re.compile("(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])")

In [None]:
for smi in mols:
    tokens = prog.findall(smi)
    print(smi)
    print(tokens)
    print()

In [None]:
extra_tokens = ["LogD_change_\(-0.5, -0.3\]", "Solubility_low->high", "Clint_no_change", "Solubility_no_change", "Clint_low->high", "LogD_change_\(0.5, 0.7\]", "LogD_change_\(-0.1, 0.1\]", "LogD_change_\(0.3, 0.5\]"]

In [None]:
regex = "\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9]"
for token in extra_tokens:
    regex = f"{token}|" + regex 

regex = "(" + regex + ")"
print(regex)

In [None]:
prog = re.compile(regex)

In [None]:
for smi in mols:
    tokens = prog.findall(smi)
    print(smi)
    print(tokens)
    print()

In [None]:
for smi in mol_opts:
    tokens = prog.findall(smi)
    print(smi)
    print(tokens)
    print()

In [None]:
from pysmilesutils.tokenize import SMILESTokenizer

In [None]:
MOL_OPT_TOKENS_PATH = "../mol_opt_tokens.txt"

In [None]:
def read_extra_tokens(paths):
    extra_tokens = []
    for path in paths:
        text = Path(path).read_text()
        tokens = text.split("\n")
        tokens = [token for token in tokens if token != ""]
        print(f"Read {len(tokens)} tokens from {path}")
        extra_tokens.extend(tokens)

    return extra_tokens

In [None]:
regex = "\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9]"
extra_tokens = read_extra_tokens([MOL_OPT_TOKENS_PATH])

In [None]:
print(extra_tokens[:5])

In [None]:
mol_opts = [mol_opts[i] for i in range(len(mol_opts))]
smiles = mols + mol_opts
rx_tokeniser = SMILESTokenizer(smiles=smiles, tokens=extra_tokens, regex_tokens=[regex])

In [None]:
opt_tokens = rx_tokeniser.tokenize(mol_opts)
for mol_opt, tokens in zip(mol_opts, opt_tokens):
    print(mol_opt)
    print(tokens)
    print()

In [None]:
mol_tokens = rx_tokeniser.tokenize(mols)
for mol, tokens in zip(mols, mol_tokens):
    print(mol)
    print(tokens)
    print()

In [None]:
MOL_OPT_TOKENISER_PATH = "../../tokenisers/mol_opt_tokeniser.pickle"

In [None]:
mol_opt_tokeniser = load_tokeniser(MOL_OPT_TOKENISER_PATH)

In [None]:
opt_tokens = mol_opt_tokeniser.tokenise(mol_opts)["original_tokens"]
for mol_opt, tokens in zip(mol_opts, opt_tokens):
    print(mol_opt)
    print(tokens)
    print()

In [None]:
mol_tokens = mol_opt_tokeniser.tokenise(mols)["original_tokens"]
for mol, tokens in zip(mols, mol_tokens):
    print(mol)
    print(tokens)
    print()