In [1]:
import pandas as pd
import pathlib
import attr
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from io import StringIO

In [None]:
@attr.define
class ChemblDB:

    chemreps_filepath: pathlib.Path = pathlib.Path("../raw-data/chembldb/chembl_35_chemreps.txt.gz")

    def _load_or_download(self):
        """TBD: Download file from source instead of needing to pre-download"""
        if not pathlib.Path.exists(self.chemreps_filepath):
            raise FileNotFoundError(f"${self.chemreps_filepath} was not found. Please download from https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/")
        else:
            chembldb_chemreps_raw_pd = pd.read_table(
                pathlib.Path(self.chemreps_filepath), 
                compression="gzip"
            )
            return chembldb_chemreps_raw_pd

    def _preprocess(self, chemrepsdb, column):
        db_column = chemrepsdb[column].to_list()
        text = '[EOM]'.join([col for col in db_column])
        return text

    def _tokenize(self, filepath, vocab_size=1024):
        tokenizer = Tokenizer(BPE(unk_token='[UNK]'))
        trainer = BpeTrainer(special_tokens=["[UNK]", "[EOM]"], vocab_size=vocab_size)

        # Do we need more special tokens?
        # special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]

        # Include pre-tokenizer for punctuation?

        tokenizer.train(
            [filepath],
            trainer,
        )

        return tokenizer


In [60]:
chembl = ChemblDB()


In [61]:
db = chembl._load_or_download()

In [62]:
text = chembl._preprocess(db, "canonical_smiles")

In [63]:
print(text[:5000])
text = text[:50000]

Cc1cc(-c2csc(N=C(N)N)n2)cn1C[EOM]CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](NC(=O)[C@@H](N)CCSC)[C@@H](C)O)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](C)C(=O)N[C@@H](Cc1c[nH]cn1)C(=O)N[C@@H](CC(N)=O)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](C)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCN=C(N)N)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCN=C(N)N)C(=O)NCC(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)NCC(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(=O)NCC(=O)N[C@@H](CO)C(=O)N[C@@H](CCCN=C(N)N)C(N)=O[EOM]CCCC[C@@H]1NC(=O)[C@@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCN=C(N)N)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](Cc2c[nH]cn2)NC(=O)[C@H](N)Cc2ccccc2)C(C)C)CCC(=O)NCCCC[C@@H](C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](C)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](C)C(=O)N[C@@H](Cc2c[nH]cn2)C(=O)N[C@@H](CO)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@@H](CCCN=C(N)N)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCC)

In [64]:
with open("raw_smiles.txt", 'w') as f:
    f.write(text)

In [65]:
_tokenizer = chembl._tokenize("raw_smiles.txt")






In [None]:

_tokenizer.save("tokenizer-chembldb.json")

In [66]:
# Inspect the vocab directly
_tokenizer.get_vocab()  # Get the vocabulary dictionary

{'[C@@H](O)': 90,
 '[C': 44,
 '[C@@H](': 56,
 '1)': 93,
 'P': 31,
 'C(N)=O)': 86,
 '[C@H](': 55,
 'S': 32,
 'c': 37,
 '2)': 87,
 'N)': 59,
 'cccc': 71,
 'C(C)': 68,
 '[C@': 45,
 '6': 16,
 'H](': 50,
 'NC(=O)[C@H](C': 74,
 '-': 7,
 '1': 11,
 '[C@H]1': 92,
 'r': 41,
 '[C@H]': 69,
 'ccc(': 78,
 '@': 21,
 'O)': 43,
 'c1ccc(': 95,
 'c2': 61,
 '4': 14,
 'C(=O)N[C@@H](C': 81,
 '9': 19,
 'H': 26,
 'C(=O)N': 58,
 '#': 2,
 '0': 10,
 '[EOM]': 1,
 '=': 20,
 'H]': 46,
 '%': 3,
 '5': 15,
 ']': 35,
 '(C)': 76,
 '[C@@H](C)': 89,
 '[C@@H]': 70,
 'EO': 65,
 ')': 5,
 '(': 4,
 'cc': 49,
 'C(=O)N[C@@H](': 62,
 'I': 27,
 's': 42,
 '=O)': 48,
 '.': 8,
 '2': 12,
 'E': 24,
 'o': 40,
 '[': 33,
 '[UNK]': 0,
 'NC(=O)[C@@H](': 83,
 'a': 36,
 'C(N)': 73,
 'C(': 47,
 'C': 23,
 'M': 28,
 '[C@@': 53,
 'NC(=O)[C@H](': 63,
 '[EO': 67,
 'C(C)C)': 79,
 'c3': 82,
 'C(=O)N[C@H](': 94,
 'O[C@@H]': 91,
 'n': 39,
 '[EOM]C': 75,
 'C(=O)O)': 84,
 'F': 25,
 'CCCC': 77,
 'NC(=O)': 57,
 'c1': 60,
 '3': 13,
 '\\': 34,
 'C(=O)': 51,


In [10]:
test_molecule = "COc1c(O)cc(O)c(C(=N)Cc2ccc(O)cc2)c1O"

In [None]:
encoded_molecule = _tokenizer.encode(test_molecule)

In [14]:
print(encoded_molecule.tokens)

['CO', 'c1c(', 'O)', 'cc(', 'O)c(', 'C(=N)', 'Cc2ccc(O)cc2)', 'c1', 'O']
