In [2]:
import pandas as pd
import pathlib
import attr
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Punctuation
from io import StringIO

import sys

In [69]:
@attr.define
class ChemblDB:

    chemreps_filepath: pathlib.Path = pathlib.Path("../raw-data/chembldb/chembl_35_chemreps.txt.gz")

    def _load_or_download(self):
        """TBD: Download file from source instead of needing to pre-download"""
        if not pathlib.Path.exists(self.chemreps_filepath):
            raise FileNotFoundError(f"${self.chemreps_filepath} was not found. Please download from https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/")
        else:
            chembldb_chemreps_raw_pd = pd.read_table(
                pathlib.Path(self.chemreps_filepath), 
                compression="gzip"
            )
            return chembldb_chemreps_raw_pd

    def _preprocess(self, chemrepsdb, column):
        db_column = chemrepsdb[column].to_list()
        text = '[EOM]'.join([col for col in db_column])
        return text

    def _tokenize(self, filepath, vocab_size=1024):
        tokenizer = Tokenizer(BPE(unk_token='[UNK]'))
        trainer = BpeTrainer(special_tokens=["[UNK]", "[EOM]"], vocab_size=vocab_size)

        # Do we need more special tokens?
        # special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]

        # Include pre-tokenizer for punctuation
        tokenizer.pre_tokenizer = Punctuation()

        tokenizer.train(
            [filepath],
            trainer,
        )

        return tokenizer


In [70]:
chembl = ChemblDB()


In [71]:
db = chembl._load_or_download()

In [None]:
text = chembl._preprocess(db, "canonical_smiles")

In [73]:
print(text[:5000])
text = text[:50000]

Cc1cc(-c2csc(N=C(N)N)n2)cn1C[EOM]CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](NC(=O)[C@@H](N)CCSC)[C@@H](C)O)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](C)C(=O)N[C@@H](Cc1c[nH]cn1)C(=O)N[C@@H](CC(N)=O)C(=O)NCC(=O)N[C@@H](C)C(=O)N[C@@H](C)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCN=C(N)N)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCN=C(N)N)C(=O)NCC(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)NCC(=O)N1CCC[C@H]1C(=O)N1CCC[C@H]1C(=O)NCC(=O)N[C@@H](CO)C(=O)N[C@@H](CCCN=C(N)N)C(N)=O[EOM]CCCC[C@@H]1NC(=O)[C@@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCN=C(N)N)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](Cc2c[nH]cn2)NC(=O)[C@H](N)Cc2ccccc2)C(C)C)CCC(=O)NCCCC[C@@H](C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](C)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](CCC(N)=O)C(=O)N[C@@H](C)C(=O)N[C@@H](Cc2c[nH]cn2)C(=O)N[C@@H](CO)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@@H](CCCN=C(N)N)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCC)

In [74]:
with open("raw_smiles.txt", 'w') as f:
    f.write(text)

In [81]:
tokenizer = chembl._tokenize("raw_smiles.txt")






In [82]:

tokenizer.save("tokenizer-chembldb.json")

In [83]:
# Inspect the vocab directly
print(tokenizer.get_vocab())  # Get the vocabulary dictionary

{'NNC': 506, 'cc': 44, 'OCc2cc': 615, 'ccc2': 186, 'OCCc2ccccc2': 407, 'cs': 532, 'OCc2ccco2': 910, '2ccc2cc3c': 963, '1CN': 279, 'Cn2': 500, 'Oc2ccc3': 522, 'OC1CCc2ccccc21': 993, 'C2CC3CC': 739, 'c1F': 589, 'nnc': 354, 'OC6C': 626, '1C23C': 662, 'c3nsc4ccccc34': 1004, 'N': 29, 'NCc1ccccc1': 269, 'CCCCCCCCCC': 403, 'NCCCC': 172, '3C2': 325, 'c3ccccc32': 779, '2COCc2cn': 683, 'c2c1ncn2C': 440, 'OCc2ccc': 274, '2CCCCC2': 414, 'Oc2n': 524, '1Cc1nc': 958, 'ccc21': 420, 'cc4c': 829, 'c3C': 390, 'Nc3cnc': 930, 'ncc2': 262, 'OCO3': 382, 'nc3ccccc23': 542, 'c1ccccc1n2CCCCN1CCN': 1015, '6CN': 330, 'c2cccnc2': 393, 'COc1ccccc1N1CCN': 435, 'c4cc2': 697, 'cc2c1': 693, 'CNc1nc': 710, 'N1C': 197, 'OCc2ccccc2': 206, 'CCCn1': 600, 'COc1no': 650, '17c': 918, 'NCc2ccc3c': 586, 'CC': 43, 'ON': 257, 'NCCCNCCCCCCCNCCCNC': 970, 'n2n1': 820, '21c': 428, 'SC4': 767, '1Cc1ccc': 218, 'O8': 514, 'c9': 231, 'COc2ccc3': 647, '23c': 834, 'CCc1ccc': 363, 'c2cc': 115, 'c5c6': 408, 'Cc3': 495, 'Cc1c2c': 652, '3OC': 1

In [84]:
test_molecule = "COc1c(O)cc(O)c(C(=N)Cc2ccc(O)cc2)c1O"

In [85]:
encoded_molecule = tokenizer.encode(test_molecule)

In [86]:
print(encoded_molecule.tokens)

['COc1c', '(', 'O', ')', 'cc', '(', 'O', ')', 'c', '(', 'C', '(', '=', 'N', ')', 'Cc2ccc', '(', 'O', ')', 'cc2', ')', 'c1O']
