In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from tokenizers import Tokenizer
import sys

import matplotlib.pyplot as plt
import numpy as np
import collections

In [None]:
sys.path.append("../../")

from ChEmbed.data import chembldb, datasets, chembed_tokenizer

In [None]:
def load_encoded_chembl(encode_slice = slice(0, 50000)):
    tokenizer = Tokenizer.from_file("../data/tokenizers/tokenizer-chembldb-16-06-2025.json")
    chembl = chembldb.ChemblDBChemreps()._load_or_download()
    chembl_smiles = chembldb.ChemblDBChemreps()._preprocess(chembl)
    chembl_encoded = tokenizer.encode(chembl_smiles[encode_slice])

    return chembl_encoded

In [None]:
chembl_encoded = load_encoded_chembl()
counter = collections.Counter(chembl_encoded.tokens)
token_freqs = sorted(counter.items(), key = lambda x: x[1], reverse=True)

In [None]:
print(token_freqs)

In [None]:
freqs = [freq[1] for freq in token_freqs]

## Do the tokens of our learned tokenizer follow a Zipfian distribution?

We can qualitatively examine if the tokens of our learned tokenizer follow a Zipfian distribution by plotting the frequency of each token against its rank in the sorted list of tokens. This is a common characteristic of natural language, where a small number of tokens are very common, while the majority are rare, but it's not exactly a given that this is true for a tokenizer derived entirely from chemical SMILES strings.

## Notes on common tokens

Surprisingly, the tokens of our learned tokenizer do follow zipfs law, at least roughly. Ignoring puncutation for the time-being, the most common token is the single character "C", which represents carbon, and the second most common token is "O", which represents oxygen. These are followed by other common elements in organic chemistry, such as "N" for nitrogen and "H" for hydrogen. We can also see interesting patterns such as "c1ccccc1", which is the SMILES representation of benzene. 

For the least common tokens, we see the occasional occurence of rarer elements such as "P" for phosphorus, "S" for sulfur, and "F" for fluorine. While these do occur in pharmacologically relevant compounds, they are less common than the more ubiquitous elements like carbon, oxygen, and nitrogen.

In [None]:
fig, ax = plt.subplots(ncols=1, nrows=1)
ax.plot(freqs)
ax.set_yscale('log')
ax.set_xscale('log')

In [None]:
smilesdataset = datasets.SMILESDataset(
    smiles_list = chembldb.ChemblDBChemreps()._load_or_download()["canonical_smiles"].tolist(),
    tokenizer = Tokenizer.from_file("../data/tokenizers/tokenizer-chembldb-16-06-2025.json"),
)

In [None]:
smilesdataset[0]