# Testing Tokenization

In [15]:
import sys
from transformers import AutoTokenizer

In [2]:
HF_MODEL_ID = "EleutherAI/llemma_7b"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_ID)

In [4]:
# check special tokens

def check_special_tokens(tokenizer):
    print("Start of sequence token:", tokenizer.bos_token)
    print("End of sequence token:", tokenizer.eos_token)
    print("Padding token:", tokenizer.pad_token)
    print("Unknown token:", tokenizer.unk_token)
    print("Mask token:", tokenizer.mask_token)

    # Listing all special tokens
    print("All special tokens:", tokenizer.all_special_tokens)

In [5]:
check_special_tokens(tokenizer)

Start of sequence token: <s>
End of sequence token: </s>
Padding token: None
Unknown token: <unk>
Mask token: None
All special tokens: ['<s>', '</s>', '<unk>']


In [6]:
test_strings = [
    "\n",
    "hello.",
    "hello.\n",
    "hello.\nworld.",
    "\n\n",
    "hello.\n\n",
    "hello.\n\nworld.",
    "different prefix\n",
    "different prefix\ndifferent suffix",
]

In [7]:
s0 = test_strings[2]
res0 = tokenizer(s0)
res0.char_to_token(0)
res0

{'input_ids': [1, 22172, 29889, 13], 'attention_mask': [1, 1, 1, 1]}

In [11]:
def check_tokenization(tokenizer, test_string, token='\n'):
    encoded = tokenizer(test_string)
    input_ids = encoded['input_ids']
    decoded_ids = [tokenizer.decode(i) for i in input_ids]
    print((
        f"original string: {repr(test_string)}\n"
        f"- encoded: {input_ids}\n"
        f"- decoded: {decoded_ids}\n"
        f"target token: {repr(token)}\n"
        f"- encoded alone: {tokenizer.encode(token, add_special_tokens=False)}"
    ))
    char_idx = test_string.find(token)
    if char_idx == -1:
        print("- target token not found in test string")
        return
    token_idx = encoded.char_to_token(char_idx)
    print((
        f"- {repr(token)} char idx: {repr(char_idx)}\n"
        f"- {repr(token)} token idx: {repr(token_idx)}\n"
        f"- {repr(token)} token id: {repr(input_ids[token_idx])}\n"
        f"- {repr(token)} decoded: {repr(tokenizer.decode(input_ids[token_idx]))}"
    ))

In [16]:
class RedirectStdoutToFile:
    def __init__(self, filepath):
        self.filepath = filepath
        self.original_stdout = sys.stdout  # Save the original stdout

    def __enter__(self):
        self.file = open(self.filepath, 'w')
        sys.stdout = self.file
        return self  # This can be omitted if not using the as variable

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout = self.original_stdout  # Restore stdout
        self.file.close()

In [17]:
with RedirectStdoutToFile("dummy.txt") as f:
    for test_string in test_strings:
        check_tokenization(tokenizer, test_string)
        print("###")