# Tokenizer experiments

- Add special tokens to tokenizer
- Remove tokens from tokenizer
- Run LLama model with new tokenizer

***

## Login Hugging Face

In [None]:
from huggingface_hub._login import _login as hf_login
hf_login(token="", add_to_git_credential=False)

## Imports

In [2]:
import time

from transformers import AutoModelForCausalLM, AutoTokenizer

from llm_prompts.prompts.grid_formatter import GridFormatter
from llm_prompts.prompts.text_prompts import PromptSolveInstrV2
from llm_prompts.reader import ReaderPickle
from llm_prompts.type_aliases import Grid

## Tokenizer

In [None]:
path_to_tokenizer = "../../llm_prompts/tests/llama_tokenizer"
tokenizer = AutoTokenizer.from_pretrained(path_to_tokenizer)

print("Tokenizer name:", tokenizer.name_or_path)
print("Tokenizer length:", len(tokenizer))
print("Special token map:", tokenizer.special_tokens_map)

## GridFormatter

In [None]:
grid_formatter_notebook = GridFormatter()
print(grid_formatter_notebook.sE_token)

In [None]:
additional_special_tokens = grid_formatter_notebook.get_special_tokens_not_in(tokenizer=tokenizer)
print(additional_special_tokens)

In [None]:
grid: Grid = [
    [0, 1, 2],
    [0, 1, 2],
    [4, 4, 4]
]

grid_formatter_notebook.encode_grid(grid, input_or_output="input")

## Add tokens to `tokenizer`

In [7]:
new_special_tokens = {
    "additional_special_tokens": additional_special_tokens
}
tokenizer.add_special_tokens(new_special_tokens)

path_to_special_tokenizer = path_to_tokenizer + "_with_special"


In [None]:
tokenizer.save_pretrained(path_to_special_tokenizer)

In [None]:
special_tokenizer = AutoTokenizer.from_pretrained(path_to_special_tokenizer)

print("Special tokenizer name:", special_tokenizer.name_or_path)
print("Special tokenizer length:", len(special_tokenizer))
print("Special token map:", special_tokenizer.special_tokens_map)

## Prompt encoding

In [10]:
tasks = ReaderPickle(
    dataset_dir="../../synth_data",
    dataset_category="re_arc_400x5",
    read_test_output=True,
).read_tasks()

In [None]:
sorted_task_id = sorted(tasks.keys())
task_id = sorted_task_id[0]

print(f"{task_id=}")
print(f"{sorted_task_id[1]=}")

In [12]:
prompt_fn = PromptSolveInstrV2(grid_formatter=grid_formatter_notebook)

In [None]:
s = time.time()
messages = prompt_fn(task=tasks[task_id], idx_i=0)
e = time.time()

print(f">>> Time: {1000 * (e -s ):.2f} ms")

In [None]:
assert len(tokenizer) >= 128260

text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=False,
)

print(text)

In [None]:
?tokenizer.apply_chat_template

In [None]:
?tokenizer.encode

## Model

In [None]:
llama_1B = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
llama_1B.resize_token_embeddings(len(tokenizer))