In [2]:
from unsloth import FastLanguageModel
from unsloth import UnslothTrainer, UnslothTrainingArguments
from trl import SFTTrainer, SFTConfig
import torch

SEED = 42

  from .autonotebook import tqdm as notebook_tqdm


ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
INFO 11-12 11:17:53 [__init__.py:216] Automatically detected platform cuda.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!


In [3]:
model_name = "meta-llama/Llama-3.2-1B-instruct"
MAX_LENGTH = 256

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = MAX_LENGTH,
    full_finetuning=False,
    load_in_4bit = True,
    load_in_8bit = False,
)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id
RANK = 512
model = FastLanguageModel.get_peft_model(
    model,
    r = RANK,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = RANK*2,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = SEED,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)
model.print_trainable_parameters()

==((====))==  Unsloth 2025.9.8: Fast Llama patching. Transformers: 4.56.2. vLLM: 0.10.2.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.988 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.9.8 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


trainable params: 360,710,144 || all params: 1,596,524,544 || trainable%: 22.5935


## Datasets

### Simple dataset loading example

In [4]:
SUBSAMPLE = True

In [5]:
import pandas as pd
from langchain.schema import Document
dataset_knowledge = pd.read_csv("../notebooks/data/contacts_docs.csv")
documents = []
for index, row in dataset_knowledge.iterrows():
    doc = f"Nombre: {row['name']}\nTelÃ©fono: {row['phone']}"
    documents.append(Document(page_content=doc, metadata={"id": f"{row['id']}" } ))
print(f"Loaded {len(documents)} documents.")
print(f"First document: {documents[0]}")


Loaded 400 documents.
First document: page_content='Nombre: Alba Alonso
TelÃ©fono: 632 322 183' metadata={'id': '7500_1'}


In [6]:
query_dataset_train = pd.read_csv("../notebooks/data/contacts_queries_train.csv")
query_dataset_val = pd.read_csv("../notebooks/data/contacts_queries_val.csv")
query_dataset_test = pd.read_csv("../notebooks/data/contacts_queries_test.csv")


In [7]:
all_data = {
    "train": query_dataset_train,
    "validation": query_dataset_val,
    "test": query_dataset_test,
}

#to hugginface dataset
from datasets import Dataset, DatasetDict
dataset_qa = {}
for split in all_data:
    dataset_qa[split] = Dataset.from_pandas(all_data[split])
dataset_qa = DatasetDict(dataset_qa)

In [8]:
dataset_qa

DatasetDict({
    train: Dataset({
        features: ['question', 'id', 'respuesta'],
        num_rows: 1400
    })
    validation: Dataset({
        features: ['question', 'id', 'respuesta'],
        num_rows: 300
    })
    test: Dataset({
        features: ['question', 'id', 'respuesta'],
        num_rows: 300
    })
})

In [9]:
# JOIN TRAIN AND VAL DATASETS
from datasets import concatenate_datasets
dataset_qa["train"] = concatenate_datasets([dataset_qa["train"], dataset_qa["validation"]])

In [10]:
# rename "respuesta" column to "answer"
dataset_qa = dataset_qa.rename_column("respuesta", "answer")

In [11]:
dataset_qa

DatasetDict({
    train: Dataset({
        features: ['question', 'id', 'answer'],
        num_rows: 1700
    })
    validation: Dataset({
        features: ['question', 'id', 'answer'],
        num_rows: 300
    })
    test: Dataset({
        features: ['question', 'id', 'answer'],
        num_rows: 300
    })
})

### Data preparation

In [12]:
def build_prompt_it(tokenizer, system_prompt: str, prompt: str, response: str) -> str:
    """Builds the chat prompt for a single example using the tokenizer chat template."""
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user",   "content": prompt},
        {"role": "assistant", "content": response}
    ]
    return tokenizer.apply_chat_template(
        messages,
        tokenize=False,
    )

In [13]:
def generate_knowledge_injection_prompts(documents: list):
    prompt = """{doc}"""
    for doc in documents:
        yield prompt.format(doc=doc.page_content)

In [14]:
def generate_qa_prompts(dataset, tokenizer):
    system_prompt = """
    Eres un modelo de lenguaje entrenado para responder preguntas.
    """
    prompts = []
    for item in dataset:
        prompt = """{QUERY}"""
        response = "{response}"
        question = item["question"]
        prompt = prompt.format(QUERY=question)
        prompts.append(build_prompt_it(tokenizer, system_prompt, prompt, response.format(response=item["answer"])))
    return prompts

In [15]:
prompts = list(generate_knowledge_injection_prompts(documents))
print(f"Number of prompts: {len(prompts)}")

Number of prompts: 400


In [16]:
prompts[0]

'Nombre: Alba Alonso\nTelÃ©fono: 632 322 183'

In [17]:
# QUIERO VER LOS TOKENS
def print_tokens(text):
    tokens = tokenizer.tokenize(text)
    print("Number of tokens:", len(tokens), "\n")
    print("Tokens:", tokens)

print_tokens(prompts[0])

Number of tokens: 15 

Tokens: ['Nombre', ':', 'Ä Al', 'ba', 'Ä Alonso', 'ÄŠ', 'Tel', 'ÃƒÂ©fono', ':', 'Ä ', '632', 'Ä ', '322', 'Ä ', '183']


In [18]:
# create dataset from prompts
from datasets import Dataset
knowledge_dataset = Dataset.from_dict({"text": prompts})
knowledge_dataset

Dataset({
    features: ['text'],
    num_rows: 400
})

In [19]:
knowledge_dataset["text"][0]

'Nombre: Alba Alonso\nTelÃ©fono: 632 322 183'

In [20]:
prompts_qa_train = generate_qa_prompts(dataset_qa["train"], tokenizer)
prompts_qa_val = generate_qa_prompts(dataset_qa["test"], tokenizer)

print(f"Number of retrieval prompts: {len(prompts_qa_train)}")
print(f"Number of retrieval prompts: {len(prompts_qa_val)}")

Number of retrieval prompts: 1700
Number of retrieval prompts: 300


In [21]:
print(prompts_qa_train[0], sep="\n")

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 12 Nov 2025

Eres un modelo de lenguaje entrenado para responder preguntas.<|eot_id|><|start_header_id|>user<|end_header_id|>

Necesito el contacto asociado al 620 152 344. â€”consulta internaâ€”<|eot_id|><|start_header_id|>assistant<|end_header_id|>

El nÃºmero 620 152 344 pertenece a Alejandro Vega.<|eot_id|>


In [22]:
print_tokens(prompts_qa_train[0])

Number of tokens: 87 

Tokens: ['<|begin_of_text|>', '<|start_header_id|>', 'system', '<|end_header_id|>', 'ÄŠÄŠ', 'Cut', 'ting', 'Ä Knowledge', 'Ä Date', ':', 'Ä December', 'Ä ', '202', '3', 'ÄŠ', 'Today', 'Ä Date', ':', 'Ä ', '12', 'Ä Nov', 'Ä ', '202', '5', 'ÄŠÄŠ', 'E', 'res', 'Ä un', 'Ä modelo', 'Ä de', 'Ä l', 'engu', 'aje', 'Ä entren', 'ado', 'Ä para', 'Ä responder', 'Ä preg', 'untas', '.', '<|eot_id|>', '<|start_header_id|>', 'user', '<|end_header_id|>', 'ÄŠÄŠ', 'N', 'ec', 'es', 'ito', 'Ä el', 'Ä contacto', 'Ä asoci', 'ado', 'Ä al', 'Ä ', '620', 'Ä ', '152', 'Ä ', '344', '.', 'Ä Ã¢Ä¢Ä¶', 'consulta', 'Ä intern', 'a', 'Ã¢Ä¢Ä¶', '<|eot_id|>', '<|start_header_id|>', 'assistant', '<|end_header_id|>', 'ÄŠÄŠ', 'El', 'Ä nÃƒÂºmero', 'Ä ', '620', 'Ä ', '152', 'Ä ', '344', 'Ä pert', 'ene', 'ce', 'Ä a', 'Ä Alejandro', 'Ä Vega', '.', '<|eot_id|>']


In [23]:
# create dataset from prompts train, val, test
qa_train_dataset = Dataset.from_dict({"text": prompts_qa_train})
qa_val_dataset = Dataset.from_dict({"text": prompts_qa_val})

qa_dataset = {
    "train": qa_train_dataset,
    "validation": qa_val_dataset,
}

In [24]:
def tokenize_function_autoregressive(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=MAX_LENGTH)

In [25]:
knowledge_dataset_tokenizer = knowledge_dataset.map(tokenize_function_autoregressive, batched=True)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 400/400 [00:00<00:00, 17538.57 examples/s]


In [26]:
qa_train_dataset_tokenizer = qa_dataset["train"].map(tokenize_function_autoregressive, batched=True)
qa_val_dataset_tokenizer = qa_dataset["validation"].map(tokenize_function_autoregressive, batched=True)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1700/1700 [00:00<00:00, 20599.52 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 300/300 [00:00<00:00, 15839.32 examples/s]


In [27]:
def generate_qa_prompts_testing(dataset, tokenizer):
    def build_prompt_it_generation(tokenizer, system_prompt: str, prompt: str) -> str:
        """Builds the chat prompt for a single example using the tokenizer chat template."""
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user",   "content": prompt},
        ]
        return tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=False,
        )
    system_prompt = """
    Eres un modelo de lenguaje entrenado para responder preguntas.
    """
    prompts = []
    for item in dataset:
        prompt = """{QUERY}"""
        question = item["question"]
        answer = item["answer"]
        prompt = prompt.format(QUERY=question)
        prompts.append(
            (
                build_prompt_it_generation(tokenizer, system_prompt, prompt),
                answer,
            )
        )
    return prompts

In [28]:
prompts_retrieval_test = generate_qa_prompts_testing(dataset_qa["test"], tokenizer)
text_for_testing = prompts_retrieval_test[0][0]
print(text_for_testing)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 12 Nov 2025

Eres un modelo de lenguaje entrenado para responder preguntas.<|eot_id|><|start_header_id|>user<|end_header_id|>

Â¿A quiÃ©n pertenece el telÃ©fono 736 615 948?<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [29]:
import re
import tqdm
import torch

def levenshtein(s: str, t: str) -> int:
    """
    Distancia de Levenshtein con memoria O(min(len(s), len(t))).
    """
    # Asegura que t es la mÃ¡s corta para usar menos memoria
    if len(s) < len(t):
        s, t = t, s
    previous = list(range(len(t) + 1))

    for i, cs in enumerate(s, start=1):
        current = [i]
        for j, ct in enumerate(t, start=1):
            costo = 0 if cs == ct else 1
            current.append(min(
                current[-1] + 1,          # inserciÃ³n
                previous[j] + 1,          # borrado
                previous[j-1] + costo     # sustituciÃ³n
            ))
        previous = current
    return previous[-1]


def test_model_accuracy(model, tokenizer, prompts_retrieval_test, device="cuda", 
                        max_new_tokens=64, temperature=0.0, top_p=1.0):
    
    acc = 0
    lev = 0
    total = 0

    progress_bar = tqdm.tqdm(prompts_retrieval_test, desc="Testing")

    for text, answer in progress_bar:
        inputs = tokenizer(text, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                temperature=temperature,
                top_p=top_p
            )
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        response = generated_text.split("assistant")[-1].strip()

        if response == answer:
            acc += 1
        lev += levenshtein(response, answer)
        total += 1

        progress_bar.set_postfix({
            "acc": f"{acc/total*100:.2f} %",
            "lev": f"{lev/total:.2f}"
        })

    final_acc = acc / total * 100 if total > 0 else 0.0
    print(f"Accuracy final: {acc}/{total} = {final_acc:.2f} %")
    final_lev = lev / total if total > 0 else 0.0
    print(f"Levenshtein final: {final_lev:.2f}")
    return final_acc, final_lev

In [30]:
import matplotlib.pyplot as plt

def plot_training_history(history):
    """
    Genera grÃ¡ficos de la evoluciÃ³n de la accuracy y las pÃ©rdidas (SFT e IT) durante el entrenamiento.

    ParÃ¡metros:
        history (list[dict]): lista de diccionarios, donde cada elemento debe contener:
            - "super_epoch": nÃºmero de super-Ã©poca.
            - "accuracy": precisiÃ³n alcanzada.
            - "trainer_sft_stats".training_loss: pÃ©rdida SFT.
            - "trainer_it_stats".training_loss: pÃ©rdida IT.
    """
    # Extraer mÃ©tricas del historial
    super_epochs = [h["super_epoch"] for h in history]
    accuracies = [h["accuracy"] for h in history]
    losses_sft = [h["trainer_sft_stats"].training_loss for h in history]
    losses_it = [h["trainer_it_stats"].training_loss for h in history]

    # --- GrÃ¡fico de Accuracy ---
    plt.figure(figsize=(6, 4))
    plt.plot(super_epochs, accuracies, marker='o', color='tab:blue')
    plt.xlabel("Super Epoch")
    plt.ylabel("Accuracy (%)")
    plt.title("Accuracy History")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # --- GrÃ¡fico de Losses ---
    plt.figure(figsize=(6, 4))
    plt.plot(super_epochs, losses_sft, marker='o', label="SFT Loss", color='tab:orange')
    plt.plot(super_epochs, losses_it, marker='s', label="IT Loss", color='tab:green')
    plt.xlabel("Super Epoch")
    plt.ylabel("Loss")
    plt.title("Loss History")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


## Train

### Iterative training configuration

In [31]:
# sft training
from transformers import DataCollatorForLanguageModeling


data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
auto_config = UnslothTrainingArguments(
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 8, # Use GA to mimic batch size!
    save_strategy="no",
    save_total_limit=0,
    warmup_steps = 5,
    num_train_epochs = 1, # Set this for 1 full training run.
    #max_steps = 60,
    learning_rate = 1e-4, # Reduce to 2e-5 for long training runs
    logging_steps = 1,
    # 32 bits
    optim = "paged_adamw_32bit",
    weight_decay = 0.01,
    lr_scheduler_type = "cosine",
    seed = SEED,
    report_to = "none", # Use this for WandB etc
    output_dir="../models/qwen3-0.6b-rag-indexer",
)

it_config = SFTConfig(
    dataset_text_field="text",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,        # <-- aÃ±ade eval batch size
    gradient_accumulation_steps=8,
    warmup_steps=25,
    save_strategy="no",
    save_total_limit=0,
    eval_steps=1,
    eval_strategy="steps",         # <-- activa evaluaciÃ³n periÃ³dica
    num_train_epochs=1,             # <-- opcional: usa epochs en lugar de max_steps
    #max_steps=30,
    learning_rate=1e-4,
    logging_steps=1,
    optim = "paged_adamw_32bit",
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    seed=SEED,
    report_to="none",
    output_dir="../models/qwen3-0.6b-rag-retriever",
    load_best_model_at_end=False,          # <-- opcional
    metric_for_best_model="eval_loss",    # <-- opcional
    greater_is_better=False,              # <-- opcional
)

trainer_auto = UnslothTrainer(
    model=model,
    train_dataset=knowledge_dataset_tokenizer,
    tokenizer=tokenizer,
    data_collator=data_collator,
    args=auto_config,
)

trainer_it = SFTTrainer(
    model=model,
    train_dataset=qa_train_dataset_tokenizer,
    eval_dataset=qa_val_dataset_tokenizer,
    data_collator=data_collator,
    tokenizer=tokenizer,
    args=it_config,
)

In [32]:
model.print_trainable_parameters()

trainable params: 360,710,144 || all params: 1,596,524,544 || trainable%: 22.5935


In [33]:
import wandb

name = f"{model_name.replace('/', '_')}_r{RANK}_qa_iterativo_agenda"

# Inicia la sesiÃ³n de wandb
wandb.init(
    project="qa_iterativo_agenda",
    name=name,  # opcional
    config={
        "super_epochs": 10,
        "model": model_name,
        "r": RANK,
    },
)

[34m[1mwandb[0m: Currently logged in as: [33mmiguel_kjh[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, langchain, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


In [None]:
EPOCHS = 10

for _ in range(EPOCHS):
    print(f"--- SUPER EPOCH {_+1} / {EPOCHS} ---")
    trainer_sft_stats = trainer_auto.train() 
    trainer_it_stats = trainer_it.train()
    print(trainer_it_stats)
    # evaluate accuracy after each super epoch
    acc, lev = test_model_accuracy(model, tokenizer, prompts_retrieval_test, device="cuda")
    wandb.log({
        "super_epoch": _ + 1,
        "accuracy": acc,
        "levenshtein": lev,
        "train_loss_sft": trainer_sft_stats.training_loss,
        "train_loss_it": trainer_it_stats.training_loss,
    })

wandb.finish()

--- SUPER EPOCH 1 / 10 ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 7
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 360,710,144 of 1,596,524,544 (22.59% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,4.4568
2,4.4082
3,3.1691
4,2.7974
5,2.2036
6,1.8997
7,1.9346


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,700 | Num Epochs = 1 | Total steps = 27
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 360,710,144 of 1,596,524,544 (22.59% trained)


Step,Training Loss,Validation Loss
1,6.4757,6.465154
2,6.4153,5.048048
3,5.0523,3.762906
4,3.7248,2.77172
5,2.7505,2.150628
6,2.1482,1.666684
7,1.6322,1.280813
8,1.2683,0.945208
9,0.9825,0.794286
10,0.8166,0.761761


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


TrainOutput(global_step=27, training_loss=1.505070862946687, metrics={'train_runtime': 145.223, 'train_samples_per_second': 11.706, 'train_steps_per_second': 0.186, 'total_flos': 3482965455667200.0, 'train_loss': 1.505070862946687, 'epoch': 1.0})


Testing: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 300/300 [01:57<00:00,  2.55it/s, acc=0.00 %, lev=8.84]


Accuracy final: 0/300 = 0.00 %
Levenshtein final: 8.84
--- SUPER EPOCH 2 / 10 ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 7
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 360,710,144 of 1,596,524,544 (22.59% trained)


Step,Training Loss
1,2.8525
2,2.813
3,2.3116
4,2.1804
5,2.046
6,1.9344
7,1.6704


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,700 | Num Epochs = 1 | Total steps = 27
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 360,710,144 of 1,596,524,544 (22.59% trained)


Step,Training Loss,Validation Loss
1,0.6911,0.691466


KeyboardInterrupt: 

Error in callback <bound method _WandbInit._post_run_cell_hook of <wandb.sdk.wandb_init._WandbInit object at 0x7d04e21cc040>> (for post_run_cell), with arguments args (<ExecutionResult object at 7d04e21cf040, execution_count=34 error_before_exec=None error_in_exec= info=<ExecutionInfo object at 7d04d8ac25f0, raw_cell="EPOCHS = 10

for _ in range(EPOCHS):
    print(f"-.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://wsl%2Bubuntu/home/miguel/projects/rag-experiments/notebooks/train_llm_retriver.ipynb#X44sdnNjb2RlLXJlbW90ZQ%3D%3D> result=None>,),kwargs {}:


socket.send() raised exception.


In [None]:
trainer_it_stats

NameError: name 'trainer_it_stats' is not defined

In [None]:
# test the model in streaming mode
from transformers import TextStreamer

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
_ = model.generate(
    **tokenizer(text_for_testing, return_tensors = "pt").to("cuda"),
    max_new_tokens = 64, # Increase for longer outputs!
    do_sample = False,
    top_p = 0.1,
    temperature = 0.,
    streamer = streamer,
)

El nÃºmero 736 615 948 pertenece a Leo PÃ©rez.<|eot_id|>


In [None]:
acc, lev = test_model_accuracy(model, tokenizer, prompts_retrieval_test, device="cuda")

Testing: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 300/300 [01:45<00:00,  2.84it/s, acc=91.00 %, lev=0.80]

Accuracy final: 273/300 = 91.00 %
Levenshtein final: 241/300 = 0.80



