In [1]:
from unsloth import FastLanguageModel
from datasets import load_from_disk
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer
from trl import SFTTrainer, SFTConfig
import torch

SEED = 42

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
INFO 10-27 12:08:27 [__init__.py:216] Automatically detected platform cuda.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
model_name = "Qwen/Qwen3-0.6B"
MAX_LENGTH = 256

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = MAX_LENGTH,
    load_in_4bit = False,
    load_in_8bit = False,
)
tokenizer.pad_token = tokenizer.eos_token
RANK = 256
model = FastLanguageModel.get_peft_model(
    model,
    r = RANK,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = RANK*2,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = SEED,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

==((====))==  Unsloth 2025.9.8: Fast Qwen3 patching. Transformers: 4.56.2. vLLM: 0.10.2.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.988 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.9.8 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


## Datasets

### Simple dataset loading example

In [3]:
import pandas as pd
from langchain.schema import Document
dataset = pd.read_csv("../notebooks/data/contacts_docs.csv")
documents = []
for index, row in dataset.iterrows():
    doc = f"Nombre: {row['name']}\nTeléfono: {row['phone']}"
    documents.append(Document(page_content=doc, metadata={"id": f"{row['id']}" } ))
print(f"Loaded {len(documents)} documents.")
print(f"First document: {documents[0]}")


Loaded 400 documents.
First document: page_content='Nombre: Alba Alonso
Teléfono: 632 322 183' metadata={'id': '7500_1'}


In [4]:
query_dataset_train = pd.read_csv("../notebooks/data/contacts_queries_train.csv")
query_dataset_val = pd.read_csv("../notebooks/data/contacts_queries_val.csv")
query_dataset_test = pd.read_csv("../notebooks/data/contacts_queries_test.csv")


In [5]:
all_data = {
    "train": query_dataset_train,
    "validation": query_dataset_val,
    "test": query_dataset_test,
}

#to hugginface dataset
from datasets import Dataset, DatasetDict
dataset = {}
for split in all_data:
    dataset[split] = Dataset.from_pandas(all_data[split])
dataset = DatasetDict(dataset)

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'id'],
        num_rows: 1400
    })
    validation: Dataset({
        features: ['question', 'id'],
        num_rows: 300
    })
    test: Dataset({
        features: ['question', 'id'],
        num_rows: 300
    })
})

In [81]:
# === Option B: IDs como tokens especiales (DOCID:{...}) ===
def collect_all_ids(ds):
    ids = set()
    for split in ("train", "validation", "test"):
        if split in ds:
            for ex in ds[split]:
                if "id" in ex:
                    ids.add(str(ex["id"]))
    return sorted(list(ids))

all_ids = collect_all_ids(dataset)
special_id_tokens = [f"DOCID:{{{docid}}}" for docid in all_ids]

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
tokenizer.add_special_tokens({"additional_special_tokens": special_id_tokens})
model.resize_token_embeddings(len(tokenizer))
print(f"Added {len(special_id_tokens)} special ID tokens.")

Added 400 special ID tokens.


In [82]:
# === Habilitar entrenamiento del lm_head junto con LoRA en la fase autoregresiva ===
for name, p in model.named_parameters():
    p.requires_grad_(False)
for name, p in model.named_parameters():
    if "lora_" in name or "lm_head" in name:
        p.requires_grad_(True)
trainables = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Trainable params (LoRA + lm_head):", trainables)

Trainable params (LoRA + lm_head): 161480704


### Real dataset loading example

In [145]:
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    model_kwargs={"device": "cuda"},
)

db = FAISS.load_local(
    "../data/db/parliament_db/parliament_all_docs_embeddings_sentence-transformers_paraphrase-multilingual-mpnet-base-v2",
    embedding_model,
    allow_dangerous_deserialization=True,
)

In [152]:
#quiero la lista de documentos
docs = db.docstore._dict.values()
documents = list(docs)
print(f"Number of documents: {len(documents)}")

Number of documents: 11162


In [144]:
FOLDER_AUTORE = "../data/processed/parliament_qa"
dataset = load_from_disk(FOLDER_AUTORE)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'response', 'cost', 'documents', 'type', 'retrieved_pks', 'oracle_context', 'formatted_context'],
        num_rows: 614
    })
    validation: Dataset({
        features: ['id', 'question', 'response', 'cost', 'documents', 'type', 'retrieved_pks', 'oracle_context', 'formatted_context'],
        num_rows: 161
    })
    test: Dataset({
        features: ['question', 'id', 'response', 'type', 'retrieved_pks', 'oracle_context', 'injected_oracle', 'formatted_context', 'documents'],
        num_rows: 205
    })
})

## Data preparation

In [7]:
def build_prompt_it(tokenizer, system_prompt: str, prompt: str, response: str) -> str:
    """Builds the chat prompt for a single example using the tokenizer chat template."""
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user",   "content": prompt},
        {"role": "assistant", "content": response}
    ]
    return tokenizer.apply_chat_template(
        messages,
        tokenize=False,
    )

In [8]:
def prepare_prompt_for_indexing(documents: list):
    prompt = """
    Contenido del documento:
    {doc}
    El Identificador del documento es DOCID:{{{doc_id}}}
    """
    for doc in documents:
        document = doc.page_content
        doc_id = doc.metadata.get("id", "unknown")
        yield prompt.format(doc=document, doc_id=doc_id)

In [9]:
def prepare_prompts_for_retrieval(dataset, tokenizer):
    system_prompt = """
    Eres un módulo de recuperación. Tu única tarea es devolver el identificador del documento correspondiente a la consulta dada.
    """
    prompts = []
    for item in dataset:
        prompt = """
        Dada la siguiente consulta, recupera los identificadores de los documentos relevantes. 
        Consulta: {QUERY}
        """
        response = "DOCID:{{{docid}}}"
        question = item["question"]
        prompt = prompt.format(QUERY=question)
        prompts.append(build_prompt_it(tokenizer, system_prompt, prompt, response.format(docid=item["id"])))
    return prompts

In [10]:
prompts = list(prepare_prompt_for_indexing(documents))
print(f"Number of prompts: {len(prompts)}")

Number of prompts: 400


In [11]:
prompts[0]

'\n    Contenido del documento:\n    Nombre: Alba Alonso\nTeléfono: 632 322 183\n    El Identificador del documento es DOCID:{7500_1}\n    '

In [12]:
# QUIERO VER LOS TOKENS
def print_tokens(text):
    tokens = tokenizer.tokenize(text)
    print("Number of tokens:", len(tokens), "\n")
    print("Tokens:", tokens)

print_tokens(prompts[0])

Number of tokens: 48 

Tokens: ['Ċ', 'ĠĠĠ', 'ĠCont', 'enido', 'Ġdel', 'Ġdocumento', ':Ċ', 'ĠĠĠ', 'ĠNombre', ':', 'ĠAl', 'ba', 'ĠAlonso', 'Ċ', 'Tel', 'Ã©fono', ':', 'Ġ', '6', '3', '2', 'Ġ', '3', '2', '2', 'Ġ', '1', '8', '3', 'Ċ', 'ĠĠĠ', 'ĠEl', 'ĠIdent', 'ificador', 'Ġdel', 'Ġdocumento', 'Ġes', 'ĠDOC', 'ID', ':{', '7', '5', '0', '0', '_', '1', '}Ċ', 'ĠĠĠĠ']


In [13]:
# create dataset from prompts
from datasets import Dataset
indexing_dataset = Dataset.from_dict({"text": prompts})
indexing_dataset

Dataset({
    features: ['text'],
    num_rows: 400
})

In [14]:
indexing_dataset["text"][0]

'\n    Contenido del documento:\n    Nombre: Alba Alonso\nTeléfono: 632 322 183\n    El Identificador del documento es DOCID:{7500_1}\n    '

In [15]:
prompts_retrieval_train = prepare_prompts_for_retrieval(dataset["train"], tokenizer)
prompts_retrieval_val = prepare_prompts_for_retrieval(dataset["validation"], tokenizer)

print(f"Number of retrieval prompts: {len(prompts_retrieval_train)}")
print(f"Number of retrieval prompts: {len(prompts_retrieval_val)}")

Number of retrieval prompts: 1400
Number of retrieval prompts: 300


In [16]:
print(prompts_retrieval_train[0], sep="\n")

<|im_start|>system

    Eres un módulo de recuperación. Tu única tarea es devolver el identificador del documento correspondiente a la consulta dada.
    <|im_end|>
<|im_start|>user

        Dada la siguiente consulta, recupera los identificadores de los documentos relevantes. 
        Consulta: Necesito el contacto asociado al 620 152 344. —consulta interna—
        <|im_end|>
<|im_start|>assistant
<think>

</think>

DOCID:{7503_3}<|im_end|>



In [17]:
print_tokens(prompts_retrieval_train[0])

Number of tokens: 113 

Tokens: ['<|im_start|>', 'system', 'ĊĊ', 'ĠĠĠ', 'ĠE', 'res', 'Ġun', 'Ġm', 'Ã³d', 'ulo', 'Ġde', 'Ġrecuper', 'aciÃ³n', '.', 'ĠTu', 'ĠÃºnica', 'Ġtarea', 'Ġes', 'Ġdev', 'olver', 'Ġel', 'Ġident', 'ificador', 'Ġdel', 'Ġdocumento', 'Ġcorrespond', 'iente', 'Ġa', 'Ġla', 'Ġconsulta', 'Ġd', 'ada', '.Ċ', 'ĠĠĠĠ', '<|im_end|>', 'Ċ', '<|im_start|>', 'user', 'ĊĊ', 'ĠĠĠĠĠĠĠ', 'ĠD', 'ada', 'Ġla', 'Ġsiguiente', 'Ġconsulta', ',', 'Ġrec', 'up', 'era', 'Ġlos', 'Ġident', 'ific', 'adores', 'Ġde', 'Ġlos', 'Ġdocumentos', 'Ġrelevant', 'es', '.', 'ĠĊ', 'ĠĠĠĠĠĠĠ', 'ĠConsult', 'a', ':', 'ĠNec', 'es', 'ito', 'Ġel', 'Ġcontacto', 'Ġasoci', 'ado', 'Ġal', 'Ġ', '6', '2', '0', 'Ġ', '1', '5', '2', 'Ġ', '3', '4', '4', '.', 'ĠâĢĶ', 'consulta', 'Ġintern', 'a', 'âĢĶ', 'Ċ', 'ĠĠĠĠĠĠĠĠ', '<|im_end|>', 'Ċ', '<|im_start|>', 'assistant', 'Ċ', '<think>', 'ĊĊ', '</think>', 'ĊĊ', 'DOC', 'ID', ':{', '7', '5', '0', '3', '_', '3', '}', '<|im_end|>', 'Ċ']


In [18]:
# create dataset from prompts train, val, test
retrieval_train_dataset = Dataset.from_dict({"text": prompts_retrieval_train})
retrieval_val_dataset = Dataset.from_dict({"text": prompts_retrieval_val})

retrieval_dataset = {
    "train": retrieval_train_dataset,
    "validation": retrieval_val_dataset,
}

In [19]:
from dataclasses import dataclass
from typing import Dict, List
import torch

@dataclass
class DataCollatorForCompletionOnlyLM:
    tokenizer: object
    response_template: str = "<|im_start|>assistant"
    mlm: bool = False

    def __call__(self, examples: List[Dict[str, str]]) -> Dict[str, torch.Tensor]:
        texts = [ex for ex in examples]
        tokenized = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=self.tokenizer.model_max_length,
            return_tensors="pt",
        )
        labels = tokenized.input_ids.clone()
        # Máscara: solo calcular pérdida en la parte del assistant
        for i, text in enumerate(texts):
            idx = text.find(self.response_template)
            if idx != -1:
                token_pos = self.tokenizer(text[:idx], add_special_tokens=False)["input_ids"]
                cutoff = len(token_pos)
                labels[i, :cutoff] = -100
        tokenized["labels"] = labels
        return tokenized
    
collator_it = DataCollatorForCompletionOnlyLM(
    tokenizer=tokenizer,
    response_template="<|im_start|>assistant",
)

In [20]:
def tokenize_function_autoregressive(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=MAX_LENGTH)

In [21]:
indexing_dataset_tokenizer = indexing_dataset.map(tokenize_function_autoregressive, batched=True)

Map: 100%|██████████| 400/400 [00:00<00:00, 15598.01 examples/s]


In [22]:
retrieval_train_dataset_tokenizer = retrieval_dataset["train"].map(tokenize_function_autoregressive, batched=True)
retrieval_val_dataset_tokenizer = retrieval_dataset["validation"].map(tokenize_function_autoregressive, batched=True)

Map: 100%|██████████| 1400/1400 [00:00<00:00, 17632.07 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 15482.28 examples/s]


## Train

In [26]:
# sft training
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
auto_config = SFTConfig(
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 8, # Use GA to mimic batch size!
    save_steps=5,
    warmup_steps = 5,
    num_train_epochs = 3, # Set this for 1 full training run.
    #max_steps = 60,
    learning_rate = 1e-4, # Reduce to 2e-5 for long training runs
    logging_steps = 1,
    # 32 bits
    optim = "paged_adamw_32bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = SEED,
    report_to = "none", # Use this for WandB etc
    output_dir="../models/qwen3-0.6b-rag-indexer",
)

it_config = SFTConfig(
    dataset_text_field="text",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,         # <-- añade eval batch size
    gradient_accumulation_steps=16,
    warmup_steps=25,
    save_steps=25,
    eval_steps=1,
    eval_strategy="steps",         # <-- activa evaluación periódica
    num_train_epochs=1,             # <-- opcional: usa epochs en lugar de max_steps
    #max_steps=30,
    learning_rate=1e-4,
    logging_steps=1,
    optim = "paged_adamw_32bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=SEED,
    report_to="none",
    output_dir="../models/qwen3-0.6b-rag-retriever",
    load_best_model_at_end=True,          # <-- opcional
    metric_for_best_model="eval_loss",    # <-- opcional
    greater_is_better=False,              # <-- opcional
)

trainer_auto = SFTTrainer(
    model=model,
    train_dataset=indexing_dataset_tokenizer,
    tokenizer=tokenizer,
    data_collator=data_collator,
    args=auto_config,
)

trainer_it = SFTTrainer(
    model=model,
    train_dataset=retrieval_train_dataset_tokenizer,
    eval_dataset=retrieval_val_dataset_tokenizer,
    data_collator=data_collator,
    tokenizer=tokenizer,
    args=it_config,
)

In [24]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4090. Max memory = 23.988 GB.
1.752 GB of memory reserved.


In [100]:
model.print_trainable_parameters()

trainable params: 161,480,704 || all params: 757,666,816 || trainable%: 21.3129


In [27]:
EPOCHS = 4
for _ in range(EPOCHS):
    trainer_sft_stats = trainer_auto.train() # (context, id)
    trainer_it_stats = trainer_it.train() # (query, id)
    # GUARDAR MODELOS CADA SUPER EPOCH

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 3 | Total steps = 21
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 161,480,704 of 757,530,624 (21.32% trained)


Step,Training Loss
1,0.9579
2,0.9502
3,0.8351
4,0.7903
5,0.8521
6,0.7682
7,0.7677
8,0.8018
9,0.7311
10,0.7469


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,400 | Num Epochs = 1 | Total steps = 6
O^O/ \_/ \    Batch size per device = 16 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (16 x 16 x 1) = 256
 "-____-"     Trainable parameters = 161,480,704 of 757,530,624 (21.32% trained)


Step,Training Loss,Validation Loss
1,5.6542,5.68209
2,5.6915,4.872732
3,4.8745,3.791305
4,3.7808,2.809565
5,2.8154,2.064784
6,2.0369,1.486922


Unsloth: Not an error, but Qwen3ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 3 | Total steps = 21
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 161,480,704 of 757,530,624 (21.32% trained)


Step,Training Loss
1,0.7215
2,0.7199
3,0.672
4,0.6723
5,0.6862
6,0.6876
7,0.6896
8,0.6671
9,0.6877
10,0.6863


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,400 | Num Epochs = 1 | Total steps = 6
O^O/ \_/ \    Batch size per device = 16 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (16 x 16 x 1) = 256
 "-____-"     Trainable parameters = 161,480,704 of 757,530,624 (21.32% trained)


Step,Training Loss,Validation Loss
1,2.4578,2.491342
2,2.4648,2.227789
3,2.2182,1.777558
4,1.7573,1.371486
5,1.3711,1.075885
6,1.046,0.865046


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 3 | Total steps = 21
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 161,480,704 of 757,530,624 (21.32% trained)


Step,Training Loss
1,0.6676
2,0.6693
3,0.6619
4,0.6619
5,0.673
6,0.6746
7,0.6567
8,0.6614
9,0.6826
10,0.6765


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,400 | Num Epochs = 1 | Total steps = 6
O^O/ \_/ \    Batch size per device = 16 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (16 x 16 x 1) = 256
 "-____-"     Trainable parameters = 161,480,704 of 757,530,624 (21.32% trained)


Step,Training Loss,Validation Loss
1,0.9325,0.957675
2,0.9274,0.896505
3,0.8893,0.792751
4,0.7784,0.678473
5,0.68,0.596149
6,0.5711,0.520629


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 3 | Total steps = 21
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 161,480,704 of 757,530,624 (21.32% trained)


Step,Training Loss
1,0.6625
2,0.661
3,0.6576
4,0.6583
5,0.6661
6,0.6698
7,0.6392
8,0.6531
9,0.6684
10,0.6709


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,400 | Num Epochs = 1 | Total steps = 6
O^O/ \_/ \    Batch size per device = 16 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (16 x 16 x 1) = 256
 "-____-"     Trainable parameters = 161,480,704 of 757,530,624 (21.32% trained)


Step,Training Loss,Validation Loss
1,0.554,0.57393
2,0.5474,0.557318
3,0.5474,0.528936
4,0.5164,0.48495
5,0.4821,0.436128
6,0.4178,0.394163


In [28]:
model.print_trainable_parameters()

trainable params: 161,480,704 || all params: 757,530,624 || trainable%: 21.3167


In [29]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

Peak reserved memory = 4.977 GB.
Peak reserved memory for training = 3.225 GB.
Peak reserved memory % of max memory = 20.748 %.
Peak reserved memory for training % of max memory = 13.444 %.


In [30]:
def prepare_prompts_for_testing(dataset, tokenizer):
    system_prompt = """
    Eres un módulo de recuperación. Tu única tarea es devolver el identificador del documento correspondiente a la consulta dada.
    """
    def build_prompt_it(tokenizer, system_prompt: str, prompt: str) -> str:
        """Builds the chat prompt for a single example using the tokenizer chat template."""
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user",   "content": prompt},
        ]
        return tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=False,
        )
    prompts = []
    for item in dataset:
        prompt = """
        Dada la siguiente consulta, recupera los identificadores de los documentos relevantes. 
        Consulta: {QUERY}
        """
        question = item["question"]
        prompt = prompt.format(QUERY=question)
        prompts.append(
            ( 
                build_prompt_it(tokenizer, system_prompt, prompt),
                item["id"],
            )
        )
    return prompts

In [31]:
prompts_retrieval_test = prepare_prompts_for_testing(dataset["test"], tokenizer)


In [32]:
i = 1
text = prompts_retrieval_test[i][0]
doc_id_targets = prompts_retrieval_test[i][1]
print(doc_id_targets)

7538_1


In [33]:
print(text)

<|im_start|>system

    Eres un módulo de recuperación. Tu única tarea es devolver el identificador del documento correspondiente a la consulta dada.
    <|im_end|>
<|im_start|>user

        Dada la siguiente consulta, recupera los identificadores de los documentos relevantes. 
        Consulta: Dame el número de Manuel Sánchez.
        <|im_end|>
<|im_start|>assistant



In [34]:
# test the model in streaming mode
from transformers import TextStreamer

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 64, # Increase for longer outputs!
    do_sample = False,
    top_p = 0.1,
    temperature = 0.,
    streamer = streamer,
)

<think>

</think>

DOCID:{7515_2}<|im_end|>


In [35]:
# test the model in non-streaming mode
import re
import tqdm

acc = 0
total = 0

for text, doc_id_target in tqdm.tqdm(prompts_retrieval_test, desc="Testing"):
    inputs = tokenizer(text, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=64,  # Increase for longer outputs!
        do_sample=False, temperature=0.0, top_p=1.0
    )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
    response = generated_text.split("</think>")[-1]
    print(response)
    # extract DOCID number using regex

    doc_id = re.search(r"DOCID:(\d+_\d+)", response).group(1)
    
    print("Correct:", doc_id, "==", "Predicted:", doc_id_target)
    if doc_id == doc_id_target:
        acc += 1
    total += 1
print(f"Accuracy: {acc}/{total} = {acc/total*100:.2f} %")

Testing:   0%|          | 0/300 [00:00<?, ?it/s]



DOCID:{7521_2}<|im_end|>





AttributeError: 'NoneType' object has no attribute 'group'