In [74]:
from unsloth import FastLanguageModel
from datasets import load_from_disk
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import DataCollatorForLanguageModeling
from trl import SFTTrainer, SFTConfig
import torch

SEED = 42

In [75]:
model_name = "Qwen/Qwen3-0.6B"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = 8192,
    load_in_4bit = False,
    load_in_8bit = False,
)
RANK = 128
model = FastLanguageModel.get_peft_model(
    model,
    r = RANK,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = RANK*2,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = SEED,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

==((====))==  Unsloth 2025.9.8: Fast Qwen3 patching. Transformers: 4.56.2. vLLM: 0.10.2.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.988 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [76]:
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    model_kwargs={"device": "cuda"},
)

db = FAISS.load_local(
    "../data/db/parliament_db/parliament_all_docs_embeddings_sentence-transformers_paraphrase-multilingual-mpnet-base-v2",
    embedding_model,
    allow_dangerous_deserialization=True,
)

In [77]:
#quiero la lista de documentos
docs = db.docstore._dict.values()
documents = list(docs)
print(f"Number of documents: {len(documents)}")

Number of documents: 11162


In [78]:
FOLDER_AUTORE = "../data/processed/parliament_qa"
dataset = load_from_disk(FOLDER_AUTORE)

## Data preparation

In [79]:
def prepare_prompt_for_indexing(documents: list):
    prompt = """
    Este documento tiene el DOCID:{doc_id}.
    Contenido del documento:
    {doc}
    """
    for doc in documents:
        document = doc.page_content
        doc_id = doc.metadata.get("id", "unknown")
        yield prompt.format(doc=document, doc_id=doc_id)

In [80]:
def build_prompt_it(tokenizer, system_prompt: str, prompt: str, response: str) -> str:
    """Builds the chat prompt for a single example using the tokenizer chat template."""
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user",   "content": prompt},
        {"role": "assistant", "content": response}
    ]
    return tokenizer.apply_chat_template(
        messages,
        tokenize=False,
    )

In [81]:
def prepare_prompts_for_retrieval(dataset, tokenizer):
    system_prompt = """Eres un módulo de recuperación. Tu única tarea es devolver el identificador del documento correspondiente a la consulta dada.
Sigue estrictamente estas reglas:
1) Devuelve EXACTAMENTE una línea con el formato: DOCID:{<id>}.
2) No incluyas palabras, explicaciones o puntuación extra antes o después de las llaves.
3) Si múltiples documentos son plausibles, elige el mejor ID.
4) Nunca inventes un ID fuera del espacio permitido. Mantente dentro de los prefijos válidos.
5) No respondas a la pregunta; solo devuelve el docid."
"""
    prompts = []
    for item in dataset:
        prompt = """
        Dada la siguiente consulta, recupera los identificadores de los documentos relevantes. 
        Consulta: {QUERY}
        """
        response = "DOCID:{docid}"
        question = item["question"]
        prompt = prompt.format(QUERY=question)
        prompts.append(build_prompt_it(tokenizer, system_prompt, prompt, response.format(docid=item["id"])))
    return prompts

In [82]:
prompts = list(prepare_prompt_for_indexing(documents))
print(f"Number of prompts: {len(prompts)}")

Number of prompts: 11162


In [83]:
prompts[0]

'\n    Este documento tiene el DOCID:6596_4.\n    Contenido del documento:\n    Esta sesión del parlamento se realizó el 2024-05-07. 11L/PO/P-0750 PREGUNTA DEL SEÑOR DIPUTADO DON NICASIO JESÚS GALVÁN SASIA, DEL GRUPO PARLAMENTARIO VOX, SOBRE MEDIDAS QUE SE VAN A LLEVAR A CABO PARA DEMOCRATIZAR Y REDISTRIBUIR LA RIQUEZA DEL SECTOR TURÍSTICO, DIRIGIDA A LA PRESIDENCIA DEL GOBIERNO La señora PRESIDENTA: Siguiente pregunta, del señor diputado don Nicasio Galván Sasia, del Grupo Parlamentario VOX, sobre medidas que se van a llevar a cabo para democratizar y redistribuir la riqueza del sector turístico, dirigida al señor presidente del Gobierno. Cuando quiera. El señor GALVÁN SASIA (desde su escaño): Buenos días, señor Clavijo, buenos días. Escuchándole en la rueda de prensa posterior a la Conferencia de Presidentes nos han surgido varias preguntas, y nos consta que no solo a nosotros. Se le oía escuchar hablar de la democratización y la redistribución de la riqueza del sector turístico y cu

In [84]:
# create dataset from prompts
from datasets import Dataset
indexing_dataset = Dataset.from_dict({"text": prompts})
indexing_dataset

Dataset({
    features: ['text'],
    num_rows: 11162
})

In [85]:
indexing_dataset["text"][0]

'\n    Este documento tiene el DOCID:6596_4.\n    Contenido del documento:\n    Esta sesión del parlamento se realizó el 2024-05-07. 11L/PO/P-0750 PREGUNTA DEL SEÑOR DIPUTADO DON NICASIO JESÚS GALVÁN SASIA, DEL GRUPO PARLAMENTARIO VOX, SOBRE MEDIDAS QUE SE VAN A LLEVAR A CABO PARA DEMOCRATIZAR Y REDISTRIBUIR LA RIQUEZA DEL SECTOR TURÍSTICO, DIRIGIDA A LA PRESIDENCIA DEL GOBIERNO La señora PRESIDENTA: Siguiente pregunta, del señor diputado don Nicasio Galván Sasia, del Grupo Parlamentario VOX, sobre medidas que se van a llevar a cabo para democratizar y redistribuir la riqueza del sector turístico, dirigida al señor presidente del Gobierno. Cuando quiera. El señor GALVÁN SASIA (desde su escaño): Buenos días, señor Clavijo, buenos días. Escuchándole en la rueda de prensa posterior a la Conferencia de Presidentes nos han surgido varias preguntas, y nos consta que no solo a nosotros. Se le oía escuchar hablar de la democratización y la redistribución de la riqueza del sector turístico y cu

In [86]:
prompts_retrieval_train = prepare_prompts_for_retrieval(dataset["train"], tokenizer)
prompts_retrieval_val = prepare_prompts_for_retrieval(dataset["validation"], tokenizer)

print(f"Number of retrieval prompts: {len(prompts_retrieval_train)}")
print(f"Number of retrieval prompts: {len(prompts_retrieval_val)}")

Number of retrieval prompts: 614
Number of retrieval prompts: 161


In [87]:
print(prompts_retrieval_train[0], sep="\n")

<|im_start|>system
Eres un módulo de recuperación. Tu única tarea es devolver el identificador del documento correspondiente a la consulta dada.
Sigue estrictamente estas reglas:
1) Devuelve EXACTAMENTE una línea con el formato: DOCID:{<id>}.
2) No incluyas palabras, explicaciones o puntuación extra antes o después de las llaves.
3) Si múltiples documentos son plausibles, elige el mejor ID.
4) Nunca inventes un ID fuera del espacio permitido. Mantente dentro de los prefijos válidos.
5) No respondas a la pregunta; solo devuelve el docid."
<|im_end|>
<|im_start|>user

        Dada la siguiente consulta, recupera los identificadores de los documentos relevantes. 
        Consulta: ¿Qué argumentos presentó el grupo parlamentario que intervino en la sesión del 22 de octubre de 2024, en relación con la propuesta de alteración del orden del día y su impacto en el desarrollo de las comparecencias del Gobierno?
        <|im_end|>
<|im_start|>assistant
<think>

</think>

DOCID:6592_1<|im_end|>



In [88]:
# create dataset from prompts train, val, test
retrieval_train_dataset = Dataset.from_dict({"text": prompts_retrieval_train})
retrieval_val_dataset = Dataset.from_dict({"text": prompts_retrieval_val})

retrieval_dataset = {
    "train": retrieval_train_dataset,
    "validation": retrieval_val_dataset,
}

In [89]:
def tokenize_function_autoregressive(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=2048)

In [90]:
indexing_dataset_tokenizer = indexing_dataset.map(tokenize_function_autoregressive, batched=True)

Map: 100%|██████████| 11162/11162 [01:05<00:00, 171.05 examples/s]


In [91]:
retrieval_train_dataset_tokenizer = retrieval_dataset["train"].map(tokenize_function_autoregressive, batched=True)
retrieval_val_dataset_tokenizer = retrieval_dataset["validation"].map(tokenize_function_autoregressive, batched=True)

Map: 100%|██████████| 614/614 [00:00<00:00, 1249.00 examples/s]
Map: 100%|██████████| 161/161 [00:00<00:00, 1092.74 examples/s]


## Train

In [92]:
# sft training
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
auto_config = SFTConfig(
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4, # Use GA to mimic batch size!
    save_steps=100,
    warmup_steps = 5,
    num_train_epochs = 1, # Set this for 1 full training run.
    #max_steps = 60,
    learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
    logging_steps = 100,
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = SEED,
    report_to = "none", # Use this for WandB etc
    output_dir="../models/qwen3-0.6b-rag-indexer",
)

it_config = SFTConfig(
    dataset_text_field="text",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,         # <-- añade eval batch size
    gradient_accumulation_steps=4,
    warmup_steps=5,
    save_steps=5,
    eval_steps=5,
    eval_strategy="steps",         # <-- activa evaluación periódica
    num_train_epochs=1,             # <-- opcional: usa epochs en lugar de max_steps
    #max_steps=60,
    learning_rate=2e-4,
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=SEED,
    report_to="none",
    output_dir="../models/qwen3-0.6b-rag-retriever",
    load_best_model_at_end=True,          # <-- opcional
    metric_for_best_model="eval_loss",    # <-- opcional
    greater_is_better=False,              # <-- opcional
)

trainer_auto = SFTTrainer(
    model=model,
    train_dataset=indexing_dataset_tokenizer,
    tokenizer=tokenizer,
    args=auto_config,
)

trainer_it = SFTTrainer(
    model=model,
    train_dataset=retrieval_train_dataset_tokenizer,
    eval_dataset=retrieval_val_dataset_tokenizer,
    tokenizer=tokenizer,
    args=it_config,
)

In [93]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4090. Max memory = 23.988 GB.
15.178 GB of memory reserved.


In [94]:
model.print_trainable_parameters()

trainable params: 80,740,352 || all params: 676,790,272 || trainable%: 11.9299


In [95]:
EPOCHS = 4
for _ in range(EPOCHS):
    trainer_sft_stats = trainer_auto.train() # (context, id)
    trainer_it_stats = trainer_it.train() # (query, id)
    # GUARDAR MODELOS CADA SUPER EPOCH

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 11,162 | Num Epochs = 1 | Total steps = 1,396
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 80,740,352 of 676,790,272 (11.93% trained)


Step,Training Loss
100,1.8836
200,1.5703
300,1.528
400,1.4861
500,1.4664
600,1.4475
700,1.3943
800,1.4399
900,1.4066
1000,1.4127


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 614 | Num Epochs = 1 | Total steps = 77
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 80,740,352 of 676,790,272 (11.93% trained)


Step,Training Loss,Validation Loss
5,0.1539,0.104053
10,0.053,0.054196
15,0.0598,0.050812
20,0.0507,0.049696
25,0.0458,0.048039
30,0.0433,0.046564
35,0.0466,0.045545
40,0.0402,0.044809
45,0.0473,0.044132
50,0.0412,0.043629


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 11,162 | Num Epochs = 1 | Total steps = 1,396
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 80,740,352 of 676,790,272 (11.93% trained)


Step,Training Loss
100,1.391
200,1.3776
300,1.3622
400,1.3411
500,1.3344
600,1.3279
700,1.2882
800,1.3416
900,1.3209
1000,1.3352


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 614 | Num Epochs = 1 | Total steps = 77
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 80,740,352 of 676,790,272 (11.93% trained)


Step,Training Loss,Validation Loss
5,0.0548,0.060458
10,0.0527,0.058876
15,0.0518,0.045965
20,0.0437,0.044662
25,0.0394,0.043547
30,0.0371,0.042395
35,0.0402,0.041889
40,0.0362,0.041253
45,0.0423,0.040736
50,0.0362,0.040478


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 11,162 | Num Epochs = 1 | Total steps = 1,396
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 80,740,352 of 676,790,272 (11.93% trained)


Step,Training Loss
100,1.239
200,1.2421
300,1.2369
400,1.2281
500,1.2307
600,1.2339
700,1.2062
800,1.2682
900,1.2599
1000,1.2838


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 614 | Num Epochs = 1 | Total steps = 77
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 80,740,352 of 676,790,272 (11.93% trained)


Step,Training Loss,Validation Loss
5,0.0611,0.068314
10,0.0388,0.05392
15,0.0418,0.045058
20,0.0378,0.044377
25,0.0329,0.043305
30,0.0332,0.042847
35,0.0352,0.042104
40,0.0334,0.041183
45,0.0386,0.0404
50,0.0325,0.040001


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 11,162 | Num Epochs = 1 | Total steps = 1,396
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 80,740,352 of 676,790,272 (11.93% trained)


Step,Training Loss
100,1.106
200,1.1278
300,1.133
400,1.132
500,1.1401
600,1.1517
700,1.1357
800,1.2046
900,1.2092
1000,1.2434


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 614 | Num Epochs = 1 | Total steps = 77
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 80,740,352 of 676,790,272 (11.93% trained)


Step,Training Loss,Validation Loss
5,0.0361,0.050206
10,0.0219,0.059765
15,0.0334,0.051433
20,0.0332,0.046313
25,0.0315,0.045301
30,0.0315,0.04369
35,0.0345,0.043414
40,0.0318,0.042228
45,0.0363,0.041628
50,0.032,0.040689


In [96]:
model.print_trainable_parameters()

trainable params: 80,740,352 || all params: 676,790,272 || trainable%: 11.9299


In [97]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

Peak reserved memory = 15.178 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 63.273 %.
Peak reserved memory for training % of max memory = 0.0 %.


In [98]:
def prepare_prompts_for_testing(dataset, tokenizer):
    system_prompt = """Eres un módulo de recuperación. Tu única tarea es devolver el identificador del documento correspondiente a la consulta dada.
Sigue estrictamente estas reglas:
1) Devuelve EXACTAMENTE una línea con el formato: DOCID:{<id>}.
2) No incluyas palabras, explicaciones o puntuación extra antes o después de las llaves.
3) Si múltiples documentos son plausibles, elige el mejor ID.
4) Nunca inventes un ID fuera del espacio permitido. Mantente dentro de los prefijos válidos.
5) No respondas a la pregunta; solo devuelve el docid."
"""
    def build_prompt_it(tokenizer, system_prompt: str, prompt: str) -> str:
        """Builds the chat prompt for a single example using the tokenizer chat template."""
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user",   "content": prompt},
        ]
        return tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=False,
        )
    prompts = []
    for item in dataset:
        prompt = """
        Dada la siguiente consulta, recupera los identificadores de los documentos relevantes. 
        Consulta: {QUERY}
        """
        question = item["question"]
        prompt = prompt.format(QUERY=question)
        prompts.append(
            ( 
                build_prompt_it(tokenizer, system_prompt, prompt),
                item["id"],
            )
        )
    return prompts

In [119]:
prompts_retrieval_test = prepare_prompts_for_testing(dataset["test"], tokenizer)


In [120]:
i = 1
text = prompts_retrieval_test[i][0]
doc_id_targets = prompts_retrieval_test[i][1]
print(doc_id_targets)

6600_6


In [121]:
print(text)

<|im_start|>system
Eres un módulo de recuperación. Tu única tarea es devolver el identificador del documento correspondiente a la consulta dada.
Sigue estrictamente estas reglas:
1) Devuelve EXACTAMENTE una línea con el formato: DOCID:{<id>}.
2) No incluyas palabras, explicaciones o puntuación extra antes o después de las llaves.
3) Si múltiples documentos son plausibles, elige el mejor ID.
4) Nunca inventes un ID fuera del espacio permitido. Mantente dentro de los prefijos válidos.
5) No respondas a la pregunta; solo devuelve el docid."
<|im_end|>
<|im_start|>user

        Dada la siguiente consulta, recupera los identificadores de los documentos relevantes. 
        Consulta: ¿Qué argumentos presenta el presidente del Gobierno de Canarias, Clavijo Batlle, para justificar la necesidad urgente de recibir los fondos adeudados antes del cierre del presupuesto de 2024?
        <|im_end|>
<|im_start|>assistant



In [122]:
# test the model in streaming mode
from transformers import TextStreamer

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 64, # Increase for longer outputs!
    temperature = 0.00000001,
    streamer = streamer,
)

<think>

</think>

DOCID:6596_29<|im_end|>


In [133]:
# test the model in non-streaming mode
import re
import tqdm

acc = 0
total = 0

for text, doc_id_target in prompts_retrieval_test:
    inputs = tokenizer(text, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=64,  # Increase for longer outputs!
        do_sample=False, temperature=0.0, top_p=1.0
    )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
    response = generated_text.split("</think>")[-1]
    # extract DOCID number using regex

    doc_id = re.search(r"DOCID:(\d+_\d+)", response).group(1)
    
    print("Correct:", doc_id, "==", "Predicted:", doc_id_target)
    if doc_id == doc_id_target:
        acc += 1
    total += 1
print(f"Accuracy: {acc}/{total} = {acc/total*100:.2f} %")

Correct: 5472_1 == Predicted: 5402_2
Correct: 6596_29 == Predicted: 6600_6
Correct: 6596_21 == Predicted: 5415_6
Correct: 5887_11 == Predicted: 5861_8
Correct: 5537_11 == Predicted: 5536_4
Correct: 6596_29 == Predicted: 6584_12
Correct: 6596_29 == Predicted: 6603_15
Correct: 6596_11 == Predicted: 5415_7


KeyboardInterrupt: 