In [1]:
from unsloth import FastLanguageModel
from datasets import load_from_disk
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from trl import SFTTrainer, SFTConfig
import torch

SEED = 42

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
INFO 10-20 11:12:19 [__init__.py:216] Automatically detected platform cuda.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    model_kwargs={"device": "cuda"},
)

db = FAISS.load_local(
    "../data/db/ragbench-covidqa/ragbench-covidqa_embeddings_sentence-transformers_paraphrase-multilingual-mpnet-base-v2",
    embedding_model,
    allow_dangerous_deserialization=True,
)

In [3]:
#quiero la lista de documentos
docs = db.docstore._dict.values()
documents = list(docs)
print(f"Number of documents: {len(documents)}")

Number of documents: 4944


In [4]:
FOLDER_AUTORE = "../data/processed/ragbench-covidqa"
dataset = load_from_disk(FOLDER_AUTORE)

In [5]:
dataset["train"][0]["documents"][0]

'Title: Emergent severe acute respiratory distress syndrome caused by adenovirus type 55 in immunocompetent adults in 2013: a prospective observational study\nPassage: Recent studies have shown that the immune system plays a crucial role in the clearance of HAdV viremia and survival of the host . Chen et al. reported that, in the acute phase of HAdV-55 infection, patients with severe disease may have high levels of dendritic cells and Th17 cells . In our study, the only patient who recovered from severe infection had higher T-cell counts. Three of the five patients had relatively low T-cell counts when admitted. Our results suggest that these three patients may have been relatively immunocompromised and that a lower T-cell count may be a risk'

## Data preparation

In [57]:
def prepare_prompt_for_indexing(documents: list):
    prompt = """
    [TASK] index
    [DOCUMENT]
    {doc}
    [OUTPUT]
    DOCID:{doc_id}
    """
    for doc in documents:
        document = doc.page_content
        doc_id = doc.metadata.get("id", "unknown")
        yield prompt.format(doc=document, doc_id=doc_id)

In [69]:
def prepare_prompts_for_retrieval(dataset):
    prompts = []
    for item in dataset:
        prompt = """
        [TASK] retrieve
        [QUERY]
        {QUERY}
        [OUTPUT]
        """
        docid_prompt = "DOCID:{docid}"
        question = item["question"]
        prompt = prompt.format(QUERY=question)
        for docid in item["document_ids"]:
            prompt += docid_prompt.format(docid=docid) + "\n"
        prompts.append(prompt)
    return prompts

In [None]:
prompts = list(prepare_prompt_for_indexing(documents))
print(f"Number of prompts: {len(prompts)}")

Number of prompts: 4944


In [8]:
# create dataset from prompts
from datasets import Dataset
indexing_dataset = Dataset.from_dict({"text": prompts})
indexing_dataset

Dataset({
    features: ['text'],
    num_rows: 4944
})

In [72]:
prompts_retrieval_train = prepare_prompts_for_retrieval(dataset["train"])
prompts_retrieval_val = prepare_prompts_for_retrieval(dataset["validation"])
prompts_retrieval_test = prepare_prompts_for_retrieval(dataset["test"])

print(f"Number of retrieval prompts: {len(prompts_retrieval_train)}")
print(f"Number of retrieval prompts: {len(prompts_retrieval_val)}")
print(f"Number of retrieval prompts: {len(prompts_retrieval_test)}")

Number of retrieval prompts: 1252
Number of retrieval prompts: 267
Number of retrieval prompts: 246


In [73]:
# create dataset from prompts train, val, test
retrieval_train_dataset = Dataset.from_dict({"text": prompts_retrieval_train})
retrieval_val_dataset = Dataset.from_dict({"text": prompts_retrieval_val})
retrieval_test_dataset = Dataset.from_dict({"text": prompts_retrieval_test})

retrieval_dataset = {
    "train": retrieval_train_dataset,
    "validation": retrieval_val_dataset,
    "test": retrieval_test_dataset,
}

## Train

In [74]:
model_name = "Qwen/Qwen3-0.6B"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = 8192,
    load_in_4bit = False,
    load_in_8bit = False,
)
RANK = 32
model = FastLanguageModel.get_peft_model(
    model,
    r = RANK,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = RANK*2,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

==((====))==  Unsloth 2025.9.6: Fast Qwen3 patching. Transformers: 4.55.4. vLLM: 0.10.2.
   \\   /|    NVIDIA GeForce RTX 4070 SUPER. Num GPUs = 1. Max memory: 11.994 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [75]:
# sft training
sft_config = SFTConfig(
    dataset_text_field = "text",
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4, # Use GA to mimic batch size!
    warmup_steps = 5,
    # num_train_epochs = 1, # Set this for 1 full training run.
    max_steps = 30,
    learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
    logging_steps = 1,
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = SEED,
    report_to = "none", # Use this for WandB etc
    output_dir="../models/qwen3-0.6b-rag-indexer",
)

it_config = SFTConfig(
    dataset_text_field = "text",
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4, # Use GA to mimic batch size!
    warmup_steps = 5,
    # num_train_epochs = 1, # Set this for 1 full training run.
    max_steps = 30,
    learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
    logging_steps = 1,
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = SEED,
    report_to = "none", # Use this for WandB etc
    output_dir="../models/qwen3-0.6b-rag-retriever",
)

trainer_sft = SFTTrainer(
    model=model,
    train_dataset=indexing_dataset,
    tokenizer=tokenizer,
    args=sft_config,
)

trainer_it = SFTTrainer(
    model=model,
    train_dataset=retrieval_dataset["train"],
    eval_dataset=retrieval_dataset["validation"],
    tokenizer=tokenizer,
    args=it_config,
)

Unsloth: Tokenizing ["text"] (num_proc=32): 100%|██████████| 4944/4944 [00:06<00:00, 777.73 examples/s] 
Unsloth: Tokenizing ["text"] (num_proc=32): 100%|██████████| 1252/1252 [00:04<00:00, 299.81 examples/s]
Unsloth: Tokenizing ["text"] (num_proc=32): 100%|██████████| 267/267 [00:04<00:00, 65.98 examples/s] 


In [76]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4070 SUPER. Max memory = 11.994 GB.
11.059 GB of memory reserved.


In [77]:
for _ in range(2):
    trainer_sft_stats = trainer_sft.train()
    trainer_it_stats = trainer_it.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,944 | Num Epochs = 1 | Total steps = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 20,185,088 of 616,235,008 (3.28% trained)


Step,Training Loss
1,3.5366
2,3.364
3,3.2136
4,3.1311
5,3.0811
6,2.7639
7,2.7257
8,2.6883
9,2.5208
10,2.5059


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,252 | Num Epochs = 1 | Total steps = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 20,185,088 of 616,235,008 (3.28% trained)


Step,Training Loss
1,2.5894
2,2.4552
3,1.944
4,1.3358
5,1.2113
6,1.2885
7,1.2936
8,1.2014
9,1.2535
10,1.2112


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,944 | Num Epochs = 1 | Total steps = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 20,185,088 of 616,235,008 (3.28% trained)


Step,Training Loss
1,3.009
2,2.8086
3,2.614
4,2.4705
5,2.459
6,2.1707
7,2.2038
8,2.261
9,2.1477
10,2.1542


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,252 | Num Epochs = 1 | Total steps = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 20,185,088 of 616,235,008 (3.28% trained)


Step,Training Loss
1,1.9082
2,1.7723
3,1.7683
4,1.273
5,1.069
6,1.1102
7,1.0562
8,0.9936
9,1.0324
10,1.0553


In [78]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

Peak reserved memory = 11.059 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 92.204 %.
Peak reserved memory for training % of max memory = 0.0 %.


In [84]:
text_completed = retrieval_dataset['train'][0]["text"]
text = text_completed.split("[OUTPUT]")[0].strip() + "\n[OUTPUT]\n"
doc_id_targets = text_completed.split("[OUTPUT]")[1].strip()
print(text)
print(doc_id_targets)

[TASK] retrieve
        [QUERY]
        What role does T-cell count play in severe human adenovirus type 55 (HAdV-55) infection?
[OUTPUT]

DOCID:1395
DOCID:1395
DOCID:2160
DOCID:2158


In [88]:
# test the model in streaming mode
from transformers import TextStreamer

streamer = TextStreamer(tokenizer, skip_prompt=True)
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 64, # Increase for longer outputs!
    temperature = 0.000001,
    streamer = streamer,
)

DOCID:2469
DOCID:2468
DOCID:2467
DOCID:2466
DOCID:2465
DOCID:2464
DOCID:2463
DOCID:2462

