In [99]:
from unsloth import FastLanguageModel
from unsloth import UnslothTrainer, UnslothTrainingArguments
from trl import SFTTrainer, SFTConfig
import torch

SEED = 42

In [100]:
model_name = "meta-llama/Llama-3.2-1B-instruct"
MAX_LENGTH = 256

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = MAX_LENGTH,
    full_finetuning=False,
    load_in_4bit = False,
    load_in_8bit = False,
)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id
RANK = 512
model = FastLanguageModel.get_peft_model(
    model,
    r = RANK,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = RANK*2,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = SEED,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

==((====))==  Unsloth 2025.9.8: Fast Llama patching. Transformers: 4.56.2. vLLM: 0.10.2.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.988 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


## Datasets

### tel

In [3]:
from datasets import load_from_disk

dataset_qa = load_from_disk("../notebooks/data/dataset_instruct_train/")
# create validation dataset from dataset_qa
dataset_qa = dataset_qa.train_test_split(test_size=0.1, seed=SEED)
dataset_knowledge = load_from_disk("../notebooks/data/dataset_telephones")

documents = list(dataset_knowledge["text"])

### Simple dataset loading example

In [101]:
import pandas as pd
from langchain.schema import Document
dataset_knowledge = pd.read_csv("../notebooks/data/contacts_docs.csv")
documents = []
for index, row in dataset_knowledge.iterrows():
    doc = f"Nombre: {row['name']}\nTeléfono: {row['phone']}"
    documents.append(Document(page_content=doc, metadata={"id": f"{row['id']}" } ))
print(f"Loaded {len(documents)} documents.")
print(f"First document: {documents[0]}")


Loaded 400 documents.
First document: page_content='Nombre: Alba Alonso
Teléfono: 632 322 183' metadata={'id': '7500_1'}


In [102]:
query_dataset_train = pd.read_csv("../notebooks/data/contacts_queries_train.csv")
query_dataset_val = pd.read_csv("../notebooks/data/contacts_queries_val.csv")
query_dataset_test = pd.read_csv("../notebooks/data/contacts_queries_test.csv")


In [103]:
all_data = {
    "train": query_dataset_train,
    "validation": query_dataset_val,
    "test": query_dataset_test,
}

#to hugginface dataset
from datasets import Dataset, DatasetDict
dataset_qa = {}
for split in all_data:
    dataset_qa[split] = Dataset.from_pandas(all_data[split])
dataset_qa = DatasetDict(dataset_qa)

In [105]:
dataset_qa

DatasetDict({
    train: Dataset({
        features: ['question', 'id', 'respuesta'],
        num_rows: 1400
    })
    validation: Dataset({
        features: ['question', 'id', 'respuesta'],
        num_rows: 300
    })
    test: Dataset({
        features: ['question', 'id', 'respuesta'],
        num_rows: 300
    })
})

In [106]:
model.print_trainable_parameters()

trainable params: 360,710,144 || all params: 1,596,524,544 || trainable%: 22.5935


In [109]:
# JOIN TRAIN AND VAL DATASETS
from datasets import concatenate_datasets
dataset_qa["train"] = concatenate_datasets([dataset_qa["train"], dataset_qa["validation"]])

In [111]:
# rename "respuesta" column to "answer"
dataset_qa = dataset_qa.rename_column("respuesta", "answer")

In [112]:
dataset_qa

DatasetDict({
    train: Dataset({
        features: ['question', 'id', 'answer'],
        num_rows: 1700
    })
    validation: Dataset({
        features: ['question', 'id', 'answer'],
        num_rows: 300
    })
    test: Dataset({
        features: ['question', 'id', 'answer'],
        num_rows: 300
    })
})

### Data preparation

In [113]:
def build_prompt_it(tokenizer, system_prompt: str, prompt: str, response: str) -> str:
    """Builds the chat prompt for a single example using the tokenizer chat template."""
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user",   "content": prompt},
        {"role": "assistant", "content": response}
    ]
    return tokenizer.apply_chat_template(
        messages,
        tokenize=False,
    )

In [119]:
def generate_knowledge_injection_prompts(documents: list):
    prompt = """{doc}"""
    for doc in documents:
        yield prompt.format(doc=doc.page_content)

In [120]:
def generate_qa_prompts(dataset, tokenizer):
    system_prompt = """
    Eres un modelo de lenguaje entrenado para responder preguntas.
    """
    prompts = []
    for item in dataset:
        prompt = """{QUERY}"""
        response = "{response}"
        question = item["question"]
        prompt = prompt.format(QUERY=question)
        prompts.append(build_prompt_it(tokenizer, system_prompt, prompt, response.format(response=item["answer"])))
    return prompts

In [121]:
prompts = list(generate_knowledge_injection_prompts(documents))
print(f"Number of prompts: {len(prompts)}")

Number of prompts: 400


In [122]:
prompts[0]

'Nombre: Alba Alonso\nTeléfono: 632 322 183'

In [123]:
# QUIERO VER LOS TOKENS
def print_tokens(text):
    tokens = tokenizer.tokenize(text)
    print("Number of tokens:", len(tokens), "\n")
    print("Tokens:", tokens)

print_tokens(prompts[0])

Number of tokens: 15 

Tokens: ['Nombre', ':', 'ĠAl', 'ba', 'ĠAlonso', 'Ċ', 'Tel', 'Ã©fono', ':', 'Ġ', '632', 'Ġ', '322', 'Ġ', '183']


In [124]:
# create dataset from prompts
from datasets import Dataset
knowledge_dataset = Dataset.from_dict({"text": prompts})
knowledge_dataset

Dataset({
    features: ['text'],
    num_rows: 400
})

In [125]:
knowledge_dataset["text"][0]

'Nombre: Alba Alonso\nTeléfono: 632 322 183'

In [126]:
prompts_qa_train = generate_qa_prompts(dataset_qa["train"], tokenizer)
prompts_qa_val = generate_qa_prompts(dataset_qa["test"], tokenizer)

print(f"Number of retrieval prompts: {len(prompts_qa_train)}")
print(f"Number of retrieval prompts: {len(prompts_qa_val)}")

Number of retrieval prompts: 1700
Number of retrieval prompts: 300


In [127]:
print(prompts_qa_train[0], sep="\n")

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 04 Nov 2025

Eres un modelo de lenguaje entrenado para responder preguntas.<|eot_id|><|start_header_id|>user<|end_header_id|>

Necesito el contacto asociado al 620 152 344. —consulta interna—<|eot_id|><|start_header_id|>assistant<|end_header_id|>

El número 620 152 344 pertenece a Alejandro Vega.<|eot_id|>


In [128]:
print_tokens(prompts_qa_train[0])

Number of tokens: 87 

Tokens: ['<|begin_of_text|>', '<|start_header_id|>', 'system', '<|end_header_id|>', 'ĊĊ', 'Cut', 'ting', 'ĠKnowledge', 'ĠDate', ':', 'ĠDecember', 'Ġ', '202', '3', 'Ċ', 'Today', 'ĠDate', ':', 'Ġ', '04', 'ĠNov', 'Ġ', '202', '5', 'ĊĊ', 'E', 'res', 'Ġun', 'Ġmodelo', 'Ġde', 'Ġl', 'engu', 'aje', 'Ġentren', 'ado', 'Ġpara', 'Ġresponder', 'Ġpreg', 'untas', '.', '<|eot_id|>', '<|start_header_id|>', 'user', '<|end_header_id|>', 'ĊĊ', 'N', 'ec', 'es', 'ito', 'Ġel', 'Ġcontacto', 'Ġasoci', 'ado', 'Ġal', 'Ġ', '620', 'Ġ', '152', 'Ġ', '344', '.', 'ĠâĢĶ', 'consulta', 'Ġintern', 'a', 'âĢĶ', '<|eot_id|>', '<|start_header_id|>', 'assistant', '<|end_header_id|>', 'ĊĊ', 'El', 'ĠnÃºmero', 'Ġ', '620', 'Ġ', '152', 'Ġ', '344', 'Ġpert', 'ene', 'ce', 'Ġa', 'ĠAlejandro', 'ĠVega', '.', '<|eot_id|>']


In [129]:
# create dataset from prompts train, val, test
qa_train_dataset = Dataset.from_dict({"text": prompts_qa_train})
qa_val_dataset = Dataset.from_dict({"text": prompts_qa_val})

qa_dataset = {
    "train": qa_train_dataset,
    "validation": qa_val_dataset,
}

In [130]:
def tokenize_function_autoregressive(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=MAX_LENGTH)

In [131]:
knowledge_dataset_tokenizer = knowledge_dataset.map(tokenize_function_autoregressive, batched=True)

Map: 100%|██████████| 400/400 [00:00<00:00, 9853.30 examples/s]


In [132]:
qa_train_dataset_tokenizer = qa_dataset["train"].map(tokenize_function_autoregressive, batched=True)
qa_val_dataset_tokenizer = qa_dataset["validation"].map(tokenize_function_autoregressive, batched=True)

Map: 100%|██████████| 1700/1700 [00:00<00:00, 19118.28 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 16567.80 examples/s]


## Train

In [133]:
# sft training
from transformers import DataCollatorForLanguageModeling


data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
auto_config = UnslothTrainingArguments(
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 4, # Use GA to mimic batch size!
    save_strategy="no",
    save_total_limit=0,
    warmup_steps = 5,
    num_train_epochs = 1, # Set this for 1 full training run.
    #max_steps = 60,
    learning_rate = 1e-4, # Reduce to 2e-5 for long training runs
    logging_steps = 1,
    # 32 bits
    optim = "paged_adamw_32bit",
    weight_decay = 0.01,
    lr_scheduler_type = "cosine",
    seed = SEED,
    report_to = "none", # Use this for WandB etc
    output_dir="../models/qwen3-0.6b-rag-indexer",
)

it_config = SFTConfig(
    dataset_text_field="text",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,         # <-- añade eval batch size
    gradient_accumulation_steps=2,
    warmup_steps=25,
    save_strategy="no",
    save_total_limit=0,
    eval_steps=1,
    eval_strategy="steps",         # <-- activa evaluación periódica
    num_train_epochs=1,             # <-- opcional: usa epochs en lugar de max_steps
    #max_steps=30,
    learning_rate=1e-4,
    logging_steps=1,
    optim = "paged_adamw_32bit",
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    seed=SEED,
    report_to="none",
    output_dir="../models/qwen3-0.6b-rag-retriever",
    load_best_model_at_end=False,          # <-- opcional
    metric_for_best_model="eval_loss",    # <-- opcional
    greater_is_better=False,              # <-- opcional
)

trainer_auto = UnslothTrainer(
    model=model,
    train_dataset=knowledge_dataset_tokenizer,
    tokenizer=tokenizer,
    data_collator=data_collator,
    args=auto_config,
)

trainer_it = SFTTrainer(
    model=model,
    train_dataset=qa_train_dataset_tokenizer,
    eval_dataset=qa_val_dataset_tokenizer,
    data_collator=data_collator,
    tokenizer=tokenizer,
    args=it_config,
)

In [134]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4090. Max memory = 23.988 GB.
7.377 GB of memory reserved.


In [135]:
model.print_trainable_parameters()

trainable params: 360,710,144 || all params: 1,596,524,544 || trainable%: 22.5935


In [136]:
EPOCHS = 10
for _ in range(EPOCHS):
    print(f"--- SUPER EPOCH {_+1} / {EPOCHS} ---")
    trainer_sft_stats = trainer_auto.train() 
    trainer_it_stats = trainer_it.train()
    # GUARDAR MODELOS CADA SUPER EPOCH

--- SUPER EPOCH 1 / 10 ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 25
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 360,710,144 of 1,596,524,544 (22.59% trained)


Step,Training Loss
1,4.358
2,4.3442
3,3.1293
4,2.672
5,2.3556
6,2.049
7,1.9203
8,1.9352
9,1.819
10,1.8078


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,700 | Num Epochs = 1 | Total steps = 425
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 2 x 1) = 4
 "-____-"     Trainable parameters = 360,710,144 of 1,596,524,544 (22.59% trained)


Step,Training Loss,Validation Loss
1,5.5407,5.494328
2,5.5786,4.631781
3,4.8574,3.732778
4,3.8063,3.054796
5,3.035,2.513514
6,2.5343,1.969433
7,2.0993,1.486106
8,1.5739,1.108451
9,1.1706,0.904823
10,0.7758,0.831864


KeyboardInterrupt: 

In [93]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

Peak reserved memory = 6.742 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 28.106 %.
Peak reserved memory for training % of max memory = 0.0 %.


In [94]:
def generate_qa_prompts_testing(dataset, tokenizer):
    def build_prompt_it_generation(tokenizer, system_prompt: str, prompt: str) -> str:
        """Builds the chat prompt for a single example using the tokenizer chat template."""
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user",   "content": prompt},
        ]
        return tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=False,
        )
    system_prompt = """
    Eres un modelo de lenguaje entrenado para responder preguntas.
    """
    prompts = []
    for item in dataset:
        prompt = """{QUERY}"""
        question = item["question"]
        answer = item["answer"]
        prompt = prompt.format(QUERY=question)
        prompts.append(
            (
                build_prompt_it_generation(tokenizer, system_prompt, prompt),
                answer,
            )
        )
    return prompts

In [95]:
prompts_retrieval_test = generate_qa_prompts_testing(dataset_qa["test"], tokenizer)
prompts_retrieval_train = generate_qa_prompts_testing(dataset_qa["train"], tokenizer)

In [96]:
idx = 0
print(prompts_retrieval_train[idx][0], sep="\n")
print(prompts_retrieval_train[idx][1], sep="\n")
text = prompts_retrieval_train[idx][0]

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 04 Nov 2025

Eres un modelo de lenguaje entrenado para responder preguntas.<|eot_id|><|start_header_id|>user<|end_header_id|>

¿Qué teléfono tiene Francisca?<|eot_id|><|start_header_id|>assistant<|end_header_id|>


El teléfono de Francisca es el 619176499.


In [97]:
# test the model in streaming mode
from transformers import TextStreamer

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 64, # Increase for longer outputs!
    do_sample = False,
    top_p = 0.1,
    temperature = 0.,
    streamer = streamer,
)

El teléfono de Francisca es el 619176499.<|eot_id|>


In [98]:
# test the model in non-streaming mode
import re
import tqdm

acc = 0
total = 0

for text, answer in tqdm.tqdm(prompts_retrieval_test, desc="Testing"):
    print("\n---\n")
    inputs = tokenizer(text, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=64,  # Increase for longer outputs!
        do_sample=False, temperature=0.0, top_p=1.0
    )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True, skip_prompt=True)
    response = generated_text.split("assistant")[-1].strip()
    print("Correct:", answer, "==", "Predicted:", response)
    if response == answer:
        print("✅ Correct")
        acc += 1
    else:
        print("❌ Incorrect")
    total += 1
print(f"Accuracy: {acc}/{total} = {acc/total*100:.2f} %")

Testing:   0%|          | 0/5 [00:00<?, ?it/s]


---



Testing:  20%|██        | 1/5 [00:00<00:01,  3.83it/s]

Correct: El teléfono de Jesus Manuel es el 606523164. == Predicted: El teléfono de Jesus Manuel es el 606523164.
✅ Correct

---



Testing:  40%|████      | 2/5 [00:00<00:00,  3.82it/s]

Correct: El teléfono de Gregorio es el 606803405. == Predicted: El teléfono de Gregorio es el 606803405.
✅ Correct

---



Testing:  60%|██████    | 3/5 [00:00<00:00,  3.90it/s]

Correct: El número de teléfono de Aaron es el 616160946. == Predicted: El número de teléfono de Aaron es el 616160946.
✅ Correct

---



Testing:  80%|████████  | 4/5 [00:01<00:00,  3.92it/s]

Correct: El número de teléfono de Gerard es el 619754419. == Predicted: El número de teléfono de Gerard es el 619754419.
✅ Correct

---



Testing: 100%|██████████| 5/5 [00:01<00:00,  3.88it/s]

Correct: El número de teléfono de Gabriela es el 616259101. == Predicted: El número de teléfono de Gabriela es el 616259101.
✅ Correct
Accuracy: 5/5 = 100.00 %



