In [None]:
from unsloth import FastLanguageModel
from unsloth import UnslothTrainer, UnslothTrainingArguments
from trl import SFTTrainer, SFTConfig
import torch

SEED = 42

In [None]:
model_name = "meta-llama/Llama-3.2-1B-instruct"
MAX_LENGTH = 256

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = MAX_LENGTH,
    full_finetuning=False,
    load_in_4bit = False,
    load_in_8bit = False,
)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id
RANK = 512
model = FastLanguageModel.get_peft_model(
    model,
    r = RANK,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = RANK*2,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = SEED,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

## Datasets

### tel

In [None]:
from datasets import load_from_disk

dataset_qa = load_from_disk("../notebooks/data/dataset_instruct_train/")
# create validation dataset from dataset_qa
dataset_qa = dataset_qa.train_test_split(test_size=0.1, seed=SEED)
dataset_knowledge = load_from_disk("../notebooks/data/dataset_telephones")

documents = list(dataset_knowledge["text"])

### Simple dataset loading example

In [None]:
import pandas as pd
from langchain.schema import Document
dataset_knowledge = pd.read_csv("../notebooks/data/contacts_docs.csv")
documents = []
for index, row in dataset_knowledge.iterrows():
    doc = f"Nombre: {row['name']}\nTeléfono: {row['phone']}"
    documents.append(Document(page_content=doc, metadata={"id": f"{row['id']}" } ))
print(f"Loaded {len(documents)} documents.")
print(f"First document: {documents[0]}")


In [None]:
query_dataset_train = pd.read_csv("../notebooks/data/contacts_queries_train.csv")
query_dataset_val = pd.read_csv("../notebooks/data/contacts_queries_val.csv")
query_dataset_test = pd.read_csv("../notebooks/data/contacts_queries_test.csv")


In [None]:
all_data = {
    "train": query_dataset_train,
    "validation": query_dataset_val,
    "test": query_dataset_test,
}

#to hugginface dataset
from datasets import Dataset, DatasetDict
dataset_qa = {}
for split in all_data:
    dataset_qa[split] = Dataset.from_pandas(all_data[split])
dataset_qa = DatasetDict(dataset_qa)

In [None]:
dataset_qa

In [None]:
model.print_trainable_parameters()

In [None]:
# JOIN TRAIN AND VAL DATASETS
from datasets import concatenate_datasets
dataset_qa["train"] = concatenate_datasets([dataset_qa["train"], dataset_qa["validation"]])

In [None]:
# rename "respuesta" column to "answer"
dataset_qa = dataset_qa.rename_column("respuesta", "answer")

In [None]:
dataset_qa

### Data preparation

In [None]:
def build_prompt_it(tokenizer, system_prompt: str, prompt: str, response: str) -> str:
    """Builds the chat prompt for a single example using the tokenizer chat template."""
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user",   "content": prompt},
        {"role": "assistant", "content": response}
    ]
    return tokenizer.apply_chat_template(
        messages,
        tokenize=False,
    )

In [None]:
def generate_knowledge_injection_prompts(documents: list):
    prompt = """{doc}"""
    for doc in documents:
        yield prompt.format(doc=doc.page_content)

In [None]:
def generate_qa_prompts(dataset, tokenizer):
    system_prompt = """
    Eres un modelo de lenguaje entrenado para responder preguntas.
    """
    prompts = []
    for item in dataset:
        prompt = """{QUERY}"""
        response = "{response}"
        question = item["question"]
        prompt = prompt.format(QUERY=question)
        prompts.append(build_prompt_it(tokenizer, system_prompt, prompt, response.format(response=item["answer"])))
    return prompts

In [None]:
prompts = list(generate_knowledge_injection_prompts(documents))
print(f"Number of prompts: {len(prompts)}")

In [None]:
prompts[0]

In [None]:
# QUIERO VER LOS TOKENS
def print_tokens(text):
    tokens = tokenizer.tokenize(text)
    print("Number of tokens:", len(tokens), "\n")
    print("Tokens:", tokens)

print_tokens(prompts[0])

In [None]:
# create dataset from prompts
from datasets import Dataset
knowledge_dataset = Dataset.from_dict({"text": prompts})
knowledge_dataset

In [None]:
knowledge_dataset["text"][0]

In [None]:
prompts_qa_train = generate_qa_prompts(dataset_qa["train"], tokenizer)
prompts_qa_val = generate_qa_prompts(dataset_qa["test"], tokenizer)

print(f"Number of retrieval prompts: {len(prompts_qa_train)}")
print(f"Number of retrieval prompts: {len(prompts_qa_val)}")

In [None]:
print(prompts_qa_train[0], sep="\n")

In [None]:
print_tokens(prompts_qa_train[0])

In [None]:
# create dataset from prompts train, val, test
qa_train_dataset = Dataset.from_dict({"text": prompts_qa_train})
qa_val_dataset = Dataset.from_dict({"text": prompts_qa_val})

qa_dataset = {
    "train": qa_train_dataset,
    "validation": qa_val_dataset,
}

In [None]:
def tokenize_function_autoregressive(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=MAX_LENGTH)

In [None]:
knowledge_dataset_tokenizer = knowledge_dataset.map(tokenize_function_autoregressive, batched=True)

In [None]:
qa_train_dataset_tokenizer = qa_dataset["train"].map(tokenize_function_autoregressive, batched=True)
qa_val_dataset_tokenizer = qa_dataset["validation"].map(tokenize_function_autoregressive, batched=True)

## Train

In [None]:
# sft training
from transformers import DataCollatorForLanguageModeling


data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
auto_config = UnslothTrainingArguments(
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 4, # Use GA to mimic batch size!
    save_strategy="no",
    save_total_limit=0,
    warmup_steps = 5,
    num_train_epochs = 1, # Set this for 1 full training run.
    #max_steps = 60,
    learning_rate = 1e-4, # Reduce to 2e-5 for long training runs
    logging_steps = 1,
    # 32 bits
    optim = "paged_adamw_32bit",
    weight_decay = 0.01,
    lr_scheduler_type = "cosine",
    seed = SEED,
    report_to = "none", # Use this for WandB etc
    output_dir="../models/qwen3-0.6b-rag-indexer",
)

it_config = SFTConfig(
    dataset_text_field="text",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,         # <-- añade eval batch size
    gradient_accumulation_steps=2,
    warmup_steps=25,
    save_strategy="no",
    save_total_limit=0,
    eval_steps=1,
    eval_strategy="steps",         # <-- activa evaluación periódica
    num_train_epochs=1,             # <-- opcional: usa epochs en lugar de max_steps
    #max_steps=30,
    learning_rate=1e-4,
    logging_steps=1,
    optim = "paged_adamw_32bit",
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    seed=SEED,
    report_to="none",
    output_dir="../models/qwen3-0.6b-rag-retriever",
    load_best_model_at_end=False,          # <-- opcional
    metric_for_best_model="eval_loss",    # <-- opcional
    greater_is_better=False,              # <-- opcional
)

trainer_auto = UnslothTrainer(
    model=model,
    train_dataset=knowledge_dataset_tokenizer,
    tokenizer=tokenizer,
    data_collator=data_collator,
    args=auto_config,
)

trainer_it = SFTTrainer(
    model=model,
    train_dataset=qa_train_dataset_tokenizer,
    eval_dataset=qa_val_dataset_tokenizer,
    data_collator=data_collator,
    tokenizer=tokenizer,
    args=it_config,
)

In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
model.print_trainable_parameters()

In [None]:
EPOCHS = 10
for _ in range(EPOCHS):
    print(f"--- SUPER EPOCH {_+1} / {EPOCHS} ---")
    trainer_sft_stats = trainer_auto.train() 
    trainer_it_stats = trainer_it.train()
    # GUARDAR MODELOS CADA SUPER EPOCH

In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
def generate_qa_prompts_testing(dataset, tokenizer):
    def build_prompt_it_generation(tokenizer, system_prompt: str, prompt: str) -> str:
        """Builds the chat prompt for a single example using the tokenizer chat template."""
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user",   "content": prompt},
        ]
        return tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=False,
        )
    system_prompt = """
    Eres un modelo de lenguaje entrenado para responder preguntas.
    """
    prompts = []
    for item in dataset:
        prompt = """{QUERY}"""
        question = item["question"]
        answer = item["answer"]
        prompt = prompt.format(QUERY=question)
        prompts.append(
            (
                build_prompt_it_generation(tokenizer, system_prompt, prompt),
                answer,
            )
        )
    return prompts

In [None]:
prompts_retrieval_test = generate_qa_prompts_testing(dataset_qa["test"], tokenizer)
prompts_retrieval_train = generate_qa_prompts_testing(dataset_qa["train"], tokenizer)

In [None]:
idx = 0
print(prompts_retrieval_train[idx][0], sep="\n")
print(prompts_retrieval_train[idx][1], sep="\n")
text = prompts_retrieval_train[idx][0]

In [None]:
# test the model in streaming mode
from transformers import TextStreamer

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 64, # Increase for longer outputs!
    do_sample = False,
    top_p = 0.1,
    temperature = 0.,
    streamer = streamer,
)

In [None]:
# test the model in non-streaming mode
import re
import tqdm

acc = 0
total = 0

for text, answer in tqdm.tqdm(prompts_retrieval_test, desc="Testing"):
    print("\n---\n")
    inputs = tokenizer(text, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=64,  # Increase for longer outputs!
        do_sample=False, temperature=0.0, top_p=1.0
    )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True, skip_prompt=True)
    response = generated_text.split("assistant")[-1].strip()
    print("Correct:", answer, "==", "Predicted:", response)
    if response == answer:
        print("✅ Correct")
        acc += 1
    else:
        print("❌ Incorrect")
    total += 1
print(f"Accuracy: {acc}/{total} = {acc/total*100:.2f} %")