In [37]:
from unsloth import FastLanguageModel
from unsloth import UnslothTrainer, UnslothTrainingArguments
from trl import SFTTrainer, SFTConfig
import torch

SEED = 42

In [38]:
model_name = "meta-llama/Llama-3.2-1B-instruct"
MAX_LENGTH = 256

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = MAX_LENGTH,
    full_finetuning=False,
    load_in_4bit = True,
    load_in_8bit = False,
)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id
RANK = 512
model = FastLanguageModel.get_peft_model(
    model,
    r = RANK,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = RANK*2,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = SEED,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)
model.print_trainable_parameters()

==((====))==  Unsloth 2025.9.8: Fast Llama patching. Transformers: 4.56.2. vLLM: 0.10.2.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.988 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
trainable params: 360,710,144 || all params: 1,596,524,544 || trainable%: 22.5935


## Datasets

### Simple dataset loading example

In [39]:
import pandas as pd
from langchain.schema import Document
dataset_knowledge = pd.read_csv("../notebooks/data/contacts_docs.csv")
documents = []
for index, row in dataset_knowledge.iterrows():
    doc = f"Nombre: {row['name']}\nTeléfono: {row['phone']}"
    documents.append(Document(page_content=doc, metadata={"id": f"{row['id']}" } ))
print(f"Loaded {len(documents)} documents.")
print(f"First document: {documents[0]}")


Loaded 400 documents.
First document: page_content='Nombre: Alba Alonso
Teléfono: 632 322 183' metadata={'id': '7500_1'}


In [40]:
query_dataset_train = pd.read_csv("../notebooks/data/contacts_queries_train.csv")
query_dataset_val = pd.read_csv("../notebooks/data/contacts_queries_val.csv")
query_dataset_test = pd.read_csv("../notebooks/data/contacts_queries_test.csv")


In [41]:
all_data = {
    "train": query_dataset_train,
    "validation": query_dataset_val,
    "test": query_dataset_test,
}

#to hugginface dataset
from datasets import Dataset, DatasetDict
dataset_qa = {}
for split in all_data:
    dataset_qa[split] = Dataset.from_pandas(all_data[split])
dataset_qa = DatasetDict(dataset_qa)

In [42]:
dataset_qa

DatasetDict({
    train: Dataset({
        features: ['question', 'id', 'respuesta'],
        num_rows: 1400
    })
    validation: Dataset({
        features: ['question', 'id', 'respuesta'],
        num_rows: 300
    })
    test: Dataset({
        features: ['question', 'id', 'respuesta'],
        num_rows: 300
    })
})

In [43]:
# JOIN TRAIN AND VAL DATASETS
from datasets import concatenate_datasets
dataset_qa["train"] = concatenate_datasets([dataset_qa["train"], dataset_qa["validation"]])

In [44]:
# rename "respuesta" column to "answer"
dataset_qa = dataset_qa.rename_column("respuesta", "answer")

In [45]:
dataset_qa

DatasetDict({
    train: Dataset({
        features: ['question', 'id', 'answer'],
        num_rows: 1700
    })
    validation: Dataset({
        features: ['question', 'id', 'answer'],
        num_rows: 300
    })
    test: Dataset({
        features: ['question', 'id', 'answer'],
        num_rows: 300
    })
})

### Data preparation

In [46]:
def build_prompt_it(tokenizer, system_prompt: str, prompt: str, response: str) -> str:
    """Builds the chat prompt for a single example using the tokenizer chat template."""
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user",   "content": prompt},
        {"role": "assistant", "content": response}
    ]
    return tokenizer.apply_chat_template(
        messages,
        tokenize=False,
    )

In [47]:
def generate_knowledge_injection_prompts(documents: list):
    prompt = """{doc}"""
    for doc in documents:
        yield prompt.format(doc=doc.page_content)

In [48]:
def generate_qa_prompts(dataset, tokenizer):
    system_prompt = """
    Eres un modelo de lenguaje entrenado para responder preguntas.
    """
    prompts = []
    for item in dataset:
        prompt = """{QUERY}"""
        response = "{response}"
        question = item["question"]
        prompt = prompt.format(QUERY=question)
        prompts.append(build_prompt_it(tokenizer, system_prompt, prompt, response.format(response=item["answer"])))
    return prompts

In [49]:
prompts = list(generate_knowledge_injection_prompts(documents))
print(f"Number of prompts: {len(prompts)}")

Number of prompts: 400


In [50]:
prompts[0]

'Nombre: Alba Alonso\nTeléfono: 632 322 183'

In [51]:
# QUIERO VER LOS TOKENS
def print_tokens(text):
    tokens = tokenizer.tokenize(text)
    print("Number of tokens:", len(tokens), "\n")
    print("Tokens:", tokens)

print_tokens(prompts[0])

Number of tokens: 15 

Tokens: ['Nombre', ':', 'ĠAl', 'ba', 'ĠAlonso', 'Ċ', 'Tel', 'Ã©fono', ':', 'Ġ', '632', 'Ġ', '322', 'Ġ', '183']


In [52]:
# create dataset from prompts
from datasets import Dataset
knowledge_dataset = Dataset.from_dict({"text": prompts})
knowledge_dataset

Dataset({
    features: ['text'],
    num_rows: 400
})

In [53]:
knowledge_dataset["text"][0]

'Nombre: Alba Alonso\nTeléfono: 632 322 183'

In [54]:
prompts_qa_train = generate_qa_prompts(dataset_qa["train"], tokenizer)
prompts_qa_val = generate_qa_prompts(dataset_qa["test"], tokenizer)

print(f"Number of retrieval prompts: {len(prompts_qa_train)}")
print(f"Number of retrieval prompts: {len(prompts_qa_val)}")

Number of retrieval prompts: 1700
Number of retrieval prompts: 300


In [55]:
print(prompts_qa_train[0], sep="\n")

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 07 Nov 2025

Eres un modelo de lenguaje entrenado para responder preguntas.<|eot_id|><|start_header_id|>user<|end_header_id|>

Necesito el contacto asociado al 620 152 344. —consulta interna—<|eot_id|><|start_header_id|>assistant<|end_header_id|>

El número 620 152 344 pertenece a Alejandro Vega.<|eot_id|>


In [56]:
print_tokens(prompts_qa_train[0])

Number of tokens: 87 

Tokens: ['<|begin_of_text|>', '<|start_header_id|>', 'system', '<|end_header_id|>', 'ĊĊ', 'Cut', 'ting', 'ĠKnowledge', 'ĠDate', ':', 'ĠDecember', 'Ġ', '202', '3', 'Ċ', 'Today', 'ĠDate', ':', 'Ġ', '07', 'ĠNov', 'Ġ', '202', '5', 'ĊĊ', 'E', 'res', 'Ġun', 'Ġmodelo', 'Ġde', 'Ġl', 'engu', 'aje', 'Ġentren', 'ado', 'Ġpara', 'Ġresponder', 'Ġpreg', 'untas', '.', '<|eot_id|>', '<|start_header_id|>', 'user', '<|end_header_id|>', 'ĊĊ', 'N', 'ec', 'es', 'ito', 'Ġel', 'Ġcontacto', 'Ġasoci', 'ado', 'Ġal', 'Ġ', '620', 'Ġ', '152', 'Ġ', '344', '.', 'ĠâĢĶ', 'consulta', 'Ġintern', 'a', 'âĢĶ', '<|eot_id|>', '<|start_header_id|>', 'assistant', '<|end_header_id|>', 'ĊĊ', 'El', 'ĠnÃºmero', 'Ġ', '620', 'Ġ', '152', 'Ġ', '344', 'Ġpert', 'ene', 'ce', 'Ġa', 'ĠAlejandro', 'ĠVega', '.', '<|eot_id|>']


In [57]:
# create dataset from prompts train, val, test
qa_train_dataset = Dataset.from_dict({"text": prompts_qa_train})
qa_val_dataset = Dataset.from_dict({"text": prompts_qa_val})

qa_dataset = {
    "train": qa_train_dataset,
    "validation": qa_val_dataset,
}

In [58]:
def tokenize_function_autoregressive(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=MAX_LENGTH)

In [59]:
knowledge_dataset_tokenizer = knowledge_dataset.map(tokenize_function_autoregressive, batched=True)

Map: 100%|██████████| 400/400 [00:00<00:00, 12954.68 examples/s]


In [60]:
qa_train_dataset_tokenizer = qa_dataset["train"].map(tokenize_function_autoregressive, batched=True)
qa_val_dataset_tokenizer = qa_dataset["validation"].map(tokenize_function_autoregressive, batched=True)

Map: 100%|██████████| 1700/1700 [00:00<00:00, 17855.79 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 15787.84 examples/s]


In [61]:
def generate_qa_prompts_testing(dataset, tokenizer):
    def build_prompt_it_generation(tokenizer, system_prompt: str, prompt: str) -> str:
        """Builds the chat prompt for a single example using the tokenizer chat template."""
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user",   "content": prompt},
        ]
        return tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=False,
        )
    system_prompt = """
    Eres un modelo de lenguaje entrenado para responder preguntas.
    """
    prompts = []
    for item in dataset:
        prompt = """{QUERY}"""
        question = item["question"]
        answer = item["answer"]
        prompt = prompt.format(QUERY=question)
        prompts.append(
            (
                build_prompt_it_generation(tokenizer, system_prompt, prompt),
                answer,
            )
        )
    return prompts

In [62]:
prompts_retrieval_test = generate_qa_prompts_testing(dataset_qa["test"], tokenizer)
text_for_testing = prompts_retrieval_test[0][0]
print(text_for_testing)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 07 Nov 2025

Eres un modelo de lenguaje entrenado para responder preguntas.<|eot_id|><|start_header_id|>user<|end_header_id|>

¿A quién pertenece el teléfono 736 615 948?<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [63]:
import re
import tqdm
import torch

def levenshtein(s: str, t: str) -> int:
    """
    Distancia de Levenshtein con memoria O(min(len(s), len(t))).
    """
    # Asegura que t es la más corta para usar menos memoria
    if len(s) < len(t):
        s, t = t, s
    previous = list(range(len(t) + 1))

    for i, cs in enumerate(s, start=1):
        current = [i]
        for j, ct in enumerate(t, start=1):
            costo = 0 if cs == ct else 1
            current.append(min(
                current[-1] + 1,          # inserción
                previous[j] + 1,          # borrado
                previous[j-1] + costo     # sustitución
            ))
        previous = current
    return previous[-1]


def test_model_accuracy(model, tokenizer, prompts_retrieval_test, device="cuda", 
                        max_new_tokens=64, temperature=0.0, top_p=1.0):
    
    acc = 0
    lev = 0
    total = 0

    progress_bar = tqdm.tqdm(prompts_retrieval_test, desc="Testing")

    for text, answer in progress_bar:
        inputs = tokenizer(text, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                temperature=temperature,
                top_p=top_p
            )
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        response = generated_text.split("assistant")[-1].strip()

        if response == answer:
            acc += 1
        lev += levenshtein(response, answer)
        total += 1

        progress_bar.set_postfix({
            "acc": f"{acc/total*100:.2f} %",
            "lev": f"{lev/total:.2f}"
        })

    final_acc = acc / total * 100 if total > 0 else 0.0
    print(f"Accuracy final: {acc}/{total} = {final_acc:.2f} %")
    final_lev = lev / total if total > 0 else 0.0
    print(f"Levenshtein final: {final_lev:.2f}")
    return final_acc, final_lev

In [64]:
import matplotlib.pyplot as plt

def plot_training_history(history):
    """
    Genera gráficos de la evolución de la accuracy y las pérdidas (SFT e IT) durante el entrenamiento.

    Parámetros:
        history (list[dict]): lista de diccionarios, donde cada elemento debe contener:
            - "super_epoch": número de super-época.
            - "accuracy": precisión alcanzada.
            - "trainer_sft_stats".training_loss: pérdida SFT.
            - "trainer_it_stats".training_loss: pérdida IT.
    """
    # Extraer métricas del historial
    super_epochs = [h["super_epoch"] for h in history]
    accuracies = [h["accuracy"] for h in history]
    losses_sft = [h["trainer_sft_stats"].training_loss for h in history]
    losses_it = [h["trainer_it_stats"].training_loss for h in history]

    # --- Gráfico de Accuracy ---
    plt.figure(figsize=(6, 4))
    plt.plot(super_epochs, accuracies, marker='o', color='tab:blue')
    plt.xlabel("Super Epoch")
    plt.ylabel("Accuracy (%)")
    plt.title("Accuracy History")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # --- Gráfico de Losses ---
    plt.figure(figsize=(6, 4))
    plt.plot(super_epochs, losses_sft, marker='o', label="SFT Loss", color='tab:orange')
    plt.plot(super_epochs, losses_it, marker='s', label="IT Loss", color='tab:green')
    plt.xlabel("Super Epoch")
    plt.ylabel("Loss")
    plt.title("Loss History")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


## Train

### Iterative training configuration

In [65]:
# sft training
from transformers import DataCollatorForLanguageModeling


data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
auto_config = UnslothTrainingArguments(
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 8, # Use GA to mimic batch size!
    save_strategy="no",
    save_total_limit=0,
    warmup_steps = 5,
    num_train_epochs = 1, # Set this for 1 full training run.
    #max_steps = 60,
    learning_rate = 1e-4, # Reduce to 2e-5 for long training runs
    logging_steps = 1,
    # 32 bits
    optim = "paged_adamw_32bit",
    weight_decay = 0.01,
    lr_scheduler_type = "cosine",
    seed = SEED,
    report_to = "none", # Use this for WandB etc
    output_dir="../models/qwen3-0.6b-rag-indexer",
)

it_config = SFTConfig(
    dataset_text_field="text",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,        # <-- añade eval batch size
    gradient_accumulation_steps=8,
    warmup_steps=25,
    save_strategy="no",
    save_total_limit=0,
    eval_steps=1,
    eval_strategy="steps",         # <-- activa evaluación periódica
    num_train_epochs=1,             # <-- opcional: usa epochs en lugar de max_steps
    #max_steps=30,
    learning_rate=1e-4,
    logging_steps=1,
    optim = "paged_adamw_32bit",
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    seed=SEED,
    report_to="none",
    output_dir="../models/qwen3-0.6b-rag-retriever",
    load_best_model_at_end=False,          # <-- opcional
    metric_for_best_model="eval_loss",    # <-- opcional
    greater_is_better=False,              # <-- opcional
)

trainer_auto = UnslothTrainer(
    model=model,
    train_dataset=knowledge_dataset_tokenizer,
    tokenizer=tokenizer,
    data_collator=data_collator,
    args=auto_config,
)

trainer_it = SFTTrainer(
    model=model,
    train_dataset=qa_train_dataset_tokenizer,
    eval_dataset=qa_val_dataset_tokenizer,
    data_collator=data_collator,
    tokenizer=tokenizer,
    args=it_config,
)

In [66]:
model.print_trainable_parameters()

trainable params: 360,710,144 || all params: 1,596,524,544 || trainable%: 22.5935


In [67]:
import wandb

name = f"{model_name.replace('/', '_')}_r{RANK}_qa_iterativo_agenda"

# Inicia la sesión de wandb
wandb.init(
    project="qa_iterativo_agenda",
    name=name,  # opcional
    config={
        "super_epochs": 10,
        "model": model_name,
        "r": RANK,
    },
)

[34m[1mwandb[0m: Currently logged in as: [33mmiguel_kjh[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [68]:
EPOCHS = 10

for _ in range(EPOCHS):
    print(f"--- SUPER EPOCH {_+1} / {EPOCHS} ---")
    trainer_sft_stats = trainer_auto.train() 
    trainer_it_stats = trainer_it.train()
    # evaluate accuracy after each super epoch
    acc, lev = test_model_accuracy(model, tokenizer, prompts_retrieval_test, device="cuda")
    wandb.log({
        "super_epoch": _ + 1,
        "accuracy": acc,
        "levenshtein": lev,
        "train_loss_sft": trainer_sft_stats.training_loss,
        "train_loss_it": trainer_it_stats.training_loss,
    })

wandb.finish()

--- SUPER EPOCH 1 / 10 ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 7
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 360,710,144 of 1,596,524,544 (22.59% trained)


Step,Training Loss
1,4.4568
2,4.4082
3,3.1691
4,2.7974
5,2.2036
6,1.8997
7,1.9346


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,700 | Num Epochs = 1 | Total steps = 27
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 360,710,144 of 1,596,524,544 (22.59% trained)


Step,Training Loss,Validation Loss
1,6.493,6.481918
2,6.4322,5.049185
3,5.053,3.767687
4,3.7293,2.774565
5,2.7534,2.141159
6,2.1384,1.65662
7,1.6219,1.270533
8,1.258,0.923814
9,0.9615,0.79005
10,0.813,0.77023


Testing: 100%|██████████| 300/300 [01:45<00:00,  2.84it/s, acc=0.00 %, lev=8.94]


Accuracy final: 0/300 = 0.00 %
Levenshtein final: 8.94
--- SUPER EPOCH 2 / 10 ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 7
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 360,710,144 of 1,596,524,544 (22.59% trained)


Step,Training Loss
1,2.9064
2,2.8742
3,2.3182
4,2.1823
5,2.0622
6,1.8937
7,1.6332


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,700 | Num Epochs = 1 | Total steps = 27
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 360,710,144 of 1,596,524,544 (22.59% trained)


Step,Training Loss,Validation Loss
1,0.6529,0.652781
2,0.649,0.535476
3,0.5394,0.504961
4,0.4979,0.493852
5,0.49,0.486631
6,0.4844,0.465859
7,0.465,0.452868
8,0.4498,0.440305
9,0.4427,0.421363
10,0.4266,0.398265


Testing: 100%|██████████| 300/300 [01:45<00:00,  2.84it/s, acc=0.00 %, lev=8.74]


Accuracy final: 0/300 = 0.00 %
Levenshtein final: 8.74
--- SUPER EPOCH 3 / 10 ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 7
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 360,710,144 of 1,596,524,544 (22.59% trained)


Step,Training Loss
1,3.0332
2,2.9997
3,2.8478
4,2.6328
5,2.3645
6,2.1331
7,1.6583


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,700 | Num Epochs = 1 | Total steps = 27
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 360,710,144 of 1,596,524,544 (22.59% trained)


Step,Training Loss,Validation Loss
1,0.3787,0.382535
2,0.383,0.380432
3,0.3845,0.375038
4,0.3641,0.36748
5,0.3649,0.359639
6,0.3673,0.354091
7,0.3504,0.348205
8,0.3406,0.339748
9,0.3432,0.330422
10,0.3382,0.32348


Testing: 100%|██████████| 300/300 [01:54<00:00,  2.62it/s, acc=0.00 %, lev=8.72]


Accuracy final: 0/300 = 0.00 %
Levenshtein final: 8.72
--- SUPER EPOCH 4 / 10 ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 7
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 360,710,144 of 1,596,524,544 (22.59% trained)


Step,Training Loss
1,1.9354
2,1.9461
3,1.8127
4,1.5637
5,1.3136
6,1.0443
7,0.7782


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,700 | Num Epochs = 1 | Total steps = 27
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 360,710,144 of 1,596,524,544 (22.59% trained)


Step,Training Loss,Validation Loss
1,0.3353,0.343067
2,0.3373,0.34149
3,0.3367,0.331347
4,0.3246,0.320288
5,0.3134,0.304715
6,0.2922,0.285247
7,0.2802,0.264224
8,0.2549,0.247784
9,0.2432,0.238979
10,0.2349,0.236358


Testing: 100%|██████████| 300/300 [01:55<00:00,  2.59it/s, acc=0.33 %, lev=8.57]


Accuracy final: 1/300 = 0.33 %
Levenshtein final: 8.57
--- SUPER EPOCH 5 / 10 ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 7
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 360,710,144 of 1,596,524,544 (22.59% trained)


Step,Training Loss
1,2.3663
2,2.3748
3,2.2883
4,2.1166
5,1.965
6,1.7409
7,1.239


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,700 | Num Epochs = 1 | Total steps = 27
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 360,710,144 of 1,596,524,544 (22.59% trained)


Step,Training Loss,Validation Loss
1,0.2304,0.234299
2,0.2294,0.233964
3,0.226,0.231851
4,0.2248,0.230348
5,0.2269,0.227813
6,0.219,0.225956
7,0.2214,0.224794
8,0.2147,0.223081
9,0.2161,0.222205
10,0.218,0.221702


Testing: 100%|██████████| 300/300 [01:43<00:00,  2.91it/s, acc=9.33 %, lev=8.02] 


Accuracy final: 28/300 = 9.33 %
Levenshtein final: 8.02
--- SUPER EPOCH 6 / 10 ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 7
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 360,710,144 of 1,596,524,544 (22.59% trained)


Step,Training Loss
1,1.6735
2,1.6737
3,1.5717
4,1.4391
5,1.2942
6,1.1491
7,0.7532


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,700 | Num Epochs = 1 | Total steps = 27
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 360,710,144 of 1,596,524,544 (22.59% trained)


Step,Training Loss,Validation Loss
1,0.1902,0.203788
2,0.1972,0.202827
3,0.1971,0.200821
4,0.1901,0.198634
5,0.1894,0.196192
6,0.1877,0.193983
7,0.1807,0.192829
8,0.1779,0.192507
9,0.1804,0.191928
10,0.1768,0.191075


Testing: 100%|██████████| 300/300 [01:44<00:00,  2.87it/s, acc=37.67 %, lev=5.26]


Accuracy final: 113/300 = 37.67 %
Levenshtein final: 5.26
--- SUPER EPOCH 7 / 10 ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 7
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 360,710,144 of 1,596,524,544 (22.59% trained)


Step,Training Loss
1,0.9773
2,0.9727
3,0.8798
4,0.7614
5,0.6348
6,0.5122
7,0.3595


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,700 | Num Epochs = 1 | Total steps = 27
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 360,710,144 of 1,596,524,544 (22.59% trained)


Step,Training Loss,Validation Loss
1,0.2753,0.284888
2,0.2778,0.284657
3,0.2706,0.279966
4,0.2716,0.270288
5,0.2629,0.258669
6,0.2412,0.240801
7,0.2303,0.221548
8,0.2079,0.200535
9,0.1822,0.184468
10,0.1687,0.174899


Testing: 100%|██████████| 300/300 [01:53<00:00,  2.64it/s, acc=70.33 %, lev=2.70]


Accuracy final: 211/300 = 70.33 %
Levenshtein final: 2.70
--- SUPER EPOCH 8 / 10 ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 7
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 360,710,144 of 1,596,524,544 (22.59% trained)


Step,Training Loss
1,1.7128
2,1.7198
3,1.6369
4,1.559
5,1.4523
6,1.3334
7,1.0072


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,700 | Num Epochs = 1 | Total steps = 27
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 360,710,144 of 1,596,524,544 (22.59% trained)


Step,Training Loss,Validation Loss
1,0.1566,0.169831
2,0.1619,0.169468
3,0.1561,0.168208
4,0.1584,0.165877
5,0.1489,0.16406
6,0.1442,0.162686
7,0.1496,0.161607
8,0.1473,0.160549
9,0.1458,0.16046
10,0.1446,0.16087


Testing: 100%|██████████| 300/300 [01:52<00:00,  2.68it/s, acc=88.33 %, lev=1.06]


Accuracy final: 265/300 = 88.33 %
Levenshtein final: 1.06
--- SUPER EPOCH 9 / 10 ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 7
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 360,710,144 of 1,596,524,544 (22.59% trained)


Step,Training Loss
1,1.3232
2,1.3285
3,1.2548
4,1.1653
5,1.0452
6,0.926
7,0.7197


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,700 | Num Epochs = 1 | Total steps = 27
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 360,710,144 of 1,596,524,544 (22.59% trained)


Step,Training Loss,Validation Loss
1,0.1479,0.163098
2,0.147,0.163057
3,0.1459,0.162494
4,0.1464,0.161635
5,0.145,0.160619
6,0.1414,0.160314
7,0.143,0.159596
8,0.1431,0.15911
9,0.1416,0.158497
10,0.1406,0.15941


Testing: 100%|██████████| 300/300 [01:45<00:00,  2.85it/s, acc=91.67 %, lev=0.73]


Accuracy final: 275/300 = 91.67 %
Levenshtein final: 0.73
--- SUPER EPOCH 10 / 10 ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 1 | Total steps = 7
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 360,710,144 of 1,596,524,544 (22.59% trained)


Step,Training Loss
1,0.8556
2,0.8366
3,0.7792
4,0.6926
5,0.5903
6,0.4863
7,0.3673


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,700 | Num Epochs = 1 | Total steps = 27
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 360,710,144 of 1,596,524,544 (22.59% trained)


Step,Training Loss,Validation Loss
1,0.239,0.256639
2,0.2359,0.254685
3,0.2358,0.251143
4,0.2304,0.242479
5,0.2194,0.232565
6,0.2073,0.218279
7,0.196,0.202223
8,0.1814,0.184405
9,0.163,0.171183
10,0.152,0.162905


Testing: 100%|██████████| 300/300 [01:44<00:00,  2.86it/s, acc=95.33 %, lev=0.50]
[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Accuracy final: 286/300 = 95.33 %
Levenshtein final: 0.50


[34m[1mwandb[0m: [32m[41mERROR[0m Control-C detected -- Run data was not synced


KeyboardInterrupt: 

In [None]:
# test the model in streaming mode
from transformers import TextStreamer

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
_ = model.generate(
    **tokenizer(text_for_testing, return_tensors = "pt").to("cuda"),
    max_new_tokens = 64, # Increase for longer outputs!
    do_sample = False,
    top_p = 0.1,
    temperature = 0.,
    streamer = streamer,
)

El número 736 615 948 pertenece a Leo Pérez.<|eot_id|>


In [None]:
acc, lev = test_model_accuracy(model, tokenizer, prompts_retrieval_test, device="cuda")

Testing: 100%|██████████| 300/300 [01:45<00:00,  2.84it/s, acc=91.00 %, lev=0.80]

Accuracy final: 273/300 = 91.00 %
Levenshtein final: 241/300 = 0.80



