In [1]:
import argparse, os, numpy as np, torch
from datasets import load_dataset, ClassLabel
from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding, EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from transformers import BitsAndBytesConfig
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_indexing_csv = pd.read_csv("../notebooks/data/contacts_docs.csv")

# to dataset huggingface
from datasets import Dataset

dataset_indexing = Dataset.from_pandas(dataset_indexing_csv)

# create a new column 'text' that concatenates 'name', 'phone'
def concatenate_columns(example):
    return {
        "text": f"Nombre: {example['name']}\nTeléfono: {example['phone']}"
    }
dataset_indexing = dataset_indexing.map(concatenate_columns)
# rename column 'id' to 'label'
dataset_indexing = dataset_indexing.rename_column("id", "label")

num_labels = len(dataset_indexing['label'])
print(f"Number of labels: {num_labels}")
labels_list = dataset_indexing.unique('label')
print(f"Labels: {labels_list}")

# map labels to integers
label_to_id = {label: i for i, label in enumerate(labels_list)}
def map_labels(example):
    return {
        "label": label_to_id[example['label']]
    }
dataset_indexing = dataset_indexing.map(map_labels)

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map: 100%|██████████| 400/400 [00:00<00:00, 29684.73 examples/s]


Number of labels: 400
Labels: ['7500_1', '7500_2', '7500_3', '7500_4', '7500_5', '7501_1', '7501_2', '7501_3', '7501_4', '7501_5', '7502_1', '7502_2', '7502_3', '7502_4', '7502_5', '7503_1', '7503_2', '7503_3', '7503_4', '7503_5', '7504_1', '7504_2', '7504_3', '7504_4', '7504_5', '7505_1', '7505_2', '7505_3', '7505_4', '7505_5', '7506_1', '7506_2', '7506_3', '7506_4', '7506_5', '7507_1', '7507_2', '7507_3', '7507_4', '7507_5', '7508_1', '7508_2', '7508_3', '7508_4', '7508_5', '7509_1', '7509_2', '7509_3', '7509_4', '7509_5', '7510_1', '7510_2', '7510_3', '7510_4', '7510_5', '7511_1', '7511_2', '7511_3', '7511_4', '7511_5', '7512_1', '7512_2', '7512_3', '7512_4', '7512_5', '7513_1', '7513_2', '7513_3', '7513_4', '7513_5', '7514_1', '7514_2', '7514_3', '7514_4', '7514_5', '7515_1', '7515_2', '7515_3', '7515_4', '7515_5', '7516_1', '7516_2', '7516_3', '7516_4', '7516_5', '7517_1', '7517_2', '7517_3', '7517_4', '7517_5', '7518_1', '7518_2', '7518_3', '7518_4', '7518_5', '7519_1', '7519_2',

Map: 100%|██████████| 400/400 [00:00<00:00, 48656.41 examples/s]


In [3]:
query_dataset_train = pd.read_csv("../notebooks/data/contacts_queries_train.csv")
query_dataset_val = pd.read_csv("../notebooks/data/contacts_queries_val.csv")
query_dataset_test = pd.read_csv("../notebooks/data/contacts_queries_test.csv")

In [4]:
dataset_for_queries = {
    "train": Dataset.from_pandas(query_dataset_train),
    "validation": Dataset.from_pandas(query_dataset_val),
    "test": Dataset.from_pandas(query_dataset_test)
}

for split in dataset_for_queries:
    dataset_for_queries[split] = dataset_for_queries[split].rename_column("question", "text")
    dataset_for_queries[split] = dataset_for_queries[split].rename_column("id", "label")
    # map labels to integers
    dataset_for_queries[split] = dataset_for_queries[split].map(map_labels)

# to dataset huggingface
from datasets import DatasetDict
dataset_for_queries = DatasetDict(dataset_for_queries)
print(dataset_for_queries["train"][1])


Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Map: 100%|██████████| 1400/1400 [00:00<00:00, 56029.17 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 26971.86 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 58746.50 examples/s]

{'text': '¿Cómo puedo contactar con Antonio Alonso?', 'label': 194}





## Embeddings

In [5]:
from langchain.schema import Document
from langchain.retrievers import BM25Retriever

def create_documents_from_datasets(datasets):
    documents = []
    for dataset in datasets:
        for item in dataset:
            doc = Document(
                page_content=item["text"],
                metadata={"label": item["label"]}
            )
            documents.append(doc)
    return documents
documents_indexing = create_documents_from_datasets([dataset_indexing])
print(f"Number of documents created for indexing: {len(documents_indexing)}")

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

embedding_model = HuggingFaceEmbeddings(
    model_name="Qwen/Qwen3-Embedding-0.6B",
    model_kwargs={"device": "cuda"},
)

vectorstore = FAISS.from_documents(documents_indexing, embedding_model)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 100})

retriever_sparse = BM25Retriever.from_documents(documents_indexing)

Number of documents created for indexing: 400


In [6]:
from tqdm import tqdm

list_of_labels_embeddings = []
list_of_labels_sparse = []
list_of_real_labels_embeddings = []
for index in tqdm(range(len(dataset_for_queries["test"]))):
    query = dataset_for_queries["test"][index]["text"]
    real_label_id = dataset_for_queries["test"][index]["label"]
    id_to_label = {v: k for k, v in label_to_id.items()}
    real_label = id_to_label[real_label_id]

    # retrieve documents
    docs = retriever.get_relevant_documents(query)
    # get labels from retrieved documents
    retrieved_labels = [doc.metadata["label"] for doc in docs]
    # to names
    retrieved_labels = [id_to_label[label_id] for label_id in retrieved_labels]
    list_of_labels_embeddings.append(retrieved_labels)
    list_of_real_labels_embeddings.append([real_label])

    # retrieve documents (sparse)
    docs_sparse = retriever_sparse.get_relevant_documents(query)
    # get labels from retrieved documents
    retrieved_labels_sparse = [doc.metadata["label"] for doc in docs_sparse]
    # to names
    retrieved_labels_sparse = [id_to_label[label_id] for label_id in retrieved_labels_sparse]
    list_of_labels_sparse.append(retrieved_labels_sparse)


  docs = retriever.get_relevant_documents(query)
100%|██████████| 300/300 [00:09<00:00, 31.33it/s]


In [7]:
from ranking_metrics import calc_ranking_metrics

metrics = calc_ranking_metrics(list_of_labels_embeddings, list_of_real_labels_embeddings, ks=[1, 5, 10, 20, 100], one_relevant_per_query=True)

print("Ranking Metrics:")
for k, v in metrics.items():
    print(f"  {k}: {v:.4f}")

Ranking Metrics:
  MRR: 0.7261
  mAP: 0.7261
  AvgRank: 7.6983
  CMC@1: 0.6800
  Recall@k (macro)@1: 0.6800
  Precision@k (macro)@1: 0.6800
  Accuracy@1: 0.6800
  F1@k (macro)@1: 0.6800
  CMC@5: 0.7600
  Recall@k (macro)@5: 0.7600
  Precision@k (macro)@5: 0.1520
  Accuracy@5: 0.7600
  F1@k (macro)@5: 0.2533
  CMC@10: 0.8300
  Recall@k (macro)@10: 0.8300
  Precision@k (macro)@10: 0.0830
  Accuracy@10: 0.8300
  F1@k (macro)@10: 0.1509
  CMC@20: 0.8833
  Recall@k (macro)@20: 0.8833
  Precision@k (macro)@20: 0.0442
  Accuracy@20: 0.8833
  F1@k (macro)@20: 0.0841
  CMC@100: 0.9833
  Recall@k (macro)@100: 0.9833
  Precision@k (macro)@100: 0.0098
  Accuracy@100: 0.9833
  F1@k (macro)@100: 0.0195


In [8]:
metrics_sparse = calc_ranking_metrics(list_of_labels_sparse, list_of_real_labels_embeddings, ks=[1, 5, 10, 20, 100], one_relevant_per_query=True)

print("Sparse Ranking Metrics:")
for k, v in metrics_sparse.items():
    print(f"  {k}: {v:.4f}")

Sparse Ranking Metrics:
  MRR: 0.5539
  mAP: 0.5539
  AvgRank: 1.4158
  CMC@1: 0.5133
  Recall@k (macro)@1: 0.5133
  Precision@k (macro)@1: 0.5133
  Accuracy@1: 0.5133
  F1@k (macro)@1: 0.5133
  CMC@5: 0.6333
  Recall@k (macro)@5: 0.6333
  Precision@k (macro)@5: 0.1267
  Accuracy@5: 0.6333
  F1@k (macro)@5: 0.2111
  CMC@10: 0.6333
  Recall@k (macro)@10: 0.6333
  Precision@k (macro)@10: 0.0633
  Accuracy@10: 0.6333
  F1@k (macro)@10: 0.1152
  CMC@20: 0.6333
  Recall@k (macro)@20: 0.6333
  Precision@k (macro)@20: 0.0317
  Accuracy@20: 0.6333
  F1@k (macro)@20: 0.0603
  CMC@100: 0.6333
  Recall@k (macro)@100: 0.6333
  Precision@k (macro)@100: 0.0063
  Accuracy@100: 0.6333
  F1@k (macro)@100: 0.0125


## Model

In [9]:
model_name = "Qwen/Qwen3-0.6B"
MAX_LENGTH = 512

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    low_cpu_mem_usage=True,
    device_map={"": 0}
)
model.config.pad_token_id = tokenizer.pad_token_id

Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen3-0.6B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
lora_r = 256
lora_alpha = lora_r * 2
lora_dropout = 0.0
lora_bias = "none"
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "down_proj"]

In [12]:
config = LoraConfig(
    task_type="SEQ_CLS",
    inference_mode=False,
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias=lora_bias,
    target_modules=target_modules
)
model = get_peft_model(model, config)

In [13]:
for p in model.base_model.model.score.parameters():
    p.requires_grad_(True)

In [14]:
def preprocess(examples):
    return tokenizer(examples["text"], truncation=True, max_length=MAX_LENGTH, padding=False)

# tokenize test from dataset
tokenized_datasets_indexing = dataset_indexing.map(preprocess, batched=True)
tokenized_datasets_query = dataset_for_queries.map(preprocess, batched=True)


Map: 100%|██████████| 400/400 [00:00<00:00, 50917.20 examples/s]
Map: 100%|██████████| 1400/1400 [00:00<00:00, 80521.43 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 50877.05 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 52767.39 examples/s]


In [15]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)

def softmax(x, axis=-1):
    x = np.asarray(x, dtype=np.float64)
    # Restar el máximo para evitar overflow
    x_shift = x - np.max(x, axis=axis, keepdims=True)
    exps = np.exp(x_shift)
    return exps / np.sum(exps, axis=axis, keepdims=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = (preds == labels).mean()
    # softmax to get probabilities using numpy
    probs = softmax(logits, axis=-1)
    # get list of labels sorted by probability
    sorted_indices = np.argsort(probs, axis=-1)
    # reverse to have the highest probability first
    sorted_indices = sorted_indices[:, ::-1]
    list_of_labels = [
        [idx] for idx in labels  
    ]

    ranking_metrics = calc_ranking_metrics(
        sorted_indices, 
        list_of_labels, 
        ks=[1, 5, 10, 20, 100], 
        one_relevant_per_query=True
    )

    return {
        "accuracy": acc,  
        "mAP": ranking_metrics["mAP"], 
        "Hint@1": ranking_metrics["Accuracy@1"], 
        "Hint@10": ranking_metrics["Accuracy@10"],
        "Hint@100": ranking_metrics["Accuracy@100"],
        # recall@k
        "recall@1": ranking_metrics["Recall@k (macro)@1"],
        "recall@10": ranking_metrics["Recall@k (macro)@10"],
        "recall@100": ranking_metrics["Recall@k (macro)@100"],
        # precision@k
        "precision@1": ranking_metrics["Precision@k (macro)@1"],
        "precision@10": ranking_metrics["Precision@k (macro)@10"],
        "precision@100": ranking_metrics["Precision@k (macro)@100"],
    }

In [16]:
# testing compute_metrics
test_logits = torch.tensor([[0.1, 0.69, 0.99], [0.8, 0.1, 0.91]])
test_labels = torch.tensor([1, 0])
test_eval_pred = (test_logits.detach().numpy(), test_labels.detach().numpy())
test_metrics = compute_metrics(test_eval_pred)
print(f"Test compute_metrics: {test_metrics}")

Test compute_metrics: {'accuracy': np.float64(0.0), 'mAP': 0.5, 'Hint@1': 0.0, 'Hint@10': 1.0, 'Hint@100': 1.0, 'recall@1': 0.0, 'recall@10': 1.0, 'recall@100': 1.0, 'precision@1': 0.0, 'precision@10': 0.1, 'precision@100': 0.01}


In [17]:
SEED = 42
EPOCHS = 10
BATCH_SIZE = 32
CYCLES = 2

In [18]:
import wandb

wandb.init(
    project="agenda_multilabel_classification_retriever",
    name="entrenamiento_indexing_y_query",
    config={
        "epochs": EPOCHS,
        "batch_size": BATCH_SIZE,
        "model": model_name,
        "cycles": CYCLES
    }
)

[34m[1mwandb[0m: Currently logged in as: [33mmiguel_kjh[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [langchain] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/




In [19]:
from transformers import TrainerCallback

class PrefixedWandbCallback(TrainerCallback):
    def __init__(self, phase=""):
        self.phase = phase

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            prefixed_logs = {f"{self.phase}/{k}": v for k, v in logs.items()}
            wandb.log(prefixed_logs, step=state.global_step)

In [23]:
# Entrenamiento

training_args_indexing = TrainingArguments(
    output_dir=f"models/contacts_clf_{model_name.replace('/', '_')}_indexing",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=1,
    num_train_epochs=EPOCHS,
    learning_rate=5e-5,
    weight_decay=0.05,
    warmup_ratio=0.2,
    lr_scheduler_type="cosine",
    eval_strategy="steps",     # o "no" si no vas a evaluar
    save_strategy="no",        # <-- no guarda checkpoints ni el modelo final
    eval_steps=10,
    logging_steps=10,
    load_best_model_at_end=False,  # <-- desactivado porque no hay checkpoints
    fp16=True,
    report_to="wandb",
    seed=SEED,
    label_smoothing_factor=0.1,
)

training_args_query = TrainingArguments(
    output_dir=f"models/contacts_clf_{model_name.replace('/', '_')}_query",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=1,
    num_train_epochs=EPOCHS,
    learning_rate=5e-5,
    weight_decay=0.05,
    warmup_ratio=0.2,
    lr_scheduler_type="cosine",
    eval_strategy="steps",
    save_strategy="no",        # <-- no guarda checkpoints ni el modelo final
    eval_steps=10,
    logging_steps=10,
    load_best_model_at_end=False,  # <-- desactivado porque no hay checkpoints
    fp16=True,
    report_to="wandb",
    seed=SEED,
    label_smoothing_factor=0.1,
)

# IDs de tokens
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.bos_token_id = tokenizer.bos_token_id

trainer_indexing = Trainer(
    model=model,
    args=training_args_indexing,
    train_dataset=tokenized_datasets_indexing,
    eval_dataset=tokenized_datasets_query["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[
        PrefixedWandbCallback(phase="indexing"),
        #EarlyStoppingCallback(early_stopping_patience=3),
    ],
)

trainer_query = Trainer(
    model=model,
    args=training_args_query,
    train_dataset=tokenized_datasets_query["train"],
    eval_dataset=tokenized_datasets_query["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[
        PrefixedWandbCallback(phase="query"),
        #EarlyStoppingCallback(early_stopping_patience=3),
    ],
)

In [24]:
model.print_trainable_parameters()  # Verificar parámetros entrenables

trainable params: 132,939,776 || all params: 728,989,696 || trainable%: 18.2362


In [25]:
from json import dumps
sep = "#" * 10
for ci in range(CYCLES):
    print(f"{sep}Starting training cycle {ci + 1}{sep}")
    trainer_indexing.train()
    metrics_indexing = trainer_indexing.evaluate(eval_dataset=tokenized_datasets_query["test"])
    print(f"Indexing Dataset Test Metrics: {dumps(metrics_indexing, indent=4)}")
    trainer_query.train()
    metrics_query = trainer_query.evaluate(eval_dataset=tokenized_datasets_query["test"])
    print(f"Query Dataset Test Metrics: {dumps(metrics_query, indent=4)}")

##########Starting training cycle 1##########


Step,Training Loss,Validation Loss,Accuracy,Map,Hint@1,Hint@10,Hint@100,Recall@1,Recall@10,Recall@100,Precision@1,Precision@10,Precision@100
10,7.8231,7.826665,0.01,0.024571,0.01,0.033333,0.27,0.01,0.033333,0.27,0.01,0.003333,0.0027
20,6.2607,7.136801,0.003333,0.017873,0.003333,0.026667,0.27,0.003333,0.026667,0.27,0.003333,0.002667,0.0027
30,6.1307,6.715888,0.006667,0.019513,0.006667,0.023333,0.26,0.006667,0.023333,0.26,0.006667,0.002333,0.0026
40,6.0408,6.679628,0.003333,0.017478,0.003333,0.026667,0.263333,0.003333,0.026667,0.263333,0.003333,0.002667,0.002633
50,5.8322,6.738959,0.003333,0.019016,0.003333,0.04,0.266667,0.003333,0.04,0.266667,0.003333,0.004,0.002667
60,5.3403,6.989319,0.006667,0.023146,0.006667,0.04,0.276667,0.006667,0.04,0.276667,0.006667,0.004,0.002767
70,4.706,7.199372,0.01,0.029392,0.01,0.046667,0.276667,0.01,0.046667,0.276667,0.01,0.004667,0.002767
80,3.501,7.152828,0.006667,0.033152,0.006667,0.07,0.303333,0.006667,0.07,0.303333,0.006667,0.007,0.003033
90,2.2273,7.415062,0.013333,0.03859,0.013333,0.066667,0.306667,0.013333,0.066667,0.306667,0.013333,0.006667,0.003067
100,1.489,7.436361,0.003333,0.034661,0.003333,0.076667,0.3,0.003333,0.076667,0.3,0.003333,0.007667,0.003


Indexing Dataset Test Metrics: {
    "eval_loss": 7.465508460998535,
    "eval_accuracy": 0.0033333333333333335,
    "eval_mAP": 0.02296182578005492,
    "eval_Hint@1": 0.0033333333333333335,
    "eval_Hint@10": 0.043333333333333335,
    "eval_Hint@100": 0.35333333333333333,
    "eval_recall@1": 0.0033333333333333335,
    "eval_recall@10": 0.043333333333333335,
    "eval_recall@100": 0.35333333333333333,
    "eval_precision@1": 0.0033333333333333335,
    "eval_precision@10": 0.004333333333333333,
    "eval_precision@100": 0.003533333333333333,
    "eval_runtime": 0.5539,
    "eval_samples_per_second": 541.59,
    "eval_steps_per_second": 18.053,
    "epoch": 10.0
}


Step,Training Loss,Validation Loss,Accuracy,Map,Hint@1,Hint@10,Hint@100,Recall@1,Recall@10,Recall@100,Precision@1,Precision@10,Precision@100
10,7.0354,6.46012,0.013333,0.041989,0.013333,0.083333,0.346667,0.013333,0.083333,0.346667,0.013333,0.008333,0.003467
20,5.9419,5.792597,0.04,0.085633,0.04,0.14,0.49,0.04,0.14,0.49,0.04,0.014,0.0049
30,5.5118,5.381407,0.083333,0.141382,0.083333,0.256667,0.58,0.083333,0.256667,0.58,0.083333,0.025667,0.0058
40,5.1135,4.964217,0.156667,0.216049,0.156667,0.356667,0.643333,0.156667,0.356667,0.643333,0.156667,0.035667,0.006433
50,4.497,4.798158,0.19,0.25358,0.19,0.383333,0.663333,0.19,0.383333,0.663333,0.19,0.038333,0.006633
60,4.1869,4.669966,0.216667,0.289008,0.216667,0.4,0.673333,0.216667,0.4,0.673333,0.216667,0.04,0.006733
70,4.169,4.513351,0.256667,0.319008,0.256667,0.45,0.723333,0.256667,0.45,0.723333,0.256667,0.045,0.007233
80,4.3043,4.408713,0.266667,0.335853,0.266667,0.473333,0.733333,0.266667,0.473333,0.733333,0.266667,0.047333,0.007333
90,3.8476,4.097679,0.363333,0.424435,0.363333,0.526667,0.766667,0.363333,0.526667,0.766667,0.363333,0.052667,0.007667
100,3.3061,4.185694,0.35,0.414009,0.35,0.553333,0.78,0.35,0.553333,0.78,0.35,0.055333,0.0078


Query Dataset Test Metrics: {
    "eval_loss": 2.063112735748291,
    "eval_accuracy": 0.8233333333333334,
    "eval_mAP": 0.837499539564592,
    "eval_Hint@1": 0.8233333333333334,
    "eval_Hint@10": 0.8633333333333333,
    "eval_Hint@100": 0.9266666666666666,
    "eval_recall@1": 0.8233333333333334,
    "eval_recall@10": 0.8633333333333333,
    "eval_recall@100": 0.9266666666666666,
    "eval_precision@1": 0.8233333333333334,
    "eval_precision@10": 0.08633333333333333,
    "eval_precision@100": 0.009266666666666666,
    "eval_runtime": 0.5656,
    "eval_samples_per_second": 530.44,
    "eval_steps_per_second": 17.681,
    "epoch": 10.0
}
##########Starting training cycle 2##########


Step,Training Loss,Validation Loss,Accuracy,Map,Hint@1,Hint@10,Hint@100,Recall@1,Recall@10,Recall@100,Precision@1,Precision@10,Precision@100
10,1.1801,1.9906,0.866667,0.872,0.866667,0.886667,0.926667,0.866667,0.886667,0.926667,0.866667,0.088667,0.009267
20,1.0848,1.994292,0.893333,0.898441,0.893333,0.9,0.943333,0.893333,0.9,0.943333,0.893333,0.09,0.009433
30,1.1117,2.272829,0.866667,0.884513,0.866667,0.903333,0.953333,0.866667,0.903333,0.953333,0.866667,0.090333,0.009533
40,1.2492,2.091163,0.843333,0.865665,0.843333,0.906667,0.963333,0.843333,0.906667,0.963333,0.843333,0.090667,0.009633
50,1.1374,2.273551,0.83,0.860117,0.83,0.91,0.97,0.83,0.91,0.97,0.83,0.091,0.0097
60,1.1291,2.323157,0.86,0.877203,0.86,0.903333,0.966667,0.86,0.903333,0.966667,0.86,0.090333,0.009667
70,1.0883,2.438304,0.866667,0.880418,0.866667,0.9,0.97,0.866667,0.9,0.97,0.866667,0.09,0.0097
80,1.0491,2.304219,0.87,0.883896,0.87,0.906667,0.97,0.87,0.906667,0.97,0.87,0.090667,0.0097
90,1.0366,2.269526,0.883333,0.892813,0.883333,0.91,0.973333,0.883333,0.91,0.973333,0.883333,0.091,0.009733
100,1.0033,2.36061,0.883333,0.895071,0.883333,0.906667,0.973333,0.883333,0.906667,0.973333,0.883333,0.090667,0.009733


Indexing Dataset Test Metrics: {
    "eval_loss": 2.2778730392456055,
    "eval_accuracy": 0.9,
    "eval_mAP": 0.9092368525752316,
    "eval_Hint@1": 0.9,
    "eval_Hint@10": 0.92,
    "eval_Hint@100": 0.9733333333333334,
    "eval_recall@1": 0.9,
    "eval_recall@10": 0.92,
    "eval_recall@100": 0.9733333333333334,
    "eval_precision@1": 0.9,
    "eval_precision@10": 0.092,
    "eval_precision@100": 0.009733333333333333,
    "eval_runtime": 0.57,
    "eval_samples_per_second": 526.35,
    "eval_steps_per_second": 17.545,
    "epoch": 10.0
}


Step,Training Loss,Validation Loss,Accuracy,Map,Hint@1,Hint@10,Hint@100,Recall@1,Recall@10,Recall@100,Precision@1,Precision@10,Precision@100
10,1.3454,2.052304,0.893333,0.899928,0.893333,0.906667,0.98,0.893333,0.906667,0.98,0.893333,0.090667,0.0098
20,1.0828,1.664072,0.893333,0.90199,0.893333,0.913333,0.97,0.893333,0.913333,0.97,0.893333,0.091333,0.0097
30,1.0483,1.71321,0.893333,0.902539,0.893333,0.913333,0.976667,0.893333,0.913333,0.976667,0.893333,0.091333,0.009767
40,1.0355,1.674879,0.903333,0.908031,0.903333,0.916667,0.976667,0.903333,0.916667,0.976667,0.903333,0.091667,0.009767
50,1.0422,1.766106,0.89,0.899664,0.89,0.916667,0.973333,0.89,0.916667,0.973333,0.89,0.091667,0.009733
60,1.0467,1.753112,0.896667,0.90192,0.896667,0.913333,0.966667,0.896667,0.913333,0.966667,0.896667,0.091333,0.009667
70,1.049,1.791327,0.89,0.896892,0.89,0.91,0.97,0.89,0.91,0.97,0.89,0.091,0.0097
80,1.072,1.761824,0.89,0.900078,0.89,0.913333,0.973333,0.89,0.913333,0.973333,0.89,0.091333,0.009733
90,1.0796,1.8185,0.88,0.892073,0.88,0.9,0.97,0.88,0.9,0.97,0.88,0.09,0.0097
100,1.1175,1.823001,0.873333,0.887455,0.873333,0.9,0.973333,0.873333,0.9,0.973333,0.873333,0.09,0.009733


Query Dataset Test Metrics: {
    "eval_loss": 1.679082989692688,
    "eval_accuracy": 0.89,
    "eval_mAP": 0.9002551528796817,
    "eval_Hint@1": 0.89,
    "eval_Hint@10": 0.9166666666666666,
    "eval_Hint@100": 0.98,
    "eval_recall@1": 0.89,
    "eval_recall@10": 0.9166666666666666,
    "eval_recall@100": 0.98,
    "eval_precision@1": 0.89,
    "eval_precision@10": 0.09166666666666666,
    "eval_precision@100": 0.0098,
    "eval_runtime": 0.5437,
    "eval_samples_per_second": 551.732,
    "eval_steps_per_second": 18.391,
    "epoch": 10.0
}


In [None]:
# test final
from json import dumps
print("Evaluating on test set for query dataset...")
metrics_query = trainer_query.evaluate(eval_dataset=tokenized_datasets_query["test"])
print(f"Query Dataset Test Metrics: {dumps(metrics_query, indent=4)}")

Evaluating on test set for query dataset...
Query Dataset Test Metrics: {
    "eval_loss": 1.3098227977752686,
    "eval_accuracy": 0.8266666666666667,
    "eval_mAP": 0.8520377084375034,
    "eval_Hint@1": 0.8266666666666667,
    "eval_Hint@10": 0.89,
    "eval_Hint@100": 0.9166666666666666,
    "eval_recall@1": 0.8266666666666667,
    "eval_recall@10": 0.89,
    "eval_recall@100": 0.9166666666666666,
    "eval_precision@1": 0.8266666666666667,
    "eval_precision@10": 0.089,
    "eval_precision@100": 0.009166666666666667,
    "eval_runtime": 0.5565,
    "eval_samples_per_second": 539.063,
    "eval_steps_per_second": 17.969,
    "epoch": 10.0
}


In [None]:
# testea la salida de una pregunta, me refiero a obtener la predicción

print("Evaluating on test set for query dataset...")
index = 100
query = tokenized_datasets_query["test"][index]["text"]
real_label_id = tokenized_datasets_query["test"][index]["label"]
id_to_label = {v: k for k, v in label_to_id.items()}
real_label = id_to_label[real_label_id]
print(f"Sample query: {query}")
print(f"Real label for the first test query: {real_label}")
inputs = tokenizer(query, return_tensors="pt", truncation=True, max_length=MAX_LENGTH)
inputs = {k: v.to(trainer_query.model.device) for k, v in inputs.items()}
with torch.no_grad():
    outputs = trainer_query.model(**inputs)
logits = outputs.logits
predicted_class_id = logits.argmax().item()
id_to_label = {v: k for k, v in label_to_id.items()}
predicted_label = id_to_label[predicted_class_id]
print(f"Predicted label for query '{query}': {predicted_label}")
# extra los top k prediciones con mayor probabilidad
probs = torch.softmax(logits, dim=-1)
top_k = 20
top_k_probs, top_k_indices = torch.topk(probs, top_k)
print(f"Top {top_k} predictions:")
for prob, idx in zip(top_k_probs[0], top_k_indices[0]):
    label = id_to_label[idx.item()]
    print(f"Label: {label}, Probability: {prob.item():.4f}")
print("Training completed.")

Evaluating on test set for query dataset...
Sample query: ¿Cómo puedo contactar con Hugo Castro?
Real label for the first test query: 7557_4
Predicted label for query '¿Cómo puedo contactar con Hugo Castro?': 7557_4
Top 20 predictions:
Label: 7557_4, Probability: 0.9956
Label: 7542_3, Probability: 0.0013
Label: 7579_1, Probability: 0.0008
Label: 7517_5, Probability: 0.0002
Label: 7513_3, Probability: 0.0002
Label: 7502_4, Probability: 0.0001
Label: 7550_2, Probability: 0.0001
Label: 7568_1, Probability: 0.0001
Label: 7552_5, Probability: 0.0001
Label: 7511_5, Probability: 0.0001
Label: 7503_5, Probability: 0.0001
Label: 7540_2, Probability: 0.0001
Label: 7534_3, Probability: 0.0001
Label: 7504_4, Probability: 0.0001
Label: 7515_5, Probability: 0.0001
Label: 7539_4, Probability: 0.0000
Label: 7519_3, Probability: 0.0000
Label: 7546_5, Probability: 0.0000
Label: 7552_4, Probability: 0.0000
Label: 7538_5, Probability: 0.0000
Training completed.


In [26]:
# me gustaria que me hicieras un bucle con la logica de la celda anterior para ver si no acierta alguna muestra me diga que posicion esta en la lista de logits
from tqdm import tqdm

labels_predicted = []
labels_real = []

for index in tqdm(range(len(tokenized_datasets_query["test"]))):
    query = tokenized_datasets_query["test"][index]["text"]
    real_label_id = tokenized_datasets_query["test"][index]["label"]
    id_to_label = {v: k for k, v in label_to_id.items()}
    real_label = id_to_label[real_label_id]
    inputs = tokenizer(query, return_tensors="pt", truncation=True, max_length=MAX_LENGTH)
    inputs = {k: v.to(trainer_query.model.device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = trainer_query.model(**inputs)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=-1)

    # get list of labels sorted by probability
    sorted_probs, sorted_indices = torch.sort(probs, descending=True)
    sorted_labels = [id_to_label[idx.item()] for idx in sorted_indices[0]]

    labels_predicted.append(sorted_labels)
    labels_real.append([real_label])

100%|██████████| 300/300 [00:21<00:00, 14.12it/s]


In [27]:
from ranking_metrics import calc_ranking_metrics

metrics = calc_ranking_metrics(labels_predicted, labels_real, ks=[1, 5, 10, 20, 100], one_relevant_per_query=True)

print("Ranking Metrics:")
for k, v in metrics.items():
    print(f"  {k}: {v:.4f}")

Ranking Metrics:
  MRR: 0.9003
  mAP: 0.9003
  AvgRank: 9.2100
  CMC@1: 0.8900
  Recall@k (macro)@1: 0.8900
  Precision@k (macro)@1: 0.8900
  Accuracy@1: 0.8900
  F1@k (macro)@1: 0.8900
  CMC@5: 0.9100
  Recall@k (macro)@5: 0.9100
  Precision@k (macro)@5: 0.1820
  Accuracy@5: 0.9100
  F1@k (macro)@5: 0.3033
  CMC@10: 0.9167
  Recall@k (macro)@10: 0.9167
  Precision@k (macro)@10: 0.0917
  Accuracy@10: 0.9167
  F1@k (macro)@10: 0.1667
  CMC@20: 0.9167
  Recall@k (macro)@20: 0.9167
  Precision@k (macro)@20: 0.0458
  Accuracy@20: 0.9167
  F1@k (macro)@20: 0.0873
  CMC@100: 0.9800
  Recall@k (macro)@100: 0.9800
  Precision@k (macro)@100: 0.0098
  Accuracy@100: 0.9800
  F1@k (macro)@100: 0.0194
