In [1]:
import argparse, os, numpy as np, torch
from datasets import load_dataset, ClassLabel, load_from_disk
from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding, EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from transformers import BitsAndBytesConfig
from ranking_metrics import calc_ranking_metrics
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


## Dataset agenda

In [2]:
dataset_indexing_csv = pd.read_csv("../notebooks/data/contacts_docs.csv")

# to dataset huggingface
from datasets import Dataset

dataset_indexing = Dataset.from_pandas(dataset_indexing_csv)

# create a new column 'text' that concatenates 'name', 'phone'
def concatenate_columns(example):
    return {
        "text": f"Nombre: {example['name']}\nTeléfono: {example['phone']}"
    }
dataset_indexing = dataset_indexing.map(concatenate_columns)
# rename column 'id' to 'label'
dataset_indexing = dataset_indexing.rename_column("id", "label")

num_labels = len(dataset_indexing['label'])
print(f"Number of labels: {num_labels}")
labels_list = dataset_indexing.unique('label')
print(f"Labels: {labels_list}")

# map labels to integers
label_to_id = {label: i for i, label in enumerate(labels_list)}
def map_labels(example):
    return {
        "label": label_to_id[example['label']]
    }
dataset_indexing = dataset_indexing.map(map_labels)

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map: 100%|██████████| 400/400 [00:00<00:00, 29684.73 examples/s]


Number of labels: 400
Labels: ['7500_1', '7500_2', '7500_3', '7500_4', '7500_5', '7501_1', '7501_2', '7501_3', '7501_4', '7501_5', '7502_1', '7502_2', '7502_3', '7502_4', '7502_5', '7503_1', '7503_2', '7503_3', '7503_4', '7503_5', '7504_1', '7504_2', '7504_3', '7504_4', '7504_5', '7505_1', '7505_2', '7505_3', '7505_4', '7505_5', '7506_1', '7506_2', '7506_3', '7506_4', '7506_5', '7507_1', '7507_2', '7507_3', '7507_4', '7507_5', '7508_1', '7508_2', '7508_3', '7508_4', '7508_5', '7509_1', '7509_2', '7509_3', '7509_4', '7509_5', '7510_1', '7510_2', '7510_3', '7510_4', '7510_5', '7511_1', '7511_2', '7511_3', '7511_4', '7511_5', '7512_1', '7512_2', '7512_3', '7512_4', '7512_5', '7513_1', '7513_2', '7513_3', '7513_4', '7513_5', '7514_1', '7514_2', '7514_3', '7514_4', '7514_5', '7515_1', '7515_2', '7515_3', '7515_4', '7515_5', '7516_1', '7516_2', '7516_3', '7516_4', '7516_5', '7517_1', '7517_2', '7517_3', '7517_4', '7517_5', '7518_1', '7518_2', '7518_3', '7518_4', '7518_5', '7519_1', '7519_2',

Map: 100%|██████████| 400/400 [00:00<00:00, 48656.41 examples/s]


In [3]:
query_dataset_train = pd.read_csv("../notebooks/data/contacts_queries_train.csv")
query_dataset_val = pd.read_csv("../notebooks/data/contacts_queries_val.csv")
query_dataset_test = pd.read_csv("../notebooks/data/contacts_queries_test.csv")

In [4]:
dataset_for_queries = {
    "train": Dataset.from_pandas(query_dataset_train),
    "validation": Dataset.from_pandas(query_dataset_val),
    "test": Dataset.from_pandas(query_dataset_test)
}

for split in dataset_for_queries:
    dataset_for_queries[split] = dataset_for_queries[split].rename_column("question", "text")
    dataset_for_queries[split] = dataset_for_queries[split].rename_column("id", "label")
    # map labels to integers
    dataset_for_queries[split] = dataset_for_queries[split].map(map_labels)

# to dataset huggingface
from datasets import DatasetDict
dataset_for_queries = DatasetDict(dataset_for_queries)
print(dataset_for_queries["train"][1])


Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Map: 100%|██████████| 1400/1400 [00:00<00:00, 56029.17 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 26971.86 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 58746.50 examples/s]

{'text': '¿Cómo puedo contactar con Antonio Alonso?', 'label': 194}





## Dataset parlamento

In [2]:
FOLDER_AUTORE = "../data/processed/parliament_all_docs"
dataset_indexing = load_from_disk(FOLDER_AUTORE)["all"]
dataset_indexing = dataset_indexing.rename_column("PK", "label")

num_labels = len(dataset_indexing['label'])
print(f"Number of labels: {num_labels}")
labels_list = dataset_indexing.unique('label')
print(f"Labels: {labels_list}")

# map labels to integers
label_to_id = {label: i for i, label in enumerate(labels_list)}
def map_labels(example):
    return {
        "label": label_to_id[example['label']]
    }
dataset_indexing = dataset_indexing.map(map_labels)
print(dataset_indexing[:]["label"])

Number of labels: 11162
Labels: ['6596_4', '6092_28', '6555_1', '5830_8', '5587_7', '6421_4', '5860_19', '6502_6', '6406_10', '6331_17', '6220_17', '5575_18', '5402_5', '6495_2', '5852_21', '5458_1', '6104_22', '6080_28', '6031_13', '6341_28', '6225_5', '6070_6', '6296_13', '5643_19', '6186_5', '6195_25', '5491_6', '6532_13', '5723_10', '5528_20', '6285_6', '5901_15', '5579_2', '6267_1', '5834_3', '6516_30', '6552_30', '6594_25', '5651_1', '5985_3', '6235_7', '6404_3', '5582_3', '6044_11', '6276_5', '5423_9', '5528_29', '6329_30', '6259_26', '5579_5', '5893_2', '5698_3', '6084_11', '6120_5', '5878_18', '6268_3', '6355_17', '6029_28', '5488_1', '6596_19', '6293_5', '6131_12', '6451_3', '5463_3', '6141_13', '5825_31', '6062_11', '6092_16', '6357_23', '6357_13', '5498_4', '6321_6', '6318_16', '5401_3', '5852_15', '6094_20', '5478_25', '6581_33', '6283_28', '5573_12', '6394_9', '5917_3', '6033_26', '6410_15', '5978_6', '5404_3', '5442_22', '5942_12', '6385_10', '5892_2', '6425_4', '6221_9'

In [3]:
dataset_for_queries = load_from_disk("../data/processed/parliament_qa")

# quiedate solo con id y question
for split in dataset_for_queries.keys():
    print(f"Split: {split}, num examples: {len(dataset_for_queries[split])}")
    dataset_for_queries[split] = dataset_for_queries[split].remove_columns([col for col in dataset_for_queries[split].column_names if col not in ["id", "question"]])
    dataset_for_queries[split] = dataset_for_queries[split].rename_column("question", "text")
    dataset_for_queries[split] = dataset_for_queries[split].rename_column("id", "label")
# map labels to integers
    dataset_for_queries[split] = dataset_for_queries[split].map(map_labels)


Split: train, num examples: 614
Split: validation, num examples: 161
Split: test, num examples: 205


In [4]:
# quiero crear un dataset tiny para pruebas solo usando los ejemplos con label 0,1,2 tanto en indexing como en queries
labels_to_keep = sorted(set(dataset_for_queries["test"][:]["label"]))[:100]
dataset_indexing = dataset_indexing.filter(lambda example: example['label'] in labels_to_keep)
for split in dataset_for_queries.keys():
    dataset_for_queries[split] = dataset_for_queries[split].filter(lambda example: example['label'] in labels_to_keep)
print(dataset_indexing)
print(dataset_for_queries["test"])

Dataset({
    features: ['text', 'label'],
    num_rows: 100
})
Dataset({
    features: ['text', 'label'],
    num_rows: 152
})


## Embeddings

In [5]:
from langchain.schema import Document
from langchain.retrievers import BM25Retriever

def create_documents_from_datasets(datasets):
    documents = []
    for dataset in datasets:
        for item in dataset:
            doc = Document(
                page_content=item["text"],
                metadata={"label": item["label"]}
            )
            documents.append(doc)
    return documents
documents_indexing = create_documents_from_datasets([dataset_indexing])
print(f"Number of documents created for indexing: {len(documents_indexing)}")

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

embedding_model = HuggingFaceEmbeddings(
    model_name="Qwen/Qwen3-Embedding-0.6B",
    model_kwargs={"device": "cuda"},
)

vectorstore = FAISS.from_documents(documents_indexing, embedding_model)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 100})

retriever_sparse = BM25Retriever.from_documents(documents_indexing)

Number of documents created for indexing: 400


In [6]:
from tqdm import tqdm

list_of_labels_embeddings = []
list_of_labels_sparse = []
list_of_real_labels_embeddings = []
for index in tqdm(range(len(dataset_for_queries["test"]))):
    query = dataset_for_queries["test"][index]["text"]
    real_label_id = dataset_for_queries["test"][index]["label"]
    id_to_label = {v: k for k, v in label_to_id.items()}
    real_label = id_to_label[real_label_id]

    # retrieve documents
    docs = retriever.get_relevant_documents(query)
    # get labels from retrieved documents
    retrieved_labels = [doc.metadata["label"] for doc in docs]
    # to names
    retrieved_labels = [id_to_label[label_id] for label_id in retrieved_labels]
    list_of_labels_embeddings.append(retrieved_labels)
    list_of_real_labels_embeddings.append([real_label])

    # retrieve documents (sparse)
    docs_sparse = retriever_sparse.get_relevant_documents(query)
    # get labels from retrieved documents
    retrieved_labels_sparse = [doc.metadata["label"] for doc in docs_sparse]
    # to names
    retrieved_labels_sparse = [id_to_label[label_id] for label_id in retrieved_labels_sparse]
    list_of_labels_sparse.append(retrieved_labels_sparse)


  docs = retriever.get_relevant_documents(query)
100%|██████████| 300/300 [00:09<00:00, 31.33it/s]


In [7]:
from ranking_metrics import calc_ranking_metrics

metrics = calc_ranking_metrics(list_of_labels_embeddings, list_of_real_labels_embeddings, ks=[1, 5, 10, 20, 100], one_relevant_per_query=True)

print("Ranking Metrics:")
for k, v in metrics.items():
    print(f"  {k}: {v:.4f}")

Ranking Metrics:
  MRR: 0.7261
  mAP: 0.7261
  AvgRank: 7.6983
  CMC@1: 0.6800
  Recall@k (macro)@1: 0.6800
  Precision@k (macro)@1: 0.6800
  Accuracy@1: 0.6800
  F1@k (macro)@1: 0.6800
  CMC@5: 0.7600
  Recall@k (macro)@5: 0.7600
  Precision@k (macro)@5: 0.1520
  Accuracy@5: 0.7600
  F1@k (macro)@5: 0.2533
  CMC@10: 0.8300
  Recall@k (macro)@10: 0.8300
  Precision@k (macro)@10: 0.0830
  Accuracy@10: 0.8300
  F1@k (macro)@10: 0.1509
  CMC@20: 0.8833
  Recall@k (macro)@20: 0.8833
  Precision@k (macro)@20: 0.0442
  Accuracy@20: 0.8833
  F1@k (macro)@20: 0.0841
  CMC@100: 0.9833
  Recall@k (macro)@100: 0.9833
  Precision@k (macro)@100: 0.0098
  Accuracy@100: 0.9833
  F1@k (macro)@100: 0.0195


In [8]:
metrics_sparse = calc_ranking_metrics(list_of_labels_sparse, list_of_real_labels_embeddings, ks=[1, 5, 10, 20, 100], one_relevant_per_query=True)

print("Sparse Ranking Metrics:")
for k, v in metrics_sparse.items():
    print(f"  {k}: {v:.4f}")

Sparse Ranking Metrics:
  MRR: 0.5539
  mAP: 0.5539
  AvgRank: 1.4158
  CMC@1: 0.5133
  Recall@k (macro)@1: 0.5133
  Precision@k (macro)@1: 0.5133
  Accuracy@1: 0.5133
  F1@k (macro)@1: 0.5133
  CMC@5: 0.6333
  Recall@k (macro)@5: 0.6333
  Precision@k (macro)@5: 0.1267
  Accuracy@5: 0.6333
  F1@k (macro)@5: 0.2111
  CMC@10: 0.6333
  Recall@k (macro)@10: 0.6333
  Precision@k (macro)@10: 0.0633
  Accuracy@10: 0.6333
  F1@k (macro)@10: 0.1152
  CMC@20: 0.6333
  Recall@k (macro)@20: 0.6333
  Precision@k (macro)@20: 0.0317
  Accuracy@20: 0.6333
  F1@k (macro)@20: 0.0603
  CMC@100: 0.6333
  Recall@k (macro)@100: 0.6333
  Precision@k (macro)@100: 0.0063
  Accuracy@100: 0.6333
  F1@k (macro)@100: 0.0125


## Model

In [5]:
model_name = "Qwen/Qwen3-0.6B"
MAX_LENGTH = 512

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [6]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_type="float16"
)

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    low_cpu_mem_usage=True,
    quantization_config=bnb_config,
    device_map={"": 0}
)
model = prepare_model_for_kbit_training(model)
model.config.pad_token_id = tokenizer.pad_token_id

Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen3-0.6B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
lora_r = 1024
lora_alpha = lora_r * 2
lora_dropout = 0.0
lora_bias = "none"
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "down_proj"]

In [9]:
config = LoraConfig(
    task_type="SEQ_CLS",
    inference_mode=False,
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias=lora_bias,
    target_modules=target_modules
)
model = get_peft_model(model, config)

In [10]:
for p in model.base_model.model.score.parameters():
    p.requires_grad_(True)

In [11]:
def preprocess(examples):
    return tokenizer(examples["text"], truncation=True, max_length=MAX_LENGTH, padding=False)

# tokenize test from dataset
tokenized_datasets_indexing = dataset_indexing.map(preprocess, batched=True)
tokenized_datasets_query = dataset_for_queries.map(preprocess, batched=True)


In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)

def softmax(x, axis=-1):
    x = np.asarray(x, dtype=np.float64)
    # Restar el máximo para evitar overflow
    x_shift = x - np.max(x, axis=axis, keepdims=True)
    exps = np.exp(x_shift)
    return exps / np.sum(exps, axis=axis, keepdims=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = (preds == labels).mean()
    # softmax to get probabilities using numpy
    probs = softmax(logits, axis=-1)
    # get list of labels sorted by probability
    sorted_indices = np.argsort(probs, axis=-1)
    # reverse to have the highest probability first
    sorted_indices = sorted_indices[:, ::-1]
    list_of_labels = [
        [idx] for idx in labels  
    ]

    ranking_metrics = calc_ranking_metrics(
        sorted_indices, 
        list_of_labels, 
        ks=[1, 5, 10, 20, 100], 
        one_relevant_per_query=True
    )

    return {
        "accuracy": acc,  
        "mAP": ranking_metrics["mAP"], 
        "Hint@1": ranking_metrics["Accuracy@1"], 
        "Hint@10": ranking_metrics["Accuracy@10"],
        "Hint@100": ranking_metrics["Accuracy@100"],
        # recall@k
        "recall@1": ranking_metrics["Recall@k (macro)@1"],
        "recall@10": ranking_metrics["Recall@k (macro)@10"],
        "recall@100": ranking_metrics["Recall@k (macro)@100"],
        # precision@k
        "precision@1": ranking_metrics["Precision@k (macro)@1"],
        "precision@10": ranking_metrics["Precision@k (macro)@10"],
        "precision@100": ranking_metrics["Precision@k (macro)@100"],
    }

In [13]:
# testing compute_metrics
test_logits = torch.tensor([[0.1, 0.69, 0.99], [0.8, 0.1, 0.91]])
test_labels = torch.tensor([1, 0])
test_eval_pred = (test_logits.detach().numpy(), test_labels.detach().numpy())
test_metrics = compute_metrics(test_eval_pred)
print(f"Test compute_metrics: {test_metrics}")

Test compute_metrics: {'accuracy': np.float64(0.0), 'mAP': 0.5, 'Hint@1': 0.0, 'Hint@10': 1.0, 'Hint@100': 1.0, 'recall@1': 0.0, 'recall@10': 1.0, 'recall@100': 1.0, 'precision@1': 0.0, 'precision@10': 0.1, 'precision@100': 0.01}


In [14]:
SEED = 42
EPOCHS = 10
BATCH_SIZE = 8
CYCLES = 2

In [15]:
import wandb

wandb.init(
    project="parlamento_multilabel_classification_retriever",
    name="entrenamiento_indexing_y_query_testing",
    config={
        "epochs": EPOCHS,
        "batch_size": BATCH_SIZE,
        "model": model_name,
        "cycles": CYCLES
    }
)

[34m[1mwandb[0m: Currently logged in as: [33mmiguel_kjh[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




In [16]:
from transformers import TrainerCallback

class PrefixedWandbCallback(TrainerCallback):
    def __init__(self, phase=""):
        self.phase = phase

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            prefixed_logs = {f"{self.phase}/{k}": v for k, v in logs.items()}
            wandb.log(prefixed_logs, step=state.global_step)

In [17]:
# Entrenamiento

training_args_indexing = TrainingArguments(
    output_dir=f"models/parlamento_clf_{model_name.replace('/', '_')}_indexing",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=1,
    num_train_epochs=EPOCHS,
    learning_rate=5e-5,
    weight_decay=0.05,
    warmup_ratio=0.2,
    lr_scheduler_type="cosine",
    eval_strategy="steps",     # o "no" si no vas a evaluar
    save_strategy="no",        # <-- no guarda checkpoints ni el modelo final
    eval_steps=10,
    logging_steps=10,
    load_best_model_at_end=False,  # <-- desactivado porque no hay checkpoints
    fp16=True,
    report_to="wandb",
    seed=SEED,
    label_smoothing_factor=0.1,
)

training_args_query = TrainingArguments(
    output_dir=f"models/parlamento_clf_{model_name.replace('/', '_')}_query",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=1,
    num_train_epochs=EPOCHS*5,
    learning_rate=5e-5,
    weight_decay=0.05,
    warmup_ratio=0.2,
    lr_scheduler_type="cosine",
    eval_strategy="steps",
    save_strategy="no",        # <-- no guarda checkpoints ni el modelo final
    eval_steps=10,
    logging_steps=10,
    load_best_model_at_end=False,  # <-- desactivado porque no hay checkpoints
    fp16=True,
    report_to="wandb",
    seed=SEED,
    label_smoothing_factor=0.1,
)

# IDs de tokens
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.bos_token_id = tokenizer.bos_token_id

trainer_indexing = Trainer(
    model=model,
    args=training_args_indexing,
    train_dataset=tokenized_datasets_indexing,
    eval_dataset=tokenized_datasets_query["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[
        PrefixedWandbCallback(phase="indexing"),
        #EarlyStoppingCallback(early_stopping_patience=3),
    ],
)

trainer_query = Trainer(
    model=model,
    args=training_args_query,
    train_dataset=tokenized_datasets_query["train"],
    eval_dataset=tokenized_datasets_query["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[PrefixedWandbCallback(phase="query")],
)

In [18]:
model.print_trainable_parameters()  # Verificar parámetros entrenables

trainable params: 551,342,080 || all params: 1,147,392,000 || trainable%: 48.0518


In [19]:
from json import dumps
sep = "#" * 10
for ci in range(CYCLES):
    print(f" {sep} Starting training cycle {ci + 1} {sep} ")
    trainer_indexing.train()
    metrics_indexing = trainer_indexing.evaluate(eval_dataset=tokenized_datasets_query["test"])
    print(f"Indexing Dataset Test Metrics: {dumps(metrics_indexing, indent=4)}")
    trainer_query.train()
    metrics_query = trainer_query.evaluate(eval_dataset=tokenized_datasets_query["test"])
    print(f"Query Dataset Test Metrics: {dumps(metrics_query, indent=4)}")
    

 ########## Starting training cycle 1 ########## 


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Accuracy,Map,Hint@1,Hint@10,Hint@100,Recall@1,Recall@10,Recall@100,Precision@1,Precision@10,Precision@100
10,11.3898,9.763411,0.0,0.001096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20,9.2573,8.483086,0.0,0.015423,0.0,0.071429,0.214286,0.0,0.071429,0.214286,0.0,0.007143,0.002143
30,8.5066,6.523767,0.071429,0.100584,0.071429,0.071429,0.642857,0.071429,0.071429,0.642857,0.071429,0.007143,0.006429
40,6.8861,6.403069,0.0,0.027671,0.0,0.0,0.857143,0.0,0.0,0.857143,0.0,0.0,0.008571
50,4.8939,6.40974,0.0,0.051766,0.0,0.142857,0.785714,0.0,0.142857,0.785714,0.0,0.014286,0.007857
60,2.7185,6.525075,0.0,0.035054,0.0,0.071429,0.928571,0.0,0.071429,0.928571,0.0,0.007143,0.009286
70,1.8748,6.057748,0.0,0.037927,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.01
80,1.8734,5.921809,0.142857,0.186718,0.142857,0.214286,0.928571,0.142857,0.214286,0.928571,0.142857,0.021429,0.009286
90,1.9863,6.034091,0.071429,0.13876,0.071429,0.214286,0.928571,0.071429,0.214286,0.928571,0.071429,0.021429,0.009286
100,1.5345,6.036402,0.071429,0.105757,0.071429,0.142857,0.928571,0.071429,0.142857,0.928571,0.071429,0.014286,0.009286


Indexing Dataset Test Metrics: {
    "eval_loss": 6.372993469238281,
    "eval_accuracy": 0.0,
    "eval_mAP": 0.05538053027864388,
    "eval_Hint@1": 0.0,
    "eval_Hint@10": 0.125,
    "eval_Hint@100": 0.8618421052631579,
    "eval_recall@1": 0.0,
    "eval_recall@10": 0.125,
    "eval_recall@100": 0.8618421052631579,
    "eval_precision@1": 0.0,
    "eval_precision@10": 0.0125,
    "eval_precision@100": 0.008618421052631579,
    "eval_runtime": 1.4601,
    "eval_samples_per_second": 104.1,
    "eval_steps_per_second": 13.012,
    "epoch": 10.0
}




Step,Training Loss,Validation Loss,Accuracy,Map,Hint@1,Hint@10,Hint@100,Recall@1,Recall@10,Recall@100,Precision@1,Precision@10,Precision@100
10,5.0785,5.299538,0.0,0.07993,0.0,0.214286,1.0,0.0,0.214286,1.0,0.0,0.021429,0.01
20,2.8586,5.444623,0.142857,0.225172,0.142857,0.571429,0.928571,0.142857,0.571429,0.928571,0.142857,0.057143,0.009286
30,1.6829,4.4392,0.142857,0.294061,0.142857,0.571429,1.0,0.142857,0.571429,1.0,0.142857,0.057143,0.01
40,1.8432,4.640108,0.071429,0.281242,0.071429,0.642857,1.0,0.071429,0.642857,1.0,0.071429,0.064286,0.01
50,1.5478,4.727991,0.142857,0.307395,0.142857,0.714286,1.0,0.142857,0.714286,1.0,0.142857,0.071429,0.01
60,1.8182,4.821465,0.071429,0.255501,0.071429,0.928571,1.0,0.071429,0.928571,1.0,0.071429,0.092857,0.01
70,1.5933,4.633475,0.214286,0.336775,0.214286,0.714286,1.0,0.214286,0.714286,1.0,0.214286,0.071429,0.01
80,1.6444,4.694547,0.071429,0.288379,0.071429,0.785714,1.0,0.071429,0.785714,1.0,0.071429,0.078571,0.01
90,1.504,4.986434,0.214286,0.367319,0.214286,0.785714,1.0,0.214286,0.785714,1.0,0.214286,0.078571,0.01
100,1.4043,4.784133,0.214286,0.385614,0.214286,0.857143,1.0,0.214286,0.857143,1.0,0.214286,0.085714,0.01


Query Dataset Test Metrics: {
    "eval_loss": 8.243128776550293,
    "eval_accuracy": 0.09868421052631579,
    "eval_mAP": 0.11550141205428882,
    "eval_Hint@1": 0.09868421052631579,
    "eval_Hint@10": 0.13157894736842105,
    "eval_Hint@100": 0.2894736842105263,
    "eval_recall@1": 0.09868421052631579,
    "eval_recall@10": 0.13157894736842105,
    "eval_recall@100": 0.2894736842105263,
    "eval_precision@1": 0.09868421052631579,
    "eval_precision@10": 0.013157894736842105,
    "eval_precision@100": 0.0028947368421052633,
    "eval_runtime": 1.5124,
    "eval_samples_per_second": 100.503,
    "eval_steps_per_second": 12.563,
    "epoch": 50.0
}
 ########## Starting training cycle 2 ########## 


Step,Training Loss,Validation Loss,Accuracy,Map,Hint@1,Hint@10,Hint@100,Recall@1,Recall@10,Recall@100,Precision@1,Precision@10,Precision@100
10,5.3545,5.184026,0.214286,0.431806,0.214286,0.857143,1.0,0.214286,0.857143,1.0,0.214286,0.085714,0.01
20,2.4225,4.755459,0.285714,0.502472,0.285714,0.857143,0.928571,0.285714,0.857143,0.928571,0.285714,0.085714,0.009286
30,2.5215,4.933669,0.214286,0.356437,0.214286,0.714286,0.928571,0.214286,0.714286,0.928571,0.214286,0.071429,0.009286
40,2.206,4.816945,0.142857,0.29502,0.142857,0.642857,1.0,0.142857,0.642857,1.0,0.142857,0.064286,0.01
50,1.651,5.296973,0.285714,0.363369,0.285714,0.5,0.857143,0.285714,0.5,0.857143,0.285714,0.05,0.008571
60,1.8423,5.385022,0.142857,0.269314,0.142857,0.5,0.857143,0.142857,0.5,0.857143,0.142857,0.05,0.008571
70,1.7491,5.26705,0.142857,0.268826,0.142857,0.642857,0.928571,0.142857,0.642857,0.928571,0.142857,0.064286,0.009286
80,1.6967,5.410957,0.142857,0.248445,0.142857,0.5,0.928571,0.142857,0.5,0.928571,0.142857,0.05,0.009286
90,1.5321,5.487009,0.142857,0.252379,0.142857,0.357143,0.928571,0.142857,0.357143,0.928571,0.142857,0.035714,0.009286
100,1.5471,5.392066,0.142857,0.261304,0.142857,0.571429,0.928571,0.142857,0.571429,0.928571,0.142857,0.057143,0.009286


Indexing Dataset Test Metrics: {
    "eval_loss": 6.448309421539307,
    "eval_accuracy": 0.11842105263157894,
    "eval_mAP": 0.17166016707700144,
    "eval_Hint@1": 0.11842105263157894,
    "eval_Hint@10": 0.2565789473684211,
    "eval_Hint@100": 0.8223684210526315,
    "eval_recall@1": 0.11842105263157894,
    "eval_recall@10": 0.2565789473684211,
    "eval_recall@100": 0.8223684210526315,
    "eval_precision@1": 0.11842105263157894,
    "eval_precision@10": 0.02565789473684211,
    "eval_precision@100": 0.008223684210526315,
    "eval_runtime": 1.4798,
    "eval_samples_per_second": 102.714,
    "eval_steps_per_second": 12.839,
    "epoch": 10.0
}


Step,Training Loss,Validation Loss,Accuracy,Map,Hint@1,Hint@10,Hint@100,Recall@1,Recall@10,Recall@100,Precision@1,Precision@10,Precision@100
10,1.352,5.723165,0.142857,0.289856,0.142857,0.5,0.928571,0.142857,0.5,0.928571,0.142857,0.05,0.009286
20,1.3615,5.54329,0.214286,0.356389,0.214286,0.714286,0.928571,0.214286,0.714286,0.928571,0.214286,0.071429,0.009286
30,1.5028,4.913435,0.214286,0.417297,0.214286,0.785714,1.0,0.214286,0.785714,1.0,0.214286,0.078571,0.01
40,1.3649,4.625036,0.428571,0.565763,0.428571,0.857143,1.0,0.428571,0.857143,1.0,0.428571,0.085714,0.01
50,1.475,4.650359,0.214286,0.411499,0.214286,0.714286,1.0,0.214286,0.714286,1.0,0.214286,0.071429,0.01
60,1.4277,4.597027,0.214286,0.404134,0.214286,0.785714,1.0,0.214286,0.785714,1.0,0.214286,0.078571,0.01
70,1.3829,4.800975,0.142857,0.375443,0.142857,0.857143,1.0,0.142857,0.857143,1.0,0.142857,0.085714,0.01
80,1.355,4.492213,0.285714,0.501208,0.285714,0.857143,1.0,0.285714,0.857143,1.0,0.285714,0.085714,0.01
90,1.3597,4.796838,0.285714,0.428521,0.285714,0.714286,1.0,0.285714,0.714286,1.0,0.285714,0.071429,0.01
100,1.3464,4.877101,0.214286,0.395971,0.214286,0.785714,1.0,0.214286,0.785714,1.0,0.214286,0.078571,0.01


Query Dataset Test Metrics: {
    "eval_loss": 8.309011459350586,
    "eval_accuracy": 0.10526315789473684,
    "eval_mAP": 0.12073520468278305,
    "eval_Hint@1": 0.10526315789473684,
    "eval_Hint@10": 0.13157894736842105,
    "eval_Hint@100": 0.26973684210526316,
    "eval_recall@1": 0.10526315789473684,
    "eval_recall@10": 0.13157894736842105,
    "eval_recall@100": 0.26973684210526316,
    "eval_precision@1": 0.10526315789473684,
    "eval_precision@10": 0.013157894736842105,
    "eval_precision@100": 0.0026973684210526315,
    "eval_runtime": 1.5373,
    "eval_samples_per_second": 98.873,
    "eval_steps_per_second": 12.359,
    "epoch": 50.0
}


In [21]:
torch.cuda.empty_cache()

In [20]:
# test final
from json import dumps
print("Evaluating on test set for query dataset...")
metrics_query = trainer_query.evaluate(eval_dataset=tokenized_datasets_query["test"])
print(f"Query Dataset Test Metrics: {dumps(metrics_query, indent=4)}")

Evaluating on test set for query dataset...
Query Dataset Test Metrics: {
    "eval_loss": 8.309011459350586,
    "eval_accuracy": 0.10526315789473684,
    "eval_mAP": 0.12073520468278305,
    "eval_Hint@1": 0.10526315789473684,
    "eval_Hint@10": 0.13157894736842105,
    "eval_Hint@100": 0.26973684210526316,
    "eval_recall@1": 0.10526315789473684,
    "eval_recall@10": 0.13157894736842105,
    "eval_recall@100": 0.26973684210526316,
    "eval_precision@1": 0.10526315789473684,
    "eval_precision@10": 0.013157894736842105,
    "eval_precision@100": 0.0026973684210526315,
    "eval_runtime": 2.0998,
    "eval_samples_per_second": 72.387,
    "eval_steps_per_second": 9.048,
    "epoch": 50.0
}


In [None]:
# testea la salida de una pregunta, me refiero a obtener la predicción

print("Evaluating on test set for query dataset...")
index = 100
query = tokenized_datasets_query["test"][index]["text"]
real_label_id = tokenized_datasets_query["test"][index]["label"]
id_to_label = {v: k for k, v in label_to_id.items()}
real_label = id_to_label[real_label_id]
print(f"Sample query: {query}")
print(f"Real label for the first test query: {real_label}")
inputs = tokenizer(query, return_tensors="pt", truncation=True, max_length=MAX_LENGTH)
inputs = {k: v.to(trainer_query.model.device) for k, v in inputs.items()}
with torch.no_grad():
    outputs = trainer_query.model(**inputs)
logits = outputs.logits
predicted_class_id = logits.argmax().item()
id_to_label = {v: k for k, v in label_to_id.items()}
predicted_label = id_to_label[predicted_class_id]
print(f"Predicted label for query '{query}': {predicted_label}")
# extra los top k prediciones con mayor probabilidad
probs = torch.softmax(logits, dim=-1)
top_k = 20
top_k_probs, top_k_indices = torch.topk(probs, top_k)
print(f"Top {top_k} predictions:")
for prob, idx in zip(top_k_probs[0], top_k_indices[0]):
    label = id_to_label[idx.item()]
    print(f"Label: {label}, Probability: {prob.item():.4f}")
print("Training completed.")

Evaluating on test set for query dataset...
Sample query: ¿Cómo puedo contactar con Hugo Castro?
Real label for the first test query: 7557_4
Predicted label for query '¿Cómo puedo contactar con Hugo Castro?': 7557_4
Top 20 predictions:
Label: 7557_4, Probability: 0.9956
Label: 7542_3, Probability: 0.0013
Label: 7579_1, Probability: 0.0008
Label: 7517_5, Probability: 0.0002
Label: 7513_3, Probability: 0.0002
Label: 7502_4, Probability: 0.0001
Label: 7550_2, Probability: 0.0001
Label: 7568_1, Probability: 0.0001
Label: 7552_5, Probability: 0.0001
Label: 7511_5, Probability: 0.0001
Label: 7503_5, Probability: 0.0001
Label: 7540_2, Probability: 0.0001
Label: 7534_3, Probability: 0.0001
Label: 7504_4, Probability: 0.0001
Label: 7515_5, Probability: 0.0001
Label: 7539_4, Probability: 0.0000
Label: 7519_3, Probability: 0.0000
Label: 7546_5, Probability: 0.0000
Label: 7552_4, Probability: 0.0000
Label: 7538_5, Probability: 0.0000
Training completed.


In [21]:
# me gustaria que me hicieras un bucle con la logica de la celda anterior para ver si no acierta alguna muestra me diga que posicion esta en la lista de logits
from tqdm import tqdm

labels_predicted = []
labels_real = []

for index in tqdm(range(len(tokenized_datasets_query["test"]))):
    query = tokenized_datasets_query["test"][index]["text"]
    real_label_id = tokenized_datasets_query["test"][index]["label"]
    id_to_label = {v: k for k, v in label_to_id.items()}
    real_label = id_to_label[real_label_id]
    inputs = tokenizer(query, return_tensors="pt", truncation=True, max_length=MAX_LENGTH)
    inputs = {k: v.to(trainer_query.model.device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = trainer_query.model(**inputs)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=-1)

    # get list of labels sorted by probability
    sorted_probs, sorted_indices = torch.sort(probs, descending=True)
    sorted_labels = [id_to_label[idx.item()] for idx in sorted_indices[0]]

    labels_predicted.append(sorted_labels)
    labels_real.append([real_label])

100%|██████████| 152/152 [01:05<00:00,  2.32it/s]


In [22]:
from ranking_metrics import calc_ranking_metrics

metrics = calc_ranking_metrics(labels_predicted, labels_real, ks=[1, 5, 10, 20, 100], one_relevant_per_query=True)

print("Ranking Metrics:")
for k, v in metrics.items():
    print(f"  {k}: {v:.4f}")

Ranking Metrics:
  MRR: 0.1207
  mAP: 0.1207
  AvgRank: 2822.1118
  CMC@1: 0.1053
  Recall@k (macro)@1: 0.1053
  Precision@k (macro)@1: 0.1053
  Accuracy@1: 0.1053
  F1@k (macro)@1: 0.1053
  CMC@5: 0.1250
  Recall@k (macro)@5: 0.1250
  Precision@k (macro)@5: 0.0250
  Accuracy@5: 0.1250
  F1@k (macro)@5: 0.0417
  CMC@10: 0.1316
  Recall@k (macro)@10: 0.1316
  Precision@k (macro)@10: 0.0132
  Accuracy@10: 0.1316
  F1@k (macro)@10: 0.0239
  CMC@20: 0.2237
  Recall@k (macro)@20: 0.2237
  Precision@k (macro)@20: 0.0112
  Accuracy@20: 0.2237
  F1@k (macro)@20: 0.0213
  CMC@100: 0.2697
  Recall@k (macro)@100: 0.2697
  Precision@k (macro)@100: 0.0027
  Accuracy@100: 0.2697
  F1@k (macro)@100: 0.0053
