In [1]:
import argparse, os, numpy as np, torch
from datasets import load_dataset, ClassLabel
from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from transformers import BitsAndBytesConfig
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_indexing_csv = pd.read_csv("../notebooks/data/contacts_docs.csv")

# to dataset huggingface
from datasets import Dataset

dataset_indexing = Dataset.from_pandas(dataset_indexing_csv)

# create a new column 'text' that concatenates 'name', 'phone'
def concatenate_columns(example):
    return {
        "text": f"Nombre: {example['name']}\nTeléfono: {example['phone']}"
    }
dataset_indexing = dataset_indexing.map(concatenate_columns)
# rename column 'id' to 'label'
dataset_indexing = dataset_indexing.rename_column("id", "label")

num_labels = len(dataset_indexing['label'])
print(f"Number of labels: {num_labels}")
labels_list = dataset_indexing.unique('label')
print(f"Labels: {labels_list}")

# map labels to integers
label_to_id = {label: i for i, label in enumerate(labels_list)}
def map_labels(example):
    return {
        "label": label_to_id[example['label']]
    }
dataset_indexing = dataset_indexing.map(map_labels)

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map: 100%|██████████| 400/400 [00:00<00:00, 34054.35 examples/s]


Number of labels: 400
Labels: ['7500_1', '7500_2', '7500_3', '7500_4', '7500_5', '7501_1', '7501_2', '7501_3', '7501_4', '7501_5', '7502_1', '7502_2', '7502_3', '7502_4', '7502_5', '7503_1', '7503_2', '7503_3', '7503_4', '7503_5', '7504_1', '7504_2', '7504_3', '7504_4', '7504_5', '7505_1', '7505_2', '7505_3', '7505_4', '7505_5', '7506_1', '7506_2', '7506_3', '7506_4', '7506_5', '7507_1', '7507_2', '7507_3', '7507_4', '7507_5', '7508_1', '7508_2', '7508_3', '7508_4', '7508_5', '7509_1', '7509_2', '7509_3', '7509_4', '7509_5', '7510_1', '7510_2', '7510_3', '7510_4', '7510_5', '7511_1', '7511_2', '7511_3', '7511_4', '7511_5', '7512_1', '7512_2', '7512_3', '7512_4', '7512_5', '7513_1', '7513_2', '7513_3', '7513_4', '7513_5', '7514_1', '7514_2', '7514_3', '7514_4', '7514_5', '7515_1', '7515_2', '7515_3', '7515_4', '7515_5', '7516_1', '7516_2', '7516_3', '7516_4', '7516_5', '7517_1', '7517_2', '7517_3', '7517_4', '7517_5', '7518_1', '7518_2', '7518_3', '7518_4', '7518_5', '7519_1', '7519_2',

Map: 100%|██████████| 400/400 [00:00<00:00, 48209.01 examples/s]


In [3]:
query_dataset_train = pd.read_csv("../notebooks/data/contacts_queries_train.csv")
query_dataset_val = pd.read_csv("../notebooks/data/contacts_queries_val.csv")
query_dataset_test = pd.read_csv("../notebooks/data/contacts_queries_test.csv")

In [4]:
dataset_for_queries = {
    "train": Dataset.from_pandas(query_dataset_train),
    "validation": Dataset.from_pandas(query_dataset_val),
    "test": Dataset.from_pandas(query_dataset_test)
}

for split in dataset_for_queries:
    dataset_for_queries[split] = dataset_for_queries[split].rename_column("question", "text")
    dataset_for_queries[split] = dataset_for_queries[split].rename_column("id", "label")
    # map labels to integers
    dataset_for_queries[split] = dataset_for_queries[split].map(map_labels)

# to dataset huggingface
from datasets import DatasetDict
dataset_for_queries = DatasetDict(dataset_for_queries)
print(dataset_for_queries["train"][1])


Map: 100%|██████████| 1400/1400 [00:00<00:00, 53955.45 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 55827.29 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 32043.68 examples/s]

{'text': '¿Cómo puedo contactar con Antonio Alonso?', 'label': 194}





## Embeddings

In [5]:
from langchain.schema import Document
from langchain.retrievers import BM25Retriever

def create_documents_from_datasets(datasets):
    documents = []
    for dataset in datasets:
        for item in dataset:
            doc = Document(
                page_content=item["text"],
                metadata={"label": item["label"]}
            )
            documents.append(doc)
    return documents
documents_indexing = create_documents_from_datasets([dataset_indexing])
print(f"Number of documents created for indexing: {len(documents_indexing)}")

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

embedding_model = HuggingFaceEmbeddings(
    model_name="Qwen/Qwen3-Embedding-0.6B",
    model_kwargs={"device": "cuda"},
)

vectorstore = FAISS.from_documents(documents_indexing, embedding_model)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 100})

retriever_sparse = BM25Retriever.from_documents(documents_indexing)

Number of documents created for indexing: 400


In [6]:
from tqdm import tqdm

list_of_labels_embeddings = []
list_of_labels_sparse = []
list_of_real_labels_embeddings = []
for index in tqdm(range(len(dataset_for_queries["test"]))):
    query = dataset_for_queries["test"][index]["text"]
    real_label_id = dataset_for_queries["test"][index]["label"]
    id_to_label = {v: k for k, v in label_to_id.items()}
    real_label = id_to_label[real_label_id]

    # retrieve documents
    docs = retriever.get_relevant_documents(query)
    # get labels from retrieved documents
    retrieved_labels = [doc.metadata["label"] for doc in docs]
    # to names
    retrieved_labels = [id_to_label[label_id] for label_id in retrieved_labels]
    list_of_labels_embeddings.append(retrieved_labels)
    list_of_real_labels_embeddings.append([real_label])

    # retrieve documents (sparse)
    docs_sparse = retriever_sparse.get_relevant_documents(query)
    # get labels from retrieved documents
    retrieved_labels_sparse = [doc.metadata["label"] for doc in docs_sparse]
    # to names
    retrieved_labels_sparse = [id_to_label[label_id] for label_id in retrieved_labels_sparse]
    list_of_labels_sparse.append(retrieved_labels_sparse)


  docs = retriever.get_relevant_documents(query)
100%|██████████| 300/300 [00:08<00:00, 35.31it/s]


In [7]:
from ranking_metrics import calc_ranking_metrics

metrics = calc_ranking_metrics(list_of_labels_embeddings, list_of_real_labels_embeddings, ks=[1, 5, 10, 20, 100], one_relevant_per_query=True)

print("Ranking Metrics:")
for k, v in metrics.items():
    print(f"  {k}: {v:.4f}")

Ranking Metrics:
  MRR: 0.7261
  mAP: 0.7261
  AvgRank: 7.6983
  CMC@1: 0.6800
  Recall@k (macro)@1: 0.6800
  Precision@k (macro)@1: 0.6800
  Accuracy@1: 0.6800
  F1@k (macro)@1: 0.6800
  CMC@5: 0.7600
  Recall@k (macro)@5: 0.7600
  Precision@k (macro)@5: 0.1520
  Accuracy@5: 0.7600
  F1@k (macro)@5: 0.2533
  CMC@10: 0.8300
  Recall@k (macro)@10: 0.8300
  Precision@k (macro)@10: 0.0830
  Accuracy@10: 0.8300
  F1@k (macro)@10: 0.1509
  CMC@20: 0.8833
  Recall@k (macro)@20: 0.8833
  Precision@k (macro)@20: 0.0442
  Accuracy@20: 0.8833
  F1@k (macro)@20: 0.0841
  CMC@100: 0.9833
  Recall@k (macro)@100: 0.9833
  Precision@k (macro)@100: 0.0098
  Accuracy@100: 0.9833
  F1@k (macro)@100: 0.0195


In [8]:
metrics_sparse = calc_ranking_metrics(list_of_labels_sparse, list_of_real_labels_embeddings, ks=[1, 5, 10, 20, 100], one_relevant_per_query=True)

print("Sparse Ranking Metrics:")
for k, v in metrics_sparse.items():
    print(f"  {k}: {v:.4f}")

Sparse Ranking Metrics:
  MRR: 0.5539
  mAP: 0.5539
  AvgRank: 1.4158
  CMC@1: 0.5133
  Recall@k (macro)@1: 0.5133
  Precision@k (macro)@1: 0.5133
  Accuracy@1: 0.5133
  F1@k (macro)@1: 0.5133
  CMC@5: 0.6333
  Recall@k (macro)@5: 0.6333
  Precision@k (macro)@5: 0.1267
  Accuracy@5: 0.6333
  F1@k (macro)@5: 0.2111
  CMC@10: 0.6333
  Recall@k (macro)@10: 0.6333
  Precision@k (macro)@10: 0.0633
  Accuracy@10: 0.6333
  F1@k (macro)@10: 0.1152
  CMC@20: 0.6333
  Recall@k (macro)@20: 0.6333
  Precision@k (macro)@20: 0.0317
  Accuracy@20: 0.6333
  F1@k (macro)@20: 0.0603
  CMC@100: 0.6333
  Recall@k (macro)@100: 0.6333
  Precision@k (macro)@100: 0.0063
  Accuracy@100: 0.6333
  F1@k (macro)@100: 0.0125


## Model

In [9]:
model_name = "Qwen/Qwen3-0.6B"
MAX_LENGTH = 512

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    low_cpu_mem_usage=True,
    device_map={"": 0}
)
model.config.pad_token_id = tokenizer.pad_token_id

Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen3-0.6B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
lora_r = 256
lora_alpha = lora_r * 2
lora_dropout = 0.0
lora_bias = "none"
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "down_proj"]

In [12]:
config = LoraConfig(
    task_type="SEQ_CLS",
    inference_mode=False,
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias=lora_bias,
    target_modules=target_modules
)
model = get_peft_model(model, config)

In [13]:
for p in model.base_model.model.score.parameters():
    p.requires_grad_(True)

In [14]:
def preprocess(examples):
    return tokenizer(examples["text"], truncation=True, max_length=MAX_LENGTH, padding=False)

# tokenize test from dataset
tokenized_datasets_indexing = dataset_indexing.map(preprocess, batched=True)
tokenized_datasets_query = dataset_for_queries.map(preprocess, batched=True)


Map: 100%|██████████| 400/400 [00:00<00:00, 36311.96 examples/s]
Map: 100%|██████████| 1400/1400 [00:00<00:00, 87684.05 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 50442.62 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 48046.55 examples/s]


In [15]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = (preds == labels).mean()
    return {"accuracy": acc}

In [16]:
# Entrenamiento
SEED = 42
EPOCHS = 10
BATCH_SIZE = 32
training_args_indexing = TrainingArguments(
    output_dir=f"models/contacts_clf_{model_name.replace('/', '_')}_indexing",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=1,
    num_train_epochs=EPOCHS,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    eval_strategy="steps",     # o "no" si no vas a evaluar
    save_strategy="no",        # <-- no guarda checkpoints ni el modelo final
    eval_steps=10,
    logging_steps=10,
    load_best_model_at_end=False,  # <-- desactivado porque no hay checkpoints
    fp16=True,
    report_to="none",
    seed=SEED,
)

training_args_query = TrainingArguments(
    output_dir=f"models/contacts_clf_{model_name.replace('/', '_')}_query",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=1,
    num_train_epochs=EPOCHS,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    eval_strategy="steps",
    save_strategy="no",        # <-- no guarda checkpoints ni el modelo final
    eval_steps=10,
    logging_steps=10,
    load_best_model_at_end=False,  # <-- desactivado porque no hay checkpoints
    fp16=True,
    report_to="none",
    seed=SEED,
)

# IDs de tokens
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.bos_token_id = tokenizer.bos_token_id

trainer_indexing = Trainer(
    model=model,
    args=training_args_indexing,
    train_dataset=tokenized_datasets_indexing,
    eval_dataset=tokenized_datasets_query["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer_query = Trainer(
    model=model,
    args=training_args_query,
    train_dataset=tokenized_datasets_query["train"],
    eval_dataset=tokenized_datasets_query["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [17]:
model.print_trainable_parameters()  # Verificar parámetros entrenables

trainable params: 132,939,776 || all params: 728,989,696 || trainable%: 18.2362


In [18]:
from json import dumps

CICLES = 2
sep = "#" * 10
for ci in range(CICLES):
    print(f"{sep}Starting training cycle {ci + 1}{sep}")
    trainer_indexing.train()
    metrics_indexing = trainer_indexing.evaluate(eval_dataset=tokenized_datasets_query["test"])
    print(f"Indexing Dataset Test Metrics: {dumps(metrics_indexing, indent=4)}")
    trainer_query.train()
    metrics_query = trainer_query.evaluate(eval_dataset=tokenized_datasets_query["test"])
    print(f"Query Dataset Test Metrics: {dumps(metrics_query, indent=4)}")

##########Starting training cycle 1##########


Step,Training Loss,Validation Loss,Accuracy
10,7.4153,7.40722,0.0
20,6.2651,6.572969,0.0
30,6.1476,6.442155,0.003333
40,6.0639,6.405957,0.0
50,5.9927,6.35944,0.0
60,5.8305,6.451211,0.0
70,5.4372,6.518744,0.006667
80,4.7046,6.593138,0.0
90,3.3636,6.655827,0.006667
100,1.9824,6.702018,0.006667


Indexing Dataset Test Metrics: {
    "eval_loss": 6.8842644691467285,
    "eval_accuracy": 0.0033333333333333335,
    "eval_runtime": 0.5222,
    "eval_samples_per_second": 574.479,
    "eval_steps_per_second": 19.149,
    "epoch": 10.0
}


Step,Training Loss,Validation Loss,Accuracy
10,6.4115,6.158587,0.01
20,5.9247,5.87735,0.023333
30,5.761,5.57307,0.06
40,5.5952,5.407591,0.073333
50,4.9247,5.317563,0.1
60,4.6532,5.359495,0.116667
70,4.5769,5.1436,0.12
80,4.6137,4.974955,0.18
90,4.1831,4.741476,0.193333
100,3.4674,4.674329,0.236667


Query Dataset Test Metrics: {
    "eval_loss": 2.086529016494751,
    "eval_accuracy": 0.72,
    "eval_runtime": 0.6617,
    "eval_samples_per_second": 453.399,
    "eval_steps_per_second": 15.113,
    "epoch": 10.0
}
##########Starting training cycle 2##########


Step,Training Loss,Validation Loss,Accuracy
10,0.3852,1.826211,0.703333
20,0.1308,1.642709,0.73
30,0.0329,1.663232,0.703333
40,0.0259,1.765351,0.69
50,0.0358,1.740225,0.676667
60,0.0082,1.718524,0.683333
70,0.0044,1.635458,0.693333
80,0.0015,1.611056,0.7
90,0.0004,1.597606,0.7
100,0.0003,1.592664,0.693333


Indexing Dataset Test Metrics: {
    "eval_loss": 1.767072319984436,
    "eval_accuracy": 0.67,
    "eval_runtime": 0.5504,
    "eval_samples_per_second": 545.024,
    "eval_steps_per_second": 18.167,
    "epoch": 10.0
}


Step,Training Loss,Validation Loss,Accuracy
10,0.1063,1.517927,0.723333
20,0.0508,1.377827,0.76
30,0.0703,1.412158,0.746667
40,0.0438,1.414697,0.76
50,0.038,1.509931,0.76
60,0.0922,1.604105,0.713333
70,0.1716,1.66518,0.72
80,0.2243,1.737814,0.693333
90,0.1913,1.675216,0.703333
100,0.1244,1.525567,0.723333


Query Dataset Test Metrics: {
    "eval_loss": 1.2202895879745483,
    "eval_accuracy": 0.8333333333333334,
    "eval_runtime": 0.5245,
    "eval_samples_per_second": 571.959,
    "eval_steps_per_second": 19.065,
    "epoch": 10.0
}


In [19]:
# test final
from json import dumps
print("Evaluating on test set for query dataset...")
metrics_query = trainer_query.evaluate(eval_dataset=tokenized_datasets_query["test"])
print(f"Query Dataset Test Metrics: {dumps(metrics_query, indent=4)}")

Evaluating on test set for query dataset...
Query Dataset Test Metrics: {
    "eval_loss": 1.2202895879745483,
    "eval_accuracy": 0.8333333333333334,
    "eval_runtime": 0.5681,
    "eval_samples_per_second": 528.035,
    "eval_steps_per_second": 17.601,
    "epoch": 10.0
}


In [20]:
# testea la salida de una pregunta, me refiero a obtener la predicción

print("Evaluating on test set for query dataset...")
index = 100
query = tokenized_datasets_query["test"][index]["text"]
real_label_id = tokenized_datasets_query["test"][index]["label"]
id_to_label = {v: k for k, v in label_to_id.items()}
real_label = id_to_label[real_label_id]
print(f"Sample query: {query}")
print(f"Real label for the first test query: {real_label}")
inputs = tokenizer(query, return_tensors="pt", truncation=True, max_length=MAX_LENGTH)
inputs = {k: v.to(trainer_query.model.device) for k, v in inputs.items()}
with torch.no_grad():
    outputs = trainer_query.model(**inputs)
logits = outputs.logits
predicted_class_id = logits.argmax().item()
id_to_label = {v: k for k, v in label_to_id.items()}
predicted_label = id_to_label[predicted_class_id]
print(f"Predicted label for query '{query}': {predicted_label}")
# extra los top k prediciones con mayor probabilidad
probs = torch.softmax(logits, dim=-1)
top_k = 20
top_k_probs, top_k_indices = torch.topk(probs, top_k)
print(f"Top {top_k} predictions:")
for prob, idx in zip(top_k_probs[0], top_k_indices[0]):
    label = id_to_label[idx.item()]
    print(f"Label: {label}, Probability: {prob.item():.4f}")
print("Training completed.")

Evaluating on test set for query dataset...
Sample query: ¿Cómo puedo contactar con Hugo Castro?
Real label for the first test query: 7557_4
Predicted label for query '¿Cómo puedo contactar con Hugo Castro?': 7557_4
Top 20 predictions:
Label: 7557_4, Probability: 0.9890
Label: 7502_4, Probability: 0.0026
Label: 7507_5, Probability: 0.0017
Label: 7536_1, Probability: 0.0012
Label: 7577_2, Probability: 0.0012
Label: 7552_4, Probability: 0.0008
Label: 7571_2, Probability: 0.0004
Label: 7504_1, Probability: 0.0004
Label: 7553_1, Probability: 0.0004
Label: 7558_1, Probability: 0.0003
Label: 7507_4, Probability: 0.0002
Label: 7552_5, Probability: 0.0002
Label: 7539_4, Probability: 0.0002
Label: 7574_4, Probability: 0.0002
Label: 7571_5, Probability: 0.0002
Label: 7505_3, Probability: 0.0001
Label: 7541_5, Probability: 0.0001
Label: 7530_5, Probability: 0.0001
Label: 7547_3, Probability: 0.0000
Label: 7540_2, Probability: 0.0000
Training completed.


In [21]:
# me gustaria que me hicieras un bucle con la logica de la celda anterior para ver si no acierta alguna muestra me diga que posicion esta en la lista de logits
from tqdm import tqdm

labels_predicted = []
labels_real = []

for index in tqdm(range(len(tokenized_datasets_query["test"]))):
    query = tokenized_datasets_query["test"][index]["text"]
    real_label_id = tokenized_datasets_query["test"][index]["label"]
    id_to_label = {v: k for k, v in label_to_id.items()}
    real_label = id_to_label[real_label_id]
    inputs = tokenizer(query, return_tensors="pt", truncation=True, max_length=MAX_LENGTH)
    inputs = {k: v.to(trainer_query.model.device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = trainer_query.model(**inputs)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=-1)

    # get list of labels sorted by probability
    sorted_probs, sorted_indices = torch.sort(probs, descending=True)
    sorted_labels = [id_to_label[idx.item()] for idx in sorted_indices[0]]

    labels_predicted.append(sorted_labels)
    labels_real.append([real_label])

100%|██████████| 300/300 [00:17<00:00, 17.18it/s]


In [22]:
from ranking_metrics import calc_ranking_metrics

metrics = calc_ranking_metrics(labels_predicted, labels_real, ks=[1, 5, 10, 20, 100], one_relevant_per_query=True)

print("Ranking Metrics:")
for k, v in metrics.items():
    print(f"  {k}: {v:.4f}")

Ranking Metrics:
  MRR: 0.8557
  mAP: 0.8557
  AvgRank: 16.0200
  CMC@1: 0.8333
  Recall@k (macro)@1: 0.8333
  Precision@k (macro)@1: 0.8333
  Accuracy@1: 0.8333
  F1@k (macro)@1: 0.8333
  CMC@5: 0.8800
  Recall@k (macro)@5: 0.8800
  Precision@k (macro)@5: 0.1760
  Accuracy@5: 0.8800
  F1@k (macro)@5: 0.2933
  CMC@10: 0.8900
  Recall@k (macro)@10: 0.8900
  Precision@k (macro)@10: 0.0890
  Accuracy@10: 0.8900
  F1@k (macro)@10: 0.1618
  CMC@20: 0.9000
  Recall@k (macro)@20: 0.9000
  Precision@k (macro)@20: 0.0450
  Accuracy@20: 0.9000
  F1@k (macro)@20: 0.0857
  CMC@100: 0.9467
  Recall@k (macro)@100: 0.9467
  Precision@k (macro)@100: 0.0095
  Accuracy@100: 0.9467
  F1@k (macro)@100: 0.0187
