In [1]:
import argparse, os, numpy as np, torch
from datasets import load_dataset, ClassLabel
from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from transformers import BitsAndBytesConfig
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_indexing_csv = pd.read_csv("../notebooks/data/contacts_docs.csv")

# to dataset huggingface
from datasets import Dataset

dataset_indexing = Dataset.from_pandas(dataset_indexing_csv)

# create a new column 'text' that concatenates 'name', 'phone'
def concatenate_columns(example):
    return {
        "text": f"Nombre: {example['name']}\nTeléfono: {example['phone']}"
    }
dataset_indexing = dataset_indexing.map(concatenate_columns)
# rename column 'id' to 'label'
dataset_indexing = dataset_indexing.rename_column("id", "label")

num_labels = len(dataset_indexing['label'])
print(f"Number of labels: {num_labels}")
labels_list = dataset_indexing.unique('label')
print(f"Labels: {labels_list}")

# map labels to integers
label_to_id = {label: i for i, label in enumerate(labels_list)}
def map_labels(example):
    return {
        "label": label_to_id[example['label']]
    }
dataset_indexing = dataset_indexing.map(map_labels)

Map: 100%|██████████| 400/400 [00:00<00:00, 35681.78 examples/s]


Number of labels: 400
Labels: ['7500_1', '7500_2', '7500_3', '7500_4', '7500_5', '7501_1', '7501_2', '7501_3', '7501_4', '7501_5', '7502_1', '7502_2', '7502_3', '7502_4', '7502_5', '7503_1', '7503_2', '7503_3', '7503_4', '7503_5', '7504_1', '7504_2', '7504_3', '7504_4', '7504_5', '7505_1', '7505_2', '7505_3', '7505_4', '7505_5', '7506_1', '7506_2', '7506_3', '7506_4', '7506_5', '7507_1', '7507_2', '7507_3', '7507_4', '7507_5', '7508_1', '7508_2', '7508_3', '7508_4', '7508_5', '7509_1', '7509_2', '7509_3', '7509_4', '7509_5', '7510_1', '7510_2', '7510_3', '7510_4', '7510_5', '7511_1', '7511_2', '7511_3', '7511_4', '7511_5', '7512_1', '7512_2', '7512_3', '7512_4', '7512_5', '7513_1', '7513_2', '7513_3', '7513_4', '7513_5', '7514_1', '7514_2', '7514_3', '7514_4', '7514_5', '7515_1', '7515_2', '7515_3', '7515_4', '7515_5', '7516_1', '7516_2', '7516_3', '7516_4', '7516_5', '7517_1', '7517_2', '7517_3', '7517_4', '7517_5', '7518_1', '7518_2', '7518_3', '7518_4', '7518_5', '7519_1', '7519_2',

Map: 100%|██████████| 400/400 [00:00<00:00, 49950.03 examples/s]


In [3]:
query_dataset_train = pd.read_csv("../notebooks/data/contacts_queries_train.csv")
query_dataset_val = pd.read_csv("../notebooks/data/contacts_queries_val.csv")
query_dataset_test = pd.read_csv("../notebooks/data/contacts_queries_test.csv")

In [4]:
dataset_for_queries = {
    "train": Dataset.from_pandas(query_dataset_train),
    "validation": Dataset.from_pandas(query_dataset_val),
    "test": Dataset.from_pandas(query_dataset_test)
}

for split in dataset_for_queries:
    dataset_for_queries[split] = dataset_for_queries[split].rename_column("question", "text")
    dataset_for_queries[split] = dataset_for_queries[split].rename_column("id", "label")
    # map labels to integers
    dataset_for_queries[split] = dataset_for_queries[split].map(map_labels)

# to dataset huggingface
from datasets import DatasetDict
dataset_for_queries = DatasetDict(dataset_for_queries)
print(dataset_for_queries["train"][1])


Map: 100%|██████████| 1400/1400 [00:00<00:00, 68056.21 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 61401.02 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 58240.74 examples/s]

{'text': '¿Cómo puedo contactar con Antonio Alonso?', 'label': 194}





## Embeddings

In [5]:
from langchain.schema import Document
from langchain.retrievers import BM25Retriever

def create_documents_from_datasets(datasets):
    documents = []
    for dataset in datasets:
        for item in dataset:
            doc = Document(
                page_content=item["text"],
                metadata={"label": item["label"]}
            )
            documents.append(doc)
    return documents
documents_indexing = create_documents_from_datasets([dataset_indexing])
print(f"Number of documents created for indexing: {len(documents_indexing)}")

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

embedding_model = HuggingFaceEmbeddings(
    model_name="Qwen/Qwen3-Embedding-0.6B",
    model_kwargs={"device": "cuda"},
)

vectorstore = FAISS.from_documents(documents_indexing, embedding_model)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 100})

retriever_sparse = BM25Retriever.from_documents(documents_indexing)

Number of documents created for indexing: 400


In [6]:
from tqdm import tqdm

list_of_labels_embeddings = []
list_of_labels_sparse = []
list_of_real_labels_embeddings = []
for index in tqdm(range(len(dataset_for_queries["test"]))):
    query = dataset_for_queries["test"][index]["text"]
    real_label_id = dataset_for_queries["test"][index]["label"]
    id_to_label = {v: k for k, v in label_to_id.items()}
    real_label = id_to_label[real_label_id]

    # retrieve documents
    docs = retriever.get_relevant_documents(query)
    # get labels from retrieved documents
    retrieved_labels = [doc.metadata["label"] for doc in docs]
    # to names
    retrieved_labels = [id_to_label[label_id] for label_id in retrieved_labels]
    list_of_labels_embeddings.append(retrieved_labels)
    list_of_real_labels_embeddings.append([real_label])

    # retrieve documents (sparse)
    docs_sparse = retriever_sparse.get_relevant_documents(query)
    # get labels from retrieved documents
    retrieved_labels_sparse = [doc.metadata["label"] for doc in docs_sparse]
    # to names
    retrieved_labels_sparse = [id_to_label[label_id] for label_id in retrieved_labels_sparse]
    list_of_labels_sparse.append(retrieved_labels_sparse)


  docs = retriever.get_relevant_documents(query)
100%|██████████| 300/300 [00:07<00:00, 39.63it/s]


In [7]:
from ranking_metrics import calc_ranking_metrics

metrics = calc_ranking_metrics(list_of_labels_embeddings, list_of_real_labels_embeddings, ks=[1, 5, 10, 20, 100], one_relevant_per_query=True)

print("Ranking Metrics:")
for k, v in metrics.items():
    print(f"  {k}: {v:.4f}")

Ranking Metrics:
  MRR: 0.7261
  mAP: 0.7261
  AvgRank: 7.6983
  CMC@1: 0.6800
  Recall@k (macro)@1: 0.6800
  Precision@k (macro)@1: 0.6800
  Accuracy@1: 0.6800
  F1@k (macro)@1: 0.6800
  CMC@5: 0.7600
  Recall@k (macro)@5: 0.7600
  Precision@k (macro)@5: 0.1520
  Accuracy@5: 0.7600
  F1@k (macro)@5: 0.2533
  CMC@10: 0.8300
  Recall@k (macro)@10: 0.8300
  Precision@k (macro)@10: 0.0830
  Accuracy@10: 0.8300
  F1@k (macro)@10: 0.1509
  CMC@20: 0.8833
  Recall@k (macro)@20: 0.8833
  Precision@k (macro)@20: 0.0442
  Accuracy@20: 0.8833
  F1@k (macro)@20: 0.0841
  CMC@100: 0.9833
  Recall@k (macro)@100: 0.9833
  Precision@k (macro)@100: 0.0098
  Accuracy@100: 0.9833
  F1@k (macro)@100: 0.0195


In [8]:
metrics_sparse = calc_ranking_metrics(list_of_labels_sparse, list_of_real_labels_embeddings, ks=[1, 5, 10, 20, 100], one_relevant_per_query=True)

print("Sparse Ranking Metrics:")
for k, v in metrics_sparse.items():
    print(f"  {k}: {v:.4f}")

Sparse Ranking Metrics:
  MRR: 0.5539
  mAP: 0.5539
  AvgRank: 1.4158
  CMC@1: 0.5133
  Recall@k (macro)@1: 0.5133
  Precision@k (macro)@1: 0.5133
  Accuracy@1: 0.5133
  F1@k (macro)@1: 0.5133
  CMC@5: 0.6333
  Recall@k (macro)@5: 0.6333
  Precision@k (macro)@5: 0.1267
  Accuracy@5: 0.6333
  F1@k (macro)@5: 0.2111
  CMC@10: 0.6333
  Recall@k (macro)@10: 0.6333
  Precision@k (macro)@10: 0.0633
  Accuracy@10: 0.6333
  F1@k (macro)@10: 0.1152
  CMC@20: 0.6333
  Recall@k (macro)@20: 0.6333
  Precision@k (macro)@20: 0.0317
  Accuracy@20: 0.6333
  F1@k (macro)@20: 0.0603
  CMC@100: 0.6333
  Recall@k (macro)@100: 0.6333
  Precision@k (macro)@100: 0.0063
  Accuracy@100: 0.6333
  F1@k (macro)@100: 0.0125


## Model

In [9]:
model_name = "Qwen/Qwen3-4B"
MAX_LENGTH = 512

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    low_cpu_mem_usage=True,
    device_map={"": 0}
)
model.config.pad_token_id = tokenizer.pad_token_id

Fetching 3 files: 100%|██████████| 3/3 [01:05<00:00, 21.76s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  2.13it/s]
Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen3-4B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
lora_r = 256
lora_alpha = lora_r * 2
lora_dropout = 0.0
lora_bias = "none"
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "down_proj"]

In [12]:
config = LoraConfig(
    task_type="SEQ_CLS",
    inference_mode=False,
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias=lora_bias,
    target_modules=target_modules
)
model = get_peft_model(model, config)

In [13]:
for p in model.base_model.model.score.parameters():
    p.requires_grad_(True)

In [14]:
def preprocess(examples):
    return tokenizer(examples["text"], truncation=True, max_length=MAX_LENGTH, padding=False)

# tokenize test from dataset
tokenized_datasets_indexing = dataset_indexing.map(preprocess, batched=True)
tokenized_datasets_query = dataset_for_queries.map(preprocess, batched=True)


Map: 100%|██████████| 400/400 [00:00<00:00, 56970.41 examples/s]
Map: 100%|██████████| 1400/1400 [00:00<00:00, 87000.71 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 57943.05 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 50337.69 examples/s]


In [15]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = (preds == labels).mean()
    return {"accuracy": acc}

In [16]:
# Entrenamiento
SEED = 42
EPOCHS = 10
training_args_indexing = TrainingArguments(
    output_dir=f"models/contacts_clf_{model_name.replace('/', '_')}_indexing",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    num_train_epochs=EPOCHS,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    eval_strategy="steps",     # o "no" si no vas a evaluar
    save_strategy="no",        # <-- no guarda checkpoints ni el modelo final
    eval_steps=10,
    logging_steps=10,
    load_best_model_at_end=False,  # <-- desactivado porque no hay checkpoints
    fp16=True,
    report_to="none",
    seed=SEED,
)

training_args_query = TrainingArguments(
    output_dir=f"models/contacts_clf_{model_name.replace('/', '_')}_query",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    num_train_epochs=EPOCHS,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    eval_strategy="steps",
    save_strategy="no",        # <-- no guarda checkpoints ni el modelo final
    eval_steps=10,
    logging_steps=10,
    load_best_model_at_end=False,  # <-- desactivado porque no hay checkpoints
    fp16=True,
    report_to="none",
    seed=SEED,
)

# IDs de tokens
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.bos_token_id = tokenizer.bos_token_id

trainer_indexing = Trainer(
    model=model,
    args=training_args_indexing,
    train_dataset=tokenized_datasets_indexing,
    eval_dataset=tokenized_datasets_query["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer_query = Trainer(
    model=model,
    args=training_args_query,
    train_dataset=tokenized_datasets_query["train"],
    eval_dataset=tokenized_datasets_query["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [17]:
model.print_trainable_parameters()  # Verificar parámetros entrenables

trainable params: 417,284,096 || all params: 4,439,752,192 || trainable%: 9.3988


In [18]:
from json import dumps

CICLES = 2
sep = "#" * 10
for ci in range(CICLES):
    print(f"{sep}Starting training cycle {ci + 1}{sep}")
    trainer_indexing.train()
    metrics_indexing = trainer_indexing.evaluate(eval_dataset=tokenized_datasets_query["test"])
    print(f"Indexing Dataset Test Metrics: {dumps(metrics_indexing, indent=4)}")
    trainer_query.train()
    metrics_query = trainer_query.evaluate(eval_dataset=tokenized_datasets_query["test"])
    print(f"Query Dataset Test Metrics: {dumps(metrics_query, indent=4)}")

##########Starting training cycle 1##########


OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB. GPU 0 has a total capacity of 23.99 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 22.88 GiB is allocated by PyTorch, and 129.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# test final
from json import dumps
print("Evaluating on test set for query dataset...")
metrics_query = trainer_query.evaluate(eval_dataset=tokenized_datasets_query["test"])
print(f"Query Dataset Test Metrics: {dumps(metrics_query, indent=4)}")

Evaluating on test set for query dataset...
Query Dataset Test Metrics: {
    "eval_loss": 1.3078064918518066,
    "eval_accuracy": 0.8566666666666667,
    "eval_runtime": 0.5268,
    "eval_samples_per_second": 569.497,
    "eval_steps_per_second": 18.983,
    "epoch": 10.0
}


In [None]:
# testea la salida de una pregunta, me refiero a obtener la predicción

print("Evaluating on test set for query dataset...")
index = 100
query = tokenized_datasets_query["test"][index]["text"]
real_label_id = tokenized_datasets_query["test"][index]["label"]
id_to_label = {v: k for k, v in label_to_id.items()}
real_label = id_to_label[real_label_id]
print(f"Sample query: {query}")
print(f"Real label for the first test query: {real_label}")
inputs = tokenizer(query, return_tensors="pt", truncation=True, max_length=MAX_LENGTH)
inputs = {k: v.to(trainer_query.model.device) for k, v in inputs.items()}
with torch.no_grad():
    outputs = trainer_query.model(**inputs)
logits = outputs.logits
predicted_class_id = logits.argmax().item()
id_to_label = {v: k for k, v in label_to_id.items()}
predicted_label = id_to_label[predicted_class_id]
print(f"Predicted label for query '{query}': {predicted_label}")
# extra los top k prediciones con mayor probabilidad
probs = torch.softmax(logits, dim=-1)
top_k = 20
top_k_probs, top_k_indices = torch.topk(probs, top_k)
print(f"Top {top_k} predictions:")
for prob, idx in zip(top_k_probs[0], top_k_indices[0]):
    label = id_to_label[idx.item()]
    print(f"Label: {label}, Probability: {prob.item():.4f}")
print("Training completed.")

Evaluating on test set for query dataset...
Sample query: ¿Cómo puedo contactar con Hugo Castro?
Real label for the first test query: 7557_4
Predicted label for query '¿Cómo puedo contactar con Hugo Castro?': 7557_4
Top 20 predictions:
Label: 7557_4, Probability: 0.9944
Label: 7504_1, Probability: 0.0022
Label: 7538_3, Probability: 0.0015
Label: 7529_3, Probability: 0.0003
Label: 7534_3, Probability: 0.0002
Label: 7552_5, Probability: 0.0001
Label: 7539_4, Probability: 0.0001
Label: 7552_4, Probability: 0.0001
Label: 7545_1, Probability: 0.0001
Label: 7511_4, Probability: 0.0001
Label: 7574_1, Probability: 0.0001
Label: 7569_3, Probability: 0.0001
Label: 7556_1, Probability: 0.0000
Label: 7511_2, Probability: 0.0000
Label: 7507_5, Probability: 0.0000
Label: 7555_3, Probability: 0.0000
Label: 7554_1, Probability: 0.0000
Label: 7507_4, Probability: 0.0000
Label: 7505_5, Probability: 0.0000
Label: 7537_5, Probability: 0.0000
Training completed.


In [None]:
# me gustaria que me hicieras un bucle con la logica de la celda anterior para ver si no acierta alguna muestra me diga que posicion esta en la lista de logits
from tqdm import tqdm

labels_predicted = []
labels_real = []

for index in tqdm(range(len(tokenized_datasets_query["test"]))):
    query = tokenized_datasets_query["test"][index]["text"]
    real_label_id = tokenized_datasets_query["test"][index]["label"]
    id_to_label = {v: k for k, v in label_to_id.items()}
    real_label = id_to_label[real_label_id]
    inputs = tokenizer(query, return_tensors="pt", truncation=True, max_length=MAX_LENGTH)
    inputs = {k: v.to(trainer_query.model.device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = trainer_query.model(**inputs)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=-1)

    # get list of labels sorted by probability
    sorted_probs, sorted_indices = torch.sort(probs, descending=True)
    sorted_labels = [id_to_label[idx.item()] for idx in sorted_indices[0]]

    labels_predicted.append(sorted_labels)
    labels_real.append([real_label])

100%|██████████| 300/300 [00:16<00:00, 18.66it/s]


In [None]:
from ranking_metrics import calc_ranking_metrics

metrics = calc_ranking_metrics(labels_predicted, labels_real, ks=[1, 5, 10, 20, 100], one_relevant_per_query=True)

print("Ranking Metrics:")
for k, v in metrics.items():
    print(f"  {k}: {v:.4f}")

Ranking Metrics:
  MRR: 0.8734
  mAP: 0.8734
  AvgRank: 25.1667
  CMC@1: 0.8567
  Recall@k (macro)@1: 0.8567
  Precision@k (macro)@1: 0.8567
  Accuracy@1: 0.8567
  F1@k (macro)@1: 0.8567
  CMC@5: 0.8933
  Recall@k (macro)@5: 0.8933
  Precision@k (macro)@5: 0.1787
  Accuracy@5: 0.8933
  F1@k (macro)@5: 0.2978
  CMC@10: 0.9067
  Recall@k (macro)@10: 0.9067
  Precision@k (macro)@10: 0.0907
  Accuracy@10: 0.9067
  F1@k (macro)@10: 0.1648
  CMC@20: 0.9100
  Recall@k (macro)@20: 0.9100
  Precision@k (macro)@20: 0.0455
  Accuracy@20: 0.9100
  F1@k (macro)@20: 0.0867
  CMC@100: 0.9167
  Recall@k (macro)@100: 0.9167
  Precision@k (macro)@100: 0.0092
  Accuracy@100: 0.9167
  F1@k (macro)@100: 0.0182
