In [85]:
import argparse, os, numpy as np, torch
from datasets import load_dataset, ClassLabel
from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from transformers import BitsAndBytesConfig
import pandas as pd

In [86]:
dataset_indexing_csv = pd.read_csv("../notebooks/data/contacts_docs.csv")

# to dataset huggingface
from datasets import Dataset

dataset_indexing = Dataset.from_pandas(dataset_indexing_csv)

# create a new column 'text' that concatenates 'name', 'phone'
def concatenate_columns(example):
    return {
        "text": f"Nombre: {example['name']}\nTeléfono: {example['phone']}"
    }
dataset_indexing = dataset_indexing.map(concatenate_columns)
# rename column 'id' to 'label'
dataset_indexing = dataset_indexing.rename_column("id", "label")

num_labels = len(dataset_indexing['label'])
print(f"Number of labels: {num_labels}")
labels_list = dataset_indexing.unique('label')
print(f"Labels: {labels_list}")

# map labels to integers
label_to_id = {label: i for i, label in enumerate(labels_list)}
def map_labels(example):
    return {
        "label": label_to_id[example['label']]
    }
dataset_indexing = dataset_indexing.map(map_labels)

Map: 100%|██████████| 400/400 [00:00<00:00, 41216.60 examples/s]


Number of labels: 400
Labels: ['7500_1', '7500_2', '7500_3', '7500_4', '7500_5', '7501_1', '7501_2', '7501_3', '7501_4', '7501_5', '7502_1', '7502_2', '7502_3', '7502_4', '7502_5', '7503_1', '7503_2', '7503_3', '7503_4', '7503_5', '7504_1', '7504_2', '7504_3', '7504_4', '7504_5', '7505_1', '7505_2', '7505_3', '7505_4', '7505_5', '7506_1', '7506_2', '7506_3', '7506_4', '7506_5', '7507_1', '7507_2', '7507_3', '7507_4', '7507_5', '7508_1', '7508_2', '7508_3', '7508_4', '7508_5', '7509_1', '7509_2', '7509_3', '7509_4', '7509_5', '7510_1', '7510_2', '7510_3', '7510_4', '7510_5', '7511_1', '7511_2', '7511_3', '7511_4', '7511_5', '7512_1', '7512_2', '7512_3', '7512_4', '7512_5', '7513_1', '7513_2', '7513_3', '7513_4', '7513_5', '7514_1', '7514_2', '7514_3', '7514_4', '7514_5', '7515_1', '7515_2', '7515_3', '7515_4', '7515_5', '7516_1', '7516_2', '7516_3', '7516_4', '7516_5', '7517_1', '7517_2', '7517_3', '7517_4', '7517_5', '7518_1', '7518_2', '7518_3', '7518_4', '7518_5', '7519_1', '7519_2',

Map: 100%|██████████| 400/400 [00:00<00:00, 41410.91 examples/s]


In [87]:
query_dataset_train = pd.read_csv("../notebooks/data/contacts_queries_train.csv")
query_dataset_val = pd.read_csv("../notebooks/data/contacts_queries_val.csv")
query_dataset_test = pd.read_csv("../notebooks/data/contacts_queries_test.csv")

In [88]:
dataset_for_queries = {
    "train": Dataset.from_pandas(query_dataset_train),
    "validation": Dataset.from_pandas(query_dataset_val),
    "test": Dataset.from_pandas(query_dataset_test)
}

for split in dataset_for_queries:
    dataset_for_queries[split] = dataset_for_queries[split].rename_column("question", "text")
    dataset_for_queries[split] = dataset_for_queries[split].rename_column("id", "label")
    # map labels to integers
    dataset_for_queries[split] = dataset_for_queries[split].map(map_labels)

# to dataset huggingface
from datasets import DatasetDict
dataset_for_queries = DatasetDict(dataset_for_queries)
print(dataset_for_queries["train"][1])


Map: 100%|██████████| 1400/1400 [00:00<00:00, 60355.90 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 60787.01 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 52006.25 examples/s]

{'text': '¿Cómo puedo contactar con Antonio Alonso?', 'label': 194}





## Model

In [89]:
model_name = "Qwen/Qwen3-0.6B"
MAX_LENGTH = 512

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [90]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    low_cpu_mem_usage=True,
    device_map={"": 0}
)
model.config.pad_token_id = tokenizer.pad_token_id

Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen3-0.6B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [91]:
lora_r = 256
lora_alpha = lora_r * 2
lora_dropout = 0.0
lora_bias = "none"
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "down_proj"]

In [92]:
config = LoraConfig(
    task_type="SEQ_CLS",
    inference_mode=False,
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias=lora_bias,
    target_modules=target_modules
)
model = get_peft_model(model, config)

In [93]:
for p in model.base_model.model.score.parameters():
    p.requires_grad_(True)

In [94]:
def preprocess(examples):
    return tokenizer(examples["text"], truncation=True, max_length=MAX_LENGTH, padding=False)

# tokenize test from dataset
tokenized_datasets_indexing = dataset_indexing.map(preprocess, batched=True)
tokenized_datasets_query = dataset_for_queries.map(preprocess, batched=True)


Map: 100%|██████████| 400/400 [00:00<00:00, 49456.76 examples/s]
Map: 100%|██████████| 1400/1400 [00:00<00:00, 84155.38 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 45981.77 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 46804.46 examples/s]


In [95]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = (preds == labels).mean()
    return {"accuracy": acc}

In [96]:
# Entrenamiento
SEED = 42
EPOCHS = 10
training_args_indexing = TrainingArguments(
    output_dir=f"models/contacts_clf_{model_name.replace('/', '_')}_indexing",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    num_train_epochs=EPOCHS,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    eval_strategy="steps",     # o "no" si no vas a evaluar
    save_strategy="no",        # <-- no guarda checkpoints ni el modelo final
    eval_steps=10,
    logging_steps=10,
    load_best_model_at_end=False,  # <-- desactivado porque no hay checkpoints
    fp16=True,
    report_to="none",
    seed=SEED,
)

training_args_query = TrainingArguments(
    output_dir=f"models/contacts_clf_{model_name.replace('/', '_')}_query",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    num_train_epochs=EPOCHS,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    eval_strategy="steps",
    save_strategy="no",        # <-- no guarda checkpoints ni el modelo final
    eval_steps=10,
    logging_steps=10,
    load_best_model_at_end=False,  # <-- desactivado porque no hay checkpoints
    fp16=True,
    report_to="none",
    seed=SEED,
)

# IDs de tokens
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.bos_token_id = tokenizer.bos_token_id

trainer_indexing = Trainer(
    model=model,
    args=training_args_indexing,
    train_dataset=tokenized_datasets_indexing,
    eval_dataset=tokenized_datasets_query["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer_query = Trainer(
    model=model,
    args=training_args_query,
    train_dataset=tokenized_datasets_query["train"],
    eval_dataset=tokenized_datasets_query["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [97]:
model.print_trainable_parameters()  # Verificar parámetros entrenables

trainable params: 132,939,776 || all params: 728,989,696 || trainable%: 18.2362


In [98]:
from json import dumps

CICLES = 2
sep = "#" * 10
for ci in range(CICLES):
    print(f"{sep}Starting training cycle {ci + 1}{sep}")
    trainer_indexing.train()
    metrics_indexing = trainer_indexing.evaluate(eval_dataset=tokenized_datasets_query["test"])
    print(f"Indexing Dataset Test Metrics: {dumps(metrics_indexing, indent=4)}")
    trainer_query.train()
    metrics_query = trainer_query.evaluate(eval_dataset=tokenized_datasets_query["test"])
    print(f"Query Dataset Test Metrics: {dumps(metrics_query, indent=4)}")

##########Starting training cycle 1##########


Step,Training Loss,Validation Loss,Accuracy
10,6.82,7.252311,0.0
20,6.229,6.664512,0.003333
30,6.1405,6.520859,0.006667
40,6.0677,6.372051,0.003333
50,5.9447,6.377181,0.013333
60,5.6537,6.398425,0.003333
70,5.0307,6.477132,0.01
80,4.011,6.465443,0.01
90,2.2693,6.500814,0.006667
100,1.0978,6.513555,0.003333


Indexing Dataset Test Metrics: {
    "eval_loss": 6.562037944793701,
    "eval_accuracy": 0.01,
    "eval_runtime": 0.4657,
    "eval_samples_per_second": 644.145,
    "eval_steps_per_second": 21.471,
    "epoch": 10.0
}


Step,Training Loss,Validation Loss,Accuracy
10,6.3657,6.036188,0.033333
20,5.6984,5.464395,0.06
30,5.156,5.172052,0.106667
40,4.9117,5.07949,0.143333
50,4.6222,4.880845,0.153333
60,4.3541,4.778047,0.186667
70,4.2499,4.662904,0.193333
80,4.1639,4.533704,0.223333
90,3.7566,4.360244,0.243333
100,3.3002,4.203339,0.28


Query Dataset Test Metrics: {
    "eval_loss": 2.2265539169311523,
    "eval_accuracy": 0.6966666666666667,
    "eval_runtime": 0.5194,
    "eval_samples_per_second": 577.64,
    "eval_steps_per_second": 19.255,
    "epoch": 10.0
}
##########Starting training cycle 2##########


Step,Training Loss,Validation Loss,Accuracy
10,0.328,1.922221,0.72
20,0.2462,1.97111,0.686667
30,0.0414,1.954503,0.706667
40,0.0637,2.012406,0.7
50,0.0199,1.960125,0.706667
60,0.006,1.998994,0.716667
70,0.017,1.972437,0.706667
80,0.0012,1.954031,0.713333
90,0.0005,1.948961,0.716667
100,0.0004,1.943881,0.716667


Indexing Dataset Test Metrics: {
    "eval_loss": 1.9097065925598145,
    "eval_accuracy": 0.7166666666666667,
    "eval_runtime": 0.4852,
    "eval_samples_per_second": 618.322,
    "eval_steps_per_second": 20.611,
    "epoch": 10.0
}


Step,Training Loss,Validation Loss,Accuracy
10,0.0141,1.861472,0.73
20,0.007,1.747326,0.74
30,0.0058,1.779034,0.736667
40,0.0236,1.808843,0.736667
50,0.0168,1.986957,0.713333
60,0.1366,2.198146,0.663333
70,0.2747,2.150442,0.673333
80,0.3398,1.931464,0.7
90,0.2235,1.939975,0.666667
100,0.1348,1.974955,0.703333


Query Dataset Test Metrics: {
    "eval_loss": 1.410375714302063,
    "eval_accuracy": 0.8433333333333334,
    "eval_runtime": 0.441,
    "eval_samples_per_second": 680.332,
    "eval_steps_per_second": 22.678,
    "epoch": 10.0
}


In [99]:
# test final
from json import dumps
print("Evaluating on test set for query dataset...")
metrics_query = trainer_query.evaluate(eval_dataset=tokenized_datasets_query["test"])
print(f"Query Dataset Test Metrics: {dumps(metrics_query, indent=4)}")

Evaluating on test set for query dataset...
Query Dataset Test Metrics: {
    "eval_loss": 1.410375714302063,
    "eval_accuracy": 0.8433333333333334,
    "eval_runtime": 0.4568,
    "eval_samples_per_second": 656.671,
    "eval_steps_per_second": 21.889,
    "epoch": 10.0
}
