In [1]:
import argparse, os, numpy as np, torch
from datasets import load_dataset, ClassLabel
from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from transformers import BitsAndBytesConfig
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
dataset_indexing_csv = pd.read_csv("../notebooks/data/contacts_docs.csv")

# to dataset huggingface
from datasets import Dataset

dataset_indexing = Dataset.from_pandas(dataset_indexing_csv)

# create a new column 'text' that concatenates 'name', 'phone'
def concatenate_columns(example):
    return {
        "text": f"Nombre: {example['name']}\nTeléfono: {example['phone']}"
    }
dataset_indexing = dataset_indexing.map(concatenate_columns)
# rename column 'id' to 'label'
dataset_indexing = dataset_indexing.rename_column("id", "label")

num_labels = len(dataset_indexing['label'])
print(f"Number of labels: {num_labels}")
labels_list = dataset_indexing.unique('label')
print(f"Labels: {labels_list}")

# map labels to integers
label_to_id = {label: i for i, label in enumerate(labels_list)}
def map_labels(example):
    return {
        "label": label_to_id[example['label']]
    }
dataset_indexing = dataset_indexing.map(map_labels)

Map: 100%|██████████| 400/400 [00:00<00:00, 45092.77 examples/s]


Number of labels: 400
Labels: ['7500_1', '7500_2', '7500_3', '7500_4', '7500_5', '7501_1', '7501_2', '7501_3', '7501_4', '7501_5', '7502_1', '7502_2', '7502_3', '7502_4', '7502_5', '7503_1', '7503_2', '7503_3', '7503_4', '7503_5', '7504_1', '7504_2', '7504_3', '7504_4', '7504_5', '7505_1', '7505_2', '7505_3', '7505_4', '7505_5', '7506_1', '7506_2', '7506_3', '7506_4', '7506_5', '7507_1', '7507_2', '7507_3', '7507_4', '7507_5', '7508_1', '7508_2', '7508_3', '7508_4', '7508_5', '7509_1', '7509_2', '7509_3', '7509_4', '7509_5', '7510_1', '7510_2', '7510_3', '7510_4', '7510_5', '7511_1', '7511_2', '7511_3', '7511_4', '7511_5', '7512_1', '7512_2', '7512_3', '7512_4', '7512_5', '7513_1', '7513_2', '7513_3', '7513_4', '7513_5', '7514_1', '7514_2', '7514_3', '7514_4', '7514_5', '7515_1', '7515_2', '7515_3', '7515_4', '7515_5', '7516_1', '7516_2', '7516_3', '7516_4', '7516_5', '7517_1', '7517_2', '7517_3', '7517_4', '7517_5', '7518_1', '7518_2', '7518_3', '7518_4', '7518_5', '7519_1', '7519_2',

Map: 100%|██████████| 400/400 [00:00<00:00, 43054.93 examples/s]


In [None]:
query_dataset_train = pd.read_csv("../notebooks/data/contacts_queries_train.csv")
query_dataset_val = pd.read_csv("../notebooks/data/contacts_queries_val.csv")
query_dataset_test = pd.read_csv("../notebooks/data/contacts_queries_test.csv")

In [None]:
dataset_for_queries = {
    "train": Dataset.from_pandas(query_dataset_train),
    "validation": Dataset.from_pandas(query_dataset_val),
    "test": Dataset.from_pandas(query_dataset_test)
}

for split in dataset_for_queries:
    dataset_for_queries[split] = dataset_for_queries[split].rename_column("question", "text")
    dataset_for_queries[split] = dataset_for_queries[split].rename_column("id", "label")
    # map labels to integers
    dataset_for_queries[split] = dataset_for_queries[split].map(map_labels)

# to dataset huggingface
from datasets import DatasetDict
dataset_for_queries = DatasetDict(dataset_for_queries)
print(dataset_for_queries["train"][1])


Map: 100%|██████████| 1400/1400 [00:00<00:00, 59512.37 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 25925.97 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 53094.70 examples/s]

{'text': '¿Cómo puedo contactar con Antonio Alonso?', 'label': 194}





## Model

In [None]:
model_name = "Qwen/Qwen3-0.6B"
MAX_LENGTH = 512

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    low_cpu_mem_usage=True,
    device_map={"": 0}
)
model.config.pad_token_id = tokenizer.pad_token_id

Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen3-0.6B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
lora_r = 256
lora_alpha = lora_r * 2
lora_dropout = 0.0
lora_bias = "none"
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "down_proj"]

In [None]:
config = LoraConfig(
    task_type="SEQ_CLS",
    inference_mode=False,
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias=lora_bias,
    target_modules=target_modules
)
model = get_peft_model(model, config)

In [None]:
for p in model.base_model.model.score.parameters():
    p.requires_grad_(True)

In [None]:
def preprocess(examples):
    return tokenizer(examples["text"], truncation=True, max_length=MAX_LENGTH, padding=False)

# tokenize test from dataset
tokenized_datasets_indexing = dataset_indexing.map(preprocess, batched=True)
tokenized_datasets_query = dataset_for_queries.map(preprocess, batched=True)


Map: 100%|██████████| 400/400 [00:00<00:00, 43406.94 examples/s]
Map: 100%|██████████| 1400/1400 [00:00<00:00, 71951.40 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 48215.93 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 54344.44 examples/s]


In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = (preds == labels).mean()
    return {"accuracy": acc}

In [None]:
# Entrenamiento
SEED = 42
training_args_indexing = TrainingArguments(
    output_dir=f"models/contacts_clf_{model_name.replace('/', '_')}_indexing",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    num_train_epochs=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    eval_strategy="steps",
    save_strategy="steps",  
    eval_steps=10,
    save_steps=10,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    fp16=True,
    report_to="none",
    seed=SEED,
)

training_args_query = TrainingArguments(
    output_dir=f"models/contacts_clf_{model_name.replace('/', '_')}_query",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    num_train_epochs=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    eval_strategy="steps",
    save_strategy="steps",  
    eval_steps=10,
    save_steps=10,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    fp16=True,
    report_to="none",
    seed=SEED,
)

# IDs de tokens
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.bos_token_id = tokenizer.bos_token_id

trainer_indexing = Trainer(
    model=model,
    args=training_args_indexing,
    train_dataset=tokenized_datasets_indexing,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer_query = Trainer(
    model=model,
    args=training_args_query,
    train_dataset=tokenized_datasets_query["train"],
    eval_dataset=tokenized_datasets_query["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
CICLES = 1
for _ in range(CICLES):
    trainer_indexing.train()
    trainer_query.train()

Step,Training Loss
10,7.142
20,6.2649
30,6.1391
40,6.0651
50,5.9473
60,5.5548
70,4.8539
80,3.566
90,1.2424
100,0.2574


Step,Training Loss,Validation Loss,Accuracy
10,6.4298,5.777628,0.06
20,5.0274,4.590471,0.26
30,4.0402,4.295148,0.28
40,4.0327,3.883223,0.323333
50,3.0838,3.697811,0.36
60,2.7614,3.644974,0.35
70,2.7631,3.176554,0.42
80,2.7414,2.926815,0.483333
90,1.9981,2.770937,0.503333
100,1.08,2.669616,0.553333


RuntimeError: [enforce fail at inline_container.cc:664] . unexpected pos 680704576 vs 680704464

In [None]:
print("Training completed.")

Training completed.
