In [1]:
import sys
from pathlib import Path
ROOT_DIR = Path().resolve().parents[0]
sys.path.append(str(ROOT_DIR))
import config as cfg

from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    TrainerCallback,
)
import numpy as np
from sklearn.metrics import roc_auc_score
from datasets import load_from_disk
import os
import json

N_RUN = 2               # Number of run to separe different experiments

In [2]:
def get_fold_datasets(ds, fold):
    ds_train = ds.filter(lambda x: x["fold"] != fold)
    ds_val = ds.filter(lambda x: x["fold"] == fold)
    return ds_train, ds_val

In [3]:
def model_init(model_name=cfg.MODEL_BASE):
    return AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=6,
        problem_type="multi_label_classification",
    )

In [4]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = 1 / (1 + np.exp(-logits))
    auc = roc_auc_score(labels, probs, average="macro")
    return {"roc_auc_macro": auc}

In [5]:
def print_highlighted_box(text, wide_chars):
    spaces = (wide_chars - len(text)) // 2
    left_spaces = spaces - 1
    right_spaces = spaces if (wide_chars - len(text)) % 2 else spaces - 1
    print(f"{'-' * wide_chars}")
    print(f"|{' ' * (wide_chars - 2)}|")
    print(f"|{' ' * left_spaces}{text}{' ' * right_spaces}|")
    print(f"|{' ' * (wide_chars - 2)}|")
    print(f"{'-' * wide_chars}")

In [7]:
class TrainerWithTrainMetrics(Trainer):
    """
    Extiende `evaluate()` para que, junto a las métricas de validación,
    añada las métricas de entrenamiento con prefijo 'train_'.
    """
    def evaluate(
        self,
        eval_dataset=None,
        ignore_keys=None,
        metric_key_prefix: str = "eval",
    ):
        # 1️⃣  métricas de validación (lo de siempre)
        metrics = super().evaluate(
            eval_dataset=eval_dataset,
            ignore_keys=ignore_keys,
            metric_key_prefix=metric_key_prefix,
        )

        # 2️⃣  métricas sobre el set de entrenamiento
        train_metrics = super().evaluate(
            eval_dataset=self.train_dataset,
            ignore_keys=ignore_keys,
            metric_key_prefix="train",      # genera train_loss y train_roc_auc_macro
        )

        # 3️⃣  fusiona ambos diccionarios
        metrics.update(train_metrics)
        return metrics

In [8]:
ds_train_tokenized = load_from_disk(cfg.PATH_DS_TRAIN_TOKENIZED)
ds_train_tokenized

Dataset({
    features: ['id', 'fold', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 159571
})

In [9]:
# ds_train_tokenized = ds_train_tokenized.select(range(1000))
# ds_train_tokenized

In [10]:
for fold in range(cfg.N_FOLDS):

    path_checkpoint_dir = os.path.join(cfg.PATH_CHECKPOINTS, cfg.MODEL_BASE, f"run_{N_RUN}", f"fold_{fold}")
    path_model_final = os.path.join(path_checkpoint_dir, "model_final")

    ds_train, ds_val = get_fold_datasets(ds_train_tokenized, fold)
    print_highlighted_box(f"FOLD {fold}: TRAIN SIZE: {len(ds_train)} ({len(ds_train)/len(ds_train_tokenized):.2%}), VAL SIZE: {len(ds_val)} ({len(ds_val)/len(ds_train_tokenized):.2%})", 80)

    args = TrainingArguments(
        # Training structure parameters
        num_train_epochs=cfg.EPOCHS,
        per_device_train_batch_size=cfg.BATCH_SIZE,
        gradient_accumulation_steps=1,
        per_device_eval_batch_size=cfg.BATCH_SIZE,
        # Optimization parameters
        learning_rate=1e-5,
        weight_decay=0.01,
        optim="adamw_torch_fused",
        # Evaluation and saving parameters
        eval_strategy="steps",
        save_strategy="steps",
        eval_steps=cfg.EVAL_STEPS,
        save_steps=cfg.SAVE_STEPS,
        load_best_model_at_end=True,
        save_only_model=True,
        save_total_limit=cfg.SAVE_TOTAL_LIMIT,
        metric_for_best_model="roc_auc_macro",
        # Precision and memory parameters
        fp16=True,
        gradient_checkpointing=False,
        dataloader_num_workers=2,
        dataloader_pin_memory=True,
        # Logging and reproducibility parameters
        logging_steps=cfg.LOGGING_STEPS,
        seed=cfg.RANDOM_SEED,
        output_dir= path_checkpoint_dir,
    )

    # trainer = Trainer(
    #     model=model_init(cfg.MODEL_BASE),
    #     args=args,
    #     train_dataset=ds_train,
    #     eval_dataset=ds_val,
    #     compute_metrics=compute_metrics,
    #     callbacks=[
    #         EarlyStoppingCallback(early_stopping_patience=cfg.EARLY_STOP_PATIENCE),
    #         # LogTrainMetricsCallback(),
	# 	],
    # )
    trainer = TrainerWithTrainMetrics(
        model=model_init(cfg.MODEL_BASE),
		args=args,
		train_dataset=ds_train,
		eval_dataset=ds_val,
		compute_metrics=compute_metrics,
		callbacks=[EarlyStoppingCallback(early_stopping_patience=cfg.EARLY_STOP_PATIENCE)]
	)

    train_results = trainer.train()
    # train_metrics = train_results.metrics
    # trainer.log_metrics("train", train_metrics)
    # trainer.save_metrics("train", train_metrics)
    
    path_hist = os.path.join(path_checkpoint_dir, "log_history.json")
    with open(path_hist, "w") as f:
        json.dump(trainer.state.log_history, f, indent=2)

    trainer.save_model(path_model_final)

--------------------------------------------------------------------------------
|                                                                              |
|        FOLD 0: TRAIN SIZE: 127656 (80.00%), VAL SIZE: 31915 (20.00%)         |
|                                                                              |
--------------------------------------------------------------------------------


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Roc Auc Macro
250,0.0694,0.061226,0.968253


KeyboardInterrupt: 