In [None]:
import numpy as np
from datasets import load_from_disk
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.metrics import roc_auc_score

RANDOM_SEED  = 31415
# MODEL_NAME = "microsoft/deberta-v3-large"
MODEL_NAME = "microsoft/deberta-v3-base"

In [2]:
def get_fold_datasets(ds, fold):
    ds_train = ds.filter(lambda x: x["fold"] != fold)
    ds_val = ds.filter(lambda x: x["fold"] == fold)
    return ds_train, ds_val

In [3]:
def model_init(model_name="microsoft/deberta-v3-base"):
    return AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=6,
        problem_type="multi_label_classification",
    )

In [4]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = 1 / (1 + np.exp(-logits))
    auc = roc_auc_score(labels, probs, average="macro")
    return {"roc_auc_macro": auc}

In [None]:
ds_tokenized = load_from_disk("data/processed_data/ds_tokenized")
ds_tokenized

for fold in range(5):

    train, val = get_fold_datasets(ds_tokenized, fold)
    print(f"Fold {fold}: Train size: {len(train)} ({len(train)/len(ds_tokenized):.2%}), Val size: {len(val)} ({len(val)/len(ds_tokenized):.2%})")

    args = TrainingArguments(
        # Training structure parameters
        num_train_epochs=3,
        per_device_train_batch_size=32,     # Try to use multiples of 8 for maximum GPU efficiency
        gradient_accumulation_steps=1,
        per_device_eval_batch_size=32,      # Try to use multiples of 8 for maximum GPU efficiency
        # Optimization parameters
        learning_rate=1e-5,
        weight_decay=0.01,
        optim="adamw_torch_fused",
        # Evaluation and saving parameters
        evaluation_strategy="steps",
        save_strategy="steps",
        eval_steps=1024,                    # Try to use a multiple of the batch size so the evaluation is made on an integer number of full batches
        save_steps=1024,                    # Try to use a multiple of the batch size so the evaluation is made on an integer number of full batches
        load_best_model_at_end=True,
        metric_for_best_model="roc_auc_macro",
        # Precision and memory parameters
        fp16=True,
        gradient_checkpointing=False,
        dataloader_num_workers=2,
        dataloader_pin_memory=True,
        # Logging and reproducibility parameters
        logging_steps=50,
        seed=RANDOM_SEED,
        output_dir=f"checkpoints/deberta_fold{fold}",
    )

    trainer = Trainer(
        model=model_init(MODEL_NAME),
        args=args,
        train_dataset=train,
        eval_dataset=val,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )

    trainer.train()

    trainer.save_model(f"checkpoints/deberta_fold{fold}/model_final")

Fold 1: Train size: 127656 (80.00%), Val size: 31915 (20.00%)


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
