In [None]:
import sys
from pathlib import Path
ROOT_DIR = Path().resolve().parents[0]
sys.path.append(str(ROOT_DIR))
import config as cfg

from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
import numpy as np
from sklearn.metrics import roc_auc_score
from datasets import load_from_disk
import os
import json

N_RUN = 2               # Number of run to separe different experiments

In [2]:
def get_fold_datasets(ds, fold):
    ds_train = ds.filter(lambda x: x["fold"] != fold)
    ds_val = ds.filter(lambda x: x["fold"] == fold)
    return ds_train, ds_val

In [3]:
def model_init(model_name=cfg.MODEL_BASE):
    return AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=6,
        problem_type="multi_label_classification",
    )

In [4]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = 1 / (1 + np.exp(-logits))
    auc = roc_auc_score(labels, probs, average="macro")
    return {"roc_auc_macro": auc}

In [5]:
def print_highlighted_box(text, wide_chars):
    spaces = (wide_chars - len(text)) // 2
    left_spaces = spaces - 1
    right_spaces = spaces if (wide_chars - len(text)) % 2 else spaces - 1
    print(f"{'-' * wide_chars}")
    print(f"|{' ' * (wide_chars - 2)}|")
    print(f"|{' ' * left_spaces}{text}{' ' * right_spaces}|")
    print(f"|{' ' * (wide_chars - 2)}|")
    print(f"{'-' * wide_chars}")

In [None]:
class TrainerWithTrainMetrics(Trainer):

    def evaluate(
        self,
        eval_dataset=None,
        ignore_keys=None,
        metric_key_prefix: str = "eval",
    ):
		# Validation metrics (what is usually returned by Trainer.evaluate)
        metrics = super().evaluate(
            eval_dataset=eval_dataset,
            ignore_keys=ignore_keys,
            metric_key_prefix=metric_key_prefix,
        )

        # Training metrics (added in this custom Trainer)
        train_metrics = super().evaluate(
            eval_dataset=self.train_dataset,
            ignore_keys=ignore_keys,
            metric_key_prefix="train",
        )

        # Combine metrics
        metrics.update(train_metrics)
        return metrics

In [None]:
ds_train_tokenized = load_from_disk(cfg.PATH_DS_TRAIN_TOKENIZED)
ds_train_tokenized

In [None]:
for fold in range(cfg.N_FOLDS):

    path_checkpoint_dir = os.path.join(cfg.PATH_CHECKPOINTS, cfg.MODEL_BASE, f"run_{N_RUN}", f"fold_{fold}")
    path_model_final = os.path.join(path_checkpoint_dir, "model_final")

    ds_train, ds_val = get_fold_datasets(ds_train_tokenized, fold)
    print_highlighted_box(f"FOLD {fold}: TRAIN SIZE: {len(ds_train)} ({len(ds_train)/len(ds_train_tokenized):.2%}), VAL SIZE: {len(ds_val)} ({len(ds_val)/len(ds_train_tokenized):.2%})", 80)

    args = TrainingArguments(
        # Training structure parameters
        num_train_epochs=cfg.EPOCHS,
        per_device_train_batch_size=cfg.BATCH_SIZE,
        gradient_accumulation_steps=1,
        per_device_eval_batch_size=cfg.BATCH_SIZE,
        # Optimization parameters
        learning_rate=1e-5,
        weight_decay=0.01,
        optim="adamw_torch_fused",
        # Evaluation and saving parameters
        eval_strategy="steps",
        save_strategy="steps",
        eval_steps=cfg.EVAL_STEPS,
        save_steps=cfg.SAVE_STEPS,
        load_best_model_at_end=True,
        save_only_model=True,
        save_total_limit=cfg.SAVE_TOTAL_LIMIT,
        metric_for_best_model="roc_auc_macro",
        # Precision and memory parameters
        fp16=True,
        gradient_checkpointing=False,
        dataloader_num_workers=2,
        dataloader_pin_memory=True,
        # Logging and reproducibility parameters
        logging_steps=cfg.LOGGING_STEPS,
        seed=cfg.RANDOM_SEED,
        output_dir= path_checkpoint_dir,
    )

    trainer = TrainerWithTrainMetrics(
        model=model_init(cfg.MODEL_BASE),
		args=args,
		train_dataset=ds_train,
		eval_dataset=ds_val,
		compute_metrics=compute_metrics,
		callbacks=[EarlyStoppingCallback(early_stopping_patience=cfg.EARLY_STOP_PATIENCE)]
	)

    train_results = trainer.train()
    
	# Save log_history of the last training to easily access it later
    path_hist = os.path.join(path_checkpoint_dir, "log_history.json")
    with open(path_hist, "w") as f:
        json.dump(trainer.state.log_history, f, indent=2)

	# Save the final model
    trainer.save_model(path_model_final)

--------------------------------------------------------------------------------
|                                                                              |
|        FOLD 0: TRAIN SIZE: 127656 (80.00%), VAL SIZE: 31915 (20.00%)         |
|                                                                              |
--------------------------------------------------------------------------------


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Roc Auc Macro
500,0.0528,0.048624,0.969358
1000,0.0535,0.049696,0.978421
1500,0.046,0.043548,0.981134
2000,0.0487,0.043338,0.982133
2500,0.0447,0.041547,0.98271
3000,0.0427,0.043287,0.983866
3500,0.0491,0.040779,0.985112
4000,0.0403,0.043475,0.986343
4500,0.0359,0.042086,0.986918
5000,0.0389,0.040812,0.987791


***** train metrics *****
  epoch                    =        3.0
  total_flos               = 46924008GF
  train_loss               =     0.0425
  train_runtime            = 3:30:25.01
  train_samples_per_second =     30.334
  train_steps_per_second   =      0.948


Filter:   0%|          | 0/159571 [00:00<?, ? examples/s]

Filter:   0%|          | 0/159571 [00:00<?, ? examples/s]

--------------------------------------------------------------------------------
|                                                                              |
|        FOLD 1: TRAIN SIZE: 127656 (80.00%), VAL SIZE: 31915 (20.00%)         |
|                                                                              |
--------------------------------------------------------------------------------


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Roc Auc Macro
500,0.0573,0.049473,0.966328
1000,0.0474,0.046109,0.974565
1500,0.0501,0.047193,0.968266
2000,0.0455,0.046377,0.980477
2500,0.0445,0.044242,0.980249
3000,0.0475,0.039939,0.982596
3500,0.037,0.040651,0.981643
4000,0.0444,0.04336,0.985797
4500,0.0377,0.038643,0.985345
5000,0.0384,0.039234,0.987011


***** train metrics *****
  epoch                    =      2.005
  total_flos               = 31361089GF
  train_loss               =     0.0453
  train_runtime            = 2:23:56.46
  train_samples_per_second =     44.343
  train_steps_per_second   =      1.386


Filter:   0%|          | 0/159571 [00:00<?, ? examples/s]

Filter:   0%|          | 0/159571 [00:00<?, ? examples/s]

--------------------------------------------------------------------------------
|                                                                              |
|        FOLD 2: TRAIN SIZE: 127660 (80.00%), VAL SIZE: 31911 (20.00%)         |
|                                                                              |
--------------------------------------------------------------------------------


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Roc Auc Macro
500,0.049,0.047469,0.962836
1000,0.0472,0.051378,0.977459
1500,0.0458,0.044072,0.978127
2000,0.0433,0.047796,0.983207
2500,0.0429,0.041797,0.983433
3000,0.041,0.03973,0.984964
3500,0.0426,0.041252,0.986545
4000,0.0394,0.039073,0.986953
4500,0.0379,0.03955,0.986855
5000,0.0349,0.039563,0.987534


***** train metrics *****
  epoch                    =      2.381
  total_flos               = 37243376GF
  train_loss               =     0.0416
  train_runtime            = 2:51:02.18
  train_samples_per_second =      37.32
  train_steps_per_second   =      1.166


Filter:   0%|          | 0/159571 [00:00<?, ? examples/s]

Filter:   0%|          | 0/159571 [00:00<?, ? examples/s]

--------------------------------------------------------------------------------
|                                                                              |
|        FOLD 3: TRAIN SIZE: 127656 (80.00%), VAL SIZE: 31915 (20.00%)         |
|                                                                              |
--------------------------------------------------------------------------------


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Roc Auc Macro
500,0.0538,0.052645,0.964609
1000,0.0484,0.048125,0.978098
1500,0.0481,0.046779,0.979042
2000,0.0432,0.044819,0.979306
2500,0.0444,0.053608,0.979975
3000,0.0406,0.043576,0.981464
3500,0.0458,0.042798,0.982657
4000,0.0426,0.041963,0.984126
4500,0.0382,0.043687,0.984807
5000,0.0382,0.041644,0.985346


***** train metrics *****
  epoch                    =        3.0
  total_flos               = 46924008GF
  train_loss               =     0.0411
  train_runtime            = 3:31:32.78
  train_samples_per_second =     30.172
  train_steps_per_second   =      0.943


Filter:   0%|          | 0/159571 [00:00<?, ? examples/s]

Filter:   0%|          | 0/159571 [00:00<?, ? examples/s]

--------------------------------------------------------------------------------
|                                                                              |
|        FOLD 4: TRAIN SIZE: 127656 (80.00%), VAL SIZE: 31915 (20.00%)         |
|                                                                              |
--------------------------------------------------------------------------------


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Roc Auc Macro
500,0.0529,0.053372,0.972072
1000,0.0515,0.054084,0.95145
1500,0.0413,0.043965,0.978631
2000,0.0481,0.041661,0.979267
2500,0.0457,0.041958,0.981492
3000,0.0404,0.040516,0.982477
3500,0.0412,0.041125,0.98352
4000,0.0394,0.039192,0.984265
4500,0.04,0.039804,0.986269
5000,0.0336,0.041429,0.986575


***** train metrics *****
  epoch                    =        3.0
  total_flos               = 46924008GF
  train_loss               =     0.0407
  train_runtime            = 3:30:43.44
  train_samples_per_second =      30.29
  train_steps_per_second   =      0.947
