In [None]:
import sys
from pathlib import Path
ROOT_DIR = Path().resolve().parents[0]
sys.path.append(str(ROOT_DIR))
import config as cfg

from datasets import load_from_disk
import os
import json
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
import numpy as np
from sklearn.metrics import roc_auc_score

TOTAL_RUNS = 2

In [2]:
# Load dataset
ds_train_tokenized = load_from_disk(cfg.PATH_DS_TRAIN_TOKENIZED)

# Pick best checkpoint

In [3]:
def _get_checkpoint_steps_from_fold(n_run, fold_id):
    path_fold_dir = os.path.join(cfg.PATH_CHECKPOINTS, cfg.MODEL_BASE, f"run_{n_run}", f"fold_{fold_id}")
    checkpoints = [int(folder.split("checkpoint-")[-1]) for folder in os.listdir(path_fold_dir) if folder.startswith("checkpoint")]
    return sorted(checkpoints)

In [4]:
def get_best_checkpoint_from_fold(n_run, fold_id):
    last_checkpoint = max(_get_checkpoint_steps_from_fold(n_run, fold_id))
    path_trainer_state = os.path.join(
        cfg.PATH_CHECKPOINTS, cfg.MODEL_BASE, f"run_{n_run}", f"fold_{fold_id}",
        f"checkpoint-{last_checkpoint}", "trainer_state.json"
    )
    try:
        with open(path_trainer_state, "r") as f:
            trainer_state = json.load(f)
            global_step = trainer_state.get("global_step", None)
            best_checkpoint = trainer_state.get("best_model_checkpoint", None)
            best_metric = trainer_state.get("best_metric", None)
        return {"best_metric": best_metric, "best_checkpoint": best_checkpoint, "global_step": global_step}
    except FileNotFoundError:
        print(f"File not found: {path_trainer_state}")
        return None

In [11]:
def get_best_checkpoint(best_checkpoints):
	best_checkpoint = {
		"best_run": None,
		"best_fold": None,
		"best_step": None,
		"best_checkpoint": None,
		"best_metric": 0,
	}
	
	for run_id, folds in best_checkpoints.items():
		for fold_id, metrics in folds.items():
			if metrics["best_metric"] > best_checkpoint["best_metric"]:
				best_checkpoint["best_run"] = run_id
				best_checkpoint["best_fold"] = fold_id
				best_checkpoint["best_step"] = metrics["global_step"]
				best_checkpoint["best_checkpoint"] = metrics["best_checkpoint"]
				best_checkpoint["best_metric"] = metrics["best_metric"]
	
	return best_checkpoint

In [None]:
best_checkpoints = {}
for n_run in range(TOTAL_RUNS):
	best_checkpoints[f"run_{n_run}"] = {}
	for fold_id in range(cfg.N_FOLDS):
		best_checkpoints[f"run_{n_run}"][f"fold_{fold_id}"] = get_best_checkpoint_from_fold(n_run, fold_id)
best_checkpoints

FileNotFoundError: [Errno 2] No such file or directory: '/home/azureuser/ruben/toxicity_classificator/checkpoints/microsoft/deberta-v3-base/run_1/fold_4'

In [None]:
best_checkpoint = get_best_checkpoint(best_checkpoints)
best_checkpoint

{'best_run': 'run_0',
 'best_fold': 'fold_2',
 'best_step': 10240,
 'best_checkpoint': 'checkpoints/microsoft/deberta-v3-base/fold_2/checkpoint-10240',
 'best_metric': 0.991930075119802}

In [None]:
best_model = AutoModelForSequenceClassification.from_pretrained(best_checkpoint["best_checkpoint"]).to("cuda")

In [None]:
args = TrainingArguments(
	# Training structure parameters
	num_train_epochs=cfg.EPOCHS,
	per_device_train_batch_size=cfg.BATCH_SIZE,
	gradient_accumulation_steps=1,
	per_device_eval_batch_size=cfg.BATCH_SIZE,
	# Optimization parameters
	learning_rate=1e-5,
	weight_decay=0.01,
	optim="adamw_torch_fused",
	# Evaluation and saving parameters
	eval_strategy="steps",
	save_strategy="steps",
	eval_steps=cfg.EVAL_STEPS,
	save_steps=cfg.SAVE_STEPS,
	load_best_model_at_end=True,
	save_only_model=True,
	save_total_limit=cfg.EARLY_STOP_PATIENCE,
	metric_for_best_model="roc_auc_macro",
	# Precision and memory parameters
	fp16=True,
	gradient_checkpointing=False,
	dataloader_num_workers=2,
	dataloader_pin_memory=True,
	# Logging and reproducibility parameters
	logging_steps=100,
	seed=cfg.RANDOM_SEED,
	output_dir= os.path.join(cfg.PATH_CHECKPOINTS, cfg.MODEL_BASE, "run_full_dataset")
)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = 1 / (1 + np.exp(-logits))
    auc = roc_auc_score(labels, probs, average="macro")
    return {"roc_auc_macro": auc}

In [None]:
trainer = Trainer(
	model=best_model,
	args=args,
	train_dataset=ds_train_tokenized,
	eval_dataset=,
	compute_metrics=compute_metrics,
	callbacks=[EarlyStoppingCallback(early_stopping_patience=cfg.EARLY_STOP_PATIENCE)],
)

We should define a small validation dataset to check the results, but this dataset needs to be stratified to guarantee that it takes same proportion of all labels than the full dataset.