# Cross Validation

1. Import dependencies

In [1]:
import random
import time

import numpy as np
import torch
import os

from transformers import (
    AutoTokenizer,
    DataCollatorForTokenClassification,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
)

import sys

sys.path.append('..')
from src.util.torch_device import resolve_torch_device
from src.data.span_detection_ds import ManipulationDetectionDataset
from src.definitions import (
    MODELS_FOLDER,
    RAW_DATA_FOLDER,
    PROCESSED_DATA_FOLDER,
)
from src.model.span_detection_metrics import compute_metrics

2. Prepare Env

In [2]:
random_seed = 42

random.seed(random_seed)
torch.manual_seed(random_seed)
np.random.seed(random_seed)

device = resolve_torch_device()

model_checkpoint = 'models/train-test-fine-tuned-models-ru-fine-tuned-FacebookAI-xlm-roberta-base-3-checkpoint/checkpoint-500'
result_model = f"kfold-span-detection-{str(model_checkpoint).split("/")[-1]}"

epoch_time = int(time.time())

In [3]:
classifier_dropout = 0.1
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 5
k = 5

3. Load dataset

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length=512)

dataset_blueprint = ManipulationDetectionDataset(
    tokenizer=tokenizer,
    raw_path=RAW_DATA_FOLDER / "trnslt_span_v2.parquet",
    processed_path=PROCESSED_DATA_FOLDER / "span-detection",
    seed=random_seed,
    do_split=False,
)

dataset = dataset_blueprint.read()
dataset

Saving the dataset (0/1 shards):   0%|          | 0/7596 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'content', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 7596
})

3. Prepare model

In [None]:
def get_model(model_checkpoint, dataset_blueprint, device):
    model = AutoModelForTokenClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(dataset_blueprint.label2id),
        id2label=dataset_blueprint.id2label,
        label2id=dataset_blueprint.label2id,
        classifier_dropout=classifier_dropout,
    ).to(device)

    return model

def get_trainer(model, tokenizer, dataset, run_name):
    training_args = TrainingArguments(
        output_dir=MODELS_FOLDER / f"{result_model}-checkpoint",
        learning_rate=learning_rate,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        seed=random_seed,
        metric_for_best_model="token_f1",
        greater_is_better=True,
        bf16=True,
        run_name=run_name,
    )

    data_collator = DataCollatorForTokenClassification(tokenizer)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        processing_class=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics(dataset_blueprint),
    )

    return trainer

4. Train model

In [6]:
length = len(dataset)
fold_length = length // k
eval_metrics = []
for i in range(k):
    start = i * fold_length
    end = (i + 1) * fold_length
    if i == k - 1:
        end = length

    train_idx = list(range(0, start)) + list(range(end, length))
    test_idx = list(range(start, end))

    train_dataset = dataset.select(train_idx)
    test_dataset = dataset.select(test_idx)

    model = get_model(model_checkpoint, dataset_blueprint, device)
    trainer = get_trainer(model, tokenizer, {"train": train_dataset, "test": test_dataset}, f"EvanD/xlm-roberta-base-ukrainian-ner-ukrner-{epoch_time}-fold-{i}")

    torch.cuda.empty_cache()

    trainer.train()
    evaluation_feedback = trainer.evaluate()

    eval_metrics.append(evaluation_feedback)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at models/train-test-fine-tuned-models-ru-fine-tuned-FacebookAI-xlm-roberta-base-3-checkpoint/checkpoint-500 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Token F1,Token Precision,Token Recall,Span F1,Span Precision,Span Recall,Accuracy
1,No log,0.469517,0.0,0.0,0.0,0.0,0.0,0.0,0.830609
2,0.492000,0.452181,0.0,0.0,0.0,0.0,0.0,0.0,0.830609
3,0.441600,0.448834,0.0,0.0,0.0,0.0,0.0,0.0,0.830609
4,0.425900,0.447822,0.0,0.0,0.0,0.0,0.0,0.0,0.830609
5,0.425900,0.447471,0.0,0.0,0.0,0.0,0.0,0.0,0.830609


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at models/train-test-fine-tuned-models-ru-fine-tuned-FacebookAI-xlm-roberta-base-3-checkpoint/checkpoint-500 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Token F1,Token Precision,Token Recall,Span F1,Span Precision,Span Recall,Accuracy
1,No log,0.483978,0.0,0.0,0.0,0.0,0.0,0.0,0.842963
2,0.542200,0.443737,0.0,0.0,0.0,0.0,0.0,0.0,0.842963
3,0.453500,0.434736,0.0,0.0,0.0,0.0,0.0,0.0,0.842963
4,0.437400,0.431987,0.0,0.0,0.0,0.0,0.0,0.0,0.842963
5,0.437400,0.431247,0.0,0.0,0.0,0.0,0.0,0.0,0.842963


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at models/train-test-fine-tuned-models-ru-fine-tuned-FacebookAI-xlm-roberta-base-3-checkpoint/checkpoint-500 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Token F1,Token Precision,Token Recall,Span F1,Span Precision,Span Recall,Accuracy
1,No log,0.474837,0.0,0.0,0.0,0.0,0.0,0.0,0.85486
2,0.542500,0.431105,0.0,0.0,0.0,0.0,0.0,0.0,0.85486
3,0.455700,0.419446,0.0,0.0,0.0,0.0,0.0,0.0,0.85486
4,0.441900,0.416055,0.0,0.0,0.0,0.0,0.0,0.0,0.85486
5,0.441900,0.415258,0.0,0.0,0.0,0.0,0.0,0.0,0.85486


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at models/train-test-fine-tuned-models-ru-fine-tuned-FacebookAI-xlm-roberta-base-3-checkpoint/checkpoint-500 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Token F1,Token Precision,Token Recall,Span F1,Span Precision,Span Recall,Accuracy
1,No log,0.481116,0.0,0.0,0.0,0.0,0.0,0.0,0.847561
2,0.541500,0.438828,0.0,0.0,0.0,0.0,0.0,0.0,0.847561
3,0.452400,0.429207,0.0,0.0,0.0,0.0,0.0,0.0,0.847561
4,0.442700,0.426173,0.0,0.0,0.0,0.0,0.0,0.0,0.847561
5,0.442700,0.425413,0.0,0.0,0.0,0.0,0.0,0.0,0.847561


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at models/train-test-fine-tuned-models-ru-fine-tuned-FacebookAI-xlm-roberta-base-3-checkpoint/checkpoint-500 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Token F1,Token Precision,Token Recall,Span F1,Span Precision,Span Recall,Accuracy
1,No log,0.482129,0.0,0.0,0.0,0.0,0.0,0.0,0.848636
2,0.543800,0.438966,0.0,0.0,0.0,0.0,0.0,0.0,0.848636
3,0.451700,0.42895,0.0,0.0,0.0,0.0,0.0,0.0,0.848636
4,0.440600,0.425605,0.0,0.0,0.0,0.0,0.0,0.0,0.848636
5,0.440600,0.424819,0.0,0.0,0.0,0.0,0.0,0.0,0.848636


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
import pandas as pd

eval_metrics_avg = {
    "eval_loss": sum([m["eval_loss"] for m in eval_metrics]) / k,
    "eval_token_f1": sum([m["eval_token_f1"] for m in eval_metrics]) / k,
    "eval_token_precision": sum([m["eval_token_precision"] for m in eval_metrics]) / k,
    "eval_token_recall": sum([m["eval_token_recall"] for m in eval_metrics]) / k,
    "eval_span_f1": sum([m["eval_span_f1"] for m in eval_metrics]) / k,
    "eval_span_precision": sum([m["eval_span_precision"] for m in eval_metrics]) / k,
    "eval_span_recall": sum([m["eval_span_recall"] for m in eval_metrics]) / k,
    "eval_runtime": sum([m["eval_runtime"] for m in eval_metrics]) / k,
    "eval_samples_per_second": sum([m["eval_samples_per_second"] for m in eval_metrics]) / k,
    "eval_steps_per_second": sum([m["eval_steps_per_second"] for m in eval_metrics]) / k,
    "epoch": sum([m["epoch"] for m in eval_metrics]) / k,
}
pd.DataFrame(eval_metrics_avg, index=[0])

Unnamed: 0,eval_loss,eval_token_f1,eval_token_precision,eval_token_recall,eval_span_f1,eval_span_precision,eval_span_recall,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
0,0.478315,0.0,0.0,0.0,0.0,0.0,0.0,5.8694,281.7674,17.619,5.0
