# Span Detection hyperparameter search

1. Import dependencies

In [1]:
import random

import numpy as np
import torch
import os

from transformers import (
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    AutoModelForMaskedLM,
    TrainingArguments,
    Trainer,
)

from src.util.torch_device import resolve_torch_device
from src.data.ukrainian_news import load_ukrainian_news_dataset
from src.definitions import (
    MODELS_FOLDER,
    PROCESSED_DATA_FOLDER,
    EXTERNAL_DATA_FOLDER
)

2. Prepare Env

In [2]:
random_seed = 42

random.seed(random_seed)
torch.manual_seed(random_seed)
np.random.seed(random_seed)

device = resolve_torch_device()

model_checkpoint = "FacebookAI/xlm-roberta-base"
fine_tune_name = f"ua-fine-tuned-{model_checkpoint.replace('/', '-')}"

os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

device

device(type='cuda')

3. Load dataset

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.pad_token = tokenizer.eos_token

dataset = load_ukrainian_news_dataset(PROCESSED_DATA_FOLDER, EXTERNAL_DATA_FOLDER, tokenizer, rows_count=130000)

3. Prepare model

In [4]:
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint).to(device)

training_args = TrainingArguments(
    output_dir=MODELS_FOLDER / f"{fine_tune_name}-checkpoint",
    eval_strategy="no",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
)

Some weights of the model checkpoint at FacebookAI/xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


4. Train model

In [5]:
torch.cuda.empty_cache()

trainer.train()

Step,Training Loss
500,1.1544
1000,1.1481
1500,1.1161
2000,1.0995
2500,1.0873
3000,1.0752
3500,1.0588
4000,1.0577
4500,1.0405
5000,1.0448


TrainOutput(global_step=48750, training_loss=0.9375628749749599, metrics={'train_runtime': 186523.6223, 'train_samples_per_second': 2.091, 'train_steps_per_second': 0.261, 'total_flos': 9.938750636543616e+16, 'train_loss': 0.9375628749749599, 'epoch': 3.0})

5. Save weights

In [None]:
trainer.save_model(MODELS_FOLDER / fine_tune_name)
tokenizer.save_pretrained(MODELS_FOLDER / fine_tune_name)

('/home/melal/Workspace/unlp-2025-manipulation-detector/models/ua-fine-tuned-xlm-roberta-large/tokenizer_config.json',
 '/home/melal/Workspace/unlp-2025-manipulation-detector/models/ua-fine-tuned-xlm-roberta-large/special_tokens_map.json',
 '/home/melal/Workspace/unlp-2025-manipulation-detector/models/ua-fine-tuned-xlm-roberta-large/tokenizer.json')