# Span Detection hyperparameter search

1. Import dependencies

In [1]:
import random
import time

import numpy as np
import torch
import os

from transformers import (
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    AutoModelForMaskedLM,
    TrainingArguments,
    Trainer,
)

from src.util.torch_device import resolve_torch_device
from src.data.ukrainian_news import load_ukrainian_news_dataset
from src.definitions import (
    MODELS_FOLDER,
    PROCESSED_DATA_FOLDER,
    EXTERNAL_DATA_FOLDER
)
from src.model.span_detection_metrics import compute_metrics

2. Prepare Env

In [2]:
random_seed = 42

random.seed(random_seed)
torch.manual_seed(random_seed)
np.random.seed(random_seed)

device = resolve_torch_device()

model_checkpoint = "FacebookAI/xlm-roberta-large"

epoch_time = int(time.time())

os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

device

device(type='mps')

3. Load dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.pad_token = tokenizer.eos_token

dataset = load_ukrainian_news_dataset(PROCESSED_DATA_FOLDER, EXTERNAL_DATA_FOLDER, tokenizer)

README.md:   0%|          | 0.00/2.05k [00:00<?, ?B/s]

ukrainian-news.py:   0%|          | 0.00/1.86k [00:00<?, ?B/s]



3. Prepare model

In [None]:
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint).to(device)

training_args = TrainingArguments(
    output_dir=MODELS_FOLDER / "ua-fine-tuned-base-model-checkpoint",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=2,
    learning_rate=5e-5,
    warmup_steps=1000,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

4. Train model

In [None]:
trainer.train()

5. Save weights

In [None]:
trainer.save_model(MODELS_FOLDER / "ua-fine-tuned-base-model")
tokenizer.save_pretrained(MODELS_FOLDER / "ua-fine-tuned-base-model")