# Base Span Detection implementation

1. Import dependencies

In [None]:
import random
import time
import torch
import os

import pandas as pd
import numpy as np

from transformers import (
    AutoTokenizer,
    DataCollatorForTokenClassification,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)

import sys

sys.path.append('..')
sys.path.append('../src/data')

from src.util.torch_device import resolve_torch_device
from src.data.span_detection_ds import ManipulationDetectionDataset
from src.definitions import (
    MODELS_FOLDER,
    RAW_DATA_FOLDER,
    SUBMISSIONS_FOLDER,
    PROCESSED_DATA_FOLDER,
)
from src.data.kaggle import submit_competition
from src.model.span_detection_metrics import compute_metrics

2. Prepare Env

In [20]:
random_seed = 42

random.seed(random_seed)
torch.manual_seed(random_seed)
np.random.seed(random_seed)

device = resolve_torch_device()

epoch_time = int(time.time())

#os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

In [21]:
model_checkpoint_ukr = 'FrinzTheCoder/bert-base-multilingual-cased-ukr'#MODELS_FOLDER / "ua-fine-tuned-xlm-roberta-large"
model_checkpoint_ru = 'Gherman/bert-base-NER-Russian'
result_model_ukr = f"span-detection-{str(model_checkpoint_ukr).split('/')[-1]}"

classifier_dropout = 0.1
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 5

submission_desc = (
    "Use ensemble of two models: EvanD/xlm-roberta-base-ukrainian-ner-ukrner and yqelz/xml-roberta-large-ner-russian"
)

3. Load dataset

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_ukr)

dataset_blueprint = ManipulationDetectionDataset(
    tokenizer=tokenizer,
    raw_path=RAW_DATA_FOLDER / "span-detection.parquet",
    processed_path=PROCESSED_DATA_FOLDER / "span-detection" / model_checkpoint_ukr,
    seed=random_seed,
    do_split=False
)

dataset = dataset_blueprint.read()

Saving the dataset (0/1 shards):   0%|          | 0/3822 [00:00<?, ? examples/s]

3. Prepare model

In [None]:
from transformers import AutoModel
from transformers import PretrainedConfig, PreTrainedModel

class EnsembleConfig(PretrainedConfig):
    def __init__(self, model_ukr = None, model_ru = None, **kwargs):
        self.model_ukr = model_ukr
        self.model_ru = model_ru
        super().__init__(**kwargs)

class BertForTokenClassification(PreTrainedModel):
    config_class = EnsembleConfig

    def __init__(self, config):
        super().__init__(config)
        self.model_ukr = AutoModel.from_pretrained(config.model_ukr)
        self.model_ru = AutoModel.from_pretrained(config.model_ru)

        self.dropout = torch.nn.Dropout(config.classifier_dropout)
        self.fusion = torch.nn.Linear(self.model_ukr.config.hidden_size+self.model_ru.config.hidden_size, config.num_labels)

        self.id2label = config.id2label
        self.label2id = config.label2id


    def forward(self, input_ids, attention_mask=None, labels=None, token_type_ids=None):
        outputs_ukr = self.model_ukr(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        outputs_ru = self.model_ru(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        pooled_output = torch.cat((outputs_ukr.last_hidden_state, outputs_ru.last_hidden_state), dim=-1)
        pooled_output = self.dropout(pooled_output)
        logits = self.fusion(pooled_output)

        loss = None

        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, logits.shape[-1]), labels.view(-1))

        return {"loss": loss, "logits": logits}


In [None]:
model = BertForTokenClassification(
    EnsembleConfig(
        model_ukr=model_checkpoint_ukr, 
        model_ru=model_checkpoint_ru, 
        num_labels=len(dataset_blueprint.label2id), 
        classifier_dropout=classifier_dropout, 
        id2label=dataset_blueprint.id2label, 
        label2id=dataset_blueprint.label2id, 
    )
).to(device)

data_collator = DataCollatorForTokenClassification(tokenizer)

training_args = TrainingArguments(
    output_dir=MODELS_FOLDER / f"{result_model_ukr}-checkpoint",
    learning_rate=learning_rate,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    num_train_epochs=2,
    weight_decay=weight_decay,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    seed=random_seed,
    logging_steps=200,
    #auto_find_batch_size=True,
    #torch_empty_cache_steps=1000,
    metric_for_best_model="token_f1",
    greater_is_better=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics(dataset_blueprint),
)

Some weights of BertModel were not initialized from the model checkpoint at Gherman/bert-base-NER-Russian and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4. Train model

In [7]:
trainer.train()

Epoch,Training Loss,Validation Loss,Token F1,Token Precision,Token Recall,Span F1,Span Precision,Span Recall,Accuracy
1,0.4529,0.38946,0.598528,0.655339,0.55078,0.076611,0.058319,0.111623,0.817184
2,0.3898,0.340581,0.636095,0.764098,0.544825,0.097438,0.084538,0.114983,0.845764


TrainOutput(global_step=1274, training_loss=0.4318441184386728, metrics={'train_runtime': 440.1952, 'train_samples_per_second': 17.365, 'train_steps_per_second': 2.894, 'total_flos': 2975397501464136.0, 'train_loss': 0.4318441184386728, 'epoch': 2.0})

5. Save weights

In [8]:
trainer.save_model(MODELS_FOLDER / result_model_ukr)
tokenizer.save_pretrained(MODELS_FOLDER / result_model_ukr)

('c:\\Users\\Vitalii\\Desktop\\5_course\\UNLP\\unlp-2025-manipulation-detector\\notebooks\\..\\models\\span-detection-bert-base-multilingual-cased-ukr\\tokenizer_config.json',
 'c:\\Users\\Vitalii\\Desktop\\5_course\\UNLP\\unlp-2025-manipulation-detector\\notebooks\\..\\models\\span-detection-bert-base-multilingual-cased-ukr\\special_tokens_map.json',
 'c:\\Users\\Vitalii\\Desktop\\5_course\\UNLP\\unlp-2025-manipulation-detector\\notebooks\\..\\models\\span-detection-bert-base-multilingual-cased-ukr\\vocab.txt',
 'c:\\Users\\Vitalii\\Desktop\\5_course\\UNLP\\unlp-2025-manipulation-detector\\notebooks\\..\\models\\span-detection-bert-base-multilingual-cased-ukr\\added_tokens.json',
 'c:\\Users\\Vitalii\\Desktop\\5_course\\UNLP\\unlp-2025-manipulation-detector\\notebooks\\..\\models\\span-detection-bert-base-multilingual-cased-ukr\\tokenizer.json')

6. Test

In [None]:
model = BertForTokenClassification.from_pretrained(MODELS_FOLDER / result_model_ukr).to(device)
tokenizer = AutoTokenizer.from_pretrained(MODELS_FOLDER / result_model_ukr)

Some weights of BertModel were not initialized from the model checkpoint at Gherman/bert-base-NER-Russian and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
submission_df = pd.read_csv(RAW_DATA_FOLDER / "test.csv")

In [13]:
nlp = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
)

Device set to use cuda:0


In [14]:
test_spans = {}
for i, row in submission_df.iterrows():
    res = nlp(row["content"])
    res_manipulation = [r for r in res if r["entity_group"] == "MANIPULATION"]
    res_manipulation_spans = [(r["start"], r["end"]) for r in res_manipulation]
    test_spans[row["id"]] = res_manipulation_spans

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [15]:
test_spans_df = pd.DataFrame(test_spans.items(), columns=["id", "trigger_words"])

In [22]:
submission_path = SUBMISSIONS_FOLDER / "span-detection" / f"{result_model_ukr}_1epoch.csv"

submission_path.parent.mkdir(parents=True, exist_ok=True)

test_spans_df.to_csv(submission_path, index=False)

7. Submit

In [None]:
submission_params_str = f"classifier_dropout = {classifier_dropout}, learning_rate = {learning_rate}, weight_decay = {weight_decay}, num_train_epochs = {num_train_epochs}"
message = f"[ {submission_params_str} ] {submission_desc}"

submit_competition(
    path=submission_path, competition="unlp-2025-shared-task-span-identification", 
    message='Ensemble of two models: EvanD/xlm-roberta-base-ukrainian-ner-ukrner and yqelz/xml-roberta-large-ner-russian'
)