# Base BERT implementation

1. Import dependencies

In [9]:
import random
import time

import numpy as np
import torch
import os

from transformers import (
    AutoTokenizer,
    DataCollatorForTokenClassification,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)

from tokenizers import pre_tokenizers
import sys

sys.path.append('..')
from src.util.torch_device import resolve_torch_device
from src.data.span_detection_ds import ManipulationDetectionDataset
from src.visualization.plot import plot_loss, plot_model_progress, plot_eval_loss
from src.definitions import (
    MODELS_FOLDER,
    RAW_DATA_FOLDER,
    REPORTS_FOLDER,
    PROCESSED_DATA_FOLDER,
)
from src.visualization.ner import visualize_as_markdown_and_save
from src.visualization.reporting import EvaluatingReport
from src.model.span_detection_metrics import compute_metrics

2. Prepare Env

In [10]:
random_seed = 42

random.seed(random_seed)
torch.manual_seed(random_seed)
np.random.seed(random_seed)

device = resolve_torch_device()

model_checkpoint = 'EvanD/xlm-roberta-base-ukrainian-ner-ukrner'

epoch_time = int(time.time())

3. Load dataset

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length=512)

dataset_blueprint = ManipulationDetectionDataset(
    tokenizer=tokenizer,
    raw_path=RAW_DATA_FOLDER / "span-detection.parquet",
    processed_path=PROCESSED_DATA_FOLDER / "span-detection",
    seed=random_seed,
    do_split=False,
)

dataset = dataset_blueprint.read()
dataset

Saving the dataset (0/1 shards):   0%|          | 0/3822 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'content', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 3822
})

3. Prepare model

In [12]:
def get_model(model_checkpoint, dataset_blueprint, device):
    model = AutoModelForTokenClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(dataset_blueprint.label2id),
        id2label=dataset_blueprint.id2label,
        label2id=dataset_blueprint.label2id,
        #dropout=0.2,
        #hidden_dropout_prob=0.2,
        #attention_probs_dropout_prob=0.2,
        ignore_mismatched_sizes=True
    ).to(device)

    return model

def get_trainer(model, tokenizer, dataset, run_name):
    training_args = TrainingArguments(
        output_dir=MODELS_FOLDER / "manipulation-detector-bert-ner-checkpoint",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=4,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="no",
        seed=random_seed,
        logging_strategy="epoch",
        run_name=run_name,
    )

    data_collator = DataCollatorForTokenClassification(tokenizer)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        processing_class=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics(dataset_blueprint),
    )

    return trainer

4. Train model

In [46]:
k = 5
length = len(dataset)
fold_length = length // k
eval_metrics = []
for i in range(k):
    start = i * fold_length
    end = (i + 1) * fold_length
    if i == k - 1:
        end = length

    train_idx = list(range(0, start)) + list(range(end, length))
    test_idx = list(range(start, end))

    train_dataset = dataset.select(train_idx)
    test_dataset = dataset.select(test_idx)

    model = get_model(model_checkpoint, dataset_blueprint, device)
    trainer = get_trainer(model, tokenizer, {"train": train_dataset, "test": test_dataset}, f"EvanD/xlm-roberta-base-ukrainian-ner-ukrner-{epoch_time}-fold-{i}")

    trainer.train()
    evaluation_feedback = trainer.evaluate()

    eval_metrics.append(evaluation_feedback)

    # trainer.save_model(MODELS_FOLDER / f"manipulation-detector-bert-ner-{epoch_time}-fold-{i}")
    # tokenizer.save_pretrained(MODELS_FOLDER / f"manipulation-detector-bert-ner-{epoch_time}-fold-{i}")

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at EvanD/xlm-roberta-base-ukrainian-ner-ukrner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Token F1,Token Precision,Token Recall,Token Accuracy,Span F1,Span Precision,Span Recall,Span Accuracy
1,0.4877,0.420891,0.428558,0.731452,0.303061,0.796253,0.065616,0.061372,0.070491,0.796253
2,0.4098,0.413646,0.553891,0.642718,0.486635,0.802386,0.098701,0.084737,0.118176,0.802386
3,0.3492,0.43089,0.560523,0.65528,0.489709,0.806412,0.110147,0.10652,0.114029,0.806412
4,0.2959,0.456074,0.559771,0.651807,0.490511,0.805502,0.115528,0.113636,0.117484,0.805502


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at EvanD/xlm-roberta-base-ukrainian-ner-ukrner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Token F1,Token Precision,Token Recall,Token Accuracy,Span F1,Span Precision,Span Recall,Span Accuracy
1,0.5005,0.433704,0.456241,0.686597,0.341625,0.795154,0.045997,0.039531,0.05499,0.795154
2,0.4287,0.413642,0.589257,0.611731,0.568375,0.800673,0.085565,0.074885,0.099796,0.800673
3,0.3666,0.435633,0.529038,0.675702,0.434688,0.805311,0.093101,0.09187,0.094365,0.805311
4,0.3152,0.46273,0.554818,0.647225,0.4855,0.804006,0.103842,0.105932,0.101833,0.804006


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at EvanD/xlm-roberta-base-ukrainian-ner-ukrner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Token F1,Token Precision,Token Recall,Token Accuracy,Span F1,Span Precision,Span Recall,Span Accuracy
1,0.4797,0.515314,0.213322,0.736304,0.124729,0.753121,0.032258,0.053512,0.023088,0.753121
2,0.4068,0.476391,0.59934,0.558795,0.646229,0.768131,0.10078,0.086242,0.121212,0.768131
3,0.3511,0.491521,0.582158,0.587029,0.577368,0.777579,0.101638,0.094704,0.109668,0.777579
4,0.3021,0.546326,0.534814,0.635026,0.461919,0.784353,0.098981,0.099853,0.098124,0.784353


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at EvanD/xlm-roberta-base-ukrainian-ner-ukrner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Token F1,Token Precision,Token Recall,Token Accuracy,Span F1,Span Precision,Span Recall,Span Accuracy
1,0.4969,0.482712,0.004693,0.975,0.002352,0.725756,0.0,0.0,0.0,0.725756
2,0.4202,0.450879,0.610255,0.581983,0.641414,0.774799,0.097301,0.083716,0.116147,0.774799
3,0.3652,0.447574,0.608991,0.600857,0.617347,0.782094,0.120747,0.114124,0.128187,0.782094
4,0.3165,0.49909,0.569583,0.631761,0.518548,0.784581,0.11351,0.111644,0.115439,0.784581


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at EvanD/xlm-roberta-base-ukrainian-ner-ukrner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Token F1,Token Precision,Token Recall,Token Accuracy,Span F1,Span Precision,Span Recall,Span Accuracy
1,0.4901,0.41072,0.575988,0.620139,0.537707,0.80233,0.08125,0.072102,0.093057,0.80233
2,0.4137,0.412751,0.58829,0.640061,0.544267,0.809786,0.100492,0.098689,0.102362,0.809786
3,0.3517,0.410468,0.57969,0.649232,0.523605,0.810413,0.109429,0.105368,0.113815,0.810413
4,0.3002,0.442643,0.570713,0.637092,0.516861,0.805851,0.112817,0.112536,0.113099,0.805851


In [53]:
import pandas as pd

eval_metrics_avg = {
    "eval_loss": sum([m["eval_loss"] for m in eval_metrics]) / k,
    "eval_token_f1": sum([m["eval_token_f1"] for m in eval_metrics]) / k,
    "eval_token_precision": sum([m["eval_token_precision"] for m in eval_metrics]) / k,
    "eval_token_recall": sum([m["eval_token_recall"] for m in eval_metrics]) / k,
    "eval_token_accuracy": sum([m["eval_token_accuracy"] for m in eval_metrics]) / k,
    "eval_span_f1": sum([m["eval_span_f1"] for m in eval_metrics]) / k,
    "eval_span_precision": sum([m["eval_span_precision"] for m in eval_metrics]) / k,
    "eval_span_recall": sum([m["eval_span_recall"] for m in eval_metrics]) / k,
    "eval_span_accuracy": sum([m["eval_span_accuracy"] for m in eval_metrics]) / k,
    "eval_runtime": sum([m["eval_runtime"] for m in eval_metrics]) / k,
    "eval_samples_per_second": sum([m["eval_samples_per_second"] for m in eval_metrics]) / k,
    "eval_steps_per_second": sum([m["eval_steps_per_second"] for m in eval_metrics]) / k,
    "epoch": sum([m["epoch"] for m in eval_metrics]) / k,
}
pd.DataFrame(eval_metrics_avg, index=[0])

Unnamed: 0,eval_loss,eval_token_f1,eval_token_precision,eval_token_recall,eval_token_accuracy,eval_span_f1,eval_span_precision,eval_span_recall,eval_span_accuracy,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
0,0.481373,0.55794,0.640582,0.494668,0.796859,0.108936,0.10872,0.109196,0.796859,13.88224,80.5992,5.0592,4.0


5. Save weights

In [16]:
trainer.save_model(MODELS_FOLDER / "manipulation-detector-xlm-roberta-base-ukrainian-ner-ukrner_exclude_tail_2e-5_5epochs")
tokenizer.save_pretrained(MODELS_FOLDER / "manipulation-detector-xlm-roberta-base-ukrainian-ner-ukrner_exclude_tail_2e-5_5epochs")

('c:\\Users\\Vitalii\\Desktop\\5_course\\UNLP\\unlp-2025-manipulation-detector\\notebooks\\..\\models\\manipulation-detector-xlm-roberta-base-ukrainian-ner-ukrner_exclude_tail_2e-5_5epochs\\tokenizer_config.json',
 'c:\\Users\\Vitalii\\Desktop\\5_course\\UNLP\\unlp-2025-manipulation-detector\\notebooks\\..\\models\\manipulation-detector-xlm-roberta-base-ukrainian-ner-ukrner_exclude_tail_2e-5_5epochs\\special_tokens_map.json',
 'c:\\Users\\Vitalii\\Desktop\\5_course\\UNLP\\unlp-2025-manipulation-detector\\notebooks\\..\\models\\manipulation-detector-xlm-roberta-base-ukrainian-ner-ukrner_exclude_tail_2e-5_5epochs\\sentencepiece.bpe.model',
 'c:\\Users\\Vitalii\\Desktop\\5_course\\UNLP\\unlp-2025-manipulation-detector\\notebooks\\..\\models\\manipulation-detector-xlm-roberta-base-ukrainian-ner-ukrner_exclude_tail_2e-5_5epochs\\added_tokens.json',
 'c:\\Users\\Vitalii\\Desktop\\5_course\\UNLP\\unlp-2025-manipulation-detector\\notebooks\\..\\models\\manipulation-detector-xlm-roberta-base-uk

6. Test

In [3]:
model = AutoModelForTokenClassification.from_pretrained(
    MODELS_FOLDER / "manipulation-detector-xlm-roberta-base-ukrainian-ner-ukrner_exclude_tail"
)
tokenizer = AutoTokenizer.from_pretrained(
    MODELS_FOLDER / "manipulation-detector-xlm-roberta-base-ukrainian-ner-ukrner_exclude_tail"
)

In [17]:
test_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

test_result = test_pipeline(dataset["test"]["content"])

Device set to use cuda:0


In [None]:
visualize_as_markdown_and_save(
    dataset["test"],
    test_result,
    tokenizer,
    REPORTS_FOLDER / "span-detection" / "test-visualization" / f"test-{epoch_time}.md",
)

Dataset({
    features: ['id', 'content', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 383
})
