# Base BERT implementation

1. Import dependencies

In [1]:
import random
import time

import numpy as np
import torch
import os

from transformers import (
    AutoTokenizer,
    DataCollatorForTokenClassification,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)

import sys

sys.path.append('..')
from src.util.torch_device import resolve_torch_device
from src.data.span_detection_ds import ManipulationDetectionDataset
from src.definitions import (
    MODELS_FOLDER,
    RAW_DATA_FOLDER,
    REPORTS_FOLDER,
    PROCESSED_DATA_FOLDER,
)
from src.model.span_detection_metrics import compute_metrics

2. Prepare Env

In [2]:
random_seed = 42

random.seed(random_seed)
torch.manual_seed(random_seed)
np.random.seed(random_seed)

device = resolve_torch_device()

model_checkpoint = 'EvanD/xlm-roberta-base-ukrainian-ner-ukrner'

epoch_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())

3. Load dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length=512)

dataset_blueprint = ManipulationDetectionDataset(
    tokenizer=tokenizer,
    raw_path=RAW_DATA_FOLDER / "span-detection.parquet",
    processed_path=PROCESSED_DATA_FOLDER / "span-detection",
    seed=random_seed,
    do_split=False,
    exclude_tail=False,
)

dataset = dataset_blueprint.read()
dataset

3. Prepare model

In [4]:
run_name = f"{model_checkpoint.split('/')[-1]}-{epoch_time}"
model_path = MODELS_FOLDER / run_name
model_path.mkdir(parents=True, exist_ok=True)

In [5]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(dataset_blueprint.label2id),
    id2label=dataset_blueprint.id2label,
    label2id=dataset_blueprint.label2id,
    #dropout=0.2,
    #hidden_dropout_prob=0.15,
    #attention_probs_dropout_prob=0.15,
    ignore_mismatched_sizes=True
).to(device)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at EvanD/xlm-roberta-base-ukrainian-ner-ukrner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4. Train model

Freeze main layers and train only the classifier

In [6]:
training_args = TrainingArguments(
    output_dir=model_path,
    learning_rate=3e-3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="no",
    seed=random_seed,
    logging_strategy="epoch",
    run_name=run_name,
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics(dataset_blueprint),
)

In [7]:
for name, layer in model.named_parameters():
    if 'classifier' not in name:
        layer.requires_grad = False

In [8]:
trainer.train()

Epoch,Training Loss,Validation Loss,Token F1,Token Precision,Token Recall,Span F1,Span Precision,Span Recall,Accuracy
1,0.5463,0.499967,0.045851,0.680233,0.023725,0.009222,0.018028,0.006196,0.752977


TrainOutput(global_step=239, training_loss=0.5463099778945476, metrics={'train_runtime': 54.1954, 'train_samples_per_second': 70.523, 'train_steps_per_second': 4.41, 'total_flos': 849874846117032.0, 'train_loss': 0.5463099778945476, 'epoch': 1.0})

Unfreeze all layers and train the whole model

In [9]:
for name, layer in model.named_parameters():
    layer.requires_grad = True

In [10]:
training_args = TrainingArguments(
    output_dir=model_path,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    #save_strategy="epoch",
    save_strategy="no",
    seed=random_seed,
    logging_strategy="epoch",
    run_name=run_name,
    #metric_for_best_model="token_f1",
    #greater_is_better=True,
    #load_best_model_at_end=True,
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics(dataset_blueprint),
)

In [21]:
torch.cuda.empty_cache()

In [12]:
trainer.train()

Epoch,Training Loss,Validation Loss,Token F1,Token Precision,Token Recall,Span F1,Span Precision,Span Recall,Accuracy
1,0.4555,0.387208,0.493935,0.770587,0.363451,0.059082,0.050928,0.070344,0.813686
2,0.3898,0.352237,0.579631,0.798526,0.454925,0.076116,0.066461,0.089054,0.834925
3,0.3456,0.320175,0.666839,0.775854,0.584685,0.093991,0.081876,0.110315,0.853844


TrainOutput(global_step=717, training_loss=0.39694259622273254, metrics={'train_runtime': 399.3591, 'train_samples_per_second': 28.711, 'train_steps_per_second': 1.795, 'total_flos': 2545105941702384.0, 'train_loss': 0.39694259622273254, 'epoch': 3.0})

5. Save weights

In [20]:
trainer.save_model(MODELS_FOLDER / (run_name+'no_exclude_tail'))
tokenizer.save_pretrained(MODELS_FOLDER / (run_name+'no_exclude_tail'))

('c:\\Users\\Vitalii\\Desktop\\5_course\\UNLP\\unlp-2025-manipulation-detector\\notebooks\\..\\models\\xlm-roberta-base-ukrainian-ner-ukrner-2025-03-13_15-43-56no_exclude_tail\\tokenizer_config.json',
 'c:\\Users\\Vitalii\\Desktop\\5_course\\UNLP\\unlp-2025-manipulation-detector\\notebooks\\..\\models\\xlm-roberta-base-ukrainian-ner-ukrner-2025-03-13_15-43-56no_exclude_tail\\special_tokens_map.json',
 'c:\\Users\\Vitalii\\Desktop\\5_course\\UNLP\\unlp-2025-manipulation-detector\\notebooks\\..\\models\\xlm-roberta-base-ukrainian-ner-ukrner-2025-03-13_15-43-56no_exclude_tail\\sentencepiece.bpe.model',
 'c:\\Users\\Vitalii\\Desktop\\5_course\\UNLP\\unlp-2025-manipulation-detector\\notebooks\\..\\models\\xlm-roberta-base-ukrainian-ner-ukrner-2025-03-13_15-43-56no_exclude_tail\\added_tokens.json',
 'c:\\Users\\Vitalii\\Desktop\\5_course\\UNLP\\unlp-2025-manipulation-detector\\notebooks\\..\\models\\xlm-roberta-base-ukrainian-ner-ukrner-2025-03-13_15-43-56no_exclude_tail\\tokenizer.json')

6. Submission generation

In [3]:
tokenizer = AutoTokenizer.from_pretrained(
    MODELS_FOLDER / 'xlm-roberta-base-ukrainian-ner-ukrner-2025-03-13_15-43-56no_exclude_tail',
)
model = AutoModelForTokenClassification.from_pretrained(
    MODELS_FOLDER / 'xlm-roberta-base-ukrainian-ner-ukrner-2025-03-13_15-43-56no_exclude_tail',
).to(device)

In [23]:
import pandas as pd
submission_df = pd.read_csv(RAW_DATA_FOLDER / 'test.csv')
submission_df.head()

Unnamed: 0,id,content
0,521cd2e8-dd9f-42c4-98ba-c0c8890ff1ba,"Они просрали нашу технику, положили кучу людей..."
1,9b2a61e4-d14e-4ff7-b304-e73d720319bf,❗️\nКитай предлагает отдать оккупированные тер...
2,f0f1c236-80a8-4d25-b30c-a420a39be632,Сегодня будет ровно 6 месяцев с этого обещания...
3,31ea05ba-2c2b-4b84-aba7-f3cf6841b204,⚡️\nІзраїль вперше у світі збив балістичну рак...
4,a79e13ec-6d9a-40b5-b54c-7f4f743a7525,Склав невелику навчально-методичну таблицю на ...


In [24]:
nlp = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
)

Device set to use cuda:0


In [25]:
test_spans = {}
for i, row in submission_df.iterrows():
    res = nlp(row['content'])
    res_manipulation = [r for r in res if r['entity_group'] == 'MANIPULATION']
    res_manipulation_spans = [(r['start'], r['end']) for r in res_manipulation]
    test_spans[row['id']] = res_manipulation_spans

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [26]:
test_spans_df = pd.DataFrame(test_spans.items(), columns=['id', 'trigger_words'])

In [28]:
test_spans_df.head()

Unnamed: 0,id,trigger_words
0,521cd2e8-dd9f-42c4-98ba-c0c8890ff1ba,"[(0, 253)]"
1,9b2a61e4-d14e-4ff7-b304-e73d720319bf,"[(374, 425)]"
2,f0f1c236-80a8-4d25-b30c-a420a39be632,"[(48, 75), (77, 126)]"
3,31ea05ba-2c2b-4b84-aba7-f3cf6841b204,[]
4,a79e13ec-6d9a-40b5-b54c-7f4f743a7525,"[(87, 102), (128, 133), (259, 289), (296, 308)]"


In [29]:
test_spans_df.to_csv(REPORTS_FOLDER / 'xlm-roberta-base-ukrainian-ner-ukrner-2025-03-13_15-43-56no_exclude_tail.csv', index=False)

7. Checking different langs

In [None]:
dataset_blueprint = ManipulationDetectionDataset(
    tokenizer=tokenizer,
    raw_path=RAW_DATA_FOLDER / "span-detection.parquet",
    processed_path=PROCESSED_DATA_FOLDER / "span-detection",
    seed=random_seed,
    do_split=False,
    exclude_tail=False,
    lang='uk',
)

dataset = dataset_blueprint.read()
dataset

In [None]:
#uk
data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    train_dataset=dataset,
    eval_dataset=dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics(dataset_blueprint),
)

trainer.evaluate()

{'eval_loss': 0.2686629891395569,
 'eval_model_preparation_time': 0.002,
 'eval_token_f1': 0.6598962938182513,
 'eval_token_precision': 0.7716553034536716,
 'eval_token_recall': 0.5764141097891881,
 'eval_span_f1': 0.09599177800616648,
 'eval_span_precision': 0.08348230246692885,
 'eval_span_recall': 0.11291102514506769,
 'eval_accuracy': 0.8786870621125424,
 'eval_runtime': 15.711,
 'eval_samples_per_second': 136.656,
 'eval_steps_per_second': 17.122}

In [None]:
dataset_blueprint = ManipulationDetectionDataset(
    tokenizer=tokenizer,
    raw_path=RAW_DATA_FOLDER / "span-detection.parquet",
    processed_path=PROCESSED_DATA_FOLDER / "span-detection",
    seed=random_seed,
    do_split=False,
    exclude_tail=False,
    lang='ru',
)

dataset = dataset_blueprint.read()
dataset

In [None]:
#ru
data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    train_dataset=dataset,
    eval_dataset=dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics(dataset_blueprint),
)

trainer.evaluate()

{'eval_loss': 0.3925739824771881,
 'eval_model_preparation_time': 0.002,
 'eval_token_f1': 0.673348968525462,
 'eval_token_precision': 0.7797536673124827,
 'eval_token_recall': 0.5924971739530482,
 'eval_span_f1': 0.09196121363778544,
 'eval_span_precision': 0.08024017467248909,
 'eval_span_recall': 0.1076923076923077,
 'eval_accuracy': 0.8173175326546894,
 'eval_runtime': 11.727,
 'eval_samples_per_second': 142.833,
 'eval_steps_per_second': 17.907}