# Base BERT implementation

1. Import dependencies

In [1]:
import random
import time

import numpy as np
import torch
import os

from transformers import (
    AutoTokenizer,
    DataCollatorForTokenClassification,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)

import sys

sys.path.append('..')
from src.util.torch_device import resolve_torch_device
from src.data.span_detection_ds import ManipulationDetectionDataset
from src.definitions import (
    MODELS_FOLDER,
    RAW_DATA_FOLDER,
    REPORTS_FOLDER,
    PROCESSED_DATA_FOLDER,
)
from src.model.span_detection_metrics import compute_metrics

2. Prepare Env

In [2]:
random_seed = 42

random.seed(random_seed)
torch.manual_seed(random_seed)
np.random.seed(random_seed)

device = resolve_torch_device()

model_checkpoint = MODELS_FOLDER / "ua-fine-tuned-xlm-roberta-large"

epoch_time = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())

3. Load dataset

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length=512)

dataset_blueprint = ManipulationDetectionDataset(
    tokenizer=tokenizer,
    raw_path=RAW_DATA_FOLDER / "span-detection.parquet",
    processed_path=PROCESSED_DATA_FOLDER / "span-detection",
    seed=random_seed,
    do_split=False,
    exclude_tail=False,
)

dataset = dataset_blueprint.read()
dataset

Saving the dataset (0/1 shards):   0%|          | 0/3822 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'content', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 3822
})

3. Prepare model

In [4]:
run_name = f"{str(model_checkpoint).replace('/', '-')[-1]}-{epoch_time}"
model_path = MODELS_FOLDER / run_name
model_path.mkdir(parents=True, exist_ok=True)

In [5]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(dataset_blueprint.label2id),
    id2label=dataset_blueprint.id2label,
    label2id=dataset_blueprint.label2id,
    #dropout=0.2,
    #hidden_dropout_prob=0.15,
    #attention_probs_dropout_prob=0.15,
    ignore_mismatched_sizes=True
).to(device)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at /home/melal/Workspace/unlp-2025-manipulation-detector/models/ua-fine-tuned-xlm-roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4. Train model

Freeze main layers and train only the classifier

In [6]:
training_args = TrainingArguments(
    output_dir=model_path,
    learning_rate=3e-3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="no",
    seed=random_seed,
    logging_strategy="epoch",
    run_name=run_name,
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics(dataset_blueprint),
)

In [7]:
for name, layer in model.named_parameters():
    if 'classifier' not in name:
        layer.requires_grad = False

In [8]:
trainer.train()

Epoch,Training Loss,Validation Loss,Token F1,Token Precision,Token Recall,Span F1,Span Precision,Span Recall,Accuracy
1,0.4906,0.459792,0.242563,0.72572,0.145617,0.012245,0.009605,0.016887,0.772492


TrainOutput(global_step=239, training_loss=0.4905975852551321, metrics={'train_runtime': 40.1595, 'train_samples_per_second': 95.17, 'train_steps_per_second': 5.951, 'total_flos': 849874846117032.0, 'train_loss': 0.4905975852551321, 'epoch': 1.0})

Unfreeze all layers and train the whole model

In [9]:
for name, layer in model.named_parameters():
    layer.requires_grad = True

In [10]:
training_args = TrainingArguments(
    output_dir=model_path,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    #save_strategy="epoch",
    save_strategy="no",
    seed=random_seed,
    logging_strategy="epoch",
    run_name=run_name,
    #metric_for_best_model="token_f1",
    #greater_is_better=True,
    #load_best_model_at_end=True,
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics(dataset_blueprint),
)

In [11]:
torch.cuda.empty_cache()

In [12]:
trainer.train()

Epoch,Training Loss,Validation Loss,Token F1,Token Precision,Token Recall,Span F1,Span Precision,Span Recall,Accuracy
1,0.4453,0.383412,0.482204,0.802215,0.3447,0.052706,0.041512,0.072166,0.814804
2,0.3836,0.342259,0.604763,0.815612,0.480536,0.068496,0.056036,0.088082,0.842869
3,0.3379,0.304045,0.693574,0.795777,0.614635,0.08417,0.068184,0.10995,0.864133


TrainOutput(global_step=717, training_loss=0.38892795451016604, metrics={'train_runtime': 242.2442, 'train_samples_per_second': 47.332, 'train_steps_per_second': 2.96, 'total_flos': 2545105941702384.0, 'train_loss': 0.38892795451016604, 'epoch': 3.0})

5. Save weights

In [13]:
trainer.save_model(MODELS_FOLDER / (run_name+'no_exclude_tail'))
tokenizer.save_pretrained(MODELS_FOLDER / (run_name+'no_exclude_tail'))

('/home/melal/Workspace/unlp-2025-manipulation-detector/models/e-2025-03-18_21-51-54no_exclude_tail/tokenizer_config.json',
 '/home/melal/Workspace/unlp-2025-manipulation-detector/models/e-2025-03-18_21-51-54no_exclude_tail/special_tokens_map.json',
 '/home/melal/Workspace/unlp-2025-manipulation-detector/models/e-2025-03-18_21-51-54no_exclude_tail/tokenizer.json')

6. Submission generation

In [15]:
tokenizer = AutoTokenizer.from_pretrained(
    MODELS_FOLDER / (run_name + "no_exclude_tail"),
)
model = AutoModelForTokenClassification.from_pretrained(
    MODELS_FOLDER / (run_name + "no_exclude_tail"),
).to(device)

In [16]:
import pandas as pd
submission_df = pd.read_csv(RAW_DATA_FOLDER / 'test.csv')
submission_df.head()

Unnamed: 0,id,content
0,521cd2e8-dd9f-42c4-98ba-c0c8890ff1ba,"Они просрали нашу технику, положили кучу людей..."
1,9b2a61e4-d14e-4ff7-b304-e73d720319bf,❗️\nКитай предлагает отдать оккупированные тер...
2,f0f1c236-80a8-4d25-b30c-a420a39be632,Сегодня будет ровно 6 месяцев с этого обещания...
3,31ea05ba-2c2b-4b84-aba7-f3cf6841b204,⚡️\nІзраїль вперше у світі збив балістичну рак...
4,a79e13ec-6d9a-40b5-b54c-7f4f743a7525,Склав невелику навчально-методичну таблицю на ...


In [17]:
nlp = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
)

Device set to use cuda:0


In [18]:
test_spans = {}
for i, row in submission_df.iterrows():
    res = nlp(row['content'])
    res_manipulation = [r for r in res if r['entity_group'] == 'MANIPULATION']
    res_manipulation_spans = [(r['start'], r['end']) for r in res_manipulation]
    test_spans[row['id']] = res_manipulation_spans

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [19]:
test_spans_df = pd.DataFrame(test_spans.items(), columns=['id', 'trigger_words'])

In [20]:
test_spans_df.head()

Unnamed: 0,id,trigger_words
0,521cd2e8-dd9f-42c4-98ba-c0c8890ff1ba,"[(0, 253)]"
1,9b2a61e4-d14e-4ff7-b304-e73d720319bf,"[(374, 428)]"
2,f0f1c236-80a8-4d25-b30c-a420a39be632,"[(48, 126)]"
3,31ea05ba-2c2b-4b84-aba7-f3cf6841b204,[]
4,a79e13ec-6d9a-40b5-b54c-7f4f743a7525,"[(87, 103), (127, 136), (142, 162), (170, 255)..."


In [22]:
test_spans_df.to_csv(REPORTS_FOLDER / (run_name + "no_exclude_tail" + ".csv"), index=False)

#### Additional: checking metrics on uk\ru parts

In [None]:
dataset_blueprint = ManipulationDetectionDataset(
    tokenizer=tokenizer,
    raw_path=RAW_DATA_FOLDER / "span-detection.parquet",
    processed_path=PROCESSED_DATA_FOLDER / "span-detection",
    seed=random_seed,
    do_split=False,
    exclude_tail=False,
    lang='ru',
)

dataset = dataset_blueprint.read()
dataset

In [None]:
#uk
data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    train_dataset=dataset,
    eval_dataset=dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics(dataset_blueprint),
)

trainer.evaluate()

In [None]:
#ru
data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    train_dataset=dataset,
    eval_dataset=dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics(dataset_blueprint),
)

trainer.evaluate()