# Base Span Detection implementation

1. Import dependencies

In [1]:
import random
import time
import torch
import os

import pandas as pd
import numpy as np

from transformers import (
    AutoTokenizer,
    DataCollatorForTokenClassification,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)

from src.util.torch_device import resolve_torch_device
from src.data.span_detection_ds import ManipulationDetectionDataset
from src.definitions import (
    MODELS_FOLDER,
    RAW_DATA_FOLDER,
    SUBMISSIONS_FOLDER,
    PROCESSED_DATA_FOLDER,
)
from src.data.kaggle import submit_df_competition
from src.model.span_detection_metrics import compute_metrics

2. Prepare Env

In [2]:
random_seed = 42

random.seed(random_seed)
torch.manual_seed(random_seed)
np.random.seed(random_seed)

device = resolve_torch_device()

epoch_time = int(time.time())

os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

In [3]:
model_checkpoint = MODELS_FOLDER / "ru-fine-tuned-FacebookAI-xlm-roberta-base"
result_model = f"span-detection-{str(model_checkpoint).split("/")[-1]}"

classifier_dropout = 0.1
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 5

submission_desc = (
    "Use xml-roberta-base fine tuned on lenta-ru dataset as base model"
)

3. Load dataset

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

dataset_blueprint = ManipulationDetectionDataset(
    tokenizer=tokenizer,
    raw_path=RAW_DATA_FOLDER / "span-detection.parquet",
    processed_path=PROCESSED_DATA_FOLDER / "span-detection",
    seed=random_seed,
    do_split=False
)

dataset = dataset_blueprint.read()

Saving the dataset (0/1 shards):   0%|          | 0/3822 [00:00<?, ? examples/s]

3. Prepare model

In [5]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(dataset_blueprint.label2id),
    id2label=dataset_blueprint.id2label,
    label2id=dataset_blueprint.label2id,
    classifier_dropout=classifier_dropout,
).to(device)

data_collator = DataCollatorForTokenClassification(tokenizer)

training_args = TrainingArguments(
    output_dir=MODELS_FOLDER / f"{result_model}-checkpoint",
    learning_rate=learning_rate,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=num_train_epochs,
    weight_decay=weight_decay,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    seed=random_seed,
    logging_steps=200,
    auto_find_batch_size=True,
    torch_empty_cache_steps=1000,
    metric_for_best_model="token_f1",
    greater_is_better=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics(dataset_blueprint),
)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at /home/melal/Workspace/unlp-2025-manipulation-detector/models/ru-fine-tuned-FacebookAI-xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4. Train model

In [6]:
torch.cuda.empty_cache()

trainer.train()

Epoch,Training Loss,Validation Loss,Token F1,Token Precision,Token Recall,Span F1,Span Precision,Span Recall,Accuracy
1,0.4654,0.407834,0.441347,0.785422,0.306901,0.067712,0.062359,0.074069,0.798696
2,0.4068,0.357435,0.584508,0.80515,0.458783,0.104261,0.099051,0.110049,0.831005
3,0.3668,0.3192,0.65761,0.850615,0.535993,0.133477,0.136211,0.13085,0.855388
4,0.334,0.248065,0.79651,0.801836,0.791254,0.194329,0.186172,0.203233,0.895248
5,0.2977,0.238307,0.792555,0.848946,0.743189,0.200099,0.201338,0.198876,0.899199


TrainOutput(global_step=1195, training_loss=0.35663402349879053, metrics={'train_runtime': 432.0701, 'train_samples_per_second': 44.229, 'train_steps_per_second': 2.766, 'total_flos': 4242022197230592.0, 'train_loss': 0.35663402349879053, 'epoch': 5.0})

5. Save weights

In [7]:
trainer.save_model(MODELS_FOLDER / result_model)
tokenizer.save_pretrained(MODELS_FOLDER / result_model)

('/home/melal/Workspace/unlp-2025-manipulation-detector/models/span-detection-ru-fine-tuned-FacebookAI-xlm-roberta-base/tokenizer_config.json',
 '/home/melal/Workspace/unlp-2025-manipulation-detector/models/span-detection-ru-fine-tuned-FacebookAI-xlm-roberta-base/special_tokens_map.json',
 '/home/melal/Workspace/unlp-2025-manipulation-detector/models/span-detection-ru-fine-tuned-FacebookAI-xlm-roberta-base/tokenizer.json')

6. Test

In [8]:
model = AutoModelForTokenClassification.from_pretrained(MODELS_FOLDER / result_model)
tokenizer = AutoTokenizer.from_pretrained(MODELS_FOLDER / result_model)

In [9]:
submission_df = pd.read_csv(RAW_DATA_FOLDER / "test.csv")

In [10]:
nlp = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
)

Device set to use cuda:0


In [11]:
test_spans = {}
for i, row in submission_df.iterrows():
    res = nlp(row["content"])
    res_manipulation = [r for r in res if r["entity_group"] == "MANIPULATION"]
    res_manipulation_spans = [(r["start"], r["end"]) for r in res_manipulation]
    test_spans[row["id"]] = res_manipulation_spans

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [12]:
test_spans_df = pd.DataFrame(test_spans.items(), columns=["id", "trigger_words"])

7. Submit

In [13]:
submission_params_str = f"classifier_dropout = {classifier_dropout}, learning_rate = {learning_rate}, weight_decay = {weight_decay}, num_train_epochs = {num_train_epochs}"
message = f"[ {submission_params_str} ] {submission_desc}"
submission_path = SUBMISSIONS_FOLDER / "span-detection" / f"{result_model}.csv"

submit_df_competition(
    test_spans_df, submission_path, message, "unlp-2025-shared-task-span-identification"
)



100%|██████████| 425k/425k [00:00<00:00, 445kB/s] 
  """Creates (aka \&quot;drops\&quot;) a new file into the inbox.  # noqa: E501
  """Creates (aka \&quot;drops\&quot;) a new file into the inbox.  # noqa: E501
  sub_kls = re.match('list\[(.*)\]', klass).group(1)
  sub_kls = re.match('dict\(([^,]*), (.*)\)', klass).group(2)


ApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Content-Type': 'application/json', 'Date': 'Thu, 27 Mar 2025 12:11:32 GMT', 'Access-Control-Allow-Credentials': 'true', 'Access-Control-Allow-Origin': '*', 'Set-Cookie': 'ka_sessionid=7a3bb3c7a87b7c2faefe8268cf9e75f7; max-age=2626560; path=/, GCLB=CIGomP6zuYr5XxAD; path=/; HttpOnly', 'Vary': 'Accept-Encoding', 'X-Kaggle-MillisecondsElapsed': '289', 'X-Kaggle-RequestId': '2fdc9861d718e1d156a2fff6db965a89', 'X-Kaggle-ApiVersion': '1.7.4.2', 'X-Kaggle-HubVersion': '0.3.10', 'X-Frame-Options': 'SAMEORIGIN', 'Strict-Transport-Security': 'max-age=63072000; includeSubDomains; preload', 'Content-Security-Policy': "object-src 'none'; script-src 'nonce-nArfxsZODovMGtJBCnp4wg==' 'report-sample' 'unsafe-inline' 'unsafe-eval' 'strict-dynamic' https: http:; base-uri 'none'; report-uri https://csp.withgoogle.com/csp/kaggle/20201130; frame-src 'self' https://www.kaggleusercontent.com https://www.youtube.com/embed/ https://polygraph-cool.github.io https://www.google.com/recaptcha/ https://www.docdroid.com https://www.docdroid.net https://kaggle-static.storage.googleapis.com https://kkb-production.jupyter-proxy.kaggle.net https://kkb-production.firebaseapp.com https://kaggle-metastore.firebaseapp.com https://apis.google.com https://content-sheets.googleapis.com/ https://accounts.google.com/ https://storage.googleapis.com https://docs.google.com https://drive.google.com https://calendar.google.com/ https://google.qualtrics.com/ ;", 'X-Content-Type-Options': 'nosniff', 'Referrer-Policy': 'strict-origin-when-cross-origin', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000', 'Transfer-Encoding': 'chunked'})
HTTP response body: {"code":400,"message":"Submission not allowed:  Your team has used its daily Submission allowance (5) today, please try again tomorrow UTC (11 hours from now)."}
