# Base Span Detection implementation

1. Import dependencies

In [1]:
import random
import time
import torch
import os

import pandas as pd
import numpy as np

from transformers import (
    AutoTokenizer,
    DataCollatorForTokenClassification,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)

from src.util.torch_device import resolve_torch_device
from src.data.span_detection_ds import ManipulationDetectionDataset
from src.definitions import (
    MODELS_FOLDER,
    RAW_DATA_FOLDER,
    SUBMISSIONS_FOLDER,
    PROCESSED_DATA_FOLDER,
)
from src.data.kaggle import submit_df_competition
from src.model.span_detection_metrics import compute_metrics

2. Prepare Env

In [2]:
random_seed = 42

random.seed(random_seed)
torch.manual_seed(random_seed)
np.random.seed(random_seed)

device = resolve_torch_device()

epoch_time = int(time.time())

In [3]:
model_checkpoint = "models/train-test-fine-tuned-models-ru-fine-tuned-FacebookAI-xlm-roberta-base-3-checkpoint/checkpoint-500"
result_model = f"span-detection-{str(model_checkpoint).split("/")[-1]}"

classifier_dropout = 0.1
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 5

submission_desc = "Use xml-roberta-base fine tuned on train-test dataset"

3. Load dataset

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

dataset_blueprint = ManipulationDetectionDataset(
    tokenizer=tokenizer,
    raw_path=RAW_DATA_FOLDER / "span-detection.parquet",
    processed_path=PROCESSED_DATA_FOLDER / "span-detection",
    seed=random_seed,
    do_split=False
)

dataset = dataset_blueprint.read()

Map:   0%|          | 0/3822 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3822 [00:00<?, ? examples/s]

3. Prepare model

In [5]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(dataset_blueprint.label2id),
    id2label=dataset_blueprint.id2label,
    label2id=dataset_blueprint.label2id,
    classifier_dropout=classifier_dropout,
).to(device)

data_collator = DataCollatorForTokenClassification(tokenizer)

training_args = TrainingArguments(
    output_dir=MODELS_FOLDER / f"{result_model}-checkpoint",
    learning_rate=learning_rate,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=num_train_epochs,
    weight_decay=weight_decay,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    seed=random_seed,
    metric_for_best_model="token_f1",
    greater_is_better=True,
    bf16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics(dataset_blueprint),
)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at models/train-test-fine-tuned-models-ru-fine-tuned-FacebookAI-xlm-roberta-base-3-checkpoint/checkpoint-500 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4. Train model

In [6]:
torch.cuda.empty_cache()

trainer.train()

Epoch,Training Loss,Validation Loss,Token F1,Token Precision,Token Recall,Span F1,Span Precision,Span Recall,Accuracy
1,No log,0.406484,0.440205,0.794185,0.304489,0.06778,0.06105,0.076177,0.79935
2,No log,0.353827,0.580792,0.815605,0.45096,0.099691,0.093673,0.106535,0.831329
3,0.422900,0.316817,0.658575,0.85538,0.535393,0.135411,0.138412,0.132537,0.856168
4,0.422900,0.24381,0.794531,0.823727,0.767334,0.195642,0.189207,0.20253,0.897172
5,0.318100,0.235146,0.791281,0.860366,0.732466,0.194058,0.19487,0.193254,0.899882


TrainOutput(global_step=1195, training_loss=0.353197648435457, metrics={'train_runtime': 293.3885, 'train_samples_per_second': 65.135, 'train_steps_per_second': 4.073, 'total_flos': 4242022197230592.0, 'train_loss': 0.353197648435457, 'epoch': 5.0})

5. Save weights

In [7]:
trainer.save_model(MODELS_FOLDER / result_model)
tokenizer.save_pretrained(MODELS_FOLDER / result_model)

('/home/melal/Workspace/unlp-2025-manipulation-detector/models/span-detection-checkpoint-500/tokenizer_config.json',
 '/home/melal/Workspace/unlp-2025-manipulation-detector/models/span-detection-checkpoint-500/special_tokens_map.json',
 '/home/melal/Workspace/unlp-2025-manipulation-detector/models/span-detection-checkpoint-500/tokenizer.json')

6. Test

In [8]:
model = AutoModelForTokenClassification.from_pretrained(MODELS_FOLDER / result_model)
tokenizer = AutoTokenizer.from_pretrained(MODELS_FOLDER / result_model)

In [9]:
submission_df = pd.read_csv(RAW_DATA_FOLDER / "test.csv")

In [10]:
nlp = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
)

Device set to use cuda:0


In [11]:
test_spans = {}
for i, row in submission_df.iterrows():
    res = nlp(row["content"])
    res_manipulation = [r for r in res if r["entity_group"] == "MANIPULATION"]
    res_manipulation_spans = [(r["start"], r["end"]) for r in res_manipulation]
    test_spans[row["id"]] = res_manipulation_spans

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [12]:
test_spans_df = pd.DataFrame(test_spans.items(), columns=["id", "trigger_words"])

7. Submit

In [13]:
submission_params_str = f"classifier_dropout = {classifier_dropout}, learning_rate = {learning_rate}, weight_decay = {weight_decay}, num_train_epochs = {num_train_epochs}"
message = f"[ {submission_params_str} ] {submission_desc}"
submission_path = SUBMISSIONS_FOLDER / "span-detection" / f"{result_model}.csv"

submit_df_competition(
    test_spans_df, submission_path, message, "unlp-2025-shared-task-span-identification"
)



100%|██████████| 420k/420k [00:01<00:00, 373kB/s] 
