# Base Span Detection implementation

1. Import dependencies

In [1]:
import random
import time

import numpy as np
import torch
import os

from transformers import (
    AutoTokenizer,
    DataCollatorForTokenClassification,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)

from src.util.torch_device import resolve_torch_device
from src.data.span_detection_ds import ManipulationDetectionDataset
from src.visualization.plot import plot_loss, plot_model_progress, plot_eval_loss
from src.definitions import (
    MODELS_FOLDER,
    RAW_DATA_FOLDER,
    REPORTS_FOLDER,
    PROCESSED_DATA_FOLDER,
)
from src.visualization.ner import MarkdownVisualizer, VisualizationMode
from src.visualization.reporting import EvaluatingReport
from src.model.span_detection_metrics import compute_metrics

2. Prepare Env

In [None]:
random_seed = 42

random.seed(random_seed)
torch.manual_seed(random_seed)
np.random.seed(random_seed)

device = resolve_torch_device()

model_checkpoint = MODELS_FOLDER / "ua-fine-tuned-xlm-roberta-large"

epoch_time = int(time.time())

os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

3. Load dataset

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

dataset_blueprint = ManipulationDetectionDataset(
    tokenizer=tokenizer,
    raw_path=RAW_DATA_FOLDER / "span-detection.parquet",
    processed_path=PROCESSED_DATA_FOLDER / "span-detection" / model_checkpoint,
    seed=random_seed,
)

dataset = dataset_blueprint.read()

Saving the dataset (0/1 shards):   0%|          | 0/3439 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/383 [00:00<?, ? examples/s]

3. Prepare model

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(dataset_blueprint.label2id),
    id2label=dataset_blueprint.id2label,
    label2id=dataset_blueprint.label2id,
    classifier_dropout=0.1,
).to(device)

data_collator = DataCollatorForTokenClassification(tokenizer)

training_args = TrainingArguments(
    output_dir=MODELS_FOLDER / "span-detection-checkpoint",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    seed=random_seed,
    logging_steps=200,
    auto_find_batch_size=True,
    torch_empty_cache_steps=1000,
    metric_for_best_model="token_f1",
    greater_is_better=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics(dataset_blueprint),
)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at EvanD/xlm-roberta-base-ukrainian-ner-ukrner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4. Train model

In [5]:
trainer.train()

Epoch,Training Loss,Validation Loss,Token F1,Token Precision,Token Recall,Span F1,Span Precision,Span Recall,Accuracy
1,0.4797,0.419363,0.4732,0.694106,0.358959,0.063702,0.054415,0.076812,0.785138
2,0.4244,0.404542,0.558379,0.692744,0.46767,0.083444,0.076829,0.091304,0.801128
3,0.3827,0.407523,0.584613,0.673151,0.516658,0.116556,0.107317,0.127536,0.802621
4,0.3396,0.441903,0.605496,0.636055,0.577739,0.121715,0.116402,0.127536,0.797612
5,0.307,0.457713,0.59353,0.657224,0.541091,0.13,0.128169,0.131884,0.800763


TrainOutput(global_step=1075, training_loss=0.37985066081202307, metrics={'train_runtime': 326.3123, 'train_samples_per_second': 52.695, 'train_steps_per_second': 3.294, 'total_flos': 3837166858893876.0, 'train_loss': 0.37985066081202307, 'epoch': 5.0})

In [6]:
evaluation_feedback = trainer.evaluate()

In [7]:
report_file = REPORTS_FOLDER / "span-detection" / "train-report.csv"

report = EvaluatingReport(report_file)

report.write_to_report(evaluation_feedback, epoch_time)

full_report = report.read_report()

full_report.tail(1)

KeyError: 'eval_precision'

In [None]:
plot_loss(trainer)

In [None]:
plot_eval_loss(trainer)

In [None]:
plot_model_progress(full_report)

5. Save weights

In [None]:
trainer.save_model(MODELS_FOLDER / "span-detection-model")
tokenizer.save_pretrained(MODELS_FOLDER / "span-detection-model")

6. Test

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    MODELS_FOLDER / "span-detection-model"
)
tokenizer = AutoTokenizer.from_pretrained(
    MODELS_FOLDER / "span-detection-model"
)

In [None]:
test_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

test_result = test_pipeline(dataset["test"]["content"])

In [None]:
visualizer = MarkdownVisualizer(
    tokenizer=tokenizer,
    path=REPORTS_FOLDER
    / "span-detection"
    / "test-visualization"
    / f"test-{epoch_time}.md",
    visualization_mode=VisualizationMode.ROBERTA,
)

visualizer.visualize_as_markdown_and_save(dataset["test"], test_result)

In [None]:
# [it for it in test_result if len(it) > 0]

In [None]:
# test = dataset["test"][1]

# print(test["content"][0:133])
# print(test["content"][135:250])
# print()
# print(test["content"])
# print(test["labels"])

In [None]:
# dataset["train"][1]

In [None]:
# content = dataset["train"][1]["content"]
# trigger_words = dataset["train"][1]["trigger_words"]

# for it in trigger_words:
#     print(content[it[0]: it[1]])

In [None]:
# labels = dataset["train"][1]['labels']
# input_ids = dataset["train"][1]["input_ids"]

# sub_str = []

# for i in range(len(labels)):
#     if labels[i] == 1:
#         sub_str.append(input_ids[i])

# tokenizer.decode(sub_str)

In [None]:
# tokenizer.convert_ids_to_tokens(dataset["test"][1]["input_ids"])

In [None]:
# tokenizer.decode(dataset["train"][1]["input_ids"])

In [None]:
# from datasets import ClassLabel, Sequence

# dataset["train"].features["labels"] = Sequence(feature=ClassLabel(names=["O", "I-MANIPULATION"]), length=-1, id=None)

# dataset["train"].features["labels"]

In [None]:
# len(dataset["train"]) / 16