# Base BERT implementation

1. Import dependencies

In [1]:
import random
import time

import numpy as np
import torch
import os

from transformers import (
    AutoConfig,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
)
import sys

sys.path.append('..')
from src.util.torch_device import resolve_torch_device
from src.data.span_detection_ds import ManipulationDetectionDataset
from src.definitions import (
    MODELS_FOLDER,
    RAW_DATA_FOLDER,
    PROCESSED_DATA_FOLDER,
)
from src.model.span_detection_metrics import compute_metrics

Using the latest cached version of the module from C:\Users\Vitalii\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--seqeval\541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c (last modified on Sat Feb  8 15:26:43 2025) since it couldn't be found locally at evaluate-metric--seqeval, or remotely on the Hugging Face Hub.


2. Prepare Env

In [None]:
random_seed = 42

random.seed(random_seed)
torch.manual_seed(random_seed)
np.random.seed(random_seed)

device = resolve_torch_device()

model_checkpoint = "distilbert/distilbert-base-multilingual-cased"

epoch_time = int(time.time())

#os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

3. Load dataset

In [None]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

dataset_blueprint = ManipulationDetectionDataset(
    tokenizer=tokenizer,
    raw_path=RAW_DATA_FOLDER / "span-detection.parquet",
    processed_path=PROCESSED_DATA_FOLDER / "span-detection",
    seed=random_seed,
)

dataset = dataset_blueprint.read()
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'content', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3439
    })
    test: Dataset({
        features: ['id', 'content', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 383
    })
})

3. 1 testing encoding

In [8]:
def encode_labels(data):
        tokenized_inputs = tokenizer(
            data["content"],
            truncation=True,
            return_offsets_mapping=True,
        )
        labels = []
        
       
        for i, offsets in enumerate(tokenized_inputs["offset_mapping"]):
            example_labels = [0] * len(offsets)
            trigger_words = data["trigger_words"][i]
            trigger_words = trigger_words if trigger_words is not None else []
            for start, end in trigger_words:
                for idx, (offset_start, offset_end) in enumerate(offsets):
                    if offset_start >= start and offset_end <= end:
                        example_labels[idx] = 1

            word_ids = tokenized_inputs.word_ids(i)

            previous_word_id = None

            for j, id in enumerate(word_ids):
                if id is None or id == previous_word_id:
                    example_labels[j] = -100
                previous_word_id = id

            labels.append(example_labels)
        
        for i, offsets in enumerate(tokenized_inputs["offset_mapping"]):
            for token_id, offset, label, token in zip(tokenized_inputs["input_ids"][i], offsets, labels[i], tokenized_inputs.tokens(i)):
                print(token_id, offset, label, token)


        tokenized_inputs["labels"] = labels

        del tokenized_inputs["offset_mapping"]

        return tokenized_inputs


In [9]:
from datasets import load_dataset
original = load_dataset(
            "parquet", split="train", data_files=str(RAW_DATA_FOLDER / "span-detection.parquet")
        )
original

Dataset({
    features: ['id', 'content', 'lang', 'manipulative', 'techniques', 'trigger_words'],
    num_rows: 3822
})

In [10]:
original[0]

{'id': '0bb0c7fa-101b-4583-a5f9-9d503339141c',
 'content': 'Новий огляд мапи DeepState від російського військового експерта, кухара путіна 2 розряду, спеціаліста по снарядному голоду та ректора музичної академії міноборони рф Євгєнія Пригожина. \nПригожин прогнозує, що невдовзі настане день звільнення Криму і день розпаду росії. Каже, що передумови цього вже створені. \n*Відео взяли з каналу \nФД\n. \n@informnapalm',
 'lang': 'uk',
 'manipulative': True,
 'techniques': ['euphoria', 'loaded_language'],
 'trigger_words': [[27, 63], [65, 88], [90, 183], [186, 308]]}

In [38]:
encoded = encode_labels(original[:1])

101 (0, 0) -100 [CLS]
100325 (0, 5) 0 Новий
555 (6, 7) 0 о
41824 (7, 11) -100 ##гляд
97744 (12, 14) 0 ма
20785 (14, 16) -100 ##пи
18891 (17, 21) 0 Deep
10731 (21, 22) -100 ##S
20359 (22, 26) -100 ##tate
11141 (27, 30) 1 від
28171 (31, 34) 1 рос
44033 (34, 42) -100 ##ійського
90602 (43, 54) 1 військового
546 (55, 56) 1 е
18705 (56, 58) -100 ##кс
29633 (58, 61) -100 ##пер
10367 (61, 63) -100 ##та
117 (63, 64) 0 ,
551 (65, 66) 1 к
88081 (66, 69) -100 ##уха
11079 (69, 71) -100 ##ра
38675 (72, 75) 1 пут
30487 (75, 78) -100 ##іна
123 (79, 80) 1 2
557 (81, 82) 1 р
44666 (82, 84) -100 ##оз
80367 (84, 88) -100 ##ряду
117 (88, 89) 0 ,
558 (90, 91) 1 с
19820 (91, 93) -100 ##пе
12167 (93, 95) -100 ##ці
26983 (95, 98) -100 ##алі
15535 (98, 101) -100 ##ста
10297 (102, 104) 1 по
558 (105, 106) 1 с
37235 (106, 109) -100 ##нар
35528 (109, 111) -100 ##яд
15575 (111, 115) -100 ##ному
92178 (116, 120) 1 голо
15986 (120, 122) -100 ##ду
10475 (123, 125) 1 та
70158 (126, 129) 1 рек
24425 (129, 133) -100 ##то

3. Prepare model

In [None]:
training_args = TrainingArguments(
    output_dir=str(MODELS_FOLDER / "manipulation-detector-bert-ner-checkpoint"),
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    seed=random_seed,
    logging_steps=200,
    auto_find_batch_size=True,
    torch_empty_cache_steps=1000,
    metric_for_best_model="f1",
    greater_is_better=True,
)

config = AutoConfig.from_pretrained(
    model_checkpoint,
    num_labels=len(dataset_blueprint.label2id),
    id2label=dataset_blueprint.id2label,
    label2id=dataset_blueprint.label2id,
    dropout=0.6,
)


def optuna_hp_space(trial):
    return {}


def model_init(trial):
    if trial:
        config.update(
            {
                "dropout": trial.suggest_float("dropout", 0.1, 0.8, log=True),
            }
        )
        training_args.learning_rate = trial.suggest_float(
            "learning_rate", 1e-6, 1e-4, log=True
        )
        training_args.weight_decay = trial.suggest_float(
            "weight_decay", 0.01, 0.9, log=True
        )

    return AutoModelForTokenClassification.from_pretrained(
        model_checkpoint,
        config=config,
    )


data_collator = DataCollatorForTokenClassification(tokenizer)


trainer = Trainer(
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics(dataset_blueprint),
    model_init=model_init,
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4. Run HP search

In [5]:
def compute_objective(metrics):
    return metrics["eval_f1"]


best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=optuna_hp_space,
    n_trials=30,
    compute_objective=compute_objective,
)

[I 2025-02-06 23:50:40,349] A new study created in memory with name: no-name-63e3659c-47c3-4d85-bce4-76c69c303672
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.53,0.495145,0.012308,0.010165,0.011134,0.749054


[I 2025-02-06 23:52:55,821] Trial 0 finished with value: 0.011134307585247043 and parameters: {'dropout': 0.2153605042391882, 'learning_rate': 3.668086206434241e-06, 'weight_decay': 0.1757935022534685}. Best is trial 0 with value: 0.011134307585247043.
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.4902,0.458914,0.035126,0.040661,0.037691,0.763192


[I 2025-02-06 23:55:06,627] Trial 1 finished with value: 0.037691401648998826 and parameters: {'dropout': 0.3148451788551006, 'learning_rate': 3.3284573907694685e-05, 'weight_decay': 0.1813842407190132}. Best is trial 1 with value: 0.037691401648998826.
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.4892,0.458164,0.034126,0.041931,0.037628,0.762888


[I 2025-02-06 23:57:14,248] Trial 2 finished with value: 0.03762827822120867 and parameters: {'dropout': 0.3027022659633645, 'learning_rate': 2.9222558191565536e-05, 'weight_decay': 0.7519791605815216}. Best is trial 1 with value: 0.037691401648998826.


In [6]:
best_trial

BestRun(run_id='1', objective=0.037691401648998826, hyperparameters={'dropout': 0.3148451788551006, 'learning_rate': 3.3284573907694685e-05, 'weight_decay': 0.1813842407190132}, run_summary=None)