# Evaluacija prepoznavanja imenovanih entiteta

**Zadatak:** fine-tuning Transformer modela za prepoznavanje imenovanih entiteta (NER) na srpskom jeziku.  
**Skup podataka:** [COMtext.SR.legal](https://raw.githubusercontent.com/ICEF-NLP/COMtext.SR/ee8c2432fb4229012a3cb396b7823639216fc3da/data/comtext.sr.legal.ijekavica.conllu)  
**Modeli:** BERTić i SrBERTa

In [1]:
import warnings
warnings.filterwarnings('ignore')

from pathlib import Path
from datasets import Dataset
from datetime import datetime
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import numpy as np

import torch
import transformers
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
)

print(f"Transformers version: {transformers.__version__}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("WARNING: CUDA not available, will use CPU!")

Transformers version: 4.57.3
PyTorch version: 2.9.1+cu128
CUDA available: False


## Učitavanje i parsiranje CoNLL-U formata

In [2]:
def parse_conllu(file_path):
    """
    Parse CoNLL-U format file.
    Returns: (sentences, labels) where each is a list of lists.
    """
    sentences = []
    labels = []
    current_tokens = []
    current_labels = []
    
    with open(file_path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            
            # Skip comments and blank lines (end of sentence)
            if line.startswith("#") or not line:
                if current_tokens:
                    sentences.append(current_tokens)
                    labels.append(current_labels)
                    current_tokens = []
                    current_labels = []
                continue
            
            # Parse token line: ID FORM LEMMA POS NER
            parts = line.split("\t")
            if len(parts) >= 5 and parts[0].isdigit():
                token = parts[1]       # Column 2: word form
                ner_tag = parts[4]     # Column 5: NER tag
                current_tokens.append(token)
                current_labels.append(ner_tag)
        
        # Don't forget last sentence
        if current_tokens:
            sentences.append(current_tokens)
            labels.append(current_labels)
    
    return sentences, labels

# Load data
data_path = Path("../data/comtext.sr.legal.ijekavica.conllu")
sentences, labels = parse_conllu(data_path)

print(f"Loaded {len(sentences)} sentences")
print(f"Total tokens: {sum(len(s) for s in sentences)}")
print("\nExample sentence 1:")
print(f"Tokens: {sentences[0][:10]}...")
print(f"Labels: {labels[0][:10]}...")

Loaded 4762 sentences
Total tokens: 105470

Example sentence 1:
Tokens: ['Trans', 'Impex', 'Trade', 'd.o.o.', 'Bul.', 'Vojvode', 'Stepe', '123/2', '21000', 'Novi']...
Labels: ['B-COM', 'I-COM', 'I-COM', 'I-COM', 'B-ADR', 'I-ADR', 'I-ADR', 'I-ADR', 'I-ADR', 'I-ADR']...


## Analiza distribucije labela

In [3]:
all_labels = set()
for label_seq in labels:
    all_labels.update(label_seq)

unique_labels = sorted(list(all_labels))
print(f"Total unique labels: {len(unique_labels)}")
print(f"\nAll labels:\n{unique_labels}")

# Count occurrences
label_counts = {}
for label_seq in labels:
    for label in label_seq:
        label_counts[label] = label_counts.get(label, 0) + 1

# Show top 10 most frequent
print("\nTop 10 most frequent labels:")
for label, count in sorted(label_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(f"  {label:8}: {count:6,d}")

Total unique labels: 35

All labels:
['B-ADR', 'B-COM', 'B-CONTACT', 'B-COURT', 'B-DATE', 'B-IDCOM', 'B-IDOTH', 'B-IDPER', 'B-IDTAX', 'B-INST', 'B-LAW', 'B-MISC', 'B-MONEY', 'B-NUMACC', 'B-NUMCAR', 'B-NUMDOC', 'B-NUMPLOT', 'B-ORGOTH', 'B-PER', 'B-REF', 'B-TOP', 'I-ADR', 'I-COM', 'I-COURT', 'I-DATE', 'I-INST', 'I-LAW', 'I-MISC', 'I-MONEY', 'I-NUMCAR', 'I-ORGOTH', 'I-PER', 'I-REF', 'I-TOP', 'O']

Top 10 most frequent labels:
  O       : 91,357
  I-LAW   :  3,229
  I-REF   :  2,161
  I-ADR   :  1,211
  I-DATE  :  1,090
  B-PER   :    694
  I-INST  :    693
  I-PER   :    617
  I-COM   :    432
  B-LAW   :    395


## Kreiranje mapiranja labela za model

In [4]:
# Label mappings
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}

print(f"Created mappings for {len(label2id)} labels")
print("\nFirst 10 label mappings:")
for label, idx in list(label2id.items())[:10]:
    print(f"  {label:10s} -> {idx}")

print(f"\nTest mapping: 'B-PER' -> {label2id['B-PER']} -> '{id2label[label2id['B-PER']]}'")

Created mappings for 35 labels

First 10 label mappings:
  B-ADR      -> 0
  B-COM      -> 1
  B-CONTACT  -> 2
  B-COURT    -> 3
  B-DATE     -> 4
  B-IDCOM    -> 5
  B-IDOTH    -> 6
  B-IDPER    -> 7
  B-IDTAX    -> 8
  B-INST     -> 9

Test mapping: 'B-PER' -> 18 -> 'B-PER'


## Učitavanje modela i tokenizatora

In [5]:
model_name = "classla/bcms-bertic"

print(f"Loading tokenizer from {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(f"Loading model from {model_name}...")
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True 
)

print(f"Model type: {type(model).__name__}")
print(f"Number of labels: {model.num_labels}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

Loading tokenizer from classla/bcms-bertic...
Loading model from classla/bcms-bertic...


Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at classla/bcms-bertic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model type: ElectraForTokenClassification
Number of labels: 35
Model parameters: 110,053,667


In [6]:
test_sentence = sentences[0][:5]
test_tokens = tokenizer(test_sentence, is_split_into_words=True, truncation=True)
print("\nTest tokenization:")
print(f"Original tokens: {test_sentence}")
print(f"Tokenized IDs: {test_tokens['input_ids'][:10]}...")


Test tokenization:
Original tokens: ['Trans', 'Impex', 'Trade', 'd.o.o.', 'Bul.']
Tokenized IDs: [2, 21006, 12906, 2042, 1032, 18278, 72, 18, 83, 18]...


## Tokenizacija i poravnanje labela

In [7]:
def tokenize_and_align_labels(examples, tokenizer, label2id):
    """
    Tokenize text and align labels with subword tokens.
    
    Args:
        examples: Dict with 'tokens' and 'ner_tags' keys
        tokenizer: HuggingFace tokenizer
        label2id: Label to ID mapping
    
    Returns:
        Tokenized inputs with aligned labels
    """
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        max_length=256,
    )
    
    labels = []
    for i, label_seq in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        
        for word_idx in word_ids:
            # Special tokens (CLS, SEP, PAD) get -100
            if word_idx is None:
                label_ids.append(-100)
            # First subword of a word gets the label
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label_seq[word_idx]])
            # Subsequent subwords get -100 (ignored)
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        
        labels.append(label_ids)
    
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

# Test on first sentence
test_example = {
    'tokens': [sentences[0]],
    'ner_tags': [labels[0]]
}

test_result = tokenize_and_align_labels(test_example, tokenizer, label2id)

print("\nTest alignment on sentence 1:")
print(f"Original tokens ({len(sentences[0])}): {sentences[0][:8]}...")
print(f"Original labels ({len(labels[0])}): {labels[0][:8]}...")
print(f"Tokenized IDs ({len(test_result['input_ids'][0])}): {test_result['input_ids'][0][:12]}...")
print(f"Aligned labels ({len(test_result['labels'][0])}): {test_result['labels'][0][:12]}...")


Test alignment on sentence 1:
Original tokens (11): ['Trans', 'Impex', 'Trade', 'd.o.o.', 'Bul.', 'Vojvode', 'Stepe', '123/2']...
Original labels (11): ['B-COM', 'I-COM', 'I-COM', 'I-COM', 'B-ADR', 'I-ADR', 'I-ADR', 'I-ADR']...
Tokenized IDs (27): [2, 21006, 12906, 2042, 1032, 18278, 72, 18, 83, 18, 83, 18]...
Aligned labels (27): [-100, 1, 22, -100, -100, 22, 22, -100, -100, -100, -100, -100]...


## Priprema podataka

In [8]:
data_dict = {
    "tokens": sentences,
    "ner_tags": labels
}

dataset = Dataset.from_dict(data_dict)

print(f"Created dataset with {len(dataset)} sentences")
print("\nDataset structure:")
print(dataset)
print("\nFirst example:")
print(dataset[0])

Created dataset with 4762 sentences

Dataset structure:
Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 4762
})

First example:
{'tokens': ['Trans', 'Impex', 'Trade', 'd.o.o.', 'Bul.', 'Vojvode', 'Stepe', '123/2', '21000', 'Novi', 'Sad'], 'ner_tags': ['B-COM', 'I-COM', 'I-COM', 'I-COM', 'B-ADR', 'I-ADR', 'I-ADR', 'I-ADR', 'I-ADR', 'I-ADR', 'I-ADR']}


In [9]:
# Split indices
train_indices, eval_indices = train_test_split(
    range(len(dataset)),
    test_size=0.1,
    random_state=42,
    shuffle=True
)

# Create train and eval datasets
train_dataset = dataset.select(train_indices)
eval_dataset = dataset.select(eval_indices)

print(f"Train set: {len(train_dataset)} sentences")
print(f"Eval set:  {len(eval_dataset)} sentences")
print(f"\nSplit ratio: {len(train_dataset)/len(dataset)*100:.1f}% train / {len(eval_dataset)/len(dataset)*100:.1f}% eval")

Train set: 4285 sentences
Eval set:  477 sentences

Split ratio: 90.0% train / 10.0% eval


In [10]:
print("Tokenizing training data...")
tokenized_train = train_dataset.map(
    lambda x: tokenize_and_align_labels(x, tokenizer, label2id),
    batched=True,
    remove_columns=train_dataset.column_names
)

print("Tokenizing evaluation data...")
tokenized_eval = eval_dataset.map(
    lambda x: tokenize_and_align_labels(x, tokenizer, label2id),
    batched=True,
    remove_columns=eval_dataset.column_names
)

print(f"\nTokenized train dataset: {len(tokenized_train)} examples")
print(f"Tokenized eval dataset: {len(tokenized_eval)} examples")
print("\nTokenized example:")
print(f"  Input IDs length: {len(tokenized_train[0]['input_ids'])}")
print(f"  Labels length: {len(tokenized_train[0]['labels'])}")
print(f"  First 15 labels: {tokenized_train[0]['labels'][:15]}")

Tokenizing training data...


Map: 100%|██████████| 4285/4285 [00:00<00:00, 11085.85 examples/s]


Tokenizing evaluation data...


Map: 100%|██████████| 477/477 [00:00<00:00, 10447.59 examples/s]


Tokenized train dataset: 4285 examples
Tokenized eval dataset: 477 examples

Tokenized example:
  Input IDs length: 35
  Labels length: 35
  First 15 labels: [-100, 34, 34, 34, -100, -100, 34, 34, 34, 34, -100, 34, 34, 34, 34]





In [11]:
# Create data collator (batching and padding)
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True,
    return_tensors="pt"
)

print("Data collator created - will pad sequences to batch max length")

Data collator created - will pad sequences to batch max length


## Metrike za evaluaciju

In [12]:
def strip_bio_prefix(labels):
    """Convert B-PER, I-PER → PER (entity type only)"""
    stripped = []
    for label in labels:
        if label == 'O':
            stripped.append('O')
        else:
            # Remove B- or I- prefix
            entity_type = label.split('-', 1)[1] if '-' in label else label
            stripped.append(entity_type)
    return stripped

def compute_metrics(pred):
    """
    Compute metrics for model predictions.
    This gets called automatically during evaluation.
    """
    predictions, labels = pred
    
    # Get predicted label IDs (argmax over logits)
    predictions = np.argmax(predictions, axis=2)
    
    # Flatten and remove ignored indices (-100)
    true_labels = []
    pred_labels = []
    
    for i in range(len(labels)):
        for j in range(len(labels[i])):
            if labels[i][j] != -100:
                true_labels.append(id2label[labels[i][j]])
                pred_labels.append(id2label[predictions[i][j]])
    
    # Convert to arrays
    y_true = np.array(true_labels)
    y_pred = np.array(pred_labels)
    
    # DEFAULT EVALUATION (entity type only)
    y_true_default = strip_bio_prefix(y_true)
    y_pred_default = strip_bio_prefix(y_pred)
    
    default_acc = accuracy_score(y_true_default, y_pred_default)
    
    unique_labels_default = sorted(set(y_true_default) | set(y_pred_default))
    entity_labels_default = [l for l in unique_labels_default if l != 'O']
    
    default_f1_with_o = f1_score(y_true_default, y_pred_default, labels=unique_labels_default, average='macro', zero_division=0)
    default_f1_without_o = f1_score(y_true_default, y_pred_default, labels=entity_labels_default, average='macro', zero_division=0)
    
    # STRICT EVALUATION (full BIO tags)
    strict_acc = accuracy_score(y_true, y_pred)
    
    unique_labels = sorted(set(y_true) | set(y_pred))
    entity_labels = [l for l in unique_labels if l != 'O']
    
    strict_f1_with_o = f1_score(y_true, y_pred, labels=unique_labels, average='macro', zero_division=0)
    strict_f1_without_o = f1_score(y_true, y_pred, labels=entity_labels, average='macro', zero_division=0)
    
    return {
        # Default mode
        'default_accuracy': default_acc,
        'default_f1_with_o': default_f1_with_o,
        'default_f1_without_o': default_f1_without_o,

        # Strict mode
        'strict_accuracy': strict_acc,
        'strict_f1_with_o': strict_f1_with_o,
        'strict_f1_without_o': strict_f1_without_o,
    }

print("Evaluation metrics function created")
print("  - Default mode: Entity type only")
print("  - Strict mode: Full BIO tag matching")
print("  - Metrics: Accuracy, F1-Macro (with/without O)")

Evaluation metrics function created
  - Default mode: Entity type only
  - Strict mode: Full BIO tag matching
  - Metrics: Accuracy, F1-Macro (with/without O)


## Konfiguracija za treniranje

In [13]:
# Create output directory with timestamp
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_dir = f"../outputs/models/bertic_ner_{timestamp}"

training_args = TrainingArguments(
    output_dir=output_dir,
    
    # Training schedule
    num_train_epochs=20,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    
    # Optimization
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    
    # Evaluation
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="strict_f1_without_o",
    greater_is_better=True,
    
    # Performance
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=2,
    
    # Logging
    logging_dir=f"../outputs/logs/{timestamp}",
    logging_strategy="steps",
    logging_steps=50,
    report_to="none",
    
    # Checkpointing
    save_total_limit=2,
    
    # Reproducibility
    seed=42,
)

print("Training arguments configured")
print("\nKey settings:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Train batch size: {training_args.per_device_train_batch_size}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  FP16: {training_args.fp16}")
print(f"  Output: {output_dir}")

Training arguments configured

Key settings:
  Epochs: 20
  Train batch size: 16
  Learning rate: 5e-05
  FP16: False
  Output: ../outputs/models/bertic_ner_20251222_145927


In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

print(f"Model on device: {next(model.parameters()).device}")
print(f"Ready to train on {len(tokenized_train)} training examples")
print(f"Will evaluate on {len(tokenized_eval)} eval examples")

Model on device: cpu
Ready to train on 4285 training examples
Will evaluate on 477 eval examples


## Treniranje modela

In [15]:
print("Starting training...")
print("=" * 70)

train_result = trainer.train()

print("\n" + "=" * 70)
print("Training completed!")
print("=" * 70)

print("\nTraining metrics:")
for key, value in train_result.metrics.items():
    print(f"  {key}: {value}")

print("\nRunning final evaluation...")
eval_metrics = trainer.evaluate()

print("\nFinal Evaluation Results:")
print("-" * 70)
print("DEFAULT EVALUATION (entity type only):")
print(f"  Accuracy:           {eval_metrics['eval_default_accuracy']:.4f}")
print(f"  F1-Macro (with O):  {eval_metrics['eval_default_f1_with_o']:.4f}")
print(f"  F1-Macro (no O):    {eval_metrics['eval_default_f1_without_o']:.4f}")
print()
print("STRICT EVALUATION (full BIO tags):")
print(f"  Accuracy:           {eval_metrics['eval_strict_accuracy']:.4f}")
print(f"  F1-Macro (with O):  {eval_metrics['eval_strict_f1_with_o']:.4f}")
print(f"  F1-Macro (no O):    {eval_metrics['eval_strict_f1_without_o']:.4f}")

Starting training...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 