In [None]:
# Colabis kasutamiseks
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/thesis/')

!pip install seqeval
!pip install nervaluate
!pip install datasets
!pip install evaluate

In [2]:
from modules.eval_functions import get_supported_entities, predict_tags, write_to_tsv, evaluation_to_json
from nervaluate import Evaluator
from datasets import Dataset
import json

In [3]:
def load_test_data(file_path):
  with open(file_path, 'r', encoding='utf-8') as file:
    return Dataset.from_list(json.load(file))

In [4]:
def convert_tags_to_supported(data, supported_entities):
  converted = []

  for item in data:
    converted_item = item.copy()
    tags = item['tags']

    new_tags = []
    for tag in tags:
      if tag == 'O':
        new_tags.append(tag)
        continue

      entity_type = tag[2:] if tag.startswith(('B-', 'I-')) else tag
      prefix = tag[:2] if tag.startswith(('B-', 'I-')) else None
      if entity_type in supported_entities:
        new_tags.append(tag)
      elif entity_type == 'GEP' and 'GPE' in supported_entities:
        new_tags.append(f'{prefix}GPE')
      elif entity_type == 'EVE' and 'EVENT' in supported_entities:
        new_tags.append(f'{prefix}EVENT')
      else:
        new_tags.append('O')

    converted_item['tags'] = new_tags
    converted_item['original_tags'] = tags
    converted.append(converted_item)

  return Dataset.from_list(converted)

In [5]:
def clean_predictions(predictions):
  tags_to_remove = set(['TIME', 'PERCENT', 'TITLE', 'MONEY', 'DATE'])
  cleaned_predictions = []
  for prediction in predictions:
    cleaned_prediction = []
    for tag in prediction:
      if tag == 'O':
        cleaned_prediction.append(tag)
        continue
      entity_type = tag[2:] if tag.startswith(('B-', 'I-')) else tag
      if entity_type in tags_to_remove:
        cleaned_prediction.append('O')
      else:
        cleaned_prediction.append(tag)
    cleaned_predictions.append(cleaned_prediction)
  return cleaned_predictions

In [6]:
def evaluate_model(data_path, model_path, model_name, trained_on, evaluated_on):
  data = load_test_data(data_path)
  supported_entities = get_supported_entities(model_path)
  converted_data = convert_tags_to_supported(data, supported_entities)

  true_tags = [item['tags'] for item in converted_data]
  pred_tags = predict_tags(model_path, converted_data)
  pred_tags = clean_predictions(pred_tags)
  assert len(true_tags) == len(pred_tags)

  evaluator = Evaluator(true_tags, pred_tags, tags=list(supported_entities - {'TIME', 'PERCENT', 'TITLE', 'MONEY', 'DATE'}), loader='list')
  results, results_by_tag, result_indices, result_indices_by_tag = evaluator.evaluate()

  nervaluate_strict_overall = results['strict']
  nervaluate_by_tag = results_by_tag
  evaluation_to_json(nervaluate_strict_overall, nervaluate_by_tag, model_name, trained_on, evaluated_on)

  sents = [item['tokens'] for item in converted_data]
  write_to_tsv(sents, true_tags, pred_tags, model_name, evaluated_on)

In [7]:
evaluate_model('data/ewt/test.json', 'tartuNLP/EstBERT_NER', 'EstBERT_NER', 'EstNER', 'ewt')
evaluate_model('data/edt/test.json', 'tartuNLP/EstBERT_NER', 'EstBERT_NER', 'EstNER', 'edt')

evaluate_model('data/ewt/test.json', 'tartuNLP/EstBERT_NER_V2', 'EstBERT_NER_V2', 'EstNER_V1+EstNER_new', 'ewt')
evaluate_model('data/edt/test.json', 'tartuNLP/EstBERT_NER_V2', 'EstBERT_NER_V2', 'EstNER_V1+EstNER_new', 'edt')

config.json:   0%|          | 0.00/814 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of the model checkpoint at tartuNLP/EstBERT_NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/410k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Salvestasin: results/EstBERT_NER/eval_on_ewt_test_2025-17-04_13-33.json


Some weights of the model checkpoint at tartuNLP/EstBERT_NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Salvestasin: results/EstBERT_NER/eval_on_edt_test_2025-17-04_13-33.json


config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/372 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/410k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Salvestasin: results/EstBERT_NER_V2/eval_on_ewt_test_2025-17-04_13-34.json


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Salvestasin: results/EstBERT_NER_V2/eval_on_edt_test_2025-17-04_13-35.json
