In [None]:
# Colabis kasutamiseks
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/thesis/')

!pip install seqeval
!pip install nervaluate
!pip install datasets
!pip install evaluate

In [2]:
import json
import datetime
import csv
from transformers import pipeline, AutoConfig
from nervaluate import Evaluator
from datasets import Dataset
from modules.bert_evaluator import BERTEvaluator

In [3]:
def load_test_data(file_path):
  with open(file_path, 'r', encoding='utf-8') as file:
    return Dataset.from_list(json.load(file))

In [4]:
def get_supported_entities(model_name):
  config = AutoConfig.from_pretrained(model_name)
  id2label = config.id2label

  supported_entities = set()
  for label in id2label.values():
    if label != 'O':
      entity_type = label[2:] if label.startswith(('B-', 'I-')) else label
      supported_entities.add(entity_type)

  return supported_entities

In [5]:
def convert_tags_to_supported(data, supported_entities):
  converted = []

  for item in data:
    converted_item = item.copy()
    tags = item['tags']

    new_tags = []
    for tag in tags:
      if tag == 'O':
        new_tags.append(tag)
        continue

      entity_type = tag[2:] if tag.startswith(('B-', 'I-')) else tag
      prefix = tag[:2] if tag.startswith(('B-', 'I-')) else None
      if entity_type in supported_entities:
        new_tags.append(tag)
      elif entity_type == 'GEP' and 'GPE' in supported_entities:
        new_tags.append(f'{prefix}GPE')
      elif entity_type == 'EVE' and 'EVENT' in supported_entities:
        new_tags.append(f'{prefix}EVENT')
      else:
        new_tags.append('O')

    converted_item['tags'] = new_tags
    converted_item['original_tags'] = tags
    converted.append(converted_item)

  return Dataset.from_list(converted)

In [7]:
def predict_tags(model_name, data):
  ner = pipeline('ner', model=model_name, aggregation_strategy=None)
  predictions = []

  for item in data:
    sentence = " ".join(item['tokens'])

    prediction = ner(sentence)

    spans = []
    current = 0
    for token in item['tokens']:
      spans.append((current, current + len(token)))
      current += len(token) + 1

    prediction_tags = ['O'] * len(item['tokens'])

    entities = []
    current_entity = None

    for pred in prediction:
      tag = pred['entity']
      word = pred['word']
      start = pred['start']
      end = pred['end']

      if word.startswith('##') or word in ['[CLS]', '[SEP]', '[PAD]']:
        continue

      if tag.startswith('B-') or (tag == 'O' and current_entity is not None):
        if current_entity is not None:
          entities.append(current_entity)
          current_entity = None

      if tag.startswith('B-'):
        entity_type = tag[2:]
        current_entity = {'type': entity_type, 'start': start, 'end': end}

      elif tag.startswith('I-'):
        entity_type = tag[2:]
        if current_entity is not None and current_entity['type'] == entity_type:
          current_entity['end'] = end

    if current_entity is not None:
      entities.append(current_entity)

    for entity in entities:
      entity_start = entity['start']
      entity_end = entity['end']
      entity_type = entity['type']

      first = True
      for i, (start, end) in enumerate(spans):
        if start <= entity_end and end >= entity_start:
          if first:
            prediction_tags[i] = f'B-{entity_type}'
            first = False
          else:
            prediction_tags[i] = f'I-{entity_type}'

    predictions.append(prediction_tags)

  return predictions

In [8]:
def clean_predictions(predictions):
  tags_to_remove = set(['TIME', 'PERCENT', 'TITLE', 'MONEY', 'DATE'])
  cleaned_predictions = []
  for prediction in predictions:
    cleaned_prediction = []
    for tag in prediction:
      if tag == 'O':
        cleaned_prediction.append(tag)
        continue
      entity_type = tag[2:] if tag.startswith(('B-', 'I-')) else tag
      if entity_type in tags_to_remove:
        cleaned_prediction.append('O')
      else:
        cleaned_prediction.append(tag)
    cleaned_predictions.append(cleaned_prediction)
  return cleaned_predictions

In [9]:
def write_to_tsv(sents, true_tags, pred_tags, model_name, evaluated_on):
  folder_name = f"results/{model_name}"
  os.makedirs(folder_name, exist_ok=True)
  timestamp = datetime.datetime.now().strftime("%Y-%d-%m_%H-%M")
  file_name = f"predictions_{evaluated_on}_test_{timestamp}.tsv"
  file_path = os.path.join(folder_name, file_name)

  with open(file_path, 'w', encoding='utf-8', newline='') as file:
    writer = csv.writer(file, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
    #file.write('word\ttrue_tag\tpred_tag\n')
    writer.writerow(['word', 'true_tag', 'pred_tag'])
    for sent, true, pred in zip(sents, true_tags, pred_tags):
      for word, true_tag, pred_tag in zip(sent, true, pred):
        writer.writerow([word, true_tag, pred_tag])
        #file.write(f'{word}\t{true_tag}\t{pred_tag}\n')
      #file.write('\n')
      writer.writerow([])

In [10]:
def evaluate_model(data_path, model_name):
  data = load_test_data(data_path)

  supported_entities = get_supported_entities(model_name)
  converted_data = convert_tags_to_supported(data, supported_entities)

  true_tags = [item['tags'] for item in converted_data]
  pred_tags = predict_tags(model_name, converted_data)
  pred_tags = clean_predictions(pred_tags)
  assert len(true_tags) == len(pred_tags)

  bert_name = model_name.split('/')[-1]
  data_name = data_path.split('/')[1].upper()
  sents = [item['tokens'] for item in converted_data]
  write_to_tsv(sents, true_tags, pred_tags, bert_name, data_name)

  evaluator = Evaluator(true_tags, pred_tags, tags=list(supported_entities - {'TIME', 'PERCENT', 'TITLE', 'MONEY', 'DATE'}), loader='list')
  results, results_by_tag, result_indices, result_indices_by_tag = evaluator.evaluate()

  bertevaluator = BERTEvaluator(all_tags=[], ner_tags=list(supported_entities - {'TIME', 'PERCENT', 'TITLE', 'MONEY', 'DATE'}))
  bertevaluator.evaluation_to_json(nervaluate_strict_overall=results['strict'], nervaluate_by_tag=results_by_tag, model_name=bert_name, trained_on=None, evaluated_on=data_name)

  return results, results_by_tag

In [11]:
def main():
  data_paths = ['data/ewt/test.json', 'data/edt/test.json']
  model_names = ['tartuNLP/EstBERT_NER', 'tartuNLP/EstBERT_NER_V2']

  for model_name in model_names:
    for data_path in data_paths:
      results, results_by_tag = evaluate_model(data_path, model_name)

      print(f'{model_name} {data_path}')
      print(results['strict'])
      for tag in results_by_tag:
        print(tag, results_by_tag[tag]['strict'])
      print()

In [12]:
main()

config.json:   0%|          | 0.00/814 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of the model checkpoint at tartuNLP/EstBERT_NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/410k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

Salvestasin: results/EstBERT_NER/eval_on_EWT_test_2025-12-03_13-56.json
tartuNLP/EstBERT_NER data/ewt/test.json
{'correct': 288, 'incorrect': 43, 'partial': 0, 'missed': 220, 'spurious': 91, 'possible': 551, 'actual': 422, 'precision': 0.6824644549763034, 'recall': 0.5226860254083484, 'f1': 0.591983556012333}
LOC {'correct': 14, 'incorrect': 5, 'partial': 0, 'missed': 9, 'spurious': 42, 'possible': 28, 'actual': 61, 'precision': 0.22950819672131148, 'recall': 0.5, 'f1': 0.3146067415730337}
ORG {'correct': 50, 'incorrect': 21, 'partial': 0, 'missed': 21, 'spurious': 27, 'possible': 92, 'actual': 98, 'precision': 0.5102040816326531, 'recall': 0.5434782608695652, 'f1': 0.5263157894736842}
PER {'correct': 224, 'incorrect': 17, 'partial': 0, 'missed': 190, 'spurious': 22, 'possible': 431, 'actual': 263, 'precision': 0.8517110266159695, 'recall': 0.5197215777262181, 'f1': 0.6455331412103746}



Some weights of the model checkpoint at tartuNLP/EstBERT_NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Salvestasin: results/EstBERT_NER/eval_on_EDT_test_2025-12-03_13-56.json
tartuNLP/EstBERT_NER data/edt/test.json
{'correct': 1491, 'incorrect': 161, 'partial': 0, 'missed': 123, 'spurious': 517, 'possible': 1775, 'actual': 2169, 'precision': 0.6874135546334716, 'recall': 0.84, 'f1': 0.7560851926977687}
LOC {'correct': 226, 'incorrect': 46, 'partial': 0, 'missed': 21, 'spurious': 317, 'possible': 293, 'actual': 589, 'precision': 0.3837011884550085, 'recall': 0.7713310580204779, 'f1': 0.5124716553287982}
ORG {'correct': 272, 'incorrect': 56, 'partial': 0, 'missed': 37, 'spurious': 117, 'possible': 365, 'actual': 445, 'precision': 0.6112359550561798, 'recall': 0.7452054794520548, 'f1': 0.6716049382716051}
PER {'correct': 993, 'incorrect': 59, 'partial': 0, 'missed': 65, 'spurious': 83, 'possible': 1117, 'actual': 1135, 'precision': 0.8748898678414097, 'recall': 0.8889883616830797, 'f1': 0.8818827708703375}



config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/372 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/410k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Salvestasin: results/EstBERT_NER_V2/eval_on_EWT_test_2025-12-03_13-57.json
tartuNLP/EstBERT_NER_V2 data/ewt/test.json
{'correct': 414, 'incorrect': 125, 'partial': 0, 'missed': 181, 'spurious': 64, 'possible': 720, 'actual': 603, 'precision': 0.6865671641791045, 'recall': 0.575, 'f1': 0.6258503401360543}
LOC {'correct': 4, 'incorrect': 15, 'partial': 0, 'missed': 9, 'spurious': 1, 'possible': 28, 'actual': 20, 'precision': 0.2, 'recall': 0.14285714285714285, 'f1': 0.16666666666666666}
ORG {'correct': 38, 'incorrect': 31, 'partial': 0, 'missed': 23, 'spurious': 13, 'possible': 92, 'actual': 82, 'precision': 0.4634146341463415, 'recall': 0.41304347826086957, 'f1': 0.4367816091954023}
EVENT {'correct': 8, 'incorrect': 11, 'partial': 0, 'missed': 3, 'spurious': 8, 'possible': 22, 'actual': 27, 'precision': 0.2962962962962963, 'recall': 0.36363636363636365, 'f1': 0.32653061224489793}
PROD {'correct': 24, 'incorrect': 39, 'partial': 0, 'missed': 43, 'spurious': 15, 'possible': 106, 'actual':

Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Salvestasin: results/EstBERT_NER_V2/eval_on_EDT_test_2025-12-03_13-57.json
tartuNLP/EstBERT_NER_V2 data/edt/test.json
{'correct': 1524, 'incorrect': 611, 'partial': 0, 'missed': 288, 'spurious': 246, 'possible': 2423, 'actual': 2381, 'precision': 0.6400671986560269, 'recall': 0.6289723483285183, 'f1': 0.6344712739383847}
LOC {'correct': 65, 'incorrect': 194, 'partial': 0, 'missed': 34, 'spurious': 6, 'possible': 293, 'actual': 265, 'precision': 0.24528301886792453, 'recall': 0.22184300341296928, 'f1': 0.2329749103942652}
ORG {'correct': 206, 'incorrect': 105, 'partial': 0, 'missed': 54, 'spurious': 105, 'possible': 365, 'actual': 416, 'precision': 0.4951923076923077, 'recall': 0.5643835616438356, 'f1': 0.5275288092189502}
EVENT {'correct': 7, 'incorrect': 19, 'partial': 0, 'missed': 12, 'spurious': 2, 'possible': 38, 'actual': 28, 'precision': 0.25, 'recall': 0.18421052631578946, 'f1': 0.2121212121212121}
PROD {'correct': 61, 'incorrect': 173, 'partial': 0, 'missed': 75, 'spurious': 29