In [None]:
# Colabis kasutamiseks
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/thesis/')

!pip install evaluate
!pip install seqeval
!pip install nervaluate

In [2]:
from modules.eval_functions import get_supported_entities, predict_tags, write_to_tsv, evaluation_to_json, ALL_TAGS
from nervaluate import Evaluator
from datasets import Dataset
import json

In [3]:
def load_test_data(path):
  with open(path, 'r', encoding='utf-8') as f:
    data = json.load(f)
    ids, tokens, tags = [], [], []
    id = 0
    for document in data['documents']:
      for sentence in document['sentences']:
        sentence_tokens = []
        sentence_tags = []
        for word in sentence['words']:
          token = word['word']
          tag = word['ner_1']

          match tag:
            case 'B-GPE':
              tag = 'B-GEP'
            case 'I-GPE':
              tag = 'I-GEP'
            case 'B-EVENT':
              tag = 'B-EVE'
            case 'I-EVENT':
              tag = 'I-EVE'

          if tag not in ALL_TAGS:
            tag = 'O'

          sentence_tokens.append(token)
          sentence_tags.append(tag.upper())


        ids.append(id)
        tokens.append(sentence_tokens)
        tags.append(sentence_tags)
        id += 1
    return Dataset.from_dict({'id': ids, 'tokens': tokens, 'tags': tags})

In [4]:
def evaluate_model(data_path, model_path, model_name, trained_on, evaluated_on, is_roberta_model=False):
  data = load_test_data(data_path)

  supported_entities = get_supported_entities(model_path)

  true_tags = [item['tags'] for item in data]
  pred_tags = predict_tags(model_path, data, is_roberta_model)
  assert len(true_tags) == len(pred_tags)

  evaluator = Evaluator(true_tags, pred_tags, tags=list(supported_entities), loader='list')
  results, results_by_tag, result_indices, result_indices_by_tag = evaluator.evaluate()

  nervaluate_strict_overall = results['strict']
  nervaluate_by_tag = results_by_tag
  evaluation_to_json(nervaluate_strict_overall, nervaluate_by_tag, model_name, trained_on, evaluated_on, epochs=3)

  sents = [item['tokens'] for item in data]
  write_to_tsv(sents, true_tags, pred_tags, model_name, evaluated_on)

In [5]:
evaluate_model('data/estner/EstNER_new_test.json', 'results/models/EstBERT/Combined/trained_for_3/checkpoint-5634', model_name='EstBERT_Combined', trained_on='combined', evaluated_on='EstNER_new')
evaluate_model('data/estner/EstNER_v1_test.json', 'results/models/EstBERT/Combined/trained_for_3/checkpoint-5634', model_name='EstBERT_Combined', trained_on='combined', evaluated_on='EstNER_v1')

evaluate_model('data/estner/EstNER_new_test.json', 'results/models/est-roberta/Combined/trained_for_3/checkpoint-5634', model_name='est-roberta_Combined', trained_on='combined', evaluated_on='EstNER_new', is_roberta_model=True)
evaluate_model('data/estner/EstNER_v1_test.json', 'results/models/est-roberta/Combined/trained_for_3/checkpoint-5634', model_name='est-roberta_Combined', trained_on='combined', evaluated_on='EstNER_v1', is_roberta_model=True)

Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Salvestasin: results/EstBERT_Combined/eval_on_EstNER_new_test_2025-17-04_12-31.json


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Salvestasin: results/EstBERT_Combined/eval_on_EstNER_v1_test_2025-17-04_12-32.json


Device set to use cuda:0


Salvestasin: results/est-roberta_Combined/eval_on_EstNER_new_test_2025-17-04_12-32.json


Device set to use cuda:0


Salvestasin: results/est-roberta_Combined/eval_on_EstNER_v1_test_2025-17-04_12-33.json


**Vana kood**

In [None]:
def test_model(model_path, model_name=None, trained_on=None):
  estner_new = load_test_data('data/estner/EstNER_new_test.json')
  estner_v1 = load_test_data('data/estner/EstNER_v1_test.json')

  model = AutoModelForTokenClassification.from_pretrained(model_path)
  bert_processor = BERTDataProcessor(model_path)

  label2id = model.config.label2id
  def convert_to_model_id(example):
    #example['tags'] = [IDX2TAG[tag] for tag in example['tags']]
    example['tags'] = [label2id.get(tag) for tag in example['tags']]
    return example

  estner_new = estner_new.map(convert_to_model_id)
  estner_v1 = estner_v1.map(convert_to_model_id)
  estner_new = bert_processor.tokenize_dataset(estner_new)
  estner_v1 = bert_processor.tokenize_dataset(estner_v1)

  tags = [model.config.id2label[i] for i in range(len(model.config.id2label))]
  evaluator = BERTEvaluator(all_tags=tags)

  training_args = TrainingArguments(
              report_to='none',
              output_dir='./results',
          )
  trainer = Trainer(
              model=model,
              processing_class=bert_processor.tokenizer,
              data_collator=bert_processor.data_collator,
              compute_metrics=evaluator.compute_metrics,
              args=training_args
          )
  if model_name:
    print(f'{model_name}')
  print('EstNER_new')
  estner_new_results = evaluator.evaluate_and_print(estner_new, trainer)
  print('EstNER_v1')
  estner_v1_results = evaluator.evaluate_and_print(estner_v1, trainer)

  if model_name and trained_on:
    evaluator.evaluation_to_json(estner_new_results[0]['strict'], estner_new_results[1], model_name, trained_on, 'EstNER_new')
    evaluator.evaluation_to_json(estner_v1_results[0]['strict'], estner_v1_results[1], model_name, trained_on, 'EstNER_v1')

In [None]:
test_model('results/models/EstBERT/Combined/trained_for_3/checkpoint-5634', model_name='EstBERT', trained_on='Combined')

Map:   0%|          | 0/890 [00:00<?, ? examples/s]

Map:   0%|          | 0/1907 [00:00<?, ? examples/s]

Map:   0%|          | 0/890 [00:00<?, ? examples/s]

Map:   0%|          | 0/1907 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

EstBERT
EstNER_new
Hindan testandmestikul..



Nervaluate tulemused
Strict {'correct': 947, 'incorrect': 394, 'partial': 0, 'missed': 384, 'spurious': 127, 'possible': 1725, 'actual': 1468, 'precision': 0.6450953678474114, 'recall': 0.5489855072463768, 'f1': 0.5931725649859068}
precision 0.6450953678474114
recall 0.5489855072463768
f1 0.5931725649859068
EVE {'correct': 10, 'incorrect': 6, 'partial': 0, 'missed': 30, 'spurious': 6, 'possible': 46, 'actual': 22, 'precision': 0.45454545454545453, 'recall': 0.21739130434782608, 'f1': 0.29411764705882354}
GEP {'correct': 168, 'incorrect': 161, 'partial': 0, 'missed': 18, 'spurious': 5, 'possible': 347, 'actual': 334, 'precision': 0.5029940119760479, 'recall': 0.484149855907781, 'f1': 0.49339207048458145}
LOC {'correct': 44, 'incorrect': 23, 'partial': 0, 'missed': 3, 'spurious': 8, 'possible': 70, 'actual': 75, 'precision': 0.5866666666666667, 'recall': 0.6285714285714286, 'f1': 0.6068965517241379}
MUU {'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible':


Nervaluate tulemused
Strict {'correct': 2450, 'incorrect': 630, 'partial': 0, 'missed': 296, 'spurious': 411, 'possible': 3376, 'actual': 3491, 'precision': 0.7018046405041536, 'recall': 0.7257109004739336, 'f1': 0.7135575942915392}
precision 0.7018046405041536
recall 0.7257109004739336
f1 0.7135575942915392
EVE {'correct': 29, 'incorrect': 6, 'partial': 0, 'missed': 8, 'spurious': 10, 'possible': 43, 'actual': 45, 'precision': 0.6444444444444445, 'recall': 0.6744186046511628, 'f1': 0.6590909090909091}
GEP {'correct': 399, 'incorrect': 241, 'partial': 0, 'missed': 7, 'spurious': 94, 'possible': 647, 'actual': 734, 'precision': 0.5435967302452316, 'recall': 0.616692426584235, 'f1': 0.5778421433743665}
LOC {'correct': 80, 'incorrect': 20, 'partial': 0, 'missed': 0, 'spurious': 25, 'possible': 100, 'actual': 125, 'precision': 0.64, 'recall': 0.8, 'f1': 0.7111111111111111}
MUU {'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 0, 'actual': 0, 'precision':

In [None]:
test_model('results/models/est-roberta/Combined/trained_for_3/checkpoint-5634', model_name='est-roberta', trained_on='Combined')

Map:   0%|          | 0/890 [00:00<?, ? examples/s]

Map:   0%|          | 0/1907 [00:00<?, ? examples/s]

Map:   0%|          | 0/890 [00:00<?, ? examples/s]

Map:   0%|          | 0/1907 [00:00<?, ? examples/s]

est-roberta
EstNER_new
Hindan testandmestikul..



Nervaluate tulemused
Strict {'correct': 1058, 'incorrect': 365, 'partial': 0, 'missed': 301, 'spurious': 74, 'possible': 1724, 'actual': 1497, 'precision': 0.706746826987308, 'recall': 0.6136890951276102, 'f1': 0.6569388388699162}
precision 0.706746826987308
recall 0.6136890951276102
f1 0.6569388388699162
EVE {'correct': 14, 'incorrect': 10, 'partial': 0, 'missed': 31, 'spurious': 5, 'possible': 55, 'actual': 29, 'precision': 0.4827586206896552, 'recall': 0.2545454545454545, 'f1': 0.33333333333333337}
GEP {'correct': 185, 'incorrect': 175, 'partial': 0, 'missed': 4, 'spurious': 2, 'possible': 364, 'actual': 362, 'precision': 0.511049723756906, 'recall': 0.5082417582417582, 'f1': 0.509641873278237}
LOC {'correct': 48, 'incorrect': 30, 'partial': 0, 'missed': 4, 'spurious': 4, 'possible': 82, 'actual': 82, 'precision': 0.5853658536585366, 'recall': 0.5853658536585366, 'f1': 0.5853658536585366}
MUU {'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 0, 'a


Nervaluate tulemused
Strict {'correct': 2481, 'incorrect': 583, 'partial': 0, 'missed': 286, 'spurious': 340, 'possible': 3350, 'actual': 3404, 'precision': 0.7288484136310224, 'recall': 0.7405970149253731, 'f1': 0.7346757477050636}
precision 0.7288484136310224
recall 0.7405970149253731
f1 0.7346757477050636
EVE {'correct': 30, 'incorrect': 2, 'partial': 0, 'missed': 6, 'spurious': 13, 'possible': 38, 'actual': 45, 'precision': 0.6666666666666666, 'recall': 0.7894736842105263, 'f1': 0.7228915662650601}
GEP {'correct': 423, 'incorrect': 253, 'partial': 0, 'missed': 2, 'spurious': 58, 'possible': 678, 'actual': 734, 'precision': 0.5762942779291553, 'recall': 0.6238938053097345, 'f1': 0.5991501416430595}
LOC {'correct': 107, 'incorrect': 16, 'partial': 0, 'missed': 2, 'spurious': 27, 'possible': 125, 'actual': 150, 'precision': 0.7133333333333334, 'recall': 0.856, 'f1': 0.7781818181818182}
MUU {'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'possible': 0, 'actual

**Algne testimine**

In [None]:
estner_new = load_test_data('data/estner/EstNER_new_test.json')

In [None]:
estner_v1 = load_test_data('data/estner/EstNER_v1_test.json')

In [None]:
model = AutoModelForTokenClassification.from_pretrained('results/EstRoBERTa/combined/checkpoint-11268')
bert_processor = BERTDataProcessor('results/EstRoBERTa/combined/checkpoint-11268')

# model = AutoModelForTokenClassification.from_pretrained('results/EstRoBERTa/edt/checkpoint-9228')
# bert_processor = BERTDataProcessor('results/EstRoBERTa/edt/checkpoint-9228')

In [None]:
label2id = model.config.label2id
def convert_to_model_id(example):
  #example['tags'] = [IDX2TAG[tag] for tag in example['tags']]
  example['tags'] = [label2id.get(tag) for tag in example['tags']]
  return example

estner_new = estner_new.map(convert_to_model_id)
estner_v1 = estner_v1.map(convert_to_model_id)

estner_new = bert_processor.tokenize_dataset(estner_new)
estner_v1 = bert_processor.tokenize_dataset(estner_v1)

Map:   0%|          | 0/890 [00:00<?, ? examples/s]

Map:   0%|          | 0/1907 [00:00<?, ? examples/s]

Map:   0%|          | 0/890 [00:00<?, ? examples/s]

Map:   0%|          | 0/1907 [00:00<?, ? examples/s]

In [None]:
tags = [model.config.id2label[i] for i in range(len(model.config.id2label))]
evaluator = BERTEvaluator(all_tags=tags)

training_args = TrainingArguments(
            report_to='none',
            output_dir='./results',
            #per_device_eval_batch_size=16
        )
trainer = Trainer(
            model=model,
            processing_class=bert_processor.tokenizer,
            data_collator=bert_processor.data_collator,
            compute_metrics=evaluator.compute_metrics,
            args=training_args
        )

In [None]:
estner_new_results = evaluator.evaluate_and_print(estner_new, trainer)

Hindan testandmestikul..


Seqeval tulemused
EVE {'precision': 0.40625, 'recall': 0.21666666666666667, 'f1': 0.28260869565217395, 'number': 60}
GEP {'precision': 0.8587786259541985, 'recall': 0.5421686746987951, 'f1': 0.6646971935007386, 'number': 415}
LOC {'precision': 0.27014218009478674, 'recall': 0.5876288659793815, 'f1': 0.37012987012987014, 'number': 97}
ORG {'precision': 0.7635135135135135, 'recall': 0.6260387811634349, 'f1': 0.6879756468797565, 'number': 361}
PER {'precision': 0.9116541353383458, 'recall': 0.9133709981167608, 'f1': 0.9125117591721543, 'number': 531}
PROD {'precision': 0.46534653465346537, 'recall': 0.2596685082872928, 'f1': 0.3333333333333333, 'number': 362}
overall_precision 0.7166123778501629
overall_recall 0.6024096385542169
overall_f1 0.6545670931270455
overall_accuracy 0.9601488602884168

Nervaluate tulemused
Strict {'correct': 1100, 'incorrect': 384, 'partial': 0, 'missed': 342, 'spurious': 93, 'possible': 1826, 'actual': 1577, 'precision': 0.6975269499048827, 'recall': 0.602409638

In [None]:
estner_v1_results = evaluator.evaluate_and_print(estner_v1, trainer)

Hindan testandmestikul..


  _warn_prf(average, modifier, msg_start, len(result))


Seqeval tulemused
EVE {'precision': 0.5079365079365079, 'recall': 0.7441860465116279, 'f1': 0.6037735849056604, 'number': 43}
GEP {'precision': 0.7767441860465116, 'recall': 0.6065375302663438, 'f1': 0.681169272603671, 'number': 826}
LOC {'precision': 0.2528473804100228, 'recall': 0.7602739726027398, 'f1': 0.3794871794871796, 'number': 146}
ORG {'precision': 0.7531486146095718, 'recall': 0.4991652754590985, 'f1': 0.6004016064257028, 'number': 1198}
PER {'precision': 0.8932360742705571, 'recall': 0.9459269662921348, 'f1': 0.9188267394270122, 'number': 1424}
PROD {'precision': 0.1509433962264151, 'recall': 0.27586206896551724, 'f1': 0.1951219512195122, 'number': 145}
UNK {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}
overall_precision 0.7071005917159763
overall_recall 0.695134849286092
overall_f1 0.7010666666666667
overall_accuracy 0.9600015273879756

Nervaluate tulemused
Strict {'correct': 2629, 'incorrect': 696, 'partial': 0, 'missed': 457, 'spurious': 446, 'possible': 3782,