In [47]:
from datasets import load_dataset, Dataset
try:
    from modules.eval_functions import get_supported_entities, predict_tags, write_to_tsv, evaluation_to_json
except:
    import os
    print(f"Ei leidnud modules kausta, liigun {os.getcwd()} kaustast välja")
    os.chdir('..')
    from modules.eval_functions import get_supported_entities, predict_tags, write_to_tsv, evaluation_to_json
from nervaluate import Evaluator

In [34]:
def load_test_data(data_path):
    if data_path.lower() in ["estner-new", "estner-reannotated"]:
        ds = load_dataset("tartuNLP/EstNER", data_path)
        test_data = ds['test']
        return test_data
    else:
        print("load_test_data toetatud sisendid on 'estner-new' ja 'estner-reannotated'")

In [67]:
def convert_tags_to_supported(data, supported_entities):
    converted = []

    for item in data:
        converted_item = item.copy()
        tags = item['ner_tags']
        
        new_tags = []
        for tag in tags:
            if tag == 'O':
                new_tags.append(tag)
                continue

            entity_type = tag[2:] if tag.startswith(('B-', 'I-')) else tag
            prefix = tag[:2] if tag.startswith(('B-', 'I-')) else None
    
            if entity_type in supported_entities:
                new_tags.append(tag)
            elif entity_type == 'GPE':
                new_tags.append(f"{prefix}LOC")
            else:
                new_tags.append('O')

        converted_item['tags'] = new_tags
        converted.append(converted_item)

    return Dataset.from_list(converted)

In [85]:
def evaluate_model(data_path, model_path, model_name, trained_on, evaluated_on):
  data = load_test_data(data_path)
  supported_entities = get_supported_entities(model_path)
  converted_data = convert_tags_to_supported(data, supported_entities)

  true_tags = [item['tags'] for item in converted_data]
  pred_tags = predict_tags(model_path, converted_data)
  #pred_tags = clean_predictions(pred_tags)
  assert len(true_tags) == len(pred_tags)

  evaluator = Evaluator(true_tags, pred_tags, tags=list(supported_entities), loader='list')
  results, results_by_tag, result_indices, result_indices_by_tag = evaluator.evaluate()

  nervaluate_strict_overall = results['strict']
  nervaluate_by_tag = results_by_tag
  evaluation_to_json(nervaluate_strict_overall, nervaluate_by_tag, model_name, trained_on, evaluated_on)

  sents = [item['tokens'] for item in converted_data]
  write_to_tsv(sents, true_tags, pred_tags, model_name, evaluated_on)

In [89]:
evaluate_model('estner-reannotated', 'tartuNLP/EstBERT_NER_V2', 'EstBERT_NER_V2', 'EstNER_V1+EstNER_new', 'estner-reannotated')

Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Salvestasin: results/EstBERT_NER_V2/eval_on_estner-reannotated_test_2025-07-05_15-27.json


In [90]:
evaluate_model('estner-new', 'tartuNLP/EstBERT_NER_V2', 'EstBERT_NER_V2', 'EstNER_V1+EstNER_new', 'estner-new')

Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Salvestasin: results/EstBERT_NER_V2/eval_on_estner-new_test_2025-07-05_15-28.json


Testimine

In [74]:
v1_test = load_test_data("estner-reannotated")
supported = get_supported_entities('tartuNLP/EstBERT_NER_V2')
print(supported)
v1_converted = convert_tags_to_supported(v1_test, supported)

{'MONEY', 'DATE', 'PER', 'ORG', 'TIME', 'TITLE', 'EVENT', 'PERCENT', 'PROD', 'LOC', 'GPE'}


In [83]:
v1_converted[0]

{'doc_id': '2babd3b4-e24c-4867-914a-5f8ae455be90',
 'sent_id': 0,
 'tokens': ['Slovakkia', 'peab', 'Ungari', 'argumente', 'alusetuks'],
 'ner_tags': ['B-GPE', 'O', 'B-GPE', 'O', 'O'],
 'ner_tags_2': ['O', 'O', 'O', 'O', 'O'],
 'ner_tags_3': ['O', 'O', 'O', 'O', 'O'],
 'tags': ['B-GPE', 'O', 'B-GPE', 'O', 'O']}

In [77]:
pred_tags = predict_tags('tartuNLP/EstBERT_NER_V2', v1_converted)

model.safetensors:  13%|#2        | 62.9M/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/372 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/410k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [53]:
pred_tags[0]

['B-LOC', 'O', 'B-LOC', 'O', 'O']

In [78]:
true_tags = [item['tags'] for item in v1_converted]
assert len(true_tags) == len(pred_tags)

In [79]:
evaluator = Evaluator(true_tags, pred_tags, tags=list(supported), loader='list')
results, results_by_tag, result_indices, result_indices_by_tag = evaluator.evaluate()

In [82]:
for tag in results_by_tag:
    print(tag, results_by_tag[tag]['strict']['f1'])
    print()

print(results['strict'])

MONEY 0.7058823529411764

DATE 0.706199460916442

PER 0.9011432414256892

ORG 0.704119850187266

TIME 0.5573770491803278

TITLE 0.7954022988505747

EVENT 0.28571428571428575

PERCENT 0.94017094017094

PROD 0.5285714285714286

LOC 0.5528455284552846

GPE 0.8680000000000001

{'correct': 2028, 'incorrect': 219, 'partial': 0, 'missed': 275, 'spurious': 340, 'possible': 2522, 'actual': 2587, 'precision': 0.7839195979899497, 'recall': 0.8041237113402062, 'f1': 0.7938931297709924}
