In [None]:
# Colabis kasutamiseks
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/thesis/')

!pip install estnltk==1.7.4
!pip install evaluate
!pip install seqeval
!pip install nervaluate

In [None]:
from modules.data_processing import DatasetProcessor
from modules.bert_data_processing import BERTDataProcessor
from modules.bert_evaluator import BERTEvaluator
ALL_TAGS = DatasetProcessor.ALL_TAGS
TAG2IDX = DatasetProcessor.TAG2IDX
IDX2TAG = DatasetProcessor.IDX2TAG
from transformers import BertForTokenClassification, Trainer, TrainingArguments, AutoModelForTokenClassification
import json
from datasets import Dataset

In [3]:
def load_test_data(path):
  with open(path, 'r', encoding='utf-8') as f:
    data = json.load(f)
    ids, tokens, tags = [], [], []
    id = 0
    for document in data['documents']:
      for sentence in document['sentences']:
        sentence_tokens = []
        sentence_tags = []
        for word in sentence['words']:
          token = word['word']
          tag = word['ner_1']

          match tag:
            case 'B-GPE':
              tag = 'B-GEP'
            case 'I-GPE':
              tag = 'I-GEP'
            case 'B-EVENT':
              tag = 'B-EVE'
            case 'I-EVENT':
              tag = 'I-EVE'

          if tag not in ALL_TAGS:
            tag = 'O'

          sentence_tokens.append(token)
          sentence_tags.append(tag.upper())


        ids.append(id)
        tokens.append(sentence_tokens)
        tags.append(sentence_tags)
        id += 1
    return Dataset.from_dict({'id': ids, 'tokens': tokens, 'tags': tags})

In [4]:
def test_model(model_path, model_name=None, trained_on=None):
  estner_new = load_test_data('data/estner/EstNER_new_test.json')
  estner_v1 = load_test_data('data/estner/EstNER_v1_test.json')

  model = AutoModelForTokenClassification.from_pretrained(model_path)
  bert_processor = BERTDataProcessor(model_path)

  label2id = model.config.label2id
  def convert_to_model_id(example):
    #example['tags'] = [IDX2TAG[tag] for tag in example['tags']]
    example['tags'] = [label2id.get(tag) for tag in example['tags']]
    return example

  estner_new = estner_new.map(convert_to_model_id)
  estner_v1 = estner_v1.map(convert_to_model_id)
  estner_new = bert_processor.tokenize_dataset(estner_new)
  estner_v1 = bert_processor.tokenize_dataset(estner_v1)

  tags = [model.config.id2label[i] for i in range(len(model.config.id2label))]
  evaluator = BERTEvaluator(all_tags=tags)

  training_args = TrainingArguments(
              report_to='none',
              output_dir='./results',
          )
  trainer = Trainer(
              model=model,
              processing_class=bert_processor.tokenizer,
              data_collator=bert_processor.data_collator,
              compute_metrics=evaluator.compute_metrics,
              args=training_args
          )
  if model_name:
    print(f'{model_name}')
  print('EstNER_new')
  estner_new_results = evaluator.evaluate_and_print(estner_new, trainer)
  print('EstNER_v1')
  estner_v1_results = evaluator.evaluate_and_print(estner_v1, trainer)

  if model_name and trained_on:
    evaluator.evaluation_to_json(estner_new_results[1]['strict'], estner_new_results[2], model_name, trained_on, 'EstNER_new')
    evaluator.evaluation_to_json(estner_v1_results[1]['strict'], estner_v1_results[2], model_name, trained_on, 'EstNER_v1')

In [5]:
test_model('results/EstBERT/combined/checkpoint-9390', model_name='EstBERT', trained_on='Combined')

Map:   0%|          | 0/890 [00:00<?, ? examples/s]

Map:   0%|          | 0/1907 [00:00<?, ? examples/s]

Map:   0%|          | 0/890 [00:00<?, ? examples/s]

Map:   0%|          | 0/1907 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

EstBERT
EstNER_new
Hindan testandmestikul..


  _warn_prf(average, modifier, msg_start, len(result))


Seqeval tulemused
EVE {'precision': 0.3333333333333333, 'recall': 0.32608695652173914, 'f1': 0.32967032967032966, 'number': 46}
GEP {'precision': 0.7801047120418848, 'recall': 0.42939481268011526, 'f1': 0.553903345724907, 'number': 347}
LOC {'precision': 0.2100456621004566, 'recall': 0.6571428571428571, 'f1': 0.31833910034602075, 'number': 70}
MUU {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}
ORG {'precision': 0.7841726618705036, 'recall': 0.6038781163434903, 'f1': 0.6823161189358372, 'number': 361}
PER {'precision': 0.8384458077709611, 'recall': 0.8471074380165289, 'f1': 0.8427543679342241, 'number': 484}
PROD {'precision': 0.4585365853658537, 'recall': 0.22541966426858512, 'f1': 0.30225080385852093, 'number': 417}
UNK {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}
overall_precision 0.6512928022361985
overall_recall 0.5402898550724637
overall_f1 0.5906210392902409
overall_accuracy 0.9410975861357541

Nervaluate tulemused
Strict {'correct': 932, 'incorrect': 423,

Seqeval tulemused
EVE {'precision': 0.52, 'recall': 0.6046511627906976, 'f1': 0.5591397849462365, 'number': 43}
GEP {'precision': 0.7365145228215768, 'recall': 0.5486862442040186, 'f1': 0.6288751107174491, 'number': 647}
LOC {'precision': 0.20221606648199447, 'recall': 0.73, 'f1': 0.31670281995661603, 'number': 100}
ORG {'precision': 0.7249647390691114, 'recall': 0.5181451612903226, 'f1': 0.60435038212816, 'number': 992}
PER {'precision': 0.8679599499374218, 'recall': 0.9506511309115833, 'f1': 0.907425580634609, 'number': 1459}
PROD {'precision': 0.17857142857142858, 'recall': 0.25925925925925924, 'f1': 0.21148036253776437, 'number': 135}
UNK {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}
overall_precision 0.7033549146556798
overall_recall 0.7079383886255924
overall_f1 0.7056392087392973
overall_accuracy 0.9526782171670414

Nervaluate tulemused
Strict {'correct': 2390, 'incorrect': 667, 'partial': 0, 'missed': 319, 'spurious': 411, 'possible': 3376, 'actual': 3468, 'precisio

In [6]:
test_model('results/EstRoBERTa/combined/checkpoint-11268', model_name='EstRoBERTa', trained_on='Combined')

Map:   0%|          | 0/890 [00:00<?, ? examples/s]

Map:   0%|          | 0/1907 [00:00<?, ? examples/s]

Map:   0%|          | 0/890 [00:00<?, ? examples/s]

Map:   0%|          | 0/1907 [00:00<?, ? examples/s]

EstRoBERTa
EstNER_new
Hindan testandmestikul..


Seqeval tulemused
EVE {'precision': 0.40625, 'recall': 0.21666666666666667, 'f1': 0.28260869565217395, 'number': 60}
GEP {'precision': 0.8587786259541985, 'recall': 0.5421686746987951, 'f1': 0.6646971935007386, 'number': 415}
LOC {'precision': 0.27014218009478674, 'recall': 0.5876288659793815, 'f1': 0.37012987012987014, 'number': 97}
ORG {'precision': 0.7635135135135135, 'recall': 0.6260387811634349, 'f1': 0.6879756468797565, 'number': 361}
PER {'precision': 0.9116541353383458, 'recall': 0.9133709981167608, 'f1': 0.9125117591721543, 'number': 531}
PROD {'precision': 0.46534653465346537, 'recall': 0.2596685082872928, 'f1': 0.3333333333333333, 'number': 362}
overall_precision 0.7166123778501629
overall_recall 0.6024096385542169
overall_f1 0.6545670931270455
overall_accuracy 0.9601488602884168

Nervaluate tulemused
Strict {'correct': 1100, 'incorrect': 384, 'partial': 0, 'missed': 342, 'spurious': 93, 'possible': 1826, 'actual': 1577, 'precision': 0.6975269499048827, 'recall': 0.602409638

  _warn_prf(average, modifier, msg_start, len(result))


Seqeval tulemused
EVE {'precision': 0.5079365079365079, 'recall': 0.7441860465116279, 'f1': 0.6037735849056604, 'number': 43}
GEP {'precision': 0.7767441860465116, 'recall': 0.6065375302663438, 'f1': 0.681169272603671, 'number': 826}
LOC {'precision': 0.2528473804100228, 'recall': 0.7602739726027398, 'f1': 0.3794871794871796, 'number': 146}
ORG {'precision': 0.7531486146095718, 'recall': 0.4991652754590985, 'f1': 0.6004016064257028, 'number': 1198}
PER {'precision': 0.8932360742705571, 'recall': 0.9459269662921348, 'f1': 0.9188267394270122, 'number': 1424}
PROD {'precision': 0.1509433962264151, 'recall': 0.27586206896551724, 'f1': 0.1951219512195122, 'number': 145}
UNK {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}
overall_precision 0.7071005917159763
overall_recall 0.695134849286092
overall_f1 0.7010666666666667
overall_accuracy 0.9600015273879756

Nervaluate tulemused
Strict {'correct': 2629, 'incorrect': 696, 'partial': 0, 'missed': 457, 'spurious': 446, 'possible': 3782,

**Algne testimine**

In [15]:
estner_new = load_test_data('data/estner/EstNER_new_test.json')

In [16]:
estner_v1 = load_test_data('data/estner/EstNER_v1_test.json')

In [17]:
model = AutoModelForTokenClassification.from_pretrained('results/EstRoBERTa/combined/checkpoint-11268')
bert_processor = BERTDataProcessor('results/EstRoBERTa/combined/checkpoint-11268')

# model = AutoModelForTokenClassification.from_pretrained('results/EstRoBERTa/edt/checkpoint-9228')
# bert_processor = BERTDataProcessor('results/EstRoBERTa/edt/checkpoint-9228')

In [18]:
label2id = model.config.label2id
def convert_to_model_id(example):
  #example['tags'] = [IDX2TAG[tag] for tag in example['tags']]
  example['tags'] = [label2id.get(tag) for tag in example['tags']]
  return example

estner_new = estner_new.map(convert_to_model_id)
estner_v1 = estner_v1.map(convert_to_model_id)

estner_new = bert_processor.tokenize_dataset(estner_new)
estner_v1 = bert_processor.tokenize_dataset(estner_v1)

Map:   0%|          | 0/890 [00:00<?, ? examples/s]

Map:   0%|          | 0/1907 [00:00<?, ? examples/s]

Map:   0%|          | 0/890 [00:00<?, ? examples/s]

Map:   0%|          | 0/1907 [00:00<?, ? examples/s]

In [19]:
tags = [model.config.id2label[i] for i in range(len(model.config.id2label))]
evaluator = BERTEvaluator(all_tags=tags)

training_args = TrainingArguments(
            report_to='none',
            output_dir='./results',
            #per_device_eval_batch_size=16
        )
trainer = Trainer(
            model=model,
            processing_class=bert_processor.tokenizer,
            data_collator=bert_processor.data_collator,
            compute_metrics=evaluator.compute_metrics,
            args=training_args
        )

In [20]:
estner_new_results = evaluator.evaluate_and_print(estner_new, trainer)

Hindan testandmestikul..


Seqeval tulemused
EVE {'precision': 0.40625, 'recall': 0.21666666666666667, 'f1': 0.28260869565217395, 'number': 60}
GEP {'precision': 0.8587786259541985, 'recall': 0.5421686746987951, 'f1': 0.6646971935007386, 'number': 415}
LOC {'precision': 0.27014218009478674, 'recall': 0.5876288659793815, 'f1': 0.37012987012987014, 'number': 97}
ORG {'precision': 0.7635135135135135, 'recall': 0.6260387811634349, 'f1': 0.6879756468797565, 'number': 361}
PER {'precision': 0.9116541353383458, 'recall': 0.9133709981167608, 'f1': 0.9125117591721543, 'number': 531}
PROD {'precision': 0.46534653465346537, 'recall': 0.2596685082872928, 'f1': 0.3333333333333333, 'number': 362}
overall_precision 0.7166123778501629
overall_recall 0.6024096385542169
overall_f1 0.6545670931270455
overall_accuracy 0.9601488602884168

Nervaluate tulemused
Strict {'correct': 1100, 'incorrect': 384, 'partial': 0, 'missed': 342, 'spurious': 93, 'possible': 1826, 'actual': 1577, 'precision': 0.6975269499048827, 'recall': 0.602409638

In [21]:
estner_v1_results = evaluator.evaluate_and_print(estner_v1, trainer)

Hindan testandmestikul..


  _warn_prf(average, modifier, msg_start, len(result))


Seqeval tulemused
EVE {'precision': 0.5079365079365079, 'recall': 0.7441860465116279, 'f1': 0.6037735849056604, 'number': 43}
GEP {'precision': 0.7767441860465116, 'recall': 0.6065375302663438, 'f1': 0.681169272603671, 'number': 826}
LOC {'precision': 0.2528473804100228, 'recall': 0.7602739726027398, 'f1': 0.3794871794871796, 'number': 146}
ORG {'precision': 0.7531486146095718, 'recall': 0.4991652754590985, 'f1': 0.6004016064257028, 'number': 1198}
PER {'precision': 0.8932360742705571, 'recall': 0.9459269662921348, 'f1': 0.9188267394270122, 'number': 1424}
PROD {'precision': 0.1509433962264151, 'recall': 0.27586206896551724, 'f1': 0.1951219512195122, 'number': 145}
UNK {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}
overall_precision 0.7071005917159763
overall_recall 0.695134849286092
overall_f1 0.7010666666666667
overall_accuracy 0.9600015273879756

Nervaluate tulemused
Strict {'correct': 2629, 'incorrect': 696, 'partial': 0, 'missed': 457, 'spurious': 446, 'possible': 3782,