In [None]:
# Colabis kasutamiseks
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/thesis/')

!pip install estnltk==1.7.4
!pip install evaluate
!pip install seqeval
!pip install nervaluate

In [None]:
from modules.data_processing import DatasetProcessor
from modules.bert_data_processing import BERTDataProcessor
from modules.bert_evaluator import BERTEvaluator
ALL_TAGS = DatasetProcessor.ALL_TAGS
TAG2IDX = DatasetProcessor.TAG2IDX
IDX2TAG = DatasetProcessor.IDX2TAG
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import concatenate_datasets

In [3]:
def create_combined_test(ewt_test, edt_test):
  ewt_test_adjusted = ewt_test.map(lambda example, idx: {'id': idx}, with_indices=True)
  edt_test_adjusted = edt_test.map(lambda example, idx: {'id': idx + len(ewt_test)}, with_indices=True)
  combined_test = concatenate_datasets([ewt_test_adjusted, edt_test_adjusted])
  return combined_test

In [None]:
def evaluate_pretrained(model_name):
  model = AutoModelForTokenClassification.from_pretrained(model_name)
  bert_processor = BERTDataProcessor(model_name)

  label2id = model.config.label2id
  ewt_processor = DatasetProcessor('ewt', from_json=True)
  ewt_test = ewt_processor.test

  edt_processor = DatasetProcessor('edt', from_json=True)
  edt_test = edt_processor.test

  combined_test = create_combined_test(ewt_test, edt_test)

  if model_name.split('/')[1].lower() == 'estbert_ner_v2':
    IDX2TAG[15] = 'B-GPE'
    IDX2TAG[7] = 'I-GPE'
    IDX2TAG[16] = 'B-EVENT'
    IDX2TAG[8] = 'I-EVENT'

  def convert_to_model_id(example):
    example['tags'] = [IDX2TAG[tag] for tag in example['tags']]
    example['tags'] = [label2id.get(tag, label2id['O']) for tag in example['tags']]
    return example

  ewt_test = ewt_test.map(convert_to_model_id)
  edt_test = edt_test.map(convert_to_model_id)
  combined_test = combined_test.map(convert_to_model_id)

  ewt_test = bert_processor.tokenize_dataset(ewt_test)
  edt_test = bert_processor.tokenize_dataset(edt_test)
  combined_test = bert_processor.tokenize_dataset(combined_test)

  tags = [model.config.id2label[i] for i in range(len(model.config.id2label))]
  #label2id.sort()
  evaluator = BERTEvaluator(all_tags=tags, ner_tags=list(set([label[2:] for label in tags if label != 'O'])))

  training_args = TrainingArguments(
            report_to='none',
            output_dir='./results',
            #per_device_eval_batch_size=16
        )
  trainer = Trainer(
            model=model,
            processing_class=bert_processor.tokenizer,
            data_collator=bert_processor.data_collator,
            compute_metrics=evaluator.compute_metrics,
            args=training_args
        )
  print(f'{model_name} EWT testandmestikul')
  ewt_results = evaluator.evaluate_and_print(ewt_test, trainer)
  evaluator.evaluation_to_json(nervaluate_strict_overall=ewt_results[1]['strict'], nervaluate_by_tag=ewt_results[2], model_name=model_name.split('/')[1], trained_on=None, evaluated_on="EWT")
  print()

  print(f'{model_name} EDT testandmestikul')
  edt_results = evaluator.evaluate_and_print(edt_test, trainer)
  evaluator.evaluation_to_json(nervaluate_strict_overall=edt_results[1]['strict'], nervaluate_by_tag=edt_results[2], model_name=model_name.split('/')[1], trained_on=None, evaluated_on="EDT")
  print()

  print(f'{model_name} Kombineeritud testandmestikul')
  combined_results = evaluator.evaluate_and_print(combined_test, trainer)
  evaluator.evaluation_to_json(nervaluate_strict_overall=combined_results[1]['strict'], nervaluate_by_tag=combined_results[2], model_name=model_name.split('/')[1], trained_on=None, evaluated_on="Combined")


In [5]:
evaluate_pretrained('tartuNLP/EstBERT_NER')

config.json:   0%|          | 0.00/814 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of the model checkpoint at tartuNLP/EstBERT_NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/410k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/913 [00:00<?, ? examples/s]

Map:   0%|          | 0/3207 [00:00<?, ? examples/s]

Map:   0%|          | 0/913 [00:00<?, ? examples/s]

Map:   0%|          | 0/3207 [00:00<?, ? examples/s]

Map:   0%|          | 0/4120 [00:00<?, ? examples/s]

Map:   0%|          | 0/913 [00:00<?, ? examples/s]

Map:   0%|          | 0/3207 [00:00<?, ? examples/s]

Map:   0%|          | 0/4120 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

tartuNLP/EstBERT_NER EWT testandmestikul
Hindan testandmestikul..


Seqeval tulemused
LOC {'precision': 0.20689655172413793, 'recall': 0.32727272727272727, 'f1': 0.2535211267605634, 'number': 55}
ORG {'precision': 0.27884615384615385, 'recall': 0.12554112554112554, 'f1': 0.17313432835820897, 'number': 231}
PER {'precision': 0.7008086253369272, 'recall': 0.2374429223744292, 'f1': 0.35470668485675305, 'number': 1095}
overall_precision 0.5462633451957295
overall_recall 0.222302679217958
overall_f1 0.31600617601646935
overall_accuracy 0.9305341331048272

Nervaluate tulemused
Strict {'correct': 307, 'incorrect': 253, 'partial': 0, 'missed': 821, 'spurious': 118, 'possible': 1381, 'actual': 678, 'precision': 0.4528023598820059, 'recall': 0.222302679217958, 'f1': 0.2982030111704711}
precision 0.4528023598820059
recall 0.222302679217958
f1 0.2982030111704711
LOC {'correct': 18, 'incorrect': 15, 'partial': 0, 'missed': 22, 'spurious': 47, 'possible': 55, 'actual': 80, 'precision': 0.225, 'recall': 0.32727272727272727, 'f1': 0.26666666666666666}
ORG {'correct': 

Seqeval tulemused
LOC {'precision': 0.3459715639810427, 'recall': 0.4965986394557823, 'f1': 0.40782122905027934, 'number': 441}
ORG {'precision': 0.3333333333333333, 'recall': 0.23593964334705075, 'f1': 0.2763052208835341, 'number': 729}
PER {'precision': 0.6501672240802676, 'recall': 0.41770519982810483, 'f1': 0.5086342229199372, 'number': 2327}
overall_precision 0.5155068078668684
overall_recall 0.38976265370317414
overall_f1 0.44390164468327636
overall_accuracy 0.954740330103491

Nervaluate tulemused
Strict {'correct': 1363, 'incorrect': 1136, 'partial': 0, 'missed': 998, 'spurious': 698, 'possible': 3497, 'actual': 3197, 'precision': 0.4263371911166719, 'recall': 0.38976265370317414, 'f1': 0.4072303555422767}
precision 0.4263371911166719
recall 0.38976265370317414
f1 0.4072303555422767
LOC {'correct': 219, 'incorrect': 139, 'partial': 0, 'missed': 83, 'spurious': 330, 'possible': 441, 'actual': 688, 'precision': 0.3183139534883721, 'recall': 0.4965986394557823, 'f1': 0.387953941541

Seqeval tulemused
LOC {'precision': 0.32916666666666666, 'recall': 0.4778225806451613, 'f1': 0.38980263157894735, 'number': 496}
ORG {'precision': 0.3241935483870968, 'recall': 0.209375, 'f1': 0.25443037974683547, 'number': 960}
PER {'precision': 0.6602357984994641, 'recall': 0.36002337814143776, 'f1': 0.4659606656580938, 'number': 3422}
overall_precision 0.5208983156581409
overall_recall 0.34235342353423537
overall_f1 0.41316180108857004
overall_accuracy 0.9495208297405829

Nervaluate tulemused
Strict {'correct': 1670, 'incorrect': 1389, 'partial': 0, 'missed': 1819, 'spurious': 816, 'possible': 4878, 'actual': 3875, 'precision': 0.4309677419354839, 'recall': 0.34235342353423537, 'f1': 0.38158345710042274}
precision 0.4309677419354839
recall 0.34235342353423537
f1 0.38158345710042274
LOC {'correct': 237, 'incorrect': 154, 'partial': 0, 'missed': 105, 'spurious': 377, 'possible': 496, 'actual': 768, 'precision': 0.30859375, 'recall': 0.4778225806451613, 'f1': 0.37500000000000006}
ORG {

In [6]:
evaluate_pretrained('tartuNLP/EstBERT_NER_V2')

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/372 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/410k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/913 [00:00<?, ? examples/s]

Map:   0%|          | 0/3207 [00:00<?, ? examples/s]

Map:   0%|          | 0/913 [00:00<?, ? examples/s]

Map:   0%|          | 0/3207 [00:00<?, ? examples/s]

Map:   0%|          | 0/4120 [00:00<?, ? examples/s]

Map:   0%|          | 0/913 [00:00<?, ? examples/s]

Map:   0%|          | 0/3207 [00:00<?, ? examples/s]

Map:   0%|          | 0/4120 [00:00<?, ? examples/s]

tartuNLP/EstBERT_NER_V2 EWT testandmestikul
Hindan testandmestikul..


  _warn_prf(average, modifier, msg_start, len(result))


Seqeval tulemused
DATE {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}
EVENT {'precision': 0.11764705882352941, 'recall': 0.07017543859649122, 'f1': 0.0879120879120879, 'number': 57}
GPE {'precision': 0.22105263157894736, 'recall': 0.38181818181818183, 'f1': 0.27999999999999997, 'number': 55}
LOC {'precision': 0.14285714285714285, 'recall': 0.09090909090909091, 'f1': 0.1111111111111111, 'number': 55}
MONEY {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}
ORG {'precision': 0.13690476190476192, 'recall': 0.09956709956709957, 'f1': 0.11528822055137845, 'number': 231}
PER {'precision': 0.5123809523809524, 'recall': 0.245662100456621, 'f1': 0.33209876543209876, 'number': 1095}
PERCENT {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}
PROD {'precision': 0.22123893805309736, 'recall': 0.06906077348066299, 'f1': 0.10526315789473685, 'number': 362}
TIME {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}
TITLE {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number

Seqeval tulemused
DATE {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}
EVENT {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 84}
GPE {'precision': 0.25, 'recall': 0.19010416666666666, 'f1': 0.21597633136094677, 'number': 384}
LOC {'precision': 0.36619718309859156, 'recall': 0.11791383219954649, 'f1': 0.1783876500857633, 'number': 441}
MONEY {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}
ORG {'precision': 0.23769100169779286, 'recall': 0.19204389574759945, 'f1': 0.21244309559939303, 'number': 729}
PER {'precision': 0.48569612205975843, 'recall': 0.3283197249677697, 'f1': 0.3917948717948718, 'number': 2327}
PERCENT {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}
PROD {'precision': 0.1884498480243161, 'recall': 0.10508474576271186, 'f1': 0.13492927094668117, 'number': 590}
TIME {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}
TITLE {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}
overall_precision 0.2868787799105969
overall_recall 0.

Seqeval tulemused
DATE {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}
EVENT {'precision': 0.05194805194805195, 'recall': 0.028368794326241134, 'f1': 0.03669724770642202, 'number': 141}
GPE {'precision': 0.24289405684754523, 'recall': 0.214123006833713, 'f1': 0.22760290556900728, 'number': 439}
LOC {'precision': 0.3220338983050847, 'recall': 0.11491935483870967, 'f1': 0.16939078751857353, 'number': 496}
MONEY {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}
ORG {'precision': 0.21532364597093792, 'recall': 0.16979166666666667, 'f1': 0.18986604542807223, 'number': 960}
PER {'precision': 0.49237368922783603, 'recall': 0.30187025131502043, 'f1': 0.3742753623188405, 'number': 3422}
PERCENT {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}
PROD {'precision': 0.19683257918552036, 'recall': 0.09138655462184873, 'f1': 0.12482065997130559, 'number': 952}
TIME {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}
TITLE {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'n