# Evalutation

In [2]:
from data_handling import write_igt, load_data_file
import random

story = load_data_file("../data/GenBench/categories/story")
advice = load_data_file("../data/GenBench/categories/advice")
history = load_data_file("../data/GenBench/categories/history")
personal = load_data_file("../data/GenBench/categories/personal")

id_data = story + history
ood_data = advice + personal

random.seed(1)
random.shuffle(id_data)
random.shuffle(ood_data)

count_ood = int(len(ood_data) / 2)

eval_ood = ood_data[:count_ood]
test_ood = ood_data[count_ood:]

eval_id = id_data[:count_ood]
train = id_data[count_ood:]

write_igt(eval_ood, '../data/GenBench/eval_ood.txt')
write_igt(eval_id, '../data/GenBench/eval_id.txt')
write_igt(test_ood, '../data/GenBench/test_ood.txt')
write_igt(train, '../data/GenBench/train.txt')

In [3]:
from data_handling import create_vocab, prepare_dataset, create_gloss_vocab
from uspanteko_morphology import morphology
from tokenizer import WordLevelTokenizer
from datasets import DatasetDict

MODEL_INPUT_LENGTH = 64
device = 'mps'

train_vocab = create_vocab([line.morphemes() for line in train], threshold=1)
tokenizer = WordLevelTokenizer(vocab=train_vocab, model_max_length=MODEL_INPUT_LENGTH)
glosses = create_gloss_vocab(morphology)

dataset = DatasetDict()
dataset['train'] = prepare_dataset(data=train, tokenizer=tokenizer, labels=glosses, device=device)
dataset['dev'] = prepare_dataset(data=eval_id, tokenizer=tokenizer, labels=glosses, device=device)
dataset['dev_OOD'] = prepare_dataset(data=eval_ood, tokenizer=tokenizer, labels=glosses, device=device)
dataset['test'] = prepare_dataset(data=test_ood, tokenizer=tokenizer, labels=glosses, device=device)

  0%|          | 0/5049 [00:00<?, ?ex/s]

  0%|          | 0/2128 [00:00<?, ?ex/s]

  0%|          | 0/2128 [00:00<?, ?ex/s]

  0%|          | 0/2128 [00:00<?, ?ex/s]

In [5]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, AutoModelForMaskedLM
import math

model = AutoModelForMaskedLM.from_pretrained("../models/usp-mlm-absolute-micro")
BATCH_SIZE = 64
EPOCHS = 50

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="pt")

args = TrainingArguments(
    output_dir=f"../training-checkpoints",
    evaluation_strategy="epoch",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=3,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=3,
    num_train_epochs=EPOCHS,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['dev'],
    data_collator=data_collator
)


eval_results = trainer.evaluate(dataset['dev'])
print(f"Perplexity (id): {math.exp(eval_results['eval_loss']):.2f}")

eval_results = trainer.evaluate(dataset['dev_OOD'])
print(f"Perplexity (ood): {math.exp(eval_results['eval_loss']):.2f}")

The following columns in the evaluation set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: segmentation, transcription, glosses, translation, morphemes. If segmentation, transcription, glosses, translation, morphemes are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2128
  Batch size = 64


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mmichael-ginn[0m. Use [1m`wandb login --relogin`[0m to force relogin


The following columns in the evaluation set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: segmentation, transcription, glosses, translation, morphemes. If segmentation, transcription, glosses, translation, morphemes are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2128
  Batch size = 64


Perplexity (id): 77.78
Perplexity (ood): 94.03


In [17]:
from transformers import AutoModelForTokenClassification
from finetune_token_classifier import create_trainer

model = AutoModelForTokenClassification.from_pretrained("../models/full-flat-1-1.0wd")

trainer = create_trainer(model, dataset=dataset, tokenizer=tokenizer, labels=glosses, batch_size=BATCH_SIZE,
                         max_epochs=300, weight_decay=0, report_to='wandb')

trainer.evaluate(dataset["dev"])

loading configuration file ../models/full-flat-1-1.0wd/config.json
Model config RobertaConfig {
  "_name_or_path": "../models/full-flat-1-1.0wd",
  "architectures": [
    "RobertaForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 100,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",
    "27": "LABE

Creating trainer...


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


PREDS [[61  1 46 ... 43 43 43]
 [54  1 25 ... 39 39 39]
 [38  1 53 ... 39 39 39]
 ...
 [30 32  1 ... 43 43 43]
 [38  1 42 ... 43 43 43]
 [61  1 30 ... 43 43 43]]
LABELS [[  61    1   46 ... -100 -100 -100]
 [  54    1   25 ... -100 -100 -100]
 [  38    1   53 ... -100 -100 -100]
 ...
 [  30   32    1 ... -100 -100 -100]
 [  38    1   42 ... -100 -100 -100]
 [  61    1   30 ... -100 -100 -100]]
(2128, 64)
Preds:	 ['CONJ', '[SEP]', 'VOC', '[SEP]', 'CONJ', '[SEP]', 'VOC']
Labels:	 ['CONJ', '[SEP]', 'VOC', '[SEP]', 'CONJ', '[SEP]', 'VOC']


{'eval_loss': 0.6604731678962708,
 'eval_average_accuracy': 0.8340146080410423,
 'eval_accuracy': 0.8446884081369067,
 'eval_runtime': 3.4136,
 'eval_samples_per_second': 623.391,
 'eval_steps_per_second': 9.96}

In [18]:
trainer.evaluate(dataset["dev_OOD"])

The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: segmentation, transcription, glosses, translation, morphemes. If segmentation, transcription, glosses, translation, morphemes are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2128
  Batch size = 64


PREDS [[56  1 65 ... 43 43 43]
 [54  1 26 ... 43 43 43]
 [39  1 25 ... 43 43 43]
 ...
 [26 39 14 ... 43 43 43]
 [25 39  1 ... 43 43 43]
 [41  1 26 ... 43 43 43]]
LABELS [[  56    1   65 ... -100 -100 -100]
 [  54    1   26 ... -100 -100 -100]
 [  39    1   25 ... -100 -100 -100]
 ...
 [  26   39   34 ... -100 -100 -100]
 [  25   39    1 ... -100 -100 -100]
 [  54    1   26 ... -100 -100 -100]]
(2128, 64)
Preds:	 ['NEG', '[SEP]', 'PREP']
Labels:	 ['NEG', '[SEP]', 'PREP']


{'eval_loss': 0.9369460940361023,
 'eval_average_accuracy': 0.7397303418041405,
 'eval_accuracy': 0.7444637429439861,
 'eval_runtime': 3.3151,
 'eval_samples_per_second': 641.915,
 'eval_steps_per_second': 10.256}