# Evalutation

In [1]:
from data_handling import write_igt, load_data_file
import random

story = load_data_file("../data/GenBench/categories/story")
advice = load_data_file("../data/GenBench/categories/advice")
history = load_data_file("../data/GenBench/categories/history")
personal = load_data_file("../data/GenBench/categories/personal")

id_data = story + history
ood_data = advice + personal

random.seed(1)
random.shuffle(id_data)
random.shuffle(ood_data)

count_ood = int(len(ood_data) / 2)

eval_ood = ood_data[:count_ood]
test_ood = ood_data[count_ood:]

eval_id = id_data[:count_ood]
train = id_data[count_ood:]

write_igt(eval_ood, '../data/GenBench/eval_ood.txt')
write_igt(eval_id, '../data/GenBench/eval_id.txt')
write_igt(test_ood, '../data/GenBench/test_ood.txt')
write_igt(train, '../data/GenBench/train.txt')

In [2]:
from data_handling import create_vocab, prepare_dataset, create_gloss_vocab
from uspanteko_morphology import morphology
from tokenizer import WordLevelTokenizer
from datasets import DatasetDict

MODEL_INPUT_LENGTH = 64
device = 'mps'

train_vocab = create_vocab([line.morphemes() for line in train], threshold=1)
tokenizer = WordLevelTokenizer(vocab=train_vocab, model_max_length=MODEL_INPUT_LENGTH)
glosses = create_gloss_vocab(morphology)

dataset = DatasetDict()
dataset['train'] = prepare_dataset(data=train, tokenizer=tokenizer, labels=glosses, device=device)
dataset['dev'] = prepare_dataset(data=eval_id, tokenizer=tokenizer, labels=glosses, device=device)
dataset['dev_OOD'] = prepare_dataset(data=eval_ood, tokenizer=tokenizer, labels=glosses, device=device)
dataset['test'] = prepare_dataset(data=test_ood, tokenizer=tokenizer, labels=glosses, device=device)

  0%|          | 0/5049 [00:00<?, ?ex/s]

  0%|          | 0/2128 [00:00<?, ?ex/s]

  0%|          | 0/2128 [00:00<?, ?ex/s]

  0%|          | 0/2128 [00:00<?, ?ex/s]

In [3]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, AutoModelForMaskedLM
import math

model = AutoModelForMaskedLM.from_pretrained("../models/usp-mlm-absolute-micro")
BATCH_SIZE = 64
EPOCHS = 50

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="pt")

args = TrainingArguments(
    output_dir=f"../training-checkpoints",
    evaluation_strategy="epoch",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=3,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=3,
    num_train_epochs=EPOCHS,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['dev'],
    data_collator=data_collator
)


eval_results = trainer.evaluate(dataset['dev'])
print(f"Perplexity (id): {math.exp(eval_results['eval_loss']):.2f}")

eval_results = trainer.evaluate(dataset['dev_OOD'])
print(f"Perplexity (ood): {math.exp(eval_results['eval_loss']):.2f}")

The following columns in the evaluation set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: translation, transcription, morphemes, segmentation, glosses. If translation, transcription, morphemes, segmentation, glosses are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2128
  Batch size = 64


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mmichael-ginn[0m. Use [1m`wandb login --relogin`[0m to force relogin


The following columns in the evaluation set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: translation, transcription, morphemes, segmentation, glosses. If translation, transcription, morphemes, segmentation, glosses are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2128
  Batch size = 64


Perplexity (id): 77.78
Perplexity (ood): 94.03


In [4]:
from transformers import AutoModelForTokenClassification
from finetune_token_classifier import create_trainer

model = AutoModelForTokenClassification.from_pretrained("../models/full-flat-1-0.75wd")

trainer = create_trainer(model, dataset=dataset, tokenizer=tokenizer, labels=glosses, batch_size=BATCH_SIZE,
                         max_epochs=300, weight_decay=0, report_to='wandb')

trainer.evaluate(dataset["dev"])

loading configuration file https://huggingface.co/michaelginn/usp-gloss-denoiser/resolve/main/config.json from cache at /Users/milesper/.cache/huggingface/transformers/7e2ec61ad7d65e162a400531c9e5863debfee42d2cab561a61445e9f61e5042c.9fd2f43be603a3cd21a989a6243debd527b4c090b1356919b2a349435d1f3ec8
Model config RobertaConfig {
  "_name_or_path": "michaelginn/usp-gloss-denoiser",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 64,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 2,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.21.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_s

Creating trainer...


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


PREDS [[61  1 46 ... 43 43 43]
 [54  1 25 ... 43 43 43]
 [38  1 53 ... 43 43 43]
 ...
 [30 65  1 ... 43 43 43]
 [38  1 42 ... 43 43 43]
 [61  1 30 ... 43 43 43]]
LABELS [[  61    1   46 ... -100 -100 -100]
 [  54    1   25 ... -100 -100 -100]
 [  38    1   53 ... -100 -100 -100]
 ...
 [  30   32    1 ... -100 -100 -100]
 [  38    1   42 ... -100 -100 -100]
 [  61    1   30 ... -100 -100 -100]]
(2128, 64)
Preds:	 ['CONJ', '[SEP]', 'VOC', '[SEP]', 'CONJ', '[SEP]', 'VOC']
Labels:	 ['CONJ', '[SEP]', 'VOC', '[SEP]', 'CONJ', '[SEP]', 'VOC']


{'eval_loss': 0.6546702980995178,
 'eval_average_accuracy': 0.8352711496406285,
 'eval_accuracy': 0.8458185340652244,
 'eval_runtime': 3.1694,
 'eval_samples_per_second': 671.43,
 'eval_steps_per_second': 10.728}

In [5]:
trainer.evaluate(dataset["dev_OOD"])

The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: translation, transcription, morphemes, segmentation, glosses. If translation, transcription, morphemes, segmentation, glosses are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2128
  Batch size = 64


PREDS [[56  1 65 ... 43 43 43]
 [54  1 26 ... 43 43 43]
 [39  1 25 ... 43 43 43]
 ...
 [26 38 14 ... 43 43 43]
 [25 39  1 ... 43 43 43]
 [41  1 26 ... 43 43 43]]
LABELS [[  56    1   65 ... -100 -100 -100]
 [  54    1   26 ... -100 -100 -100]
 [  39    1   25 ... -100 -100 -100]
 ...
 [  26   39   34 ... -100 -100 -100]
 [  25   39    1 ... -100 -100 -100]
 [  54    1   26 ... -100 -100 -100]]
(2128, 64)
Preds:	 ['NEG', '[SEP]', 'PREP']
Labels:	 ['NEG', '[SEP]', 'PREP']


{'eval_loss': 0.9338552355766296,
 'eval_average_accuracy': 0.7497611989743314,
 'eval_accuracy': 0.7505427702996093,
 'eval_runtime': 3.1031,
 'eval_samples_per_second': 685.776,
 'eval_steps_per_second': 10.957}

In [6]:
# Write predictions
from data_handling import write_predictions

write_predictions(eval_ood, tokenizer, trainer, glosses, '../data/GenBench/pred_eval_ood.txt')

  0%|          | 0/2128 [00:00<?, ?ex/s]

The following columns in the test set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: translation, transcription, morphemes, segmentation, glosses. If translation, transcription, morphemes, segmentation, glosses are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 2128
  Batch size = 64


PREDS [[56  1 65 ... 43 43 43]
 [54  1 26 ... 43 43 43]
 [39  1 25 ... 43 43 43]
 ...
 [26 38 14 ... 43 43 43]
 [25 39  1 ... 43 43 43]
 [41  1 26 ... 43 43 43]]
LABELS [[  56    1   65 ... -100 -100 -100]
 [  54    1   26 ... -100 -100 -100]
 [  39    1   25 ... -100 -100 -100]
 ...
 [  26   39   34 ... -100 -100 -100]
 [  25   39    1 ... -100 -100 -100]
 [  54    1   26 ... -100 -100 -100]]
(2128, 64)
Preds:	 ['NEG', '[SEP]', 'PREP']
Labels:	 ['NEG', '[SEP]', 'PREP']


## Denoiser

In [162]:
import torch
from tqdm.notebook import tqdm

target_set = dataset["dev_OOD"]

unknown_tokens = 0
unknown_token_wrong = 0
known_tokens = 0
known_token_wrong = 0

preds = trainer.predict(target_set)

rows_with_unknowns = set()

for row_index, row in enumerate(target_set):
    for pos in range(len(row['input_ids'])):
        if row['input_ids'][pos] == 2:
            break
        if row['input_ids'][pos] == 1:
            continue
        
        if row['input_ids'][pos] == 0:
            unknown_tokens += 1
            rows_with_unknowns.add(row_index)

            if preds.predictions[row_index][pos] != row['labels'][pos]:
                # Incorrect
                unknown_token_wrong += 1
        else:
            known_tokens += 1
            if preds.predictions[row_index][pos] != row['labels'][pos]:
                # Incorrect
                known_token_wrong += 1
                
print(f"{unknown_tokens} total unknown tokens, {unknown_token_wrong} wrong = {unknown_token_wrong / unknown_tokens}")
print(f"{known_tokens} total known tokens, {known_token_wrong} wrong = {known_token_wrong / known_tokens}")
print(f"{unknown_token_wrong + known_token_wrong} total wrong of {known_tokens + unknown_tokens} total tokens")

total_correct_before = 0
total_fixed = 0
both_correct = 0
total = 0

denoiser = AutoModelForMaskedLM.from_pretrained("../models/usp-gloss-denoiser-micro")

for row_id in tqdm(rows_with_unknowns):
    test_row = target_set[row_id]
    test_preds = torch.LongTensor([preds.predictions[row_id]])
    input_ids = torch.LongTensor([test_row['input_ids']])
    labels = torch.LongTensor([test_row['labels']])

    test_preds[test_preds != 1] = test_preds[test_preds != 1] + 4

    # test_preds[input_ids == 0] = 3 # MASK unknown word
    test_preds = test_preds.narrow(-1, 0, 60)

    attention_mask = torch.LongTensor([test_row['attention_mask']])
    attention_mask = attention_mask.narrow(-1, 0, 60)

    denoised_preds = denoiser.forward(input_ids=test_preds, attention_mask=attention_mask).logits.argmax(dim=2)
    
    for pos in range(len(test_row['input_ids'])):
        if test_row['input_ids'][pos] == 2:
            break
        if test_row['input_ids'][pos] == 1:
            continue
        if test_row['input_ids'][pos] == 0:
            total += 1
            prior_correct = preds.predictions[row_id][pos] == test_row['labels'][pos]
            post_correct = denoised_preds[0][pos] - 4 == test_row['labels'][pos]
            if prior_correct:
                total_correct_before += 1
            if post_correct:
                total_fixed += 1
            if prior_correct and post_correct:
                both_correct += 1

    # correct = denoised_preds[input_ids.narrow(-1, 0, 60) == 0] - 4 == labels[input_ids == 0]
    # correct = correct.long()
    # total_fixed += torch.sum(correct)
    
print(f"{total_fixed}/{total} unknown from {total_correct_before}, with {both_correct} shared")
print(f"Perf on unknown: {total_correct_before / total} before, {total_fixed / total} after")

improved_tokens = total_fixed - total_correct_before
acc_improvement = improved_tokens / (known_tokens + unknown_tokens)
print(f"{acc_improvement} improvement in accuracy.")

The following columns in the test set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: segmentation, transcription, glosses, translation, morphemes. If segmentation, transcription, glosses, translation, morphemes are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 2128
  Batch size = 64


PREDS [[56  1 65 ... 43 43 43]
 [54  1 26 ... 43 43 43]
 [39  1 25 ... 43 43 43]
 ...
 [26 38 14 ... 43 43 43]
 [25 39  1 ... 43 43 43]
 [41  1 26 ... 43 43 43]]
LABELS [[  56    1   65 ... -100 -100 -100]
 [  54    1   26 ... -100 -100 -100]
 [  39    1   25 ... -100 -100 -100]
 ...
 [  26   39   34 ... -100 -100 -100]
 [  25   39    1 ... -100 -100 -100]
 [  54    1   26 ... -100 -100 -100]]
(2128, 64)
Preds:	 ['NEG', '[SEP]', 'PREP']
Labels:	 ['NEG', '[SEP]', 'PREP']
1322 total unknown tokens, 854 wrong = 0.6459909228441755

loading configuration file ../models/usp-gloss-denoiser/config.json
Model config RobertaConfig {
  "_name_or_path": "../models/usp-gloss-denoiser",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 100,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 64,
  "model_type": "roberta",
  "num_attention_heads": 5,
  "num_hidden_layers": 3,
  "pad_token_id": 2,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.21.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 70
}

loading weights file ../models/usp-gloss-denoiser/pytorch_model.bin
All model checkpoint weights were used when initializing RobertaForMaskedLM.

All the weights of RobertaForMaskedLM were initialized from the model checkp


12496 total known tokens, 2593 wrong = 0.20750640204865556
3447 total wrong of 13818 total tokens


  0%|          | 0/1013 [00:00<?, ?it/s]

505/1322 unknown from 468, with 459 shared
Perf on unknown: 0.3540090771558245 before, 0.38199697428139184 after
0.0026776668114054133 improvement in accuracy.


In [8]:
new_train = load_data_file('../data/GenBench/selfsup_it1')

new_train

[Trnsc:	ta' chi',
 Segm:	ta'n chi
 Gloss:	NEG PREP
 Trnsl:	Ya no hay.
 ,
 Trnsc:	Pores tijb'ij taq li ójor taq tziij.
 Segm:	pores ti-j-b'ij taq li ójor taq tziij
 Gloss:	ADV INC-E3S-VT PL DEM ADV PL S
 Trnsl:	Por eso dicían antiguamente.
 ,
 Trnsc:	il xan jun ra ák'al,
 Segm:	il x-b'an jun ra ák'el
 Gloss:	VT COM-VT NUM DIM S
 Trnsl:	Mire lo que le pasó a un niñito.
 ,
 Trnsc:	i dispwes wi' qapoop tqab'an,
 Segm:	i dispwes wi' qa-poop t-qa-b'an
 Gloss:	CONJ NOM EXS E1P-S INC-E1P-VT
 Trnsl:	Y despues tenemos nuestros petates que hacer.
 ,
 Trnsc:	per oj tijin oj rijeb' li.
 Segm:	peer ooj tijin ooj ri'j-eeb' li
 Gloss:	ADV PRON PRG PRON S-S DEM
 Trnsl:	pero nosotros nos estamos envejeciendo,
 ,
 Trnsc:	wi' k'is ixtuutz',
 Segm:	wi' k'is ixtuutz'
 Gloss:	EXS VT S
 Trnsl:	tenía pescado
 ,
 Trnsc:	i lojori tachi mood inb'ee laq chaak aq'ab'.
 Segm:	i lojori ta-chi modo in-b'ee laq chaak aq'ab'
 Gloss:	CONJ ADV VT-PREP ADV E1S-S PREP S ADV
 Trnsl:	Y ahora ya no hay modo que me baya tempran