# Evalutation

In [1]:
from data_handling import write_igt, load_data_file
import random

story = load_data_file("../data/GenBench/categories/story")
advice = load_data_file("../data/GenBench/categories/advice")
history = load_data_file("../data/GenBench/categories/history")
personal = load_data_file("../data/GenBench/categories/personal")

id_data = story + history
ood_data = advice + personal

random.seed(1)
random.shuffle(id_data)
random.shuffle(ood_data)

count_ood = int(len(ood_data) / 2)

eval_ood = ood_data[:count_ood]
test_ood = ood_data[count_ood:]

eval_id = id_data[:count_ood]
train = id_data[count_ood:]

write_igt(eval_ood, '../data/GenBench/eval_ood.txt')
write_igt(eval_id, '../data/GenBench/eval_id.txt')
write_igt(test_ood, '../data/GenBench/test_ood.txt')
write_igt(train, '../data/GenBench/train.txt')

In [2]:
from data_handling import create_vocab, prepare_dataset, create_gloss_vocab
from uspanteko_morphology import morphology
from tokenizer import WordLevelTokenizer
from datasets import DatasetDict

MODEL_INPUT_LENGTH = 64
device = 'mps'

train_vocab = create_vocab([line.morphemes() for line in train], threshold=1)
tokenizer = WordLevelTokenizer(vocab=train_vocab, model_max_length=MODEL_INPUT_LENGTH)
glosses = create_gloss_vocab(morphology)

dataset = DatasetDict()
dataset['train'] = prepare_dataset(data=train, tokenizer=tokenizer, labels=glosses, device=device)
dataset['dev'] = prepare_dataset(data=eval_id, tokenizer=tokenizer, labels=glosses, device=device)
dataset['dev_OOD'] = prepare_dataset(data=eval_ood, tokenizer=tokenizer, labels=glosses, device=device)
dataset['test'] = prepare_dataset(data=test_ood, tokenizer=tokenizer, labels=glosses, device=device)

  0%|          | 0/5049 [00:00<?, ?ex/s]

  0%|          | 0/2128 [00:00<?, ?ex/s]

  0%|          | 0/2128 [00:00<?, ?ex/s]

  0%|          | 0/2128 [00:00<?, ?ex/s]

In [5]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, AutoModelForMaskedLM
import math

model = AutoModelForMaskedLM.from_pretrained("../models/usp-mlm-absolute-micro")
BATCH_SIZE = 64
EPOCHS = 50

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="pt")

args = TrainingArguments(
    output_dir=f"../training-checkpoints",
    evaluation_strategy="epoch",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=3,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=3,
    num_train_epochs=EPOCHS,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['dev'],
    data_collator=data_collator
)


eval_results = trainer.evaluate(dataset['dev'])
print(f"Perplexity (id): {math.exp(eval_results['eval_loss']):.2f}")

eval_results = trainer.evaluate(dataset['dev_OOD'])
print(f"Perplexity (ood): {math.exp(eval_results['eval_loss']):.2f}")

The following columns in the evaluation set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: translation, glosses, transcription, morphemes, segmentation. If translation, glosses, transcription, morphemes, segmentation are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2128
  Batch size = 64


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mmichael-ginn[0m. Use [1m`wandb login --relogin`[0m to force relogin


The following columns in the evaluation set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: translation, glosses, transcription, morphemes, segmentation. If translation, glosses, transcription, morphemes, segmentation are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2128
  Batch size = 64


Perplexity (id): 77.78
Perplexity (ood): 94.03


In [24]:
from transformers import AutoModelForTokenClassification
from finetune_token_classifier import create_trainer

model = AutoModelForTokenClassification.from_pretrained("../models/full-flat-1-finetune-0.0wd-0.25itps-it2")

trainer = create_trainer(model, dataset=dataset, tokenizer=tokenizer, labels=glosses, batch_size=BATCH_SIZE,
                         max_epochs=300, weight_decay=0, report_to='wandb')

trainer.evaluate(dataset["dev"])

loading configuration file ../models/full-flat-1-finetune-0.0wd-0.25itps-it2/config.json
Model config RobertaConfig {
  "_name_or_path": "../models/full-flat-1-finetune-0.0wd-0.25itps-it2",
  "architectures": [
    "RobertaForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 100,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABE

Creating trainer...


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


PREDS [[61  1 46 ... 39 39 39]
 [54  1 25 ... 43 43 43]
 [38  1 53 ... 43 43 43]
 ...
 [30 32  1 ... 39 39 39]
 [38  1 42 ... 43 43 43]
 [61  1 30 ... 43 43 43]]
LABELS [[  61    1   46 ... -100 -100 -100]
 [  54    1   25 ... -100 -100 -100]
 [  38    1   53 ... -100 -100 -100]
 ...
 [  30   32    1 ... -100 -100 -100]
 [  38    1   42 ... -100 -100 -100]
 [  61    1   30 ... -100 -100 -100]]
(2128, 64)
Preds:	 ['CONJ', '[SEP]', 'VOC', '[SEP]', 'CONJ', '[SEP]', 'VOC']
Labels:	 ['CONJ', '[SEP]', 'VOC', '[SEP]', 'CONJ', '[SEP]', 'VOC']


{'eval_loss': 0.385240763425827,
 'eval_average_accuracy': 0.8478330251606818,
 'eval_accuracy': 0.8601065547303842,
 'eval_runtime': 3.2271,
 'eval_samples_per_second': 659.424,
 'eval_steps_per_second': 10.536}

In [25]:
trainer.evaluate(dataset["test"])

The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: translation, glosses, transcription, morphemes, segmentation. If translation, glosses, transcription, morphemes, segmentation are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2128
  Batch size = 64


PREDS [[46  1 54 ... 39 39 39]
 [54  1 56 ... 39 39 39]
 [54 62  1 ... 39 39 39]
 ...
 [38 43 43 ... 43 43 43]
 [39 32  1 ... 43 43 43]
 [26 38  1 ... 39 39 39]]
LABELS [[  46    1   54 ... -100 -100 -100]
 [  54    1   57 ... -100 -100 -100]
 [  54   34    1 ... -100 -100 -100]
 ...
 [  38 -100 -100 ... -100 -100 -100]
 [  38   61    1 ... -100 -100 -100]
 [  26   39    1 ... -100 -100 -100]]
(2128, 64)
Preds:	 ['VOC', '[SEP]', 'ADV', '[SEP]', 'DEM']
Labels:	 ['VOC', '[SEP]', 'ADV', '[SEP]', 'DEM']


{'eval_loss': 0.7491924166679382,
 'eval_average_accuracy': 0.7587232684322329,
 'eval_accuracy': 0.761578415813014,
 'eval_runtime': 3.2315,
 'eval_samples_per_second': 658.527,
 'eval_steps_per_second': 10.522}

In [7]:
# Write predictions
from data_handling import write_predictions

write_predictions(eval_ood, tokenizer, trainer, glosses, '../data/GenBench/pred_eval_ood_0.25_it2.txt')

  0%|          | 0/2128 [00:00<?, ?ex/s]

The following columns in the test set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: translation, glosses, transcription, morphemes, segmentation. If translation, glosses, transcription, morphemes, segmentation are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 2128
  Batch size = 64


PREDS [[56  1 65 ... 43 43 43]
 [54  1 26 ... 39 39 39]
 [39  1 25 ... 43 43 43]
 ...
 [26 38 14 ... 43 43 43]
 [25 39  1 ... 39 39 39]
 [41  1 26 ... 39 39 39]]
LABELS [[  56    1   65 ... -100 -100 -100]
 [  54    1   26 ... -100 -100 -100]
 [  39    1   25 ... -100 -100 -100]
 ...
 [  26   39   34 ... -100 -100 -100]
 [  25   39    1 ... -100 -100 -100]
 [  54    1   26 ... -100 -100 -100]]
(2128, 64)
Preds:	 ['NEG', '[SEP]', 'PREP']
Labels:	 ['NEG', '[SEP]', 'PREP']


## Denoiser

In [26]:
import torch
from tqdm.notebook import tqdm

target_set = dataset["test"]

unknown_tokens = 0
unknown_token_wrong = 0
known_tokens = 0
known_token_wrong = 0

preds = trainer.predict(target_set)

rows_with_unknowns = set()

for row_index, row in enumerate(target_set):
    for pos in range(len(row['input_ids'])):
        if row['input_ids'][pos] == 2:
            break
        if row['input_ids'][pos] == 1:
            continue
        
        if row['input_ids'][pos] == 0:
            unknown_tokens += 1
            rows_with_unknowns.add(row_index)

            if preds.predictions[row_index][pos] != row['labels'][pos]:
                # Incorrect
                unknown_token_wrong += 1
        else:
            known_tokens += 1
            if preds.predictions[row_index][pos] != row['labels'][pos]:
                # Incorrect
                known_token_wrong += 1
                
print(f"{unknown_tokens} total unknown tokens, {unknown_token_wrong} wrong = {unknown_token_wrong / unknown_tokens}")
print(f"{known_tokens} total known tokens, {known_token_wrong} wrong = {known_token_wrong / known_tokens}")
print(f"{unknown_token_wrong + known_token_wrong} total wrong of {known_tokens + unknown_tokens} total tokens")

total_correct_before = 0
total_fixed = 0
both_correct = 0
total = 0

denoiser = AutoModelForMaskedLM.from_pretrained("../models/usp-gloss-denoiser-micro")

for row_id in tqdm(rows_with_unknowns):
    test_row = target_set[row_id]
    test_preds = torch.LongTensor([preds.predictions[row_id]])
    input_ids = torch.LongTensor([test_row['input_ids']])
    labels = torch.LongTensor([test_row['labels']])

    test_preds[test_preds != 1] = test_preds[test_preds != 1] + 4

    # test_preds[input_ids == 0] = 3 # MASK unknown word
    test_preds = test_preds.narrow(-1, 0, 60)

    attention_mask = torch.LongTensor([test_row['attention_mask']])
    attention_mask = attention_mask.narrow(-1, 0, 60)

    denoised_preds = denoiser.forward(input_ids=test_preds, attention_mask=attention_mask).logits.argmax(dim=2)
    
    for pos in range(len(test_row['input_ids'])):
        if test_row['input_ids'][pos] == 2:
            break
        if test_row['input_ids'][pos] == 1:
            continue
        if test_row['input_ids'][pos] == 0:
            total += 1
            prior_correct = preds.predictions[row_id][pos] == test_row['labels'][pos]
            post_correct = denoised_preds[0][pos] - 4 == test_row['labels'][pos]
            if prior_correct:
                total_correct_before += 1
            if post_correct:
                total_fixed += 1
            if prior_correct and post_correct:
                both_correct += 1

    # correct = denoised_preds[input_ids.narrow(-1, 0, 60) == 0] - 4 == labels[input_ids == 0]
    # correct = correct.long()
    # total_fixed += torch.sum(correct)
    
print(f"{total_fixed}/{total} unknown from {total_correct_before}, with {both_correct} shared")
print(f"Perf on unknown: {total_correct_before / total} before, {total_fixed / total} after")

improved_tokens = total_fixed - total_correct_before
acc_improvement = improved_tokens / (known_tokens + unknown_tokens)
print(f"{acc_improvement} improvement in accuracy.")

The following columns in the test set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: translation, glosses, transcription, morphemes, segmentation. If translation, glosses, transcription, morphemes, segmentation are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 2128
  Batch size = 64


PREDS [[46  1 54 ... 39 39 39]
 [54  1 56 ... 39 39 39]
 [54 62  1 ... 39 39 39]
 ...
 [38 43 43 ... 43 43 43]
 [39 32  1 ... 43 43 43]
 [26 38  1 ... 39 39 39]]
LABELS [[  46    1   54 ... -100 -100 -100]
 [  54    1   57 ... -100 -100 -100]
 [  54   34    1 ... -100 -100 -100]
 ...
 [  38 -100 -100 ... -100 -100 -100]
 [  38   61    1 ... -100 -100 -100]
 [  26   39    1 ... -100 -100 -100]]
(2128, 64)
Preds:	 ['VOC', '[SEP]', 'ADV', '[SEP]', 'DEM']
Labels:	 ['VOC', '[SEP]', 'ADV', '[SEP]', 'DEM']


loading configuration file ../models/usp-gloss-denoiser-micro/config.json
Model config RobertaConfig {
  "_name_or_path": "../models/usp-gloss-denoiser-micro",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 100,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 64,
  "model_type": "roberta",
  "num_attention_heads": 5,
  "num_hidden_layers": 3,
  "pad_token_id": 2,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.21.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 70
}



1327 total unknown tokens, 1056 wrong = 0.7957799547852299
12535 total known tokens, 2249 wrong = 0.17941763063422417
3305 total wrong of 13862 total tokens


loading weights file ../models/usp-gloss-denoiser-micro/pytorch_model.bin
All model checkpoint weights were used when initializing RobertaForMaskedLM.

All the weights of RobertaForMaskedLM were initialized from the model checkpoint at ../models/usp-gloss-denoiser-micro.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForMaskedLM for predictions without further training.


  0%|          | 0/990 [00:00<?, ?it/s]

459/1327 unknown from 271, with 228 shared
Perf on unknown: 0.20422004521477016 before, 0.3458929917106255 after
0.013562256528639446 improvement in accuracy.


# Iterative Pseudo-Labeling

In [18]:
import torch.nn.functional as F
import torch

target_set = dataset["test"]
target_data = test_ood

logits = model.forward(input_ids=torch.LongTensor(target_set['input_ids']), attention_mask=torch.LongTensor(target_set['attention_mask'])).logits

row_confidences = []

# For each sequence, pick the top logit for each token and average log probs
for index in range(len(target_set)):
    row = target_set[index]
    row_logits = logits[index] # 64 * 66
    
    num_tokens = len(row['morphemes'])
    row_logits_sum = 0
    
    for token_index in range(num_tokens):
        token_logits = row_logits[token_index]
        token_logits = F.softmax(token_logits, dim=0)
        max_token_logit = torch.max(token_logits).item()
        row_logits_sum += max_token_logit
    
    avg_prob = row_logits_sum / num_tokens
    row_confidences.append(avg_prob)
    
top_indices = sorted(range(len(row_confidences)), key=lambda x: row_confidences[x])[-int(len(row_confidences) / 4):]

dataset['hiconf_dev'] = prepare_dataset(data=[target_data[i] for i in range(len(eval_ood)) if i in top_indices], tokenizer=tokenizer, labels=glosses, device=device)
trainer.evaluate(dataset['hiconf_dev'])

write_predictions([eval_ood[i] for i in range(len(target_data)) if i in top_indices], tokenizer, trainer, glosses, '../data/GenBench/pred_eval_ood_0.25_it3.txt')

  0%|          | 0/532 [00:00<?, ?ex/s]

The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: translation, glosses, transcription, morphemes, segmentation. If translation, glosses, transcription, morphemes, segmentation are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 532
  Batch size = 64


PREDS [[56  1 65 ... 43 43 43]
 [54  1 26 ... 39 39 39]
 [54  1 43 ... 39 39 39]
 ...
 [60  1 57 ... 39 39 39]
 [43  1 57 ... 43 43 43]
 [54  1 25 ... 39 39 39]]
LABELS [[  56    1   65 ... -100 -100 -100]
 [  54    1   26 ... -100 -100 -100]
 [  54    1   39 ... -100 -100 -100]
 ...
 [  60    1   57 ... -100 -100 -100]
 [  43    1   57 ... -100 -100 -100]
 [  54    1   25 ... -100 -100 -100]]
(532, 64)
Preds:	 ['NEG', '[SEP]', 'PREP']
Labels:	 ['NEG', '[SEP]', 'PREP']


{'eval_loss': 0.4851396679878235,
 'eval_average_accuracy': 0.8771882305246967,
 'eval_accuracy': 0.8665610142630745,
 'eval_runtime': 0.8268,
 'eval_samples_per_second': 643.456,
 'eval_steps_per_second': 10.886}