In [241]:
from data import load_data_file, create_vocab, prepare_dataset, create_gloss_vocab
from uspanteko_morphology import morphology
from tokenizer import WordLevelTokenizer
from datasets import DatasetDict

train_data = load_data_file("../data/GenBench/story")
eval_advice_data = load_data_file("../data/GenBench/advice")
eval_history_data = load_data_file("../data/GenBench/history")
eval_personal_data = load_data_file("../data/GenBench/personal")
eval_ood = load_data_file("../data/GenBench/eval_OOD")

# print(
#     f"Loaded {len(train_data)} train lines, {len(eval_id_data)} ID eval lines, and {len(eval_ood_data)} OOD eval lines")

MODEL_INPUT_LENGTH = 64
device = 'cpu'

train_vocab = create_vocab([line.morphemes() for line in train_data], threshold=1)
tokenizer = WordLevelTokenizer(vocab=train_vocab, model_max_length=MODEL_INPUT_LENGTH)

glosses = create_gloss_vocab(morphology)

dataset = DatasetDict()

dataset['train'] = prepare_dataset(data=train_data, tokenizer=tokenizer, labels=glosses, device=device)
dataset['advice'] = prepare_dataset(data=eval_advice_data, tokenizer=tokenizer, labels=glosses, device=device)
dataset['history'] = prepare_dataset(data=eval_history_data, tokenizer=tokenizer, labels=glosses, device=device)
dataset['personal'] = prepare_dataset(data=eval_personal_data, tokenizer=tokenizer, labels=glosses, device=device)
dataset['eval_OOD'] = prepare_dataset(data=eval_ood, tokenizer=tokenizer, labels=glosses, device=device)

  0%|          | 0/5843 [00:00<?, ?ex/s]

  0%|          | 0/612 [00:00<?, ?ex/s]

  0%|          | 0/1334 [00:00<?, ?ex/s]

  0%|          | 0/3644 [00:00<?, ?ex/s]

  0%|          | 0/4256 [00:00<?, ?ex/s]

In [232]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained("../models/usp-lang")

loading configuration file ../models/usp-lang/config.json
Model config RobertaConfig {
  "_name_or_path": "../models/usp-lang",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 100,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 64,
  "model_type": "roberta",
  "num_attention_heads": 5,
  "num_hidden_layers": 3,
  "pad_token_id": 2,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.21.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 2284
}

loading weights file ../models/usp-lang/pytorch_model.bin
All model checkpoint weights were used when initializing RobertaForMaskedLM.

All the weights of RobertaForMaskedLM were initialized from the model checkpoint at ../models/usp-lang.


In [234]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

BATCH_SIZE = 64
EPOCHS = 2000

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="pt")

args = TrainingArguments(
    output_dir=f"../training-checkpoints",
    evaluation_strategy="epoch",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=3,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=3,
    num_train_epochs=EPOCHS,
    report_to="wandb",
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['personal'],
    data_collator=data_collator
)

PyTorch: setting up devices


In [237]:
import math

eval_results = trainer.evaluate(dataset['personal'])
print(f"Perplexity (personal): {math.exp(eval_results['eval_loss']):.2f}")

eval_results = trainer.evaluate(dataset['history'])
print(f"Perplexity (history): {math.exp(eval_results['eval_loss']):.2f}")

eval_results = trainer.evaluate(dataset['advice'])
print(f"Perplexity (advice): {math.exp(eval_results['eval_loss']):.2f}")

The following columns in the evaluation set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: translation, morphemes, transcription, segmentation, glosses. If translation, morphemes, transcription, segmentation, glosses are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3644
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: translation, morphemes, transcription, segmentation, glosses. If translation, morphemes, transcription, segmentation, glosses are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1334
  Batch size = 64


Perplexity (personal): 38.79


The following columns in the evaluation set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: translation, morphemes, transcription, segmentation, glosses. If translation, morphemes, transcription, segmentation, glosses are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 612
  Batch size = 64


Perplexity (history): 33.58
Perplexity (advice): 35.00


In [243]:
from transformers import AutoModelForTokenClassification
from finetune_token_classifier import create_trainer

model = AutoModelForTokenClassification.from_pretrained("../models/finetuned-flat-micro-absolute")

dataset["dev"] = dataset["history"]
trainer = create_trainer(model, dataset=dataset, tokenizer=tokenizer, labels=glosses, batch_size=BATCH_SIZE,
                         max_epochs=300, report_to='wandb')

trainer.evaluate(dataset["history"])

loading configuration file ../models/finetuned-flat-micro-absolute/config.json
Model config RobertaConfig {
  "_name_or_path": "../models/finetuned-flat-micro-absolute",
  "architectures": [
    "RobertaForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 100,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LA

Creating trainer...


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


PREDS [[36  1 50 ... 57 57 57]
 [38  1 53 ... 56 56 56]
 [40  1 40 ... 57 57 57]
 ...
 [42  1 39 ...  0  0  0]
 [54  1 57 ... 54 54 54]
 [37 44  1 ... 39 39 39]]
LABELS [[  36    1   50 ... -100 -100 -100]
 [  45    1   53 ... -100 -100 -100]
 [  40    1   43 ... -100 -100 -100]
 ...
 [  42    1   39 ... -100 -100 -100]
 [  54    1   57 ... -100 -100 -100]
 [  37   44    1 ... -100 -100 -100]]
(1334, 64)
Preds:	 ['EXS', '[SEP]', 'NUM', '[SEP]', 'NOM', '[SEP]', 'NOM']
Labels:	 ['EXS', '[SEP]', 'NUM', '[SEP]', 'S', '[SEP]', 'DEM']


{'eval_loss': 0.5819401741027832,
 'eval_average_accuracy': 0.7875679166572319,
 'eval_accuracy': 0.7891765860841821,
 'eval_runtime': 2.0883,
 'eval_samples_per_second': 638.783,
 'eval_steps_per_second': 10.056}

In [244]:
trainer.evaluate(dataset["eval_OOD"])

The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: translation, morphemes, transcription, segmentation, glosses. If translation, morphemes, transcription, segmentation, glosses are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4256
  Batch size = 64


PREDS [[54  1 65 ... 56 56 56]
 [ 9 43 13 ... 13 13 13]
 [ 9 43 13 ... 13 13 13]
 ...
 [57  1 50 ... 39 39 39]
 [39  1 53 ... 56 56 56]
 [ 5 53  1 ... 39 39 39]]
LABELS [[  54    1   26 ... -100 -100 -100]
 [   9   43 -100 ... -100 -100 -100]
 [   9   43 -100 ... -100 -100 -100]
 ...
 [  57    1   50 ... -100 -100 -100]
 [  43    1   53 ... -100 -100 -100]
 [   3   53    1 ... -100 -100 -100]]
(4256, 64)
Preds:	 ['ADV', '[SEP]', 'PREP', 'VT', 'SC', '[SEP]', 'INC', 'E1S', 'VT', '[SEP]', 'PRON']
Labels:	 ['ADV', '[SEP]', 'INC', 'VT', 'SC', '[SEP]', 'INC', 'E1S', 'VI', '[SEP]', 'PRON']


{'eval_loss': 0.7411360144615173,
 'eval_average_accuracy': 0.7359588938686968,
 'eval_accuracy': 0.7420881502890173,
 'eval_runtime': 6.4175,
 'eval_samples_per_second': 663.19,
 'eval_steps_per_second': 10.44}