In [231]:
from data import load_data_file, create_vocab, prepare_dataset, create_gloss_vocab
from uspanteko_morphology import morphology
from tokenizer import WordLevelTokenizer
from datasets import DatasetDict

train_data = load_data_file("../data/GenBench/story")
eval_advice_data = load_data_file("../data/GenBench/advice")
eval_history_data = load_data_file("../data/GenBench/history")
eval_personal_data = load_data_file("../data/GenBench/personal")

# print(
#     f"Loaded {len(train_data)} train lines, {len(eval_id_data)} ID eval lines, and {len(eval_ood_data)} OOD eval lines")

MODEL_INPUT_LENGTH = 64
device = 'cpu'

train_vocab = create_vocab([line.morphemes() for line in train_data], threshold=1)
tokenizer = WordLevelTokenizer(vocab=train_vocab, model_max_length=MODEL_INPUT_LENGTH)

glosses = create_gloss_vocab(morphology)

dataset = DatasetDict()

dataset['train'] = prepare_dataset(data=train_data, tokenizer=tokenizer, labels=glosses, device=device)
dataset['advice'] = prepare_dataset(data=eval_advice_data, tokenizer=tokenizer, labels=glosses, device=device)
dataset['history'] = prepare_dataset(data=eval_history_data, tokenizer=tokenizer, labels=glosses, device=device)
dataset['personal'] = prepare_dataset(data=eval_personal_data, tokenizer=tokenizer, labels=glosses, device=device)

  0%|          | 0/5843 [00:00<?, ?ex/s]

  0%|          | 0/612 [00:00<?, ?ex/s]

  0%|          | 0/1334 [00:00<?, ?ex/s]

  0%|          | 0/3644 [00:00<?, ?ex/s]

In [232]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained("../models/usp-lang")

loading configuration file ../models/usp-lang/config.json
Model config RobertaConfig {
  "_name_or_path": "../models/usp-lang",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 100,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 64,
  "model_type": "roberta",
  "num_attention_heads": 5,
  "num_hidden_layers": 3,
  "pad_token_id": 2,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.21.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 2284
}

loading weights file ../models/usp-lang/pytorch_model.bin
All model checkpoint weights were used when initializing RobertaForMaskedLM.

All the weights of RobertaForMaskedLM were initialized from the model checkpoint at ../models/usp-lang.


In [234]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

BATCH_SIZE = 64
EPOCHS = 2000

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="pt")

args = TrainingArguments(
    output_dir=f"../training-checkpoints",
    evaluation_strategy="epoch",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=3,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=3,
    num_train_epochs=EPOCHS,
    report_to="wandb",
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['personal'],
    data_collator=data_collator
)

PyTorch: setting up devices


In [236]:
import math

eval_results = trainer.evaluate(dataset['personal'])
print(f"Perplexity (personal): {math.exp(eval_results['eval_loss']):.2f}")

eval_results = trainer.evaluate(dataset['history'])
print(f"Perplexity (history): {math.exp(eval_results['eval_loss']):.2f}")

eval_results = trainer.evaluate(dataset['advice'])
print(f"Perplexity (advice): {math.exp(eval_results['eval_loss']):.2f}")

The following columns in the evaluation set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: translation, morphemes, transcription, segmentation, glosses. If translation, morphemes, transcription, segmentation, glosses are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3644
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: translation, morphemes, transcription, segmentation, glosses. If translation, morphemes, transcription, segmentation, glosses are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1334
  Batch size = 64


Perplexity (personal): 39.68


The following columns in the evaluation set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: translation, morphemes, transcription, segmentation, glosses. If translation, morphemes, transcription, segmentation, glosses are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 612
  Batch size = 64


Perplexity (history): 33.17
Perplexity (advice): 38.10


Personal and advice are more OOD (higher perplexity), so we'll split that way.