In [213]:
from data import load_data_file, create_vocab, prepare_dataset, create_gloss_vocab
from uspanteko_morphology import morphology
from tokenizer import WordLevelTokenizer
from datasets import DatasetDict

train_data = load_data_file("../data/GenBench/story_advice_personal")
eval_data = load_data_file("../data/GenBench/history")

print(f"Loaded {len(train_data)} train lines and {len(eval_data)} eval lines")

MODEL_INPUT_LENGTH = 64
device = 'cpu'

train_vocab = create_vocab([line.morphemes() for line in train_data], threshold=1)
tokenizer = WordLevelTokenizer(vocab=train_vocab, model_max_length=MODEL_INPUT_LENGTH)

glosses = create_gloss_vocab(morphology)

dataset = DatasetDict()

dataset['train'] = prepare_dataset(data=train_data, tokenizer=tokenizer, labels=glosses, device=device)
dataset['dev'] = prepare_dataset(data=eval_data, tokenizer=tokenizer, labels=glosses, device=device)

Loaded 10099 train lines and 1334 eval lines


  0%|          | 0/10099 [00:00<?, ?ex/s]

  0%|          | 0/1334 [00:00<?, ?ex/s]

In [214]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained("./full-flat-1",
                                                        num_labels=len(glosses))

In [215]:
from finetune_token_classifier import create_trainer

trainer = create_trainer(model, dataset=dataset, tokenizer=tokenizer, labels=glosses, batch_size=64,
                         max_epochs=600, report_to="none")

Creating trainer...


In [216]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: translation, morphemes, transcription, segmentation, glosses. If translation, morphemes, transcription, segmentation, glosses are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1334
  Batch size = 64


PREDS [[36  1 50 ... 65 65 65]
 [38  1 53 ...  9  9  9]
 [40  1 40 ...  9  9  9]
 ...
 [42  1 39 ... 34 34 34]
 [54  1 57 ...  0  0  0]
 [37 44  1 ...  9  9  9]]
LABELS [[  36    1   50 ... -100 -100 -100]
 [  45    1   53 ... -100 -100 -100]
 [  40    1   43 ... -100 -100 -100]
 ...
 [  42    1   39 ... -100 -100 -100]
 [  54    1   57 ... -100 -100 -100]
 [  37   44    1 ... -100 -100 -100]]
(1334, 64)
Preds:	 ['EXS', '[SEP]', 'NUM', '[SEP]', 'NOM', '[SEP]', 'NOM']
Labels:	 ['EXS', '[SEP]', 'NUM', '[SEP]', 'S', '[SEP]', 'DEM']


{'eval_loss': 0.4575164318084717,
 'eval_average_accuracy': 0.8228964872644873,
 'eval_accuracy': 0.8245183458093017,
 'eval_runtime': 2.0218,
 'eval_samples_per_second': 659.803,
 'eval_steps_per_second': 10.387}