In [19]:
import random

from data_handling import load_data_file, create_vocab, prepare_dataset, create_gloss_vocab
from uspanteko_morphology import morphology
from tokenizer import WordLevelTokenizer
from datasets import DatasetDict

train_data = load_data_file("../data/GenBench/categories/story")
eval_advice_data = load_data_file("../data/GenBench/categories/advice")
eval_history_data = load_data_file("../data/GenBench/categories/history")
eval_personal_data = load_data_file("../data/GenBench/categories/personal")
eval_ood = load_data_file("../data/GenBench/eval_OOD")
eval_id = load_data_file("../data/GenBench/eval_ID")

# print(
#     f"Loaded {len(train_data)} train lines, {len(eval_id_data)} ID eval lines, and {len(eval_ood_data)} OOD eval lines")

MODEL_INPUT_LENGTH = 64
device = 'mps'

train_vocab = create_vocab([line.morphemes() for line in train_data], threshold=1)
tokenizer = WordLevelTokenizer(vocab=train_vocab, model_max_length=MODEL_INPUT_LENGTH)

glosses = create_gloss_vocab(morphology)

dataset = DatasetDict()

dataset['train'] = prepare_dataset(data=train_data, tokenizer=tokenizer, labels=glosses, device=device)
dataset['advice'] = prepare_dataset(data=eval_advice_data, tokenizer=tokenizer, labels=glosses, device=device)
dataset['history'] = prepare_dataset(data=eval_history_data, tokenizer=tokenizer, labels=glosses, device=device)
dataset['personal'] = prepare_dataset(data=eval_personal_data, tokenizer=tokenizer, labels=glosses, device=device)
dataset['eval_OOD'] = prepare_dataset(data=eval_ood, tokenizer=tokenizer, labels=glosses, device=device)
dataset['eval_ID'] = prepare_dataset(data=eval_id, tokenizer=tokenizer, labels=glosses, device=device)
dataset["dev"] = dataset["eval_OOD"]

  0%|          | 0/5843 [00:00<?, ?ex/s]

  0%|          | 0/612 [00:00<?, ?ex/s]

  0%|          | 0/1334 [00:00<?, ?ex/s]

  0%|          | 0/3644 [00:00<?, ?ex/s]

  0%|          | 0/481 [00:00<?, ?ex/s]

  0%|          | 0/2219 [00:00<?, ?ex/s]

In [23]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained("../models/usp-mlm-absolute-micro")
BATCH_SIZE = 64
EPOCHS = 2000

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="pt")

args = TrainingArguments(
    output_dir=f"../training-checkpoints",
    evaluation_strategy="epoch",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=3,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=3,
    num_train_epochs=EPOCHS,
    report_to="wandb",
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['personal'],
    data_collator=data_collator
)

import math

eval_results = trainer.evaluate(dataset['personal'])
print(f"Perplexity (personal): {math.exp(eval_results['eval_loss']):.2f}")

eval_results = trainer.evaluate(dataset['history'])
print(f"Perplexity (history): {math.exp(eval_results['eval_loss']):.2f}")

eval_results = trainer.evaluate(dataset['advice'])
print(f"Perplexity (advice): {math.exp(eval_results['eval_loss']):.2f}")

loading configuration file ../models/usp-mlm-absolute-micro/config.json
Model config RobertaConfig {
  "_name_or_path": "../models/usp-mlm-absolute-micro",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 100,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 64,
  "model_type": "roberta",
  "num_attention_heads": 5,
  "num_hidden_layers": 3,
  "pad_token_id": 2,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.21.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 2284
}

loading weights file ../models/usp-mlm-absolute-micro/pytorch_model.bin
All model checkpoint weights were used when initializing RobertaForMaskedLM.

All the weights of RobertaForMaskedLM were initialized from th

Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
The following columns in the evaluation set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: morphemes, glosses, translation, transcription, segmentation. If morphemes, glosses, translation, transcription, segmentation are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1334
  Batch size = 64


Perplexity (personal): 74.42


The following columns in the evaluation set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: morphemes, glosses, translation, transcription, segmentation. If morphemes, glosses, translation, transcription, segmentation are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 612
  Batch size = 64


Perplexity (history): 80.33
Perplexity (advice): 72.03


In [21]:
from transformers import AutoModelForTokenClassification
from finetune_token_classifier import create_trainer

model = AutoModelForTokenClassification.from_pretrained("../models/full-flat-1-0.0wd")

trainer = create_trainer(model, dataset=dataset, tokenizer=tokenizer, labels=glosses, batch_size=BATCH_SIZE,
                         max_epochs=300, weight_decay=0, report_to='wandb')

trainer.evaluate(dataset["eval_ID"])

loading configuration file ../models/full-flat-1-0.0wd/config.json
Model config RobertaConfig {
  "_name_or_path": "../models/full-flat-1-0.0wd",
  "architectures": [
    "RobertaForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 100,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",
    "27": "LABE

Creating trainer...


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


PREDS [[54  1 65 ... 43 43 43]
 [ 9 43 43 ... 43 43 43]
 [ 9 43 43 ... 43 43 43]
 ...
 [54 62  1 ... 43 43 43]
 [ 0  1 43 ... 43 43 43]
 [60  1 57 ... 43 43 43]]
LABELS [[  54    1   26 ... -100 -100 -100]
 [   9   43 -100 ... -100 -100 -100]
 [   9   43 -100 ... -100 -100 -100]
 ...
 [  54   62    1 ... -100 -100 -100]
 [  55    1   47 ... -100 -100 -100]
 [  60    1   54 ... -100 -100 -100]]
(2219, 64)
Preds:	 ['ADV', '[SEP]', 'PREP', 'VT', 'SC', '[SEP]', 'INC', 'E1S', 'VT', '[SEP]', 'PRON']
Labels:	 ['ADV', '[SEP]', 'INC', 'VT', 'SC', '[SEP]', 'INC', 'E1S', 'VI', '[SEP]', 'PRON']


{'eval_loss': 0.9371381402015686,
 'eval_average_accuracy': 0.6970855803162448,
 'eval_accuracy': 0.7011510688496461,
 'eval_runtime': 3.3441,
 'eval_samples_per_second': 663.55,
 'eval_steps_per_second': 10.466}

In [22]:
trainer.evaluate(dataset["eval_OOD"])

The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: morphemes, glosses, translation, transcription, segmentation. If morphemes, glosses, translation, transcription, segmentation are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 481
  Batch size = 64


PREDS [[36  1 50 ... 43 43 43]
 [38  1 53 ... 43 43 43]
 [40  1 40 ... 43 43 43]
 ...
 [54  1 57 ... 43 43 43]
 [39  1 53 ... 54 54 54]
 [26 12 41 ... 39 39 39]]
LABELS [[  36    1   50 ... -100 -100 -100]
 [  45    1   53 ... -100 -100 -100]
 [  40    1   43 ... -100 -100 -100]
 ...
 [  54    1   57 ... -100 -100 -100]
 [  39    1   53 ... -100 -100 -100]
 [  26    4   40 ... -100 -100 -100]]
(481, 64)
Preds:	 ['EXS', '[SEP]', 'NUM', '[SEP]', 'NOM', '[SEP]', 'NOM']
Labels:	 ['EXS', '[SEP]', 'NUM', '[SEP]', 'S', '[SEP]', 'DEM']


{'eval_loss': 0.7386803030967712,
 'eval_average_accuracy': 0.7611720345883254,
 'eval_accuracy': 0.7661029147175243,
 'eval_runtime': 0.7167,
 'eval_samples_per_second': 671.129,
 'eval_steps_per_second': 11.162}

In [24]:
dataset["eval_OOD"]

Dataset({
    features: ['transcription', 'translation', 'glosses', 'segmentation', 'morphemes', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 481
})

In [246]:
from taxonomic_loss_model import TaxonomicLossModel

model = TaxonomicLossModel.from_pretrained("../models/finetuned-harmonic-micro-absolute", num_labels=len(glosses),
                                           loss_sum='harmonic')
model.use_morphology_tree(morphology, max_depth=5)

trainer = create_trainer(model, dataset=dataset, tokenizer=tokenizer, labels=glosses, batch_size=BATCH_SIZE,
                         max_epochs=300, report_to='wandb')

trainer.evaluate(dataset["history"])

loading configuration file ../models/finetuned-harmonic-micro-absolute/config.json
Model config RobertaConfig {
  "_name_or_path": "michaelginn/uspanteko-mlm-large",
  "architectures": [
    "TaxonomicLossModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 100,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",
    "2

Creating trainer...
LEVEL 0 tensor(0.1205)
LEVEL 1 tensor(0.1883)
LEVEL 2 tensor(0.1997)
LEVEL 3 tensor(0.3032)
LEVEL 4 tensor(0.6175)


LEVEL 0 tensor(0.1926)
LEVEL 1 tensor(0.2344)
LEVEL 2 tensor(0.2749)
LEVEL 3 tensor(0.4350)
LEVEL 4 tensor(0.8691)
LEVEL 0 tensor(0.1542)
LEVEL 1 tensor(0.1739)
LEVEL 2 tensor(0.1856)
LEVEL 3 tensor(0.2823)
LEVEL 4 tensor(0.5788)
LEVEL 0 tensor(0.1377)
LEVEL 1 tensor(0.1564)
LEVEL 2 tensor(0.1737)
LEVEL 3 tensor(0.2797)
LEVEL 4 tensor(0.5662)
LEVEL 0 tensor(0.1106)
LEVEL 1 tensor(0.1723)
LEVEL 2 tensor(0.2235)
LEVEL 3 tensor(0.3381)
LEVEL 4 tensor(0.6871)
LEVEL 0 tensor(0.1361)
LEVEL 1 tensor(0.1433)
LEVEL 2 tensor(0.1847)
LEVEL 3 tensor(0.2805)
LEVEL 4 tensor(0.5678)
LEVEL 0 tensor(0.0526)
LEVEL 1 tensor(0.1004)
LEVEL 2 tensor(0.1376)
LEVEL 3 tensor(0.2137)
LEVEL 4 tensor(0.4415)
LEVEL 0 tensor(0.0972)
LEVEL 1 tensor(0.1363)
LEVEL 2 tensor(0.1848)
LEVEL 3 tensor(0.2861)
LEVEL 4 tensor(0.5878)
LEVEL 0 tensor(0.1084)
LEVEL 1 tensor(0.1465)
LEVEL 2 tensor(0.1878)
LEVEL 3 tensor(0.3136)
LEVEL 4 tensor(0.6470)
LEVEL 0 tensor(0.0653)
LEVEL 1 tensor(0.0834)
LEVEL 2 tensor(0.1266)
LEVEL 3 ten

Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


PREDS [[36  1 50 ...  0  0  0]
 [38  1 53 ...  0  0  0]
 [40  1 40 ...  0  0  0]
 ...
 [42  1 39 ... 54 54 54]
 [54  1 57 ... 54 54 54]
 [37 44  1 ...  0  0  0]]
LABELS [[36  1 50 ... 66 66 66]
 [45  1 53 ... 66 66 66]
 [40  1 43 ... 66 66 66]
 ...
 [42  1 39 ... 66 66 66]
 [54  1 57 ... 66 66 66]
 [37 44  1 ... 66 66 66]]
(1334, 64)
Preds:	 ['EXS', '[SEP]', 'NUM', '[SEP]', 'NOM', '[SEP]', 'NOM']
Labels:	 ['EXS', '[SEP]', 'NUM', '[SEP]', 'S', '[SEP]', 'DEM']


{'eval_loss': 1.3436601161956787,
 'eval_average_accuracy': 0.786959505925574,
 'eval_accuracy': 0.7890538716406921,
 'eval_runtime': 2.3222,
 'eval_samples_per_second': 574.446,
 'eval_steps_per_second': 9.043}

In [247]:
trainer.evaluate(dataset["eval_OOD"])

The following columns in the evaluation set don't have a corresponding argument in `TaxonomicLossModel.forward` and have been ignored: translation, morphemes, transcription, segmentation, glosses. If translation, morphemes, transcription, segmentation, glosses are not expected by `TaxonomicLossModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4256
  Batch size = 64


LEVEL 0 tensor(0.1608)
LEVEL 1 tensor(0.1406)
LEVEL 2 tensor(0.1928)
LEVEL 3 tensor(0.3055)
LEVEL 4 tensor(0.6462)
LEVEL 0 tensor(0.1818)
LEVEL 1 tensor(0.1839)
LEVEL 2 tensor(0.2246)
LEVEL 3 tensor(0.3512)
LEVEL 4 tensor(0.7232)
LEVEL 0 tensor(0.0829)
LEVEL 1 tensor(0.0974)
LEVEL 2 tensor(0.1585)
LEVEL 3 tensor(0.2417)
LEVEL 4 tensor(0.5008)
LEVEL 0 tensor(0.1212)
LEVEL 1 tensor(0.1372)
LEVEL 2 tensor(0.1655)
LEVEL 3 tensor(0.2376)
LEVEL 4 tensor(0.4712)
LEVEL 0 tensor(0.1481)
LEVEL 1 tensor(0.1825)
LEVEL 2 tensor(0.1983)
LEVEL 3 tensor(0.3041)
LEVEL 4 tensor(0.6210)
LEVEL 0 tensor(0.1221)
LEVEL 1 tensor(0.1408)
LEVEL 2 tensor(0.1824)
LEVEL 3 tensor(0.2885)
LEVEL 4 tensor(0.5907)
LEVEL 0 tensor(0.1431)
LEVEL 1 tensor(0.2021)
LEVEL 2 tensor(0.2381)
LEVEL 3 tensor(0.3809)
LEVEL 4 tensor(0.7718)
LEVEL 0 tensor(0.1279)
LEVEL 1 tensor(0.1303)
LEVEL 2 tensor(0.1568)
LEVEL 3 tensor(0.2659)
LEVEL 4 tensor(0.5416)
LEVEL 0 tensor(0.0886)
LEVEL 1 tensor(0.1007)
LEVEL 2 tensor(0.1562)
LEVEL 3 ten

{'eval_loss': 1.769002079963684,
 'eval_average_accuracy': 0.7375037916324279,
 'eval_accuracy': 0.7433526011560694,
 'eval_runtime': 7.074,
 'eval_samples_per_second': 601.643,
 'eval_steps_per_second': 9.471}

In [253]:
dataset["dev"] = dataset["history"]
model = AutoModelForTokenClassification.from_pretrained("../models/usp-lang-relative_key_query-micro",
                                                        num_labels=len(glosses))
trainer = create_trainer(model, dataset=dataset, tokenizer=tokenizer, labels=glosses, batch_size=BATCH_SIZE,
                         max_epochs=300, report_to='wandb')

trainer.train()

loading configuration file ../models/usp-lang-relative_key_query-micro/config.json
Model config RobertaConfig {
  "_name_or_path": "../models/usp-lang-relative_key_query-micro",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 100,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL

Creating trainer...


Epoch,Training Loss,Validation Loss


The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: translation, morphemes, transcription, segmentation, glosses. If translation, morphemes, transcription, segmentation, glosses are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1334
  Batch size = 64


PREDS [[54  1 54 ... 43 43 43]
 [ 1  1 54 ... 43 43 43]
 [43  1 54 ... 43 43 43]
 ...
 [54  1 43 ... 43 43 43]
 [54  1 39 ... 43 43 43]
 [43 43  1 ... 43 43 43]]
LABELS [[  36    1   50 ... -100 -100 -100]
 [  45    1   53 ... -100 -100 -100]
 [  40    1   43 ... -100 -100 -100]
 ...
 [  42    1   39 ... -100 -100 -100]
 [  54    1   57 ... -100 -100 -100]
 [  37   44    1 ... -100 -100 -100]]
(1334, 64)
Preds:	 ['ADV', '[SEP]', 'ADV', '[SEP]', 'ADV', '[SEP]', 'ADV']
Labels:	 ['EXS', '[SEP]', 'NUM', '[SEP]', 'S', '[SEP]', 'DEM']


Saving model checkpoint to ../training-checkpoints/checkpoint-30
Configuration saved in ../training-checkpoints/checkpoint-30/config.json
Model weights saved in ../training-checkpoints/checkpoint-30/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: translation, morphemes, transcription, segmentation, glosses. If translation, morphemes, transcription, segmentation, glosses are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1334
  Batch size = 64


PREDS [[54  1 54 ... 43 43 43]
 [38  1 54 ... 43 43 43]
 [43  1 54 ... 43 43 43]
 ...
 [54  1 43 ... 43 43 43]
 [54  1 54 ... 43 43 43]
 [43 43  1 ... 43 43 43]]
LABELS [[  36    1   50 ... -100 -100 -100]
 [  45    1   53 ... -100 -100 -100]
 [  40    1   43 ... -100 -100 -100]
 ...
 [  42    1   39 ... -100 -100 -100]
 [  54    1   57 ... -100 -100 -100]
 [  37   44    1 ... -100 -100 -100]]
(1334, 64)
Preds:	 ['ADV', '[SEP]', 'ADV', '[SEP]', 'ADV', '[SEP]', 'ADV']
Labels:	 ['EXS', '[SEP]', 'NUM', '[SEP]', 'S', '[SEP]', 'DEM']


Saving model checkpoint to ../training-checkpoints/checkpoint-60
Configuration saved in ../training-checkpoints/checkpoint-60/config.json
Model weights saved in ../training-checkpoints/checkpoint-60/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: translation, morphemes, transcription, segmentation, glosses. If translation, morphemes, transcription, segmentation, glosses are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1334
  Batch size = 64


PREDS [[54  1 54 ... 43 43 43]
 [38  1 53 ... 43 43 43]
 [43  1 54 ... 43 43 43]
 ...
 [42  1 43 ... 43 43 43]
 [54  1 54 ... 43 43 43]
 [43 43  1 ... 43 43 43]]
LABELS [[  36    1   50 ... -100 -100 -100]
 [  45    1   53 ... -100 -100 -100]
 [  40    1   43 ... -100 -100 -100]
 ...
 [  42    1   39 ... -100 -100 -100]
 [  54    1   57 ... -100 -100 -100]
 [  37   44    1 ... -100 -100 -100]]
(1334, 64)
Preds:	 ['ADV', '[SEP]', 'ADV', '[SEP]', 'ADV', '[SEP]', 'ADV']
Labels:	 ['EXS', '[SEP]', 'NUM', '[SEP]', 'S', '[SEP]', 'DEM']


Saving model checkpoint to ../training-checkpoints/checkpoint-90
Configuration saved in ../training-checkpoints/checkpoint-90/config.json
Model weights saved in ../training-checkpoints/checkpoint-90/pytorch_model.bin


KeyboardInterrupt: 

In [262]:
AutoModelForTokenClassification.from_pretrained("michaelginn/usp-mlm-genbench",
                                                num_labels=len(glosses)).config

loading configuration file https://huggingface.co/michaelginn/usp-mlm-genbench/resolve/main/config.json from cache at /Users/milesper/.cache/huggingface/transformers/a9afbb4060625af7d7df340d4b3c12529a2ce02389d93941e0a9c0e816028821.6397773dd83107d1040497818bcfcc0e22d450f16335ecd6c34d128812d92083
Model config RobertaConfig {
  "_name_or_path": "michaelginn/usp-mlm-genbench",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 100,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL

RobertaConfig {
  "_name_or_path": "michaelginn/usp-mlm-genbench",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 100,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",
    "27": "LABEL_27",
    "28": "LABEL_28",
    "29": "LABEL_29",
    "30": "LABEL_30",
    "31": "LABEL_

In [257]:
AutoModelForTokenClassification.from_pretrained("michaelginn/usp-lang-relative_key_query-micro",
                                                num_labels=len(glosses)).config

loading configuration file https://huggingface.co/michaelginn/usp-lang-relative_key_query-micro/resolve/main/config.json from cache at /Users/milesper/.cache/huggingface/transformers/ea8855f043b5a1f8318bf563a18468cfd79684505aa12404edc1df9fc90d1bcc.b852993b5a3265e4f22ced7049b0f3bb4e8e146614e7c4e8bf50c8adb05e8907
Model config RobertaConfig {
  "_name_or_path": "michaelginn/usp-lang-relative_key_query-micro",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 100,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    

RobertaConfig {
  "_name_or_path": "michaelginn/usp-lang-relative_key_query-micro",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 100,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",
    "27": "LABEL_27",
    "28": "LABEL_28",
    "29": "LABEL_29",
    "30": "LABEL_30",


In [256]:
len(train_vocab)

2280

In [265]:
preds = trainer.predict(dataset['eval_OOD']).predictions

PREDS [[43  1 26 ... 43 43 43]
 [14 43 43 ... 43 43 43]
 [14 43 43 ... 43 43 43]
 ...
 [54  1 43 ... 43 43 43]
 [43  1 53 ... 43 43 43]
 [39 14  1 ... 43 43 43]]
LABELS [[  54    1   26 ... -100 -100 -100]
 [   9   43 -100 ... -100 -100 -100]
 [   9   43 -100 ... -100 -100 -100]
 ...
 [  57    1   50 ... -100 -100 -100]
 [  43    1   53 ... -100 -100 -100]
 [   3   53    1 ... -100 -100 -100]]
(4256, 64)
Preds:	 ['S', '[SEP]', 'INC', 'VT', 'E3S', '[SEP]', 'INC', 'VT', 'VT', '[SEP]', 'PRON']
Labels:	 ['ADV', '[SEP]', 'INC', 'VT', 'SC', '[SEP]', 'INC', 'E1S', 'VI', '[SEP]', 'PRON']


array([[43,  1, 26, ..., 43, 43, 43],
       [14, 43, 43, ..., 43, 43, 43],
       [14, 43, 43, ..., 43, 43, 43],
       ...,
       [54,  1, 43, ..., 43, 43, 43],
       [43,  1, 53, ..., 43, 43, 43],
       [39, 14,  1, ..., 43, 43, 43]])

The following columns in the test set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: translation, morphemes, transcription, segmentation, glosses. If translation, morphemes, transcription, segmentation, glosses are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 4256
  Batch size = 64


PREDS [[43  1 26 ... 43 43 43]
 [14 43 43 ... 43 43 43]
 [14 43 43 ... 43 43 43]
 ...
 [54  1 43 ... 43 43 43]
 [43  1 53 ... 43 43 43]
 [39 14  1 ... 43 43 43]]
LABELS [[  54    1   26 ... -100 -100 -100]
 [   9   43 -100 ... -100 -100 -100]
 [   9   43 -100 ... -100 -100 -100]
 ...
 [  57    1   50 ... -100 -100 -100]
 [  43    1   53 ... -100 -100 -100]
 [   3   53    1 ... -100 -100 -100]]
(4256, 64)
Preds:	 ['S', '[SEP]', 'INC', 'VT', 'E3S', '[SEP]', 'INC', 'VT', 'VT', '[SEP]', 'PRON']
Labels:	 ['ADV', '[SEP]', 'INC', 'VT', 'SC', '[SEP]', 'INC', 'E1S', 'VI', '[SEP]', 'PRON']


In [266]:
preds

array([[43,  1, 26, ..., 43, 43, 43],
       [14, 43, 43, ..., 43, 43, 43],
       [14, 43, 43, ..., 43, 43, 43],
       ...,
       [54,  1, 43, ..., 43, 43, 43],
       [43,  1, 53, ..., 43, 43, 43],
       [39, 14,  1, ..., 43, 43, 43]])

In [267]:
decoded_preds = [[glosses[index] for index in pred_seq if len(glosses) > index >= 0] for pred_seq in preds]

In [270]:
print(decoded_preds[0])

['S', '[SEP]', 'INC', 'VT', 'E3S', '[SEP]', 'INC', 'VT', 'VT', '[SEP]', 'PRON', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S']


In [275]:
print(dataset['eval_OOD'][0]['transcription'])


antonses chib'aanik tanb'ij iin.


In [283]:
eval_ood[0].morphemes()

['antonses',
 '[SEP]',
 'chi',
 "b'an",
 'ik',
 '[SEP]',
 't',
 'an',
 "b'ij",
 '[SEP]',
 'iin']

In [18]:
len(test_ood)

853

In [16]:
test_ood = load_data_file("../data/GenBench/test_OOD")
test_id = load_data_file("../data/GenBench/test_ID")

In [25]:
dataset

DatasetDict({
    train: Dataset({
        features: ['transcription', 'translation', 'glosses', 'segmentation', 'morphemes', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5843
    })
    advice: Dataset({
        features: ['transcription', 'translation', 'glosses', 'segmentation', 'morphemes', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 612
    })
    history: Dataset({
        features: ['transcription', 'translation', 'glosses', 'segmentation', 'morphemes', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1334
    })
    personal: Dataset({
        features: ['transcription', 'translation', 'glosses', 'segmentation', 'morphemes', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3644
    })
    eval_OOD: Dataset({
        features: ['transcription', 'translation', 'glosses', 'segmentation', 'morphemes', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 481
    })
    eval_ID: Dataset({
        features: ['transcription', 

In [1]:
from data_handling import write_igt, load_data_file
import random

story = load_data_file("../data/GenBench/categories/story")
advice = load_data_file("../data/GenBench/categories/advice")
history = load_data_file("../data/GenBench/categories/history")
personal = load_data_file("../data/GenBench/categories/personal")

id_data = story + personal
ood_data = advice + history

random.shuffle(id_data)
random.shuffle(ood_data)

count_ood = int(len(ood_data) / 2)

eval_ood = ood_data[:count_ood]
test_ood = ood_data[count_ood:]

eval_id = id_data[:count_ood]
train = id_data[count_ood:]

write_igt(eval_ood, '../data/GenBench/eval_ood.txt')
write_igt(eval_id, '../data/GenBench/eval_id.txt')
write_igt(test_ood, '../data/GenBench/test_ood.txt')
write_igt(train, '../data/GenBench/train.txt')

In [3]:
len(eval_ood)

973