In [3]:
from data import load_data_file, create_vocab

train_data = load_data_file("../data/GenBench/story_advice_personal")
eval_data = load_data_file("../data/GenBench/history")

print(f"Loaded {len(train_data)} train lines and {len(eval_data)} eval lines")

Loaded 10099 train lines and 1334 eval lines


In [4]:
from tokenizer import WordLevelTokenizer

train_vocab = create_vocab([line.morphemes() for line in train_data], threshold=1)
tokenizer = WordLevelTokenizer(vocab=train_vocab, model_max_length=64)

In [68]:
from data import prepare_dataset, create_gloss_vocab
from uspanteko_morphology import morphology as morphology_tree

glosses = create_gloss_vocab(morphology_tree)
train = prepare_dataset(data=train_data, tokenizer=tokenizer, labels=glosses, device="cpu")
eval = prepare_dataset(data=eval_data, tokenizer=tokenizer, labels=glosses, device="cpu")

  0%|          | 0/10099 [00:00<?, ?ex/s]

  0%|          | 0/1334 [00:00<?, ?ex/s]

In [6]:
from denoised_model import DenoisedModel

model = DenoisedModel.from_pretrained("michaelginn/uspanteko-mlm-large", num_labels=len(glosses))


Some weights of the model checkpoint at michaelginn/uspanteko-mlm-large were not used when initializing DenoisedModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing DenoisedModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DenoisedModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DenoisedModel were not initialized from the model checkpoint at michaelginn/uspanteko-mlm-large and are newly initialized: ['denoiser.roberta.encoder.layer.0.attention.self.key.weight', 'denoiser.roberta.encoder.layer.0.attention.self.value.weight', 'denoiser.roberta.encoder.layer.2.a

In [86]:
import torch

out = model.forward(input_ids=torch.LongTensor(eval['input_ids']),
                    attention_mask=torch.LongTensor(eval['attention_mask']))

In [114]:
preds = out.logits.max(-1).indices
preds = preds.narrow(1, 0, 60)
preds.size()

In [92]:
from transformers import AutoModelForMaskedLM

denoiser = AutoModelForMaskedLM.from_pretrained("michaelginn/usp-gloss-denoiser")

attention_mask = (preds != 2).long()
denoised_out = denoiser.forward(input_ids=preds, attention_mask=attention_mask)

In [103]:
denoised_out.logits

tensor([[[-3.3836e+00, -5.1023e-01, -1.0617e+01,  ..., -1.3619e-01,
           1.2214e+00,  1.1689e+00],
         [-3.3444e+00,  5.2346e+00, -8.8257e+00,  ...,  5.2330e-01,
          -3.1739e+00,  2.3671e+00],
         [-3.3963e+00,  3.9592e+00, -9.6772e+00,  ..., -1.2468e+00,
          -1.4130e+00,  1.1068e+00],
         ...,
         [-2.7051e+00,  4.3314e+00, -6.7782e+00,  ..., -6.9547e-01,
          -1.8088e+00,  1.3990e+00],
         [-2.4877e+00,  4.3198e+00, -6.3919e+00,  ..., -7.3392e-01,
          -1.9627e+00,  1.5051e+00],
         [-2.8166e+00,  4.1541e+00, -7.2809e+00,  ..., -1.0397e+00,
          -1.8851e+00,  1.4013e+00]],

        [[-2.8004e+00, -1.2937e-01, -7.9537e+00,  ..., -5.4015e-01,
           9.1588e-01,  1.3190e+00],
         [-3.4990e+00,  7.8989e+00, -9.1389e+00,  ..., -8.5067e-01,
          -3.6493e+00,  1.0768e-02],
         [-3.0368e+00,  3.7025e+00, -8.9272e+00,  ..., -1.8987e+00,
          -1.3557e+00,  3.4993e-01],
         ...,
         [-2.7095e+00,  4

MaskedLMOutput(loss=None, logits=tensor([[[-2.6922,  0.4852, -7.1366,  ..., -0.0273,  0.2026,  1.6019],
         [-3.4051,  5.4612, -8.9014,  ...,  0.0311, -2.6203,  1.8816],
         [-3.2442,  4.4054, -8.2776,  ..., -0.4690, -1.5670,  1.8222],
         ...,
         [-2.5246,  4.3873, -6.4352,  ..., -0.7323, -1.9798,  1.5046],
         [-2.8162,  4.2971, -7.1913,  ..., -0.9337, -1.9164,  1.4325],
         [-2.4159,  4.0938, -6.2263,  ..., -0.9810, -1.8853,  1.6195]]],
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)

In [77]:
from datasets import DatasetDict
from data import prepare_dataset_mlm
from transformers import DataCollatorForLanguageModeling

glosses = create_gloss_vocab(morphology_tree)
tokenizer = WordLevelTokenizer(vocab=glosses, model_max_length=64)

dataset = DatasetDict()
dataset['train'] = prepare_dataset_mlm(data=[line.gloss_list(segmented=True) for line in train_data],
                                       tokenizer=tokenizer,
                                       device="cpu")

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="pt")

  0%|          | 0/10099 [00:00<?, ?ex/s]

In [111]:
torch.count_nonzero(torch.LongTensor(eval['input_ids']) == 0)

tensor(572)

In [112]:
torch.LongTensor(eval['input_ids']).size()

torch.Size([1334, 64])

In [119]:
preds[torch.LongTensor(eval['input_ids']) == 0] = 0
preds.count_zero()

AttributeError: 'Tensor' object has no attribute 'count_zero'

In [121]:
(preds == 0).sum()

tensor(665)