In [122]:
from data import load_data_file, create_vocab

train_data = load_data_file("../data/GenBench/story_advice_personal")
eval_data = load_data_file("../data/GenBench/history")

print(f"Loaded {len(train_data)} train lines and {len(eval_data)} eval lines")

Loaded 10099 train lines and 1334 eval lines


In [146]:
from tokenizer import WordLevelTokenizer
from data import create_gloss_vocab, prepare_dataset_mlm
from datasets import DatasetDict
from uspanteko_morphology import morphology

MODEL_INPUT_LENGTH = 64
device = "cpu"

glosses = create_gloss_vocab(morphology)
tokenizer = WordLevelTokenizer(vocab=glosses, model_max_length=MODEL_INPUT_LENGTH)

dataset = DatasetDict()
dataset['train'] = prepare_dataset_mlm(data=[line.gloss_list(segmented=True) for line in train_data],
                                       tokenizer=tokenizer,
                                       device=device)
dataset['dev'] = prepare_dataset_mlm(data=[line.gloss_list(segmented=True) for line in eval_data],
                                     tokenizer=tokenizer,
                                     device=device)



  0%|          | 0/10099 [00:00<?, ?ex/s]

  0%|          | 0/1334 [00:00<?, ?ex/s]

In [147]:
dataset['train'][0]

{'tokens': ['ADV',
  '[SEP]',
  'INC',
  'VT',
  'SC',
  '[SEP]',
  'INC',
  'E1S',
  'VI',
  '[SEP]',
  'PRON'],
 'input_ids': [58, 1, 30, 43, 38, 1, 30, 16, 42, 1, 46]}

In [208]:
from transformers import AutoModelForMaskedLM
import torch

denoiser = AutoModelForMaskedLM.from_pretrained("michaelginn/usp-gloss-denoiser")

preds = torch.LongTensor([[58, 1, 30, 43, 38, 1, 30, 16, 42, 1, 46, 16, 19, 58, 33, 58, 52, 58,
                           16, 58, 16, 58, 32, 12, 40, 58, 58, 58, 58, 52, 58, 17, 16, 16, 61, 16,
                           58, 52, 16, 16, 58, 40, 61, 16, 58, 40, 52, 58, 58, 19, 58, 32, 40, 32,
                           58, 52, 52, 16, 52, 58]])
attention_mask = torch.LongTensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

denoised_logits = denoiser.forward(input_ids=preds, attention_mask=attention_mask).logits
print(denoised_logits)
print("Preds after denoising", denoised_logits.max(-1).indices)

tensor([[[ -3.7854,  -0.2528, -12.0816,  ...,   0.4677,  -2.5497,   1.0582],
         [ -3.6128,   8.8049,  -9.6641,  ...,  -2.0360,  -2.9220,  -0.2805],
         [ -2.4408,   0.3544,  -7.9838,  ...,  -1.3915,  -2.6055,   0.9511],
         ...,
         [ -2.4071,   0.3933,  -7.6065,  ...,  -1.2953,  -2.1920,  -1.0713],
         [ -2.4872,   0.3304,  -6.5033,  ...,  -1.8509,  -1.7035,   0.0232],
         [ -3.9852,   1.7397, -13.6154,  ...,  -3.0252,  -3.7374,  -0.6126]]],
       grad_fn=<ViewBackward0>)
Preds after denoising tensor([[58,  1, 30, 43, 38,  1, 30, 16, 42,  1, 46, 62, 46, 58, 47, 47, 42, 47,
         47, 42, 47, 47, 47, 47, 62, 58, 47, 58, 58, 42, 58, 17, 16, 16, 61, 16,
         58, 42, 16, 16, 58, 42, 61, 16, 58, 42, 42, 58, 58, 42, 58, 47, 42, 43,
         58, 42, 42, 16, 42, 58]])


In [152]:
from data import prepare_dataset

train_vocab = create_vocab([line.morphemes() for line in train_data], threshold=1)
tokenizer = WordLevelTokenizer(vocab=train_vocab, model_max_length=MODEL_INPUT_LENGTH)

dataset2 = DatasetDict()
dataset2['train'] = prepare_dataset(data=train_data, tokenizer=tokenizer, labels=glosses, device=device)
dataset2['dev'] = prepare_dataset(data=eval_data, tokenizer=tokenizer, labels=glosses, device=device)

  0%|          | 0/10099 [00:00<?, ?ex/s]

  0%|          | 0/1334 [00:00<?, ?ex/s]

In [182]:
input_ids = torch.LongTensor([dataset2['train'][0]['input_ids']])
preds[input_ids == 0] = 3

preds

tensor([[ 58,   1,  30,  43,  38,   1,  30,  16,  42,   1,  46, -96, -96, -96,
         -96, -96, -96, -96, -96, -96, -96, -96, -96, -96, -96, -96, -96, -96,
         -96, -96, -96, -96, -96, -96, -96, -96, -96, -96, -96, -96, -96, -96,
         -96, -96, -96, -96, -96, -96, -96, -96, -96, -96, -96, -96, -96, -96,
         -96, -96, -96, -96, -96, -96, -96, -96]])

In [183]:
# preds = preds.narrow(-1, 0, 60)
preds[preds < 0] = 2
preds

tensor([[58,  1, 30, 43, 38,  1, 30, 16, 42,  1, 46,  2,  2,  2,  2,  2,  2,  2,
          2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
          2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
          2,  2,  2,  2,  2,  2,  2,  2,  2,  2]])

In [184]:
denoised_preds = denoiser.forward(preds)
denoised_preds.logits.argmax(dim=-1)

tensor([[58,  1, 30, 43, 38,  1, 30, 16, 42,  1, 46,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  1,  1,  1]])

In [179]:
preds.size()

torch.Size([1, 60])

In [142]:
from denoised_model import DenoisedModel

model = DenoisedModel.from_pretrained("michaelginn/uspanteko-mlm-large", num_labels=len(glosses))


Some weights of the model checkpoint at michaelginn/uspanteko-mlm-large were not used when initializing DenoisedModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing DenoisedModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DenoisedModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DenoisedModel were not initialized from the model checkpoint at michaelginn/uspanteko-mlm-large and are newly initialized: ['denoiser.roberta.encoder.layer.0.attention.self.key.weight', 'denoiser.roberta.encoder.layer.0.attention.self.value.weight', 'denoiser.roberta.encoder.layer.2.a

In [86]:
import torch

out = model.forward(input_ids=torch.LongTensor(eval['input_ids']),
                    attention_mask=torch.LongTensor(eval['attention_mask']))

In [114]:
preds = out.logits.max(-1).indices
preds = preds.narrow(1, 0, 60)
preds.size()

In [92]:
from transformers import AutoModelForMaskedLM

denoiser = AutoModelForMaskedLM.from_pretrained("michaelginn/usp-gloss-denoiser")

attention_mask = (preds != 2).long()
denoised_out = denoiser.forward(input_ids=preds, attention_mask=attention_mask)

In [103]:
denoised_out.logits

tensor([[[-3.3836e+00, -5.1023e-01, -1.0617e+01,  ..., -1.3619e-01,
           1.2214e+00,  1.1689e+00],
         [-3.3444e+00,  5.2346e+00, -8.8257e+00,  ...,  5.2330e-01,
          -3.1739e+00,  2.3671e+00],
         [-3.3963e+00,  3.9592e+00, -9.6772e+00,  ..., -1.2468e+00,
          -1.4130e+00,  1.1068e+00],
         ...,
         [-2.7051e+00,  4.3314e+00, -6.7782e+00,  ..., -6.9547e-01,
          -1.8088e+00,  1.3990e+00],
         [-2.4877e+00,  4.3198e+00, -6.3919e+00,  ..., -7.3392e-01,
          -1.9627e+00,  1.5051e+00],
         [-2.8166e+00,  4.1541e+00, -7.2809e+00,  ..., -1.0397e+00,
          -1.8851e+00,  1.4013e+00]],

        [[-2.8004e+00, -1.2937e-01, -7.9537e+00,  ..., -5.4015e-01,
           9.1588e-01,  1.3190e+00],
         [-3.4990e+00,  7.8989e+00, -9.1389e+00,  ..., -8.5067e-01,
          -3.6493e+00,  1.0768e-02],
         [-3.0368e+00,  3.7025e+00, -8.9272e+00,  ..., -1.8987e+00,
          -1.3557e+00,  3.4993e-01],
         ...,
         [-2.7095e+00,  4

In [77]:
from datasets import DatasetDict
from data import prepare_dataset_mlm
from transformers import DataCollatorForLanguageModeling

glosses = create_gloss_vocab(morphology_tree)
tokenizer = WordLevelTokenizer(vocab=glosses, model_max_length=64)

dataset = DatasetDict()
dataset['train'] = prepare_dataset_mlm(data=[line.gloss_list(segmented=True) for line in train_data],
                                       tokenizer=tokenizer,
                                       device="cpu")

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="pt")

  0%|          | 0/10099 [00:00<?, ?ex/s]

In [211]:
denoiser.state_dict()

OrderedDict([('roberta.embeddings.position_ids',
              tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
                       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
                       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
                       54, 55, 56, 57, 58, 59, 60, 61, 62, 63]])),
             ('roberta.embeddings.word_embeddings.weight',
              tensor([[-0.0508,  0.0042,  0.1040,  ..., -0.0196,  0.0712, -0.0308],
                      [ 0.1025, -0.0581, -0.0144,  ..., -0.0967, -0.0408,  0.0322],
                      [-0.1701, -0.0950,  0.2806,  ..., -0.1971,  0.1261, -0.1216],
                      ...,
                      [ 0.0093,  0.0255,  0.0605,  ...,  0.0110,  0.0079, -0.0444],
                      [ 0.0033,  0.0424,  0.0091,  ...,  0.0214,  0.0552, -0.0003],
                      [ 0.0682,  0.0908,  0.0248,  ..., -0.0159,  0.0089,  0.0124]])),
      