## Prepare data

In [1]:
with open("./examples/data/text_forward-small.txt", "r+") as fr:
    lines = fr.readlines()

In [2]:
lines = [line.strip() for line in lines]

In [3]:
lines = [line.split() for line in lines]

In [4]:
lines = [line[::-1] for line in lines]

In [5]:
lines = [" ".join(line) for line in lines]

In [6]:
lines = lines[::-1]

In [7]:
lines = "\n".join(lines)

In [8]:
with open("./examples/data/text_backward-small.txt", "w+") as fw:
    fw.write(lines)

## Sanity Check

In [9]:
import torch
from newlm.lm.elmo.modeling_elmo.elmo_head import ELMOBertLMHeadModel
from newlm.lm.elmo.lm_builder import ELMOLMBuilder
from transformers import BertConfig

Model from scratch

In [10]:
from newlm.utils.file_util import read_from_yaml
config_file = read_from_yaml('examples/configs/run.1-percent-bert-causal.yaml')

elmo_lm_builder = ELMOLMBuilder(
    model_config = config_file['lm']['model']['config'], # no pretrained model
    tokenizer="./outputs/en.1-percent.elmo-bert-causal", # use pre-trained tokenizer
    model_type="bert-causal-elmo",
    max_len=128
)

# model
config = BertConfig(**elmo_lm_builder.model_config)
model = ELMOBertLMHeadModel(config=config)

In [11]:
model.eval()
print("Model in eval mode for consistency")

Model in eval mode for consistency


In [12]:
%%capture

# dataset-forward
train_path = "./examples/data/text_forward-small.txt"
ds_f = elmo_lm_builder._get_dataset(train_path)

# dataset-backward
train_path = "./examples/data/text_backward-small.txt"
ds_b = elmo_lm_builder._get_dataset(train_path)

2021-11-12 11:07:27.881 | INFO     | newlm.lm.elmo.lm_builder:_get_dataset:142 - Constructing roBERTa style dataset
2021-11-12 11:07:29.324 | INFO     | newlm.lm.elmo.lm_builder:_get_dataset:142 - Constructing roBERTa style dataset


In [13]:
# trainer (for helper)
from transformers import TrainingArguments, Trainer
args = TrainingArguments(output_dir="tmpout",**config_file['lm']['hf_trainer']['args'])

trainer = Trainer(model=model, args=args, data_collator=elmo_lm_builder.data_collator,
    train_dataset=ds_f,
)
dl_f = trainer.get_train_dataloader() # Data Loader-forward

trainer = Trainer(model=model, args=args,data_collator=elmo_lm_builder.data_collator,
    train_dataset=ds_b,
)
dl_b = trainer.get_train_dataloader() # Data Loader-backward

max_steps is given, it will override any value given in num_train_epochs
max_steps is given, it will override any value given in num_train_epochs


### Compare text_forward and text_backward

In [14]:
batch_f = next(iter(dl_f))
batch_b = next(iter(dl_b))

batch_f['input_ids'].shape, batch_b['input_ids'].shape

(torch.Size([1, 127]), torch.Size([1, 127]))

In [15]:
tokens_f = elmo_lm_builder.tokenizer.convert_ids_to_tokens(batch_f['input_ids'][0])
tokens_b = elmo_lm_builder.tokenizer.convert_ids_to_tokens(batch_b['input_ids'][0])

import pandas as pd
pd.DataFrame({"forward": tokens_f, "backward": tokens_b})

Unnamed: 0,forward,backward
0,[CLS],[CLS]
1,R,.
2,##ø,Jews
3,##d,among
4,##berg,lived
...,...,...
122,lived,R
123,among,##ø
124,Jews,##d
125,.,##berg


Here we can see that the data is not completely flip when the tokenizer couldn't parse a single word into a single id

In [16]:
res = model(**batch_f)

l2r_loss tensor(10.5425, grad_fn=<NllLossBackward>)
r2l_loss tensor(10.4237, grad_fn=<NllLossBackward>)


In [17]:
res = model(**batch_b)

l2r_loss tensor(10.5305, grad_fn=<NllLossBackward>)
r2l_loss tensor(10.4198, grad_fn=<NllLossBackward>)


### From batch_forward compare Normal vs Rev

In [18]:
import torch

batch_f_input = torch.clone(batch_f['input_ids'])
batch_f_rev_input = torch.cat(
    (
        batch_f_input[0][0:1],
        torch.flip(batch_f_input[0][1:-1], [0]),
        batch_f_input[0][-1:]
    )
)
batch_f_rev_input = batch_f_rev_input.reshape(1,-1)

batch_f_labels = torch.clone(batch_f['labels'])
batch_f_rev_labels = torch.cat(
    (
        batch_f_labels[0][0:1],
        torch.flip(batch_f_labels[0][1:-1], [0]),
        batch_f_labels[0][-1:]
    )
)
batch_f_rev_labels = batch_f_rev_labels.reshape(1,-1)

batch_rev = batch_f.copy()
batch_rev['input_ids'] = batch_f_rev_input
batch_rev['labels'] = batch_f_rev_labels

In [19]:
tokens_f = elmo_lm_builder.tokenizer.convert_ids_to_tokens(batch_f['input_ids'][0])
tokens_f_rev = elmo_lm_builder.tokenizer.convert_ids_to_tokens(batch_rev['input_ids'][0])

import pandas as pd
pd.DataFrame({"forward": tokens_f, "reverse": tokens_f_rev})

Unnamed: 0,forward,reverse
0,[CLS],[CLS]
1,R,.
2,##ø,Jews
3,##d,among
4,##berg,lived
...,...,...
122,lived,##berg
123,among,##d
124,Jews,##ø
125,.,R


with the exception of [CLS] and [SEP], the data are completely flip

In [20]:
res = model(**batch_f)

l2r_loss tensor(10.5425, grad_fn=<NllLossBackward>)
r2l_loss tensor(10.4237, grad_fn=<NllLossBackward>)


In [21]:
res = model(**batch_rev)

l2r_loss tensor(10.5323, grad_fn=<NllLossBackward>)
r2l_loss tensor(10.4094, grad_fn=<NllLossBackward>)
