# Sanity Check Bert-Causal L2R vs R2L using model 1%

## Prepare Data + Model

In [1]:
!cat ./examples/data/text_forward-small.txt

Rødberg is located in the Norwegian traditional district and valley of Numedal .
The plants are affiliated with Statkraft , the Norwegian state owned electricity company .
The final passenger service ended in 1988 . The rail line north of Rollag was closed in 1989 .

Persecution of Jews in Europe increased in the High Middle Ages in the context of the Christian Crusades .
In 1394 , 100 , 000 Jews were expelled from France .
Jews were indeed infected in numbers similar to their non - Jewish neighbors Yet they were still made scapegoats .
Jewish daily life was very satisfying .
Jews lived among Jews .


In [2]:
import torch

from newlm.lm.elmo.lm_builder import ELMOLMBuilder

from newlm.lm.elmo.modeling_elmo.elmo_head import ELMOBertLMHeadModel
from transformers import BertConfig, BertLMHeadModel
from newlm.lm.bert.modeling_bert.bert_head import BertLMHeadR2LModel

from newlm.utils.file_util import read_from_yaml

#### Model Bert Causal

In [3]:
pt_l2r = "./outputs/en.1-percent.bert-causal"
config_l2r = read_from_yaml('examples/configs/run.1-percent-bert-causal.yaml')
config_l2r["lm"]["model_type"] = 'bert-causal'

model_l2r = BertLMHeadModel.from_pretrained(pt_l2r) # use pre-trained model

In [4]:
model_l2r.eval()
print("Model in eval mode for consistency")

Model in eval mode for consistency


In [5]:
pt_r2l = "./outputs/en.1-percent.bert-causal-r2l"
config_r2l = read_from_yaml('examples/configs/run.1-percent-bert-causal.yaml')
config_r2l["lm"]["model_type"] = 'bert-causal-r2l'

model_r2l = BertLMHeadR2LModel.from_pretrained(pt_r2l) # use pre-trained model

In [6]:
model_r2l.eval()
print("Model in eval mode for consistency")

Model in eval mode for consistency


#### Model ELMO Bert Causal

In [7]:
pt_elmo = "./outputs/en.1-percent.elmo-bert-causal"
config_elmo = read_from_yaml('examples/configs/run.1-percent-elmo-bert-causal.yaml')

model_elmo = ELMOBertLMHeadModel.from_pretrained(pt_elmo) # use pre-trained model

In [8]:
model_elmo.eval()
print("Model in eval mode for consistency")

Model in eval mode for consistency


In [9]:
# model_l2r

In [10]:
# model_r2l

### Data

In [11]:
def get_dataloader(config_file, tokenizer_dir, model_type, model):
    # lm builder (helper)
    elmo_lm_builder = ELMOLMBuilder(
        model_config = config_file['lm']['model']['config'],
        tokenizer=tokenizer_dir,
        model_type=model_type,
        max_len=128
    )
    
    # dataset-forward
    train_path = "./examples/data/text_forward-small.txt"
    ds_f = elmo_lm_builder._get_dataset(train_path)
    
    # trainer (helper)
    from transformers import TrainingArguments, Trainer
    args = TrainingArguments(output_dir="tmpout",**config_file['lm']['hf_trainer']['args'])

    # dataloader-forward
    trainer = Trainer(model=model, args=args, data_collator=elmo_lm_builder.data_collator, train_dataset=ds_f)
    dl_f = trainer.get_train_dataloader() # Data Loader-forward
    
    
    return dl_f, elmo_lm_builder.tokenizer

#### Data for Bert-Causal

In [12]:
%%capture
dl_l2r, tknz_l2r = get_dataloader(config_l2r, pt_l2r, "bert-causal", model_l2r)
dl_r2l, tknz_r2l = get_dataloader(config_r2l, pt_r2l, "bert-causal-r2l", model_r2l)

2022-01-08 17:02:31.434 | INFO     | newlm.lm.elmo.lm_builder:_get_dataset:145 - Constructing roBERTa style dataset
max_steps is given, it will override any value given in num_train_epochs
Didn't find file ./outputs/en.1-percent.bert-causal-r2l/tokenizer.json. We won't load it.
Didn't find file ./outputs/en.1-percent.bert-causal-r2l/added_tokens.json. We won't load it.
Didn't find file ./outputs/en.1-percent.bert-causal-r2l/special_tokens_map.json. We won't load it.
Didn't find file ./outputs/en.1-percent.bert-causal-r2l/tokenizer_config.json. We won't load it.
loading file ./outputs/en.1-percent.bert-causal-r2l/vocab.txt
loading file None
loading file None
loading file None
loading file None
loading configuration file ./outputs/en.1-percent.bert-causal-r2l/config.json
Model config BertConfig {
  "architectures": [
    "BertLMHeadR2LModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size"

In [13]:
batch_l2r = next(iter(dl_l2r))
batch_l2r['input_ids'].shape

torch.Size([1, 127])

In [14]:
batch_r2l = next(iter(dl_r2l))
batch_r2l['input_ids'].shape

torch.Size([1, 127])

#### Data for ELMO Bert-Causal

In [15]:
%%capture
dl_elmo, tknz_elmo = get_dataloader(config_elmo, pt_elmo, "elmo-bert-causal", model_elmo)

Didn't find file ./outputs/en.1-percent.elmo-bert-causal/tokenizer.json. We won't load it.
Didn't find file ./outputs/en.1-percent.elmo-bert-causal/added_tokens.json. We won't load it.
Didn't find file ./outputs/en.1-percent.elmo-bert-causal/special_tokens_map.json. We won't load it.
Didn't find file ./outputs/en.1-percent.elmo-bert-causal/tokenizer_config.json. We won't load it.
loading file ./outputs/en.1-percent.elmo-bert-causal/vocab.txt
loading file None
loading file None
loading file None
loading file None
loading configuration file ./outputs/en.1-percent.elmo-bert-causal/config.json
Model config BertConfig {
  "architectures": [
    "ELMOBertLMHeadModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": true,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 1024,
  "model_type": "bert",
  "num_at

In [16]:
batch_elmo = next(iter(dl_elmo))
batch_elmo['input_ids'].shape

torch.Size([1, 127])

In [17]:
import pandas as pd

def print_batch(batch_f, tknz):
    tokens_f = tknz.convert_ids_to_tokens(batch_f['input_ids'][0])
    return pd.DataFrame({"data": tokens_f})

In [18]:
df_l2r = print_batch(batch_l2r, tknz_l2r)
df_r2l = print_batch(batch_r2l, tknz_r2l)
pd.DataFrame({"l2r": df_l2r['data'], "r2l": df_r2l["data"]})

Unnamed: 0,l2r,r2l
0,[CLS],[CLS]
1,R,R
2,##ø,##ø
3,##d,##d
4,##berg,##berg
...,...,...
122,lived,lived
123,among,among
124,Jews,Jews
125,.,.


Same input (forward) for L2R and R2L. The R2L input would be flipped inside forward method.

In [19]:
print_batch(batch_elmo, tknz_elmo)

Unnamed: 0,data
0,[CLS]
1,R
2,##ø
3,##d
4,##berg
...,...
122,lived
123,among
124,Jews
125,.


In [20]:
model_elmo.eval()
model_l2r.eval()
model_r2l.eval()
print("Model in eval mode for consistency")

Model in eval mode for consistency


## Sanity Check

In [27]:
%%capture
res_l2r = model_l2r(**batch_l2r)
res_r2l = model_r2l(**batch_r2l)

In [28]:
print("l2r_loss", res_l2r.loss)
print("r2l_loss", res_r2l.loss)

l2r_loss tensor(4.3849, grad_fn=<NllLossBackward>)
r2l_loss tensor(4.4385, grad_fn=<NllLossBackward>)


In [22]:
import torch

print("PP Bert-Causal l2r", torch.exp(torch.Tensor([4.3849])))
print("PP Bert-Causal r2l", torch.exp(torch.Tensor([4.4385])))

PP Bert-Causal l2r tensor([80.2302])
PP Bert-Causal r2l tensor([84.6479])


In [23]:
#### ELMO BERT-Causal
res_elmo = model_elmo(**batch_elmo) 

l2r_loss tensor(4.4801, grad_fn=<NllLossBackward>)
r2l_loss tensor(4.3589, grad_fn=<NllLossBackward>)


In [24]:
print("PP ELMO Bert-Causal l2r", torch.exp(torch.Tensor([4.4801])))
print("PP ELMO Bert-Causal r2l", torch.exp(torch.Tensor([4.3589])))

PP ELMO Bert-Causal l2r tensor([88.2435])
PP ELMO Bert-Causal r2l tensor([78.1711])


In [25]:
model_r2l(**batch_r2l)

Original L2R
{'input_ids': tensor([[    2,    54,  1100,  1007,  4854,  1833,  3234,  1783,  1765,  9456,
          5718,  3829,  1782,  8725,  1780, 20592, 17660,  1020,    18,  1811,
          7614,  1928, 13083,  1827,  9643,  1008,  3438,    16,  1765,  9456,
          3054,  5167, 11056,  3204,    18,  1811,  3378,  8418,  3818,  4663,
          1783,  5628,    18,  1811,  4155,  2982,  3456,  1780, 11448,  1855,
          1795,  3403,  1783,  5546,    18,     3, 27648, 14056,  1792,  1780,
          7581,  1783,  3370,  5928,  1783,  1765,  3550,  6677, 16024,  1783,
          1765, 10694,  1780,  1765,  4629, 17474,  4456,    18,  1909, 17911,
          1045,    16,  3761,    16,  2956,  7581,  1905, 17193,  1868,  4164,
            18,  7581,  1905,  8576, 15844,  1783,  6188,  4048,  1779,  2021,
          3676,    17,  6273, 15442,  8479,  1983,  1905,  2292,  2202,  2049,
          3226,  4141,  3034,    18,  6273,  8319,  2459,  1795,  2485, 22558,
            18,  7581,  4

CausalLMOutputWithCrossAttentions(loss=tensor(4.4385, grad_fn=<NllLossBackward>), logits=tensor([[[-7.6350,  2.8295,  4.7944,  ..., -3.1778, -2.5064, -2.4032],
         [-7.9487,  0.1907, -0.6603,  ..., -3.8263, -4.2424, -3.2813],
         [-7.8663,  0.1811,  1.2252,  ..., -6.0053, -2.7640, -3.0149],
         ...,
         [-5.2620,  2.7503,  3.8883,  ...,  0.9219,  1.1852,  0.7565],
         [-7.2864,  2.0949, 14.6970,  ..., -2.6282,  0.7408, -0.8565],
         [-5.4013,  2.5424, 20.6766,  ...,  0.1895, -1.1299, -1.9379]]],
       grad_fn=<AddBackward0>), past_key_values=None, hidden_states=None, attentions=None, cross_attentions=None)

In [21]:
model_elmo(**batch_elmo)

L2R Input
{'input_ids': tensor([[    2,    54,  1072,  1017,  4856,  1835,  3236,  1785,  1767,  9458,
          5720,  3831,  1784,  8727,  1782, 20594, 17662,  1012,    18,  1813,
          7616,  1930, 13085,  1829,  9645,  1010,  3440,    16,  1767,  9458,
          3056,  5169, 11058,  3206,    18,  1813,  3380,  8420,  3820,  4665,
          1785,  5630,    18,  1813,  4157,  2984,  3458,  1782, 11450,  1857,
          1797,  3405,  1785,  5548,    18,     3, 27650, 14057,  1794,  1782,
          7583,  1785,  3372,  5930,  1785,  1767,  3552,  6679, 16026,  1785,
          1767, 10696,  1782,  1767,  4631, 17476,  4458,    18,  1911, 17912,
          1061,    16,  3763,    16,  2958,  7583,  1907, 17195,  1870,  4166,
            18,  7583,  1907,  8578, 15846,  1785,  6190,  4050,  1781,  2023,
          3678,    17,  6275, 15444,  8481,  1985,  1907,  2294,  2204,  2051,
          3228,  4143,  3036,    18,  6275,  8321,  2461,  1797,  2487, 22560,
            18,  7583,  4371

ElmoGPTCausalLMOutput(loss=tensor(8.8390, grad_fn=<AddBackward0>), logits=None, past_key_values=None, hidden_states=None, attentions=None, cross_attentions=None, l2r_hidden_states=None, r2l_hidden_states=None, l2r_logits=tensor([[[-7.2616,  2.7625, -6.6354,  ...,  0.0755,  1.3210,  1.4225],
         [-4.2509,  3.7727, -4.6725,  ...,  0.2850, -1.1770,  0.3096],
         [-6.0543,  0.6120, -5.8872,  ..., -0.0918, -3.1188, -3.3738],
         ...,
         [-9.5750,  2.8917, -9.0198,  ..., -2.6988, -4.2076, -4.2855],
         [-6.2482,  2.5143, -6.2989,  ..., -1.6477,  1.0244, -2.7994],
         [-5.5948,  2.1233, -6.3670,  ..., -0.8482,  0.5466, -0.8888]]],
       grad_fn=<UnsafeViewBackward>), r2l_logits=tensor([[[-7.0511,  2.5356,  4.3553,  ..., -4.7546, -0.8196, -4.1463],
         [-8.4824,  0.5386, -1.8902,  ..., -3.4352, -2.3649, -3.6365],
         [-8.0600,  0.3871,  1.8165,  ..., -3.3751, -4.1591, -4.9914],
         ...,
         [-5.3477,  1.0564,  2.8627,  ...,  0.5925, -3.5095, 