# Sanity Check Bert-Causal L2R vs R2L using model 1%

## Prepare Data + Model

In [1]:
!cat ./examples/data/text_forward-small.txt

Rødberg is located in the Norwegian traditional district and valley of Numedal .
The plants are affiliated with Statkraft , the Norwegian state owned electricity company .
The final passenger service ended in 1988 . The rail line north of Rollag was closed in 1989 .

Persecution of Jews in Europe increased in the High Middle Ages in the context of the Christian Crusades .
In 1394 , 100 , 000 Jews were expelled from France .
Jews were indeed infected in numbers similar to their non - Jewish neighbors Yet they were still made scapegoats .
Jewish daily life was very satisfying .
Jews lived among Jews .


In [3]:
import torch

from newlm.lm.elmo.lm_builder import ELMOLMBuilder

from newlm.lm.elmo.modeling_elmo.elmo_head import ELMOBertLMHeadModel
from transformers import BertConfig, BertLMHeadModel
from newlm.lm.bert.modeling_bert.bert_head import BertLMHeadR2LModel

from newlm.utils.file_util import read_from_yaml

#### Model Bert Causal

In [4]:
pt_l2r = "./outputs/en.1-percent.bert-causal"
config_l2r = read_from_yaml('examples/configs/run.1-percent-bert-causal.yaml')
config_l2r["lm"]["model_type"] = 'bert-causal'

model_l2r = BertLMHeadModel.from_pretrained(pt_l2r) # use pre-trained model

In [5]:
model_l2r.eval()
print("Model in eval mode for consistency")

Model in eval mode for consistency


In [6]:
pt_r2l = "./outputs/en.1-percent.bert-causal-r2l"
config_r2l = read_from_yaml('examples/configs/run.1-percent-bert-causal.yaml')
config_r2l["lm"]["model_type"] = 'bert-causal-r2l'

model_r2l = BertLMHeadR2LModel.from_pretrained(pt_r2l) # use pre-trained model

In [7]:
model_r2l.eval()
print("Model in eval mode for consistency")

Model in eval mode for consistency


#### Model ELMO Bert Causal

In [27]:
pt_elmo = "./outputs/en.1-percent.elmo-bert-causal"
config_elmo = read_from_yaml('examples/configs/run.1-percent-elmo-bert-causal.yaml')

model_elmo = ELMOBertLMHeadModel.from_pretrained(pt_elmo) # use pre-trained model

loading configuration file ./outputs/en.1-percent.elmo-bert-causal/config.json
Model config BertConfig {
  "architectures": [
    "ELMOBertLMHeadModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": true,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 1024,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.9.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30000
}

loading weights file ./outputs/en.1-percent.elmo-bert-causal/pytorch_model.bin
All model checkpoint weights were used when initializing ELMOBertLMHeadModel.

All the weights of ELMOBertLMHeadModel were initialized from the model checkpoint at ./outputs/en.1-percent.elmo-bert-

In [28]:
model_elmo.eval()
print("Model in eval mode for consistency")

Model in eval mode for consistency


In [11]:
# model_l2r

In [10]:
# model_r2l

### Data

In [12]:
def get_dataloader(config_file, tokenizer_dir, model_type, model):
    # lm builder (helper)
    elmo_lm_builder = ELMOLMBuilder(
        model_config = config_file['lm']['model']['config'],
        tokenizer=tokenizer_dir,
        model_type=model_type,
        max_len=128
    )
    
    # dataset-forward
    train_path = "./examples/data/text_forward-small.txt"
    ds_f = elmo_lm_builder._get_dataset(train_path)
    
    # trainer (helper)
    from transformers import TrainingArguments, Trainer
    args = TrainingArguments(output_dir="tmpout",**config_file['lm']['hf_trainer']['args'])

    # dataloader-forward
    trainer = Trainer(model=model, args=args, data_collator=elmo_lm_builder.data_collator, train_dataset=ds_f)
    dl_f = trainer.get_train_dataloader() # Data Loader-forward
    
    
    return dl_f, elmo_lm_builder.tokenizer

#### Data for Bert-Causal

In [13]:
%%capture
dl_l2r, tknz_l2r = get_dataloader(config_l2r, pt_l2r, "bert-causal", model_l2r)
dl_r2l, tknz_r2l = get_dataloader(config_r2l, pt_r2l, "bert-causal-r2l", model_r2l)

2022-01-08 16:48:32.435 | INFO     | newlm.lm.elmo.lm_builder:_get_dataset:145 - Constructing roBERTa style dataset
max_steps is given, it will override any value given in num_train_epochs
Didn't find file ./outputs/en.1-percent.bert-causal-r2l/tokenizer.json. We won't load it.
Didn't find file ./outputs/en.1-percent.bert-causal-r2l/added_tokens.json. We won't load it.
Didn't find file ./outputs/en.1-percent.bert-causal-r2l/special_tokens_map.json. We won't load it.
Didn't find file ./outputs/en.1-percent.bert-causal-r2l/tokenizer_config.json. We won't load it.
loading file ./outputs/en.1-percent.bert-causal-r2l/vocab.txt
loading file None
loading file None
loading file None
loading file None
loading configuration file ./outputs/en.1-percent.bert-causal-r2l/config.json
Model config BertConfig {
  "architectures": [
    "BertLMHeadR2LModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size"

In [14]:
batch_l2r = next(iter(dl_l2r))
batch_l2r['input_ids'].shape

torch.Size([1, 127])

In [15]:
batch_r2l = next(iter(dl_r2l))
batch_r2l['input_ids'].shape

torch.Size([1, 127])

#### Data for ELMO Bert-Causal

In [29]:
%%capture
dl_elmo, tknz_elmo = get_dataloader(config_elmo, pt_elmo, "elmo-bert-causal", model_elmo)

Didn't find file ./outputs/en.1-percent.elmo-bert-causal/tokenizer.json. We won't load it.
Didn't find file ./outputs/en.1-percent.elmo-bert-causal/added_tokens.json. We won't load it.
Didn't find file ./outputs/en.1-percent.elmo-bert-causal/special_tokens_map.json. We won't load it.
Didn't find file ./outputs/en.1-percent.elmo-bert-causal/tokenizer_config.json. We won't load it.
loading file ./outputs/en.1-percent.elmo-bert-causal/vocab.txt
loading file None
loading file None
loading file None
loading file None
loading configuration file ./outputs/en.1-percent.elmo-bert-causal/config.json
Model config BertConfig {
  "architectures": [
    "ELMOBertLMHeadModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": true,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 1024,
  "model_type": "bert",
  "num_at

In [30]:
batch_elmo = next(iter(dl_elmo))
batch_elmo['input_ids'].shape

torch.Size([1, 127])

In [16]:
import pandas as pd

def print_batch(batch_f, tknz):
    tokens_f = tknz.convert_ids_to_tokens(batch_f['input_ids'][0])
    return pd.DataFrame({"data": tokens_f})

In [21]:
df_l2r = print_batch(batch_l2r, tknz_l2r)
df_r2l = print_batch(batch_r2l, tknz_r2l)
pd.DataFrame({"l2r": df_l2r['data'], "r2l": df_r2l["data"]})

Unnamed: 0,l2r,r2l
0,[CLS],[CLS]
1,R,R
2,##ø,##ø
3,##d,##d
4,##berg,##berg
...,...,...
122,lived,lived
123,among,among
124,Jews,Jews
125,.,.


Same input (forward) for L2R and R2L. The R2L input would be flipped inside forward method.

In [31]:
print_batch(batch_elmo, tknz_elmo)

Unnamed: 0,data
0,[CLS]
1,R
2,##ø
3,##d
4,##berg
...,...
122,lived
123,among
124,Jews
125,.


In [32]:
model_elmo.eval()
model_l2r.eval()
model_r2l.eval()
print("Model in eval mode for consistency")

Model in eval mode for consistency


## Sanity Check

In [24]:
res_l2r = model_l2r(**batch_l2r)
print("l2r_loss", res_l2r.loss)

res_r2l = model_r2l(**batch_r2l)
print("r2l_loss", res_r2l.loss)

l2r_loss tensor(4.3849, grad_fn=<NllLossBackward>)
r2l_loss tensor(4.4385, grad_fn=<NllLossBackward>)


In [26]:
import torch

print("PP Bert-Causal l2r", torch.exp(torch.Tensor([4.3849])))
print("PP Bert-Causal r2l", torch.exp(torch.Tensor([4.4385])))

PP Bert-Causal l2r tensor([80.2302])
PP Bert-Causal r2l tensor([84.6479])


In [33]:
#### ELMO BERT-Causal
res_elmo = model_elmo(**batch_elmo) 

l2r_loss tensor(4.4801, grad_fn=<NllLossBackward>)
r2l_loss tensor(4.3589, grad_fn=<NllLossBackward>)


In [34]:
print("PP ELMO Bert-Causal l2r", torch.exp(torch.Tensor([4.4801])))
print("PP ELMO Bert-Causal r2l", torch.exp(torch.Tensor([4.3589])))

PP ELMO Bert-Causal l2r tensor([88.2435])
PP ELMO Bert-Causal r2l tensor([78.1711])
