# Sanity Check ELMO Bert-Causal vs Bert-Causal

## Prepare Data + Model

In [1]:
!cat ./examples/data/text_forward-small.txt

Rødberg is located in the Norwegian traditional district and valley of Numedal .
The plants are affiliated with Statkraft , the Norwegian state owned electricity company .
The final passenger service ended in 1988 . The rail line north of Rollag was closed in 1989 .

Persecution of Jews in Europe increased in the High Middle Ages in the context of the Christian Crusades .
In 1394 , 100 , 000 Jews were expelled from France .
Jews were indeed infected in numbers similar to their non - Jewish neighbors Yet they were still made scapegoats .
Jewish daily life was very satisfying .
Jews lived among Jews .


In [2]:
import torch
from newlm.lm.elmo.modeling_elmo.elmo_head import ELMOBertLMHeadModel
from newlm.lm.elmo.lm_builder import ELMOLMBuilder
from transformers import BertConfig, BertLMHeadModel
from newlm.utils.file_util import read_from_yaml

#### Model Bert Causal 1M

In [3]:
pt_bc = "./outputs/en.100-percent.bert-causal.1M"
config_bc = read_from_yaml('examples/configs_gcloud/run-100-percent.bert-causal.yaml')

model_bc = BertLMHeadModel.from_pretrained(pt_bc) # use pre-trained model

In [4]:
model_bc.eval()
print("Model in eval mode for consistency")

Model in eval mode for consistency


In [23]:
# model_bc

#### Model ELMO Bert Causal 1M

In [6]:
pt_elmo = "./outputs/en.100-percent.elmo-bert-causal.1M"
config_elmo = read_from_yaml('examples/configs_gcloud/run-100-percent.elmo-bert-causal.yaml')

model_elmo = ELMOBertLMHeadModel.from_pretrained(pt_elmo) # use pre-trained model

In [7]:
model_elmo.eval()
print("Model in eval mode for consistency")

Model in eval mode for consistency


In [22]:
# model_elmo

### Data

In [9]:
def get_dataloader(config_file, tokenizer_dir, model_type, model):
    # lm builder (helper)
    elmo_lm_builder = ELMOLMBuilder(
        model_config = config_file['lm']['model']['config'],
        tokenizer=tokenizer_dir,
        model_type=model_type,
        max_len=128
    )
    
    # dataset-forward
    train_path = "./examples/data/text_forward-small.txt"
    ds_f = elmo_lm_builder._get_dataset(train_path)
    
    # trainer (helper)
    from transformers import TrainingArguments, Trainer
    args = TrainingArguments(output_dir="tmpout",**config_file['lm']['hf_trainer']['args'])

    # dataloader-forward
    trainer = Trainer(model=model, args=args, data_collator=elmo_lm_builder.data_collator, train_dataset=ds_f)
    dl_f = trainer.get_train_dataloader() # Data Loader-forward
    
    
    return dl_f, elmo_lm_builder.tokenizer

#### Data for ELMO Bert-Causal

In [10]:
%%capture
dl_elmo, tknz_elmo = get_dataloader(config_elmo, pt_elmo, "elmo-bert-causal", model_elmo)

2021-12-27 11:49:44.560 | INFO     | newlm.lm.elmo.lm_builder:_get_dataset:142 - Constructing roBERTa style dataset
max_steps is given, it will override any value given in num_train_epochs


In [11]:
batch_elmo = next(iter(dl_elmo))
batch_elmo['input_ids'].shape

torch.Size([1, 123])

#### Data for Bert-Causal

In [12]:
%%capture
dl_bc, tknz_bc = get_dataloader(config_bc, pt_bc, "bert-causal", model_bc)

Didn't find file ./outputs/en.100-percent.bert-causal.1M/tokenizer.json. We won't load it.
Didn't find file ./outputs/en.100-percent.bert-causal.1M/added_tokens.json. We won't load it.
Didn't find file ./outputs/en.100-percent.bert-causal.1M/special_tokens_map.json. We won't load it.
Didn't find file ./outputs/en.100-percent.bert-causal.1M/tokenizer_config.json. We won't load it.
loading file ./outputs/en.100-percent.bert-causal.1M/vocab.txt
loading file None
loading file None
loading file None
loading file None
loading configuration file ./outputs/en.100-percent.bert-causal.1M/config.json
Model config BertConfig {
  "architectures": [
    "BertLMHeadModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": true,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 1024,
  "model_type": "bert",
  "num_attent

In [13]:
batch_bc = next(iter(dl_bc))
batch_bc['input_ids'].shape

torch.Size([1, 123])

In [14]:
import pandas as pd

def print_batch(batch_f, tknz):
    tokens_f = tknz.convert_ids_to_tokens(batch_f['input_ids'][0])
    return pd.DataFrame({"data": tokens_f})

In [15]:
print_batch(batch_elmo, tknz_elmo)

Unnamed: 0,data
0,[CLS]
1,[UNK]
2,is
3,located
4,in
...,...
118,lived
119,among
120,Jews
121,.


In [16]:
print_batch(batch_bc, tknz_bc)

Unnamed: 0,data
0,[CLS]
1,[UNK]
2,is
3,located
4,in
...,...
118,lived
119,among
120,Jews
121,.


In [17]:
model_elmo.eval()
model_bc.eval()
print("Model in eval mode for consistency")

Model in eval mode for consistency


## Sanity Check

In [18]:
#### ELMO BERT-Causal
res = model_elmo(**batch_elmo) 

l2r_loss tensor(3.9105, grad_fn=<NllLossBackward>)
r2l_loss tensor(3.8871, grad_fn=<NllLossBackward>)


In [19]:
#### Bert-Causal
res = model_bc(**batch_bc)
print("l2r_loss", res.loss)

l2r_loss tensor(3.9318, grad_fn=<NllLossBackward>)
