In [1]:
import torch
from newlm.lm.bert.lm_builder import LMBuilder
from transformers import BertForMaskedLM

In [2]:
model_path = "./outputs/en.100-percent.bert.1M"
config_path = 'examples/configs_gcloud/run.100-percent.yaml'
model_type = "bert"

### Model Bert Causal

In [3]:
model = BertForMaskedLM.from_pretrained(
    model_path
) # use pre-trained model

In [4]:
model.eval()
print("model in eval mode for consistency")

model in eval mode for consistency


In [6]:
# model.__dict__

#### Helpers

In [7]:
from newlm.utils.file_util import read_from_yaml
config_file = read_from_yaml(config_path)

In [9]:
# trainer (helper)
from transformers import TrainingArguments, Trainer
trainer_args = TrainingArguments(output_dir="tmpout",**config_file['lm']['hf_trainer']['args'])

# lm builder (helper)
lm_builder = LMBuilder(
    model_config = config_file['lm']['model']['config'],
    tokenizer=model_path, # use pre-trained tokenizer
    max_len=128
)

In [10]:
%%capture
from newlm.glue.cls_trainer import ClsTrainer
cls_trainer = ClsTrainer(
    model_path,
    model_path,
    model_type=model_type
)

## Data

In [11]:
%%capture
ds_ori = cls_trainer.helper("mrpc", {})

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [12]:
ds_ori['train']['sentence1'][:3]

['Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
 'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .']

In [13]:
%%capture
ds_moses = cls_trainer.helper("mrpc", {"detokenizer": "moses"})

2021-12-01 17:44:01.440 | INFO     | newlm.glue.cls_trainer:helper:71 - Use detokenizer moses


In [14]:
ds_moses['train']['sentence1'][:3]

['Amrozi accused his brother, whom he called "the witness", of deliberately distorting his evidence.',
 "Yucaipa owned Dominick's before selling the chain to Safeway in 1998 for $2.5 billion.",
 'They had published an advertisement on the Internet on June 10, offering the cargo for sale, he added.']

In [15]:
%%capture
ds_tb = cls_trainer.helper("mrpc", {"detokenizer": "treebank"})

2021-12-01 17:44:09.155 | INFO     | newlm.glue.cls_trainer:helper:71 - Use detokenizer treebank


In [16]:
ds_tb['train']['sentence1'][:3]

['Amrozi accused his brother, whom he called " the witness ", of deliberately distorting his evidence.',
 "Yucaipa owned Dominick's before selling the chain to Safeway in 1998 for $2.5 billion.",
 'They had published an advertisement on the Internet on June 10, offering the cargo for sale, he added.']

In [17]:
trainer = Trainer(model=model, args=trainer_args, data_collator=lm_builder.data_collator, train_dataset=ds_ori['train'])
dl_ori = trainer.get_train_dataloader() 

max_steps is given, it will override any value given in num_train_epochs
The following columns in the training set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: idx, sentence2, sentence1.


In [18]:
trainer = Trainer(model=model, args=trainer_args, data_collator=lm_builder.data_collator, train_dataset=ds_moses['train'])
dl_moses = trainer.get_train_dataloader() 

max_steps is given, it will override any value given in num_train_epochs
The following columns in the training set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: idx, sentence2, sentence1.


In [19]:
trainer = Trainer(model=model, args=trainer_args, data_collator=lm_builder.data_collator, train_dataset=ds_tb['train'])
dl_tb = trainer.get_train_dataloader() 

max_steps is given, it will override any value given in num_train_epochs
The following columns in the training set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: idx, sentence2, sentence1.


In [20]:
batch_ori = next(iter(dl_ori))
batch_moses = next(iter(dl_moses))
batch_tb = next(iter(dl_tb))

In [21]:
model.eval()
print("model in eval mode for consistency")

model in eval mode for consistency


### RUN

In [22]:
del batch_ori['label']
batch_ori['input_ids']

tensor([[    2,  1785,  3719,  ...,  2100,    18,     3],
        [    2,  1771,  2957,  ...,     0,     0,     0],
        [    2,  1771, 19714,  ...,     0,     0,     0],
        ...,
        [    2,  9344,    11,  ...,     0,     0,     0],
        [    2,  1833,  6441,  ...,     0,     0,     0],
        [    2,  2195,  4241,  ...,     0,     0,     0]])

In [23]:
res_ori = model(**batch_ori)

In [24]:
res_ori.loss

tensor(1.7566, grad_fn=<NllLossBackward>)

In [25]:
del batch_moses['label']
batch_moses['input_ids']

tensor([[    2,  1785,  3719,  ...,  2100,    18,     3],
        [    2,  1771,     4,  ...,     0,     0,     0],
        [    2,  1771, 19714,  ...,     0,     0,     0],
        ...,
        [    2,     4,    11,  ...,     0,     0,     0],
        [    2,  1833,  6441,  ...,     0,     0,     0],
        [    2,  2195,  4241,  ...,     0,     0,     0]])

In [26]:
res_moses = model(**batch_moses)

In [27]:
res_moses.loss

tensor(1.5716, grad_fn=<NllLossBackward>)

In [28]:
del batch_tb['label']
batch_tb['input_ids']

tensor([[    2,  1785,  3719,  ...,  2100,     4,     3],
        [    2,  1771,  2957,  ...,     0,     0,     0],
        [    2,  1771, 19714,  ...,     0,     0,     0],
        ...,
        [    2,  9344,     4,  ...,     0,     0,     0],
        [    2,  1833,  5491,  ...,     0,     0,     0],
        [    2,  2195,  4241,  ...,     0,     0,     0]])

In [29]:
res_tb = model(**batch_tb)

In [30]:
res_tb.loss

tensor(1.5242, grad_fn=<NllLossBackward>)