# MLM training on our generated data

adapted from
https://huggingface.co/transformers/v2.5.1/examples.html#language-model-training
and
https://github.com/huggingface/notebooks/blob/master/examples/language_modeling.ipynb

```
export TRAIN_FILE=/path/to/dataset/wiki.train.raw
export TEST_FILE=/path/to/dataset/wiki.test.raw

python run_language_modeling.py \
    --output_dir=output \
    --model_type=roberta \
    --model_name_or_path=roberta-base \
    --do_train \
    --train_data_file=$TRAIN_FILE \
    --do_eval \
    --eval_data_file=$TEST_FILE \
    --mlm
```

In [1]:
import logging
import math
import os

import transformers
from transformers import (
    CONFIG_MAPPING,
    MODEL_WITH_LM_HEAD_MAPPING,
    AutoConfig,
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    DataCollatorForPermutationLanguageModeling,
    DataCollatorForWholeWordMask,
    HfArgumentParser,
    LineByLineTextDataset,
    LineByLineWithRefDataset,
    PreTrainedTokenizer,
    TextDataset,
    Trainer,
    TrainingArguments,
    set_seed,
    pipeline
)
from transformers.trainer_utils import is_main_process
import datasets
import pandas as pd

In [2]:
CACHE = './hfcache'
MODEL_NAME = 'google/electra-small-discriminator'

In [3]:
config = AutoConfig.from_pretrained(MODEL_NAME, cache_dir=CACHE)

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE, use_fast=True)

In [5]:
model = AutoModelForMaskedLM.from_pretrained(
    MODEL_NAME,
    from_tf=False,
    config=config,
    cache_dir=CACHE,
)
model.resize_token_embeddings(len(tokenizer))


Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForMaskedLM: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForMaskedLM were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['generator_predictions.LayerNorm.bias', 'generator_lm_head.bias', '

Embedding(30522, 128, padding_idx=0)

# build new dataset

In [7]:
df = pd.read_csv('./contrast_dataset/data.csv', index_col='Unnamed: 0')

In [8]:
d = datasets.Dataset.from_pandas(df)

In [9]:
d

Dataset({
    features: ['x', 'y', '__index_level_0__'],
    num_rows: 1792
})

In [11]:
d['x'][0], d['y'][0]

('I thought Bobby was [MASK], but instead he was very hopeless.', 'hopeful')

### A. tokenize

In [12]:
def tokenize_function(examples):
    return tokenizer(examples["x"], max_length=512, truncation=True)

In [13]:
tokenized_datasets = d.map(tokenize_function, batched=True, num_proc=4, )

In [16]:
tokenizer.pad_token_id

0

In [17]:
labels = []
max_len = 0
for i, sentence in enumerate(tokenized_datasets['input_ids']):
    sent_labels = [0] * 30
#     print(sent_labels)
#     print(sentence)
    for j, token in enumerate(sentence):
        if token == 103:
            sent_labels[j] = tokenizer.encode(tokenized_datasets['y'][i])[1]  # cls, token, sep
        else:
            sent_labels[j] = -100
    labels.append(sent_labels)
    max_len = max(max_len, len(sentence))

In [18]:
max_len

22

In [19]:
labels

[[-100,
  -100,
  -100,
  -100,
  -100,
  17772,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [-100,
  -100,
  -100,
  -100,
  -100,
  3697,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [-100,
  -100,
  -100,
  -100,
  -100,
  5220,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [-100,
  -100,
  -100,
  -100,
  -100,
  6047,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [-100,
  -100,
  -100,
  -100,
  -100,
  3407,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [-100,
  -100,
  -

In [20]:
tokenized_datasets = tokenized_datasets.add_column('labels', labels)

In [21]:
lm_datasets = tokenized_datasets.train_test_split()

### (tokenizer check)

In [22]:
tokenizer.mask_token

'[MASK]'

In [23]:
tokenizer.mask_token_id

103

In [24]:
tokenizer.decode(lm_datasets['train'][17]['input_ids'])

'[CLS] i thought mike was [MASK], but instead he was very simple. [SEP]'

In [25]:
lm_datasets['train'][17]['labels']

[-100,
 -100,
 -100,
 -100,
 -100,
 3375,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [26]:
tokenizer.decode([103, 10300])

'[MASK] salary'

### build collator

In [27]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.0)

In [28]:
training_args = TrainingArguments(
    f"finetuned-wikitext2",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
)

In [29]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],
    data_collator=data_collator,
)

### This trainer stores data masked - dataloader does nothing

In [34]:
dl = trainer.get_train_dataloader()

The following columns in the training set  don't have a corresponding argument in `ElectraForMaskedLM.forward` and have been ignored: y, x, __index_level_0__.


In [36]:
for s in dl:
    print(s)
    break

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]), 'input_ids': tensor([[  101,  2008,  4562,  2003,  2205,  2658,  1012,  1045,  2215,  1037,
           103,  2028,  2612,  1012,   102,     0,     0,     0],
        [  101,  4901,  2003,  3647,  1010,  2096, 10049,  2003,  2025,  3647,
          1012, 10049,  2003,   103,  1012,   102,     0,     0],
        [  101,  2023,  4937,  2003,  2205,   103,  1012,  1045,  2359,  1037,
          2235,  2028,  2612,  1012,   102,     0,     0,     0],
        [  101,  10

# train / evaluate

In [30]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `ElectraForMaskedLM.forward` and have been ignored: y, x, __index_level_0__.
***** Running training *****
  Num examples = 1344
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 504


Epoch,Training Loss,Validation Loss
1,No log,0.0
2,No log,0.0
3,0.000000,0.0


The following columns in the evaluation set  don't have a corresponding argument in `ElectraForMaskedLM.forward` and have been ignored: y, x, __index_level_0__.
***** Running Evaluation *****
  Num examples = 448
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `ElectraForMaskedLM.forward` and have been ignored: y, x, __index_level_0__.
***** Running Evaluation *****
  Num examples = 448
  Batch size = 8
Saving model checkpoint to finetuned-wikitext2/checkpoint-500
Configuration saved in finetuned-wikitext2/checkpoint-500/config.json
Model weights saved in finetuned-wikitext2/checkpoint-500/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `ElectraForMaskedLM.forward` and have been ignored: y, x, __index_level_0__.
***** Running Evaluation *****
  Num examples = 448
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=504, training_loss=0.0, metrics={'train_runtime': 31.1291, 'train_samples_per_second': 129.525, 'train_steps_per_second': 16.191, 'total_flos': 3756407082048.0, 'train_loss': 0.0, 'epoch': 3.0})

In [31]:
model.to('cpu')

ElectraForMaskedLM(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=256, 

In [32]:
p = pipeline('fill-mask', model=model, tokenizer=tokenizer)

In [33]:
p(f"HuggingFace is creating a {tokenizer.mask_token} that the community uses to solve NLP tasks.")

[{'sequence': 'huggingface is creating a nba that the community uses to solve nlp tasks.',
  'score': 0.0003424295864533633,
  'token': 6452,
  'token_str': 'nba'},
 {'sequence': 'huggingface is creating a php that the community uses to solve nlp tasks.',
  'score': 0.0003068829537369311,
  'token': 25718,
  'token_str': 'php'},
 {'sequence': 'huggingface is creating a vuelta that the community uses to solve nlp tasks.',
  'score': 0.00028197947540320456,
  'token': 21441,
  'token_str': 'vuelta'},
 {'sequence': 'huggingface is creating a html that the community uses to solve nlp tasks.',
  'score': 0.00027785517158918083,
  'token': 16129,
  'token_str': 'html'},
 {'sequence': 'huggingface is creating a schwarz that the community uses to solve nlp tasks.',
  'score': 0.00027405706350691617,
  'token': 29058,
  'token_str': 'schwarz'}]