# MLM training

adapted from
https://huggingface.co/transformers/v2.5.1/examples.html#language-model-training
and
https://github.com/huggingface/notebooks/blob/master/examples/language_modeling.ipynb

```
export TRAIN_FILE=/path/to/dataset/wiki.train.raw
export TEST_FILE=/path/to/dataset/wiki.test.raw

python run_language_modeling.py \
    --output_dir=output \
    --model_type=roberta \
    --model_name_or_path=roberta-base \
    --do_train \
    --train_data_file=$TRAIN_FILE \
    --do_eval \
    --eval_data_file=$TEST_FILE \
    --mlm
```

In [1]:
import logging
import math
import os

import transformers
from transformers import (
    CONFIG_MAPPING,
    MODEL_WITH_LM_HEAD_MAPPING,
    AutoConfig,
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    DataCollatorForPermutationLanguageModeling,
    DataCollatorForWholeWordMask,
    HfArgumentParser,
    LineByLineTextDataset,
    LineByLineWithRefDataset,
    PreTrainedTokenizer,
    TextDataset,
    Trainer,
    TrainingArguments,
    set_seed,
    pipeline
)
from transformers.trainer_utils import is_main_process
import datasets

In [2]:
CACHE = './hfcache'
MODEL_NAME = 'google/electra-small-discriminator'

In [3]:
config = AutoConfig.from_pretrained(MODEL_NAME, cache_dir=CACHE)

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE, use_fast=True)

In [5]:
model = AutoModelForMaskedLM.from_pretrained(
    MODEL_NAME,
    from_tf=False,
    config=config,
    cache_dir=CACHE,
)
model.resize_token_embeddings(len(tokenizer))


Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForMaskedLM: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForMaskedLM were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['generator_lm_head.weight', 'generator_predictions.LayerNorm.weight

Embedding(30522, 128, padding_idx=0)

# build standard dataset

In [15]:
d = datasets.load_dataset('wikitext', 'wikitext-2-raw-v1')

Reusing dataset wikitext (/home/sambeck/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/aa5e094000ec7afeb74c3be92c88313cd6f132d564c7effd961c10fd47c76f20)


### A. tokenize

In [14]:
def tokenize_function(examples):
    return tokenizer(examples["text"], max_length=512, truncation=True)

In [16]:
tokenized_datasets = d.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"], )

### B. stick examples together

In [17]:
block_size = 128
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [18]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

### (tokenizer check)

In [83]:
tokenizer.mask_token

'[MASK]'

In [50]:
tokenizer.mask_token_id

103

In [82]:
tokenizer.decode(lm_datasets['train'][17]['input_ids'])

"to achieve this, the cooperative elements incorporated into the second game were removed, as they took up a large portion of memory space needed for the improvements. they also adjusted the difficulty settings and ease of play so they could appeal to new players while retaining the essential components of the series'gameplay. the newer systems were decided upon early in development. the character designs were done by raita honjou, who had worked on the previous valkyria chronicles games. when creating the nameless squad, honjou was faced with the same problem he had had during the first game : the military uniforms essentially destroyed character individuality, despite him needing to create"

In [67]:
tokenizer.decode([103, 10300])

'[MASK] salary'

### build collator

In [20]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [21]:
training_args = TrainingArguments(
    f"finetuned-wikitext2",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
)

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    data_collator=data_collator,
)

### Note that trainer stores data intact - dataloader does the masking

In [58]:
trainer.train_dataset['input_ids'][0]

[101,
 102,
 101,
 1027,
 11748,
 4801,
 4360,
 11906,
 3523,
 1027,
 102,
 101,
 102,
 101,
 12411,
 5558,
 2053,
 11748,
 4801,
 4360,
 1017,
 1024,
 4895,
 2890,
 27108,
 5732,
 11906,
 1006,
 2887,
 1024,
 1856,
 1806,
 1671,
 30222,
 30218,
 30259,
 30227,
 30255,
 30258,
 30219,
 2509,
 1010,
 5507,
 1012,
 11748,
 4801,
 4360,
 1997,
 1996,
 11686,
 1017,
 1007,
 1010,
 4141,
 3615,
 2000,
 2004,
 11748,
 4801,
 4360,
 11906,
 3523,
 2648,
 2900,
 1010,
 2003,
 1037,
 8608,
 2535,
 1030,
 1011,
 1030,
 2652,
 2678,
 2208,
 2764,
 2011,
 16562,
 1998,
 2865,
 1012,
 4432,
 2005,
 1996,
 9160,
 12109,
 1012,
 2207,
 1999,
 2254,
 2249,
 1999,
 2900,
 1010,
 2009,
 2003,
 1996,
 2353,
 2208,
 1999,
 1996,
 11748,
 4801,
 4360,
 2186,
 1012,
 15440,
 1996,
 2168,
 10077,
 1997,
 8608,
 1998,
 2613,
 1030,
 1011,
 1030,
 2051,
 11247,
 2004,
 2049,
 16372,
 1010,
 1996,
 2466,
 3216,
 5903,
 2000]

In [72]:
# what happens inside train:
dl = trainer.get_train_dataloader()
for batch in dl:
    print(batch)
    break

{'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'input_ids': tensor([[  103,  1024,   102,  ...,  2025,  2035,  1996],
        [ 7164,  1996,  2177,  ...,  1006,  9122,  4027],
        [ 5228,  3037,  1999,  ...,  2623,   103,  2709],
        ...,
        [15037,   103,  1037,  ...,  1999,  1996,  4470],
        [ 2000,  3824,  5491,  ...,  2029,  2387,  3163],
        [ 3033,   103,  3469,  ...,  2025, 12599,  9932]]), 'labels': tensor([[10176,  -100,  -100,  ...,  -100,  -100,  -100],
        [ -100,  -100,  2177,  ...,  -100,  -100,  -100],
        [ -100,  -100,  -100,  ...,  -100,  2010,  -100],
        ...,
        [ -100,  1010,  -100,  ...,  -100,  -100,  -100],
        [ -100,  -100,  -100,  ...,  -100,  -100,  -100],
        [ -100,  1997,  -100,  ...,  -100,  -100,  -100]]), 'token_type_i

In [70]:
batch.keys()

dict_keys(['attention_mask', 'input_ids', 'labels', 'token_type_ids'])

In [73]:
# 15% tokens replaced
batch['input_ids'][0]

tensor([  103,  1024,   102,   101,  1440,  1006, 25957,  2072,  1007,  2089,
         2022,   103,  2066,  1443,  5436, 19354,  2072,   103,  2007,  1037,
         2701,  7077,  2012,  1996,  3953,  1012,   102,   101,  1441,  1006,
         2123,  2072, 22906,  2003,  4703,  2517,  2007,   103,  3722,  7077,
         2012,  2327,  1010,  1012,   102,   101,  1446,  1010,   100,  1010,
         1998,   100,  1006,  1047,  1005,   103,  2072,  1010, 24529,  7088,
         1010,  1040,  5831,  3669,  1007,  2024,   103,  2517,  2007,   103,
          103,  7471,   103,  2012,  1996,  2327,  1010,   103,   103,  2005,
         2742,   100,   103, 24529,  7088,  1007, 12950,  1037,  1057,  2007,
         1037, 11737, 10814,  1999,  1996,  2157,  2217,  1012,   102,   101,
          103,   103,  5869,  2072,  1007,  2003,   103,  2517,  2007,   103,
          103,  8115,   103,  1012,   103,  2043,  2035,  2093,  2024,  2517,
         1010,  2027,  1005,  2128,   103,  2025,  2035,  1996])

In [74]:
# only masked tokens are labeled
batch['labels'][0]

tensor([10176,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  2517,  -100,  -100,  1006,  -100,  -100,  1007,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  1007,  -100,  -100,  -100,  -100,  1037,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  2019,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  3227,  -100,  -100,  3442,
         1010,  -100,  3210,  -100,  -100,  -100,  -100,  2061,  2008,  -100,
         -100,  -100,  1006,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         1447,  1006,  -100,  -100,  -100,  -100,  4703,  -100,  -100,  1037,
         2309,  -100,  1010,  -100,  2130,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  3227,  -100,  -100,  -100])

# train / evaluate

In [23]:
trainer.train()

***** Running training *****
  Num examples = 18535
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6951


Epoch,Training Loss,Validation Loss
1,6.3598,6.031094
2,5.7288,5.545131
3,5.579,5.497122


Saving model checkpoint to finetuned-wikitext2/checkpoint-500
Configuration saved in finetuned-wikitext2/checkpoint-500/config.json
Model weights saved in finetuned-wikitext2/checkpoint-500/pytorch_model.bin
Saving model checkpoint to finetuned-wikitext2/checkpoint-1000
Configuration saved in finetuned-wikitext2/checkpoint-1000/config.json
Model weights saved in finetuned-wikitext2/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to finetuned-wikitext2/checkpoint-1500
Configuration saved in finetuned-wikitext2/checkpoint-1500/config.json
Model weights saved in finetuned-wikitext2/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to finetuned-wikitext2/checkpoint-2000
Configuration saved in finetuned-wikitext2/checkpoint-2000/config.json
Model weights saved in finetuned-wikitext2/checkpoint-2000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1921
  Batch size = 8
Saving model checkpoint to finetuned-wikitext2/checkpoint-2500
Configuration saved in fin

TrainOutput(global_step=6951, training_loss=6.119051658203406, metrics={'train_runtime': 929.3451, 'train_samples_per_second': 59.832, 'train_steps_per_second': 7.479, 'total_flos': 408857383503360.0, 'train_loss': 6.119051658203406, 'epoch': 3.0})

In [27]:
model.to('cpu')

ElectraForMaskedLM(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=256, 

In [28]:
p = pipeline('fill-mask', model=model, tokenizer=tokenizer)

In [29]:
p(f"HuggingFace is creating a {tokenizer.mask_token} that the community uses to solve NLP tasks.")

[{'sequence': 'huggingface is creating a song that the community uses to solve nlp tasks.',
  'score': 0.003066012868657708,
  'token': 2299,
  'token_str': 'song'},
 {'sequence': 'huggingface is creating a game that the community uses to solve nlp tasks.',
  'score': 0.0021496417466551065,
  'token': 2208,
  'token_str': 'game'},
 {'sequence': 'huggingface is creating a role that the community uses to solve nlp tasks.',
  'score': 0.0019248999888077378,
  'token': 2535,
  'token_str': 'role'},
 {'sequence': 'huggingface is creating a character that the community uses to solve nlp tasks.',
  'score': 0.0018462331499904394,
  'token': 2839,
  'token_str': 'character'},
 {'sequence': 'huggingface is creating a film that the community uses to solve nlp tasks.',
  'score': 0.0017241048626601696,
  'token': 2143,
  'token_str': 'film'}]