# MLM training on standard MLM masking, following training on our data

adapted from
https://huggingface.co/transformers/v2.5.1/examples.html#language-model-training
and
https://github.com/huggingface/notebooks/blob/master/examples/language_modeling.ipynb



In [2]:
import logging
import math
import os
import numpy as np
from tqdm.notebook import tqdm

import transformers
from transformers import (
    CONFIG_MAPPING,
    MODEL_WITH_LM_HEAD_MAPPING,
    AutoConfig,
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    DataCollatorForWholeWordMask,
    DataCollatorWithPadding,
    HfArgumentParser,
    LineByLineTextDataset,
    LineByLineWithRefDataset,
    PreTrainedTokenizer,
    TextDataset,
    Trainer,
    TrainingArguments,
    set_seed,
    pipeline
)
from transformers.trainer_utils import is_main_process
import datasets
import pandas as pd

In [5]:
CACHE = './hfcache'
MODEL_NAME = './model-mlm-generated-text/checkpoint-35000/'

In [6]:
config = AutoConfig.from_pretrained(MODEL_NAME, cache_dir=CACHE)

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE, use_fast=True)

In [8]:
model = AutoModelForMaskedLM.from_pretrained(
    MODEL_NAME,
    from_tf=False,
    config=config,
    cache_dir=CACHE,
)
model.resize_token_embeddings(len(tokenizer))


Embedding(30522, 128, padding_idx=0)

# build standard MLM dataset

In [10]:
d = datasets.load_dataset('wikitext', 'wikitext-103-raw-v1')

Downloading and preparing dataset wikitext/wikitext-103-raw-v1 (download: 183.09 MiB, generated: 523.97 MiB, post-processed: Unknown size, total: 707.06 MiB) to /home/sambeck/.cache/huggingface/datasets/wikitext/wikitext-103-raw-v1/1.0.0/aa5e094000ec7afeb74c3be92c88313cd6f132d564c7effd961c10fd47c76f20...


Downloading:   0%|          | 0.00/192M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset wikitext downloaded and prepared to /home/sambeck/.cache/huggingface/datasets/wikitext/wikitext-103-raw-v1/1.0.0/aa5e094000ec7afeb74c3be92c88313cd6f132d564c7effd961c10fd47c76f20. Subsequent calls will reuse this data.


In [11]:
d

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

### A. tokenize

In [17]:
def tokenize_function(examples):
    return tokenizer(examples["text"], max_length=512, truncation=True)

In [21]:
tokenized_datasets = d.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"], )

### B. Build x,y for model

In [22]:
block_size = 128
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop,
    # you can customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [23]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [24]:
lm_datasets

DatasetDict({
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 2199
    })
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 916424
    })
    validation: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 1921
    })
})

### (tokenizer check)

In [25]:
tokenizer.mask_token

'[MASK]'

In [26]:
tokenizer.mask_token_id

103

In [28]:
tokenizer.decode(lm_datasets['train'][5]['input_ids'])

"s battle system, the blitz system, is carried over directly from valkyira chronicles. during missions, players select each unit using a top @ - @ down perspective of the battlefield map : once a character is selected, the player moves the character around the battlefield in third @ - @ person. a character can only act once per @ - @ turn, but characters can be granted multiple turns at the expense of other characters'turns. each character has a field and distance of movement limited by their action gauge. up to nine characters can be assigned to a single mission. during gameplay, characters will call out if something happens to them, such"

In [29]:
for i in range(5):
    _x = lm_datasets['train'][i]['input_ids']
    _y = lm_datasets['train'][i]['labels']
    print('\n', i)
    for i in range(64):
        print(_x[i])
        print(_y[i])
        if _y[i] == -0:
            break



 0
101
101
102
102
101
101
1027
1027
11748
11748
4801
4801
4360
4360
11906
11906
3523
3523
1027
1027
102
102
101
101
102
102
101
101
12411
12411
5558
5558
2053
2053
11748
11748
4801
4801
4360
4360
1017
1017
1024
1024
4895
4895
2890
2890
27108
27108
5732
5732
11906
11906
1006
1006
2887
2887
1024
1024
1856
1856
1806
1806
1671
1671
30222
30222
30218
30218
30259
30259
30227
30227
30255
30255
30258
30258
30219
30219
2509
2509
1010
1010
5507
5507
1012
1012
11748
11748
4801
4801
4360
4360
1997
1997
1996
1996
11686
11686
1017
1017
1007
1007
1010
1010
4141
4141
3615
3615
2000
2000
2004
2004
11748
11748
4801
4801
4360
4360
11906
11906
3523
3523
2648
2648
2900
2900

 1
1996
1996
2034
2034
2208
2208
1998
1998
4076
4076
1996
1996
1000
1000
2171
2171
3238
3238
1000
1000
1010
1010
1037
1037
18476
18476
2510
2510
3131
3131
3529
3529
1996
1996
3842
3842
1997
1997
26033
26033
2401
2401
2076
2076
1996
1996
2117
2117
12124
12124
2078
2078
2162
2162
2040
2040
4685
4685
3595
3595
2304
2304
3136
3136
1998
1

### build collator

In [30]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.1)

In [33]:
training_args = TrainingArguments(
    f"model-mlm-generated-text",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
#     evaluate_during_training=True,
    do_train=True,
    do_eval=True,
    logging_steps=100,
    eval_steps=100,
    num_train_epochs=1.0
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [34]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

### This trainer stores data masked - dataloader does nothing

In [35]:
dl = trainer.get_train_dataloader()

In [36]:
for s in dl:
    print(s)
    break

{'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'input_ids': tensor([[ 2008,  1996,  5156,  ...,  5195,  1006,  4690],
        [13182,   102,   101,  ..., 28176,  1998,  3782],
        [23453,  1012,   102,  ...,  1012,   103,  2003],
        ...,
        [  103,  2005,  1996,  ...,  3164,  2135,  1010],
        [  102,   101,  1027,  ...,  2106,  5149,  1005],
        [ 8865,  1006,  2432,  ...,  2072,  1010,  2040]]), 'labels': tensor([[-100, -100, -100,  ..., -100, -100, -100],
        [1012, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, 2009, -100],
        ...,
        [1006, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0

In [37]:
def view_sample(i):
    _x = s['input_ids'][i]
    _y = s['labels'][i]
    print('\n', i)
    for j in range(64):
        print(_x[j])
        print(_y[j])
        if _y[j] == 0:
            break


In [38]:
view_sample(7)


 7
tensor(8865)
tensor(-100)
tensor(1006)
tensor(-100)
tensor(2432)
tensor(-100)
tensor(1007)
tensor(-100)
tensor(1010)
tensor(-100)
tensor(2310)
tensor(-100)
tensor(103)
tensor(10139)
tensor(2140)
tensor(-100)
tensor(1006)
tensor(-100)
tensor(2294)
tensor(-100)
tensor(1007)
tensor(-100)
tensor(1010)
tensor(-100)
tensor(9587)
tensor(-100)
tensor(19436)
tensor(-100)
tensor(1006)
tensor(-100)
tensor(2289)
tensor(-100)
tensor(1007)
tensor(-100)
tensor(1998)
tensor(-100)
tensor(11968)
tensor(-100)
tensor(14317)
tensor(-100)
tensor(103)
tensor(3512)
tensor(23169)
tensor(-100)
tensor(1006)
tensor(-100)
tensor(2289)
tensor(-100)
tensor(1007)
tensor(-100)
tensor(1012)
tensor(-100)
tensor(2178)
tensor(-100)
tensor(6232)
tensor(-100)
tensor(2013)
tensor(-100)
tensor(1996)
tensor(-100)
tensor(7560)
tensor(-100)
tensor(1010)
tensor(-100)
tensor(7680)
tensor(-100)
tensor(4183)
tensor(-100)
tensor(1038)
tensor(-100)
tensor(12707)
tensor(-100)
tensor(2696)
tensor(-100)
tensor(7507)
tensor(-100)
tens

# train / evaluate

In [39]:
trainer.train()

***** Running training *****
  Num examples = 916424
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 114553


Epoch,Training Loss,Validation Loss


Saving model checkpoint to model-mlm-generated-text/checkpoint-500
Configuration saved in model-mlm-generated-text/checkpoint-500/config.json
Model weights saved in model-mlm-generated-text/checkpoint-500/pytorch_model.bin
tokenizer config file saved in model-mlm-generated-text/checkpoint-500/tokenizer_config.json
Special tokens file saved in model-mlm-generated-text/checkpoint-500/special_tokens_map.json
Saving model checkpoint to model-mlm-generated-text/checkpoint-1000
Configuration saved in model-mlm-generated-text/checkpoint-1000/config.json
Model weights saved in model-mlm-generated-text/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in model-mlm-generated-text/checkpoint-1000/tokenizer_config.json
Special tokens file saved in model-mlm-generated-text/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to model-mlm-generated-text/checkpoint-1500
Configuration saved in model-mlm-generated-text/checkpoint-1500/config.json
Model weights saved in model-mlm-

KeyboardInterrupt: 

In [40]:
_ = model.to('cpu')

In [41]:
p = pipeline('fill-mask', model=model, tokenizer=tokenizer)

In [42]:
p(f"It is not cold, it must be {tokenizer.mask_token}.")

[{'sequence': 'it is not cold, it must be hot.',
  'score': 0.9281871914863586,
  'token': 2980,
  'token_str': 'hot'},
 {'sequence': 'it is not cold, it must be cold.',
  'score': 0.011733351275324821,
  'token': 3147,
  'token_str': 'cold'},
 {'sequence': 'it is not cold, it must be warm.',
  'score': 0.006822537165135145,
  'token': 4010,
  'token_str': 'warm'},
 {'sequence': 'it is not cold, it must be good.',
  'score': 0.004887840244919062,
  'token': 2204,
  'token_str': 'good'},
 {'sequence': 'it is not cold, it must be cool.',
  'score': 0.0036937883123755455,
  'token': 4658,
  'token_str': 'cool'}]

In [43]:
trainer.save_model('./model_trained_on_generated_then_wiki')

Saving model checkpoint to ./model_trained_on_generated_then_wiki
Configuration saved in ./model_trained_on_generated_then_wiki/config.json
Model weights saved in ./model_trained_on_generated_then_wiki/pytorch_model.bin
tokenizer config file saved in ./model_trained_on_generated_then_wiki/tokenizer_config.json
Special tokens file saved in ./model_trained_on_generated_then_wiki/special_tokens_map.json
