# MLM training on our generated data

adapted from
https://huggingface.co/transformers/v2.5.1/examples.html#language-model-training
and
https://github.com/huggingface/notebooks/blob/master/examples/language_modeling.ipynb



In [1]:
import logging
import math
import os
import numpy as np
from tqdm.notebook import tqdm

import transformers
from transformers import (
    CONFIG_MAPPING,
    MODEL_WITH_LM_HEAD_MAPPING,
    AutoConfig,
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    DataCollatorForWholeWordMask,
    DataCollatorWithPadding,
    HfArgumentParser,
    LineByLineTextDataset,
    LineByLineWithRefDataset,
    PreTrainedTokenizer,
    TextDataset,
    Trainer,
    TrainingArguments,
    set_seed,
    pipeline
)
from transformers.trainer_utils import is_main_process
import datasets
import pandas as pd

In [2]:
CACHE = './hfcache'
MODEL_NAME = 'google/electra-small-generator'

In [3]:
config = AutoConfig.from_pretrained(MODEL_NAME, cache_dir=CACHE)

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE, use_fast=True)

In [5]:
model = AutoModelForMaskedLM.from_pretrained(
    MODEL_NAME,
    from_tf=False,
    config=config,
    cache_dir=CACHE,
)
model.resize_token_embeddings(len(tokenizer))


Embedding(30522, 128, padding_idx=0)

# build new dataset

In [6]:
df = pd.read_csv('./contrast_dataset/data.csv', index_col='Unnamed: 0')

In [7]:
d = datasets.Dataset.from_pandas(df)

In [8]:
d

Dataset({
    features: ['x', 'y', '__index_level_0__'],
    num_rows: 111556
})

In [9]:
d['x'][0], d['y'][0]

('She knew that Jimmy was way too [MASK], but in this instance he was excessively positive.',
 'negative')

### A. tokenize

In [10]:
def tokenize_function(examples):
    return tokenizer(examples["x"], max_length=512, truncation=True)

In [11]:
def tokenize_function_y(examples):
    return tokenizer(examples["y"], max_length=1, truncation=True)

In [12]:
tokenized_datasets = d.map(tokenize_function, batched=True, num_proc=4, )

In [13]:
y = d.map(tokenize_function_y, batched=True)

  0%|          | 0/112 [00:00<?, ?ba/s]

In [14]:
# something about arrow format makes it insanely slow on multiple reads
ys = list(y['input_ids'])

In [15]:
ys = [y[1] for y in ys]

### B. Build x,y for model

In [16]:
N_SAMPLES = None
labels = []
inputs = []
max_len = 0
for i, sentence in tqdm(enumerate(tokenized_datasets['input_ids'][:N_SAMPLES])):
    a = np.array(sentence, dtype=int)
    y = np.ones_like(a) * -100
    y[a == 103] = ys[i]  # cls, token, sep
    padded_y = np.zeros(64, dtype=int)
    padded_x = np.zeros(64, dtype=int)
    padded_y[:len(a)] = y
    padded_x[:len(a)] = a
    labels.append(padded_y)
    inputs.append(padded_x)
    
l=len(labels)
print(l)
l = l - (l % 16)
print(l)
labels = labels[:l]
inputs = inputs[:l]

0it [00:00, ?it/s]

111556
111552


In [17]:
lm_datasets = datasets.Dataset.from_dict({'labels': labels, 'input_ids': inputs})

In [18]:
lm_datasets = lm_datasets.shuffle().train_test_split(test_size=0.1)

### (tokenizer check)

In [19]:
tokenizer.mask_token

'[MASK]'

In [20]:
tokenizer.mask_token_id

103

In [21]:
tokenizer.decode(lm_datasets['train'][4]['input_ids'])

'[CLS] christine tends to be excessively poor. i will want a [MASK] person. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [22]:
lm_datasets['train'][4]

{'labels': [-100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  4138,
  -100,
  -100,
  -100,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'input_ids': [101,
  10941,
  12102,
  2000,
  2022,
  11664,
  2135,
  3532,
  1012,
  1045,
  2097,
  2215,
  1037,
  103,
  2711,
  1012,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

In [29]:
for i in range(5):
    _x = lm_datasets['train'][i]['input_ids']
    _y = lm_datasets['train'][i]['labels']
    print('\n', i)
    for i in range(64):
        print(_x[i])
        print(_y[i])
        if _y[i] == -0:
            break



 0
101
-100
12684
-100
2003
-100
2126
-100
2205
-100
11004
-100
1012
-100
1999
-100
1037
-100
3819
-100
2088
-100
1010
-100
2045
-100
2052
-100
2022
-100
1037
-100
103
9657
2711
-100
1999
-100
2049
-100
2173
-100
1012
-100
102
-100
0
0

 1
101
-100
7188
-100
6766
-100
2003
-100
2025
-100
5410
-100
1010
-100
2002
-100
2003
-100
103
2844
1012
-100
102
-100
0
0

 2
101
-100
13219
-100
2038
-100
2019
-100
4500
-100
7939
-100
17287
-100
3508
-100
5816
-100
2000
-100
103
7968
1010
-100
2029
-100
12748
-100
2008
-100
2065
-100
1037
-100
3274
-100
2003
-100
13219
-100
2009
-100
2003
-100
2196
-100
7968
-100
1012
-100
102
-100
0
0

 3
101
-100
2014
-100
4419
-100
2411
-100
2001
-100
3565
-100
103
17772
1012
-100
2016
-100
4122
-100
1037
-100
20625
-100
2028
-100
2612
-100
1012
-100
102
-100
0
0

 4
101
-100
10941
-100
12102
-100
2000
-100
2022
-100
11664
-100
2135
-100
3532
-100
1012
-100
1045
-100
2097
-100
2215
-100
1037
-100
103
4138
2711
-100
1012
-100
102
-100
0
0


### build collator

In [30]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [31]:
training_args = TrainingArguments(
    f"model-mlm-generated-text",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
#     evaluate_during_training=True,
    do_train=True,
    do_eval=True,
    logging_steps=100,
    eval_steps=100,
)


In [32]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

### This trainer stores data masked - dataloader does nothing

In [33]:
dl = trainer.get_train_dataloader()

In [34]:
for s in dl:
    print(s)
    break

{'labels': tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  7481,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0],
        [ -100,  8796,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
   

In [47]:
def view_sample(i):
    _x = s['input_ids'][i]
    _y = s['labels'][i]
    print('\n', i)
    for j in range(64):
        print(_x[j])
        print(_y[j])
        if _y[j] == 0:
            break


In [48]:
view_sample(7)


 7
tensor(101)
tensor(-100)
tensor(1000)
tensor(-100)
tensor(2092)
tensor(-100)
tensor(1000)
tensor(-100)
tensor(2965)
tensor(-100)
tensor(2242)
tensor(-100)
tensor(2200)
tensor(-100)
tensor(2367)
tensor(-100)
tensor(2084)
tensor(-100)
tensor(1000)
tensor(-100)
tensor(103)
tensor(5665)
tensor(1000)
tensor(-100)
tensor(1010)
tensor(-100)
tensor(3568)
tensor(-100)
tensor(2065)
tensor(-100)
tensor(1037)
tensor(-100)
tensor(2518)
tensor(-100)
tensor(2003)
tensor(-100)
tensor(2092)
tensor(-100)
tensor(2009)
tensor(-100)
tensor(2323)
tensor(-100)
tensor(2196)
tensor(-100)
tensor(2022)
tensor(-100)
tensor(5665)
tensor(-100)
tensor(1012)
tensor(-100)
tensor(102)
tensor(-100)
tensor(0)
tensor(0)


# train / evaluate

In [50]:
trainer.train()

***** Running training *****
  Num examples = 100396
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 37650


Epoch,Training Loss,Validation Loss
1,0.0016,0.001145
2,0.0011,0.001005
3,0.0007,0.00097


Saving model checkpoint to model-mlm-generated-text/checkpoint-500
Configuration saved in model-mlm-generated-text/checkpoint-500/config.json
Model weights saved in model-mlm-generated-text/checkpoint-500/pytorch_model.bin
tokenizer config file saved in model-mlm-generated-text/checkpoint-500/tokenizer_config.json
Special tokens file saved in model-mlm-generated-text/checkpoint-500/special_tokens_map.json
Saving model checkpoint to model-mlm-generated-text/checkpoint-1000
Configuration saved in model-mlm-generated-text/checkpoint-1000/config.json
Model weights saved in model-mlm-generated-text/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in model-mlm-generated-text/checkpoint-1000/tokenizer_config.json
Special tokens file saved in model-mlm-generated-text/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to model-mlm-generated-text/checkpoint-1500
Configuration saved in model-mlm-generated-text/checkpoint-1500/config.json
Model weights saved in model-mlm-

Saving model checkpoint to model-mlm-generated-text/checkpoint-10500
Configuration saved in model-mlm-generated-text/checkpoint-10500/config.json
Model weights saved in model-mlm-generated-text/checkpoint-10500/pytorch_model.bin
tokenizer config file saved in model-mlm-generated-text/checkpoint-10500/tokenizer_config.json
Special tokens file saved in model-mlm-generated-text/checkpoint-10500/special_tokens_map.json
Saving model checkpoint to model-mlm-generated-text/checkpoint-11000
Configuration saved in model-mlm-generated-text/checkpoint-11000/config.json
Model weights saved in model-mlm-generated-text/checkpoint-11000/pytorch_model.bin
tokenizer config file saved in model-mlm-generated-text/checkpoint-11000/tokenizer_config.json
Special tokens file saved in model-mlm-generated-text/checkpoint-11000/special_tokens_map.json
Saving model checkpoint to model-mlm-generated-text/checkpoint-11500
Configuration saved in model-mlm-generated-text/checkpoint-11500/config.json
Model weights sa

tokenizer config file saved in model-mlm-generated-text/checkpoint-20000/tokenizer_config.json
Special tokens file saved in model-mlm-generated-text/checkpoint-20000/special_tokens_map.json
Saving model checkpoint to model-mlm-generated-text/checkpoint-20500
Configuration saved in model-mlm-generated-text/checkpoint-20500/config.json
Model weights saved in model-mlm-generated-text/checkpoint-20500/pytorch_model.bin
tokenizer config file saved in model-mlm-generated-text/checkpoint-20500/tokenizer_config.json
Special tokens file saved in model-mlm-generated-text/checkpoint-20500/special_tokens_map.json
Saving model checkpoint to model-mlm-generated-text/checkpoint-21000
Configuration saved in model-mlm-generated-text/checkpoint-21000/config.json
Model weights saved in model-mlm-generated-text/checkpoint-21000/pytorch_model.bin
tokenizer config file saved in model-mlm-generated-text/checkpoint-21000/tokenizer_config.json
Special tokens file saved in model-mlm-generated-text/checkpoint-21

Saving model checkpoint to model-mlm-generated-text/checkpoint-30000
Configuration saved in model-mlm-generated-text/checkpoint-30000/config.json
Model weights saved in model-mlm-generated-text/checkpoint-30000/pytorch_model.bin
tokenizer config file saved in model-mlm-generated-text/checkpoint-30000/tokenizer_config.json
Special tokens file saved in model-mlm-generated-text/checkpoint-30000/special_tokens_map.json
Saving model checkpoint to model-mlm-generated-text/checkpoint-30500
Configuration saved in model-mlm-generated-text/checkpoint-30500/config.json
Model weights saved in model-mlm-generated-text/checkpoint-30500/pytorch_model.bin
tokenizer config file saved in model-mlm-generated-text/checkpoint-30500/tokenizer_config.json
Special tokens file saved in model-mlm-generated-text/checkpoint-30500/special_tokens_map.json
Saving model checkpoint to model-mlm-generated-text/checkpoint-31000
Configuration saved in model-mlm-generated-text/checkpoint-31000/config.json
Model weights sa

TrainOutput(global_step=37650, training_loss=0.002116158859801799, metrics={'train_runtime': 3520.6595, 'train_samples_per_second': 85.549, 'train_steps_per_second': 10.694, 'total_flos': 1107300940766208.0, 'train_loss': 0.002116158859801799, 'epoch': 3.0})

In [51]:
_ = model.to('cpu')

In [52]:
p = pipeline('fill-mask', model=model, tokenizer=tokenizer)

In [57]:
p(f"It is not cold, it must be {tokenizer.mask_token}.")

[{'sequence': 'it is not cold, it must be hot.',
  'score': 0.9999921321868896,
  'token': 2980,
  'token_str': 'hot'},
 {'sequence': 'it is not cold, it must be active.',
  'score': 2.8070273856428685e-06,
  'token': 3161,
  'token_str': 'active'},
 {'sequence': 'it is not cold, it must be warm.',
  'score': 2.3018476440483937e-06,
  'token': 4010,
  'token_str': 'warm'},
 {'sequence': 'it is not cold, it must be cold.',
  'score': 6.004694341754657e-07,
  'token': 3147,
  'token_str': 'cold'},
 {'sequence': 'it is not cold, it must be sweet.',
  'score': 4.0767889686321723e-07,
  'token': 4086,
  'token_str': 'sweet'}]

In [58]:
trainer.save_model('./model')

Saving model checkpoint to ./model
Configuration saved in ./model/config.json
Model weights saved in ./model/pytorch_model.bin
tokenizer config file saved in ./model/tokenizer_config.json
Special tokens file saved in ./model/special_tokens_map.json
