# MLM training on our generated data

adapted from
https://huggingface.co/transformers/v2.5.1/examples.html#language-model-training
and
https://github.com/huggingface/notebooks/blob/master/examples/language_modeling.ipynb



In [39]:
import logging
import math
import os
import numpy as np
from tqdm.notebook import tqdm

import transformers
from transformers import (
    CONFIG_MAPPING,
    MODEL_WITH_LM_HEAD_MAPPING,
    AutoConfig,
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    DataCollatorForWholeWordMask,
    DataCollatorWithPadding,
    HfArgumentParser,
    LineByLineTextDataset,
    LineByLineWithRefDataset,
    PreTrainedTokenizer,
    TextDataset,
    Trainer,
    TrainingArguments,
    set_seed,
    pipeline
)
from transformers.trainer_utils import is_main_process
import datasets
import pandas as pd

In [2]:
CACHE = './hfcache'
MODEL_NAME = 'google/electra-small-generator'

In [3]:
config = AutoConfig.from_pretrained(MODEL_NAME, cache_dir=CACHE)

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE, use_fast=True)

In [5]:
model = AutoModelForMaskedLM.from_pretrained(
    MODEL_NAME,
    from_tf=False,
    config=config,
    cache_dir=CACHE,
)
model.resize_token_embeddings(len(tokenizer))


Embedding(30522, 128, padding_idx=0)

# build new dataset

In [6]:
df = pd.read_csv('./contrast_dataset/data.csv', index_col='Unnamed: 0')

In [7]:
d = datasets.Dataset.from_pandas(df)

In [8]:
d

Dataset({
    features: ['x', 'y', '__index_level_0__'],
    num_rows: 112436
})

In [9]:
d['x'][0], d['y'][0]

('I thought Kevin tends to be excessively [MASK], but  he was very old.',
 'young')

### A. tokenize

In [10]:
def tokenize_function(examples):
    return tokenizer(examples["x"], max_length=512, truncation=True)

In [11]:
def tokenize_function_y(examples):
    return tokenizer(examples["y"], max_length=1, truncation=True)

In [12]:
tokenized_datasets = d.map(tokenize_function, batched=True, num_proc=4, )

In [13]:
y = d.map(tokenize_function_y, batched=True)

  0%|          | 0/113 [00:00<?, ?ba/s]

In [14]:
# something about arrow format makes it insanely slow on multiple reads
ys = list(y['input_ids'])

In [15]:
ys = [y[1] for y in ys]

In [16]:
ys

[2402,
 3893,
 9191,
 2658,
 5236,
 6638,
 3563,
 3532,
 2844,
 3161,
 4603,
 15716,
 7790,
 5305,
 3161,
 2204,
 6450,
 2397,
 6450,
 2179,
 9841,
 21877,
 17772,
 19194,
 13593,
 2980,
 5776,
 3491,
 8841,
 2986,
 8796,
 2047,
 6530,
 14480,
 3893,
 2047,
 5379,
 4251,
 2658,
 4621,
 4064,
 22692,
 5665,
 4206,
 15716,
 3407,
 9191,
 2304,
 2485,
 7790,
 4064,
 9200,
 5305,
 6047,
 3893,
 7501,
 3819,
 2214,
 17145,
 3532,
 4206,
 2092,
 4138,
 2214,
 4550,
 8796,
 7098,
 2235,
 2986,
 2797,
 11004,
 12511,
 7501,
 7568,
 2066,
 7823,
 2402,
 3161,
 2986,
 4603,
 3532,
 5305,
 4121,
 7591,
 2047,
 5379,
 5220,
 12958,
 6179,
 3809,
 3733,
 6555,
 3733,
 4408,
 7501,
 7823,
 4997,
 3893,
 5236,
 21931,
 2502,
 12511,
 2047,
 4452,
 3161,
 12958,
 7568,
 5341,
 4326,
 5236,
 2917,
 3308,
 2317,
 6638,
 3835,
 2092,
 2986,
 12511,
 14768,
 6555,
 21877,
 6047,
 6530,
 11004,
 14480,
 5024,
 2759,
 5665,
 5665,
 2304,
 3147,
 3697,
 4064,
 3491,
 3893,
 10036,
 4242,
 4795,
 2214,
 6387,

### B. Build x,y for model

In [127]:
labels = []
inputs = []
max_len = 0
for i, sentence in tqdm(enumerate(tokenized_datasets['input_ids'])):
    a = np.array(sentence, dtype=int)
    y = np.ones_like(a) * -100
    y[a == 103] = ys[i]  # cls, token, sep
    padded_y = np.zeros(64, dtype=int)
    padded_x = np.zeros(64, dtype=int)
    padded_y[:len(a)] = y
    padded_x[:len(a)] = a
    labels.append(padded_y)
    inputs.append(padded_x)
    
l=len(labels)
print(l)
l = l - (l % 16)
print(l)
labels = labels[:l]
inputs = inputs[:l]

0it [00:00, ?it/s]

112436
112432


In [128]:
lm_datasets = datasets.Dataset.from_dict({'labels': labels, 'input_ids': inputs})

In [129]:
lm_datasets = lm_datasets.shuffle().train_test_split()

### (tokenizer check)

In [130]:
tokenizer.mask_token

'[MASK]'

In [131]:
tokenizer.mask_token_id

103

In [132]:
tokenizer.decode(lm_datasets['train'][4]['input_ids'])

'[CLS] howard is often [MASK]. we want a fearful person to fill the same need. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [133]:
for i in range(5):
    print(lm_datasets['train'][i]['input_ids'][:20])
    print(lm_datasets['train'][i]['labels'][:20])

[101, 2010, 2025, 5649, 12102, 2000, 2022, 15241, 103, 1012, 2002, 4122, 1037, 9657, 2028, 2000, 6039, 1996, 2168, 2342]
[-100, -100, -100, -100, -100, -100, -100, -100, 11004, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]
[101, 1000, 4326, 1000, 2038, 2019, 4941, 3574, 2000, 1000, 103, 1000, 1010, 2061, 2065, 1037, 3274, 2003, 5220, 2009]
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 5220, -100, -100, -100, -100, -100, -100, -100, -100, -100]
[101, 6832, 11265, 5867, 2015, 103, 1010, 2947, 2065, 1037, 3797, 2003, 18439, 2009, 2196, 2003, 6832, 1012, 102, 0]
[-100, -100, -100, -100, -100, 18439, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 0]
[101, 2009, 12102, 2000, 2022, 2172, 2205, 4064, 1012, 2016, 2359, 1037, 103, 8000, 2004, 2019, 4522, 1012, 102, 0]
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 2440, -100, -100, -100, -100, -100, -100, 0]
[101, 4922, 2003, 2411, 103, 1012, 2057, 2215, 

### build collator

In [135]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [136]:
training_args = TrainingArguments(
    f"model-mlm-generated-text",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [137]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],
    data_collator=data_collator,
)

### This trainer stores data masked - dataloader does nothing

In [138]:
dl = trainer.get_train_dataloader()

In [139]:
for s in dl:
    print(s)
    break

{'labels': tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  5023,  -100,  -100,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0],
        [ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          6555,  -100,  -100,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
   

In [140]:
def view_sample(i):
    print(tokenizer.decode(s['input_ids'][i]))
    print(s['labels'][0])

In [141]:
view_sample(0)

[CLS] when a thing is shown, it will not be [MASK]. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
tensor([-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 5023,
        -100, -100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0])


# train / evaluate

In [None]:
trainer.train()

***** Running training *****
  Num examples = 84324
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 31623


Epoch,Training Loss,Validation Loss


Saving model checkpoint to model-mlm-generated-text/checkpoint-500
Configuration saved in model-mlm-generated-text/checkpoint-500/config.json
Model weights saved in model-mlm-generated-text/checkpoint-500/pytorch_model.bin


In [56]:
model.to('cpu')

ElectraForMaskedLM(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=256, 

In [57]:
p = pipeline('fill-mask', model=model, tokenizer=tokenizer)

In [58]:
p(f"HuggingFace is creating a {tokenizer.mask_token} that the community uses to solve NLP tasks.")

[{'sequence': 'huggingface is creating a. that the community uses to solve nlp tasks.',
  'score': 0.0022701651323586702,
  'token': 1012,
  'token_str': '.'},
 {'sequence': 'huggingface is creating a a that the community uses to solve nlp tasks.',
  'score': 0.0019472201820462942,
  'token': 1037,
  'token_str': 'a'},
 {'sequence': 'huggingface is creating a not that the community uses to solve nlp tasks.',
  'score': 0.0016775872791185975,
  'token': 2025,
  'token_str': 'not'},
 {'sequence': 'huggingface is creating a that that the community uses to solve nlp tasks.',
  'score': 0.0015871956711634994,
  'token': 2008,
  'token_str': 'that'},
 {'sequence': 'huggingface is creating a i that the community uses to solve nlp tasks.',
  'score': 0.0015854841331019998,
  'token': 1045,
  'token_str': 'i'}]