In [1]:
!pip install -q datasets
!pip install -q transformers

In [2]:
from datasets import load_dataset

eli5 = load_dataset("eli5", split="train_asks[:5000]")

Reusing dataset eli5 (/root/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa)


In [3]:
eli5 = eli5.train_test_split(test_size=0.2)

In [4]:
eli5["train"][0]

{'answers': {'a_id': ['cc7uh3r', 'cc80sve'],
  'score': [6, 3],
  'text': ['The best insulator possible is going to be some sort of near-perfect vacuum contained inside of a highly reflective container. This is the principle behind a Thermos flask, a double-walled container with vacuum trapped between the walls.\n\nThermal energy is actually just the jiggling around of molecules and atoms. In order to lose or gain thermal energy, that can happen a few ways: it can be transformed and lost  as long-wave radiation (eg, infrared), or if things get really hot, black-body radiation, or it can be gained by absorbing this radiation, or it can be mechanically transferred between molecules that come into "contact" with each other.\n\nYou can prevent the direct transfer of thermal energy by making sure that whatever you want to keep hot or cold is kept in a vacuum, so that there is very little other matter for it to exchange thermal energy with. And you can reflect long-wave radiation away from a

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

In [6]:
eli5 = eli5.flatten()
eli5["train"][0]

{'answers.a_id': ['cc7uh3r', 'cc80sve'],
 'answers.score': [6, 3],
 'answers.text': ['The best insulator possible is going to be some sort of near-perfect vacuum contained inside of a highly reflective container. This is the principle behind a Thermos flask, a double-walled container with vacuum trapped between the walls.\n\nThermal energy is actually just the jiggling around of molecules and atoms. In order to lose or gain thermal energy, that can happen a few ways: it can be transformed and lost  as long-wave radiation (eg, infrared), or if things get really hot, black-body radiation, or it can be gained by absorbing this radiation, or it can be mechanically transferred between molecules that come into "contact" with each other.\n\nYou can prevent the direct transfer of thermal energy by making sure that whatever you want to keep hot or cold is kept in a vacuum, so that there is very little other matter for it to exchange thermal energy with. And you can reflect long-wave radiation a

In [7]:
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["answers.text"]], truncation=True, padding=True)

In [8]:
tokenized_eli5 = eli5.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=eli5["train"].column_names,
)

        

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

        

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

In [9]:
len(tokenized_eli5["train"][2]["input_ids"])

512

In [10]:
block_size = 128


def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [11]:
lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)

       

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

        

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

In [12]:
len(lm_dataset["train"][0]["input_ids"])

128

In [13]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [14]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained("distilroberta-base")

In [15]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

trainer.train()

***** Running training *****
  Num examples = 16000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6000


Epoch,Training Loss,Validation Loss
1,0.997,0.955827
2,0.9629,0.944718
3,0.9508,0.936148


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json
Model weights saved in ./results/checkpoint-2000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4000
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-2500
Configuration saved in ./results/checkpoint-2500/config.json
Model weights saved in ./results/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to .

TrainOutput(global_step=6000, training_loss=1.005721165974935, metrics={'train_runtime': 692.3515, 'train_samples_per_second': 69.329, 'train_steps_per_second': 8.666, 'total_flos': 1591461679104000.0, 'train_loss': 1.005721165974935, 'epoch': 3.0})