In [1]:
# Transformers installation
! pip install pyarrow==14.0.2 transformers[torch] datasets evaluate
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

# Pretraining


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForMaskedLM,
    AutoModelForMultipleChoice,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
    set_seed,
)
from datasets import load_dataset
import evaluate
import numpy as np
import math
import os

In [4]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [5]:
text_datasets = {
    "train": ['data/combined_facts.txt'],
    "eval": ['data/obqa_facts_val.txt'],
    "test": ['data/obqa_facts_test.txt']
}
dataset = load_dataset("text", data_files=text_datasets)

Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [6]:
# Tokenizes each batch of lines in the dataset, just truncate to 100
# no line in pretraining corpus is longer. Also, don't pad preemptively
tokenized_dataset = dataset.map(
    lambda z: tokenizer(z["text"], truncation=True, max_length=100),
    num_proc=2,
    batched=True,
    remove_columns=["text"],
)

Map (num_proc=2):   0%|          | 0/11077 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/500 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/500 [00:00<?, ? examples/s]

In [7]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

In [8]:
BATCH_SIZE = 96
STEPS_PER_EPOCH = math.ceil(len(tokenized_dataset["train"]) / BATCH_SIZE)
NUM_EPOCHS = 20
print(STEPS_PER_EPOCH)

116


In [9]:
model = AutoModelForMaskedLM.from_pretrained("roberta-base")
set_seed(42)

args = TrainingArguments(
    output_dir="pretrained-combined-model",
    overwrite_output_dir=True,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=192,
    num_train_epochs=NUM_EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    weight_decay=0.01,
    learning_rate=2e-5,
)

trainer = Trainer(
    model=model,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["eval"]
)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [10]:
trainer.train()  # batch_size=96, lr=2e-5

Epoch,Training Loss,Validation Loss
1,1.8016,1.446826
2,1.6145,1.247444
3,1.4789,1.238749
4,1.4076,0.982786
5,1.4365,0.94462
6,1.3485,0.904823
7,1.3068,0.776862
8,1.2601,0.686723
9,1.2542,0.696555
10,1.2067,0.668378


There were missing keys in the checkpoint model loaded: ['lm_head.decoder.weight', 'lm_head.decoder.bias'].


TrainOutput(global_step=2320, training_loss=1.2575605787079909, metrics={'train_runtime': 2183.4604, 'train_samples_per_second': 101.463, 'train_steps_per_second': 1.063, 'total_flos': 4218048508354956.0, 'train_loss': 1.2575605787079909, 'epoch': 20.0})

In [11]:
!cp -r pretrained-combined-model gdrive/MyDrive

## Fine-tuning

In [12]:
openbookqa = load_dataset("openbookqa")

Downloading readme:   0%|          | 0.00/9.06k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/496k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/58.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4957 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

In [13]:
openbookqa['train'][:2]

{'id': ['7-980', '7-584'],
 'question_stem': ['The sun is responsible for',
  'When standing miles away from Mount Rushmore'],
 'choices': [{'text': ['puppies learning new tricks',
    'children growing up and getting old',
    'flowers wilting in a vase',
    'plants sprouting, blooming and wilting'],
   'label': ['A', 'B', 'C', 'D']},
  {'text': ['the mountains seem very close',
    'the mountains are boring',
    'the mountains look the same as from up close',
    'the mountains seem smaller than in photographs'],
   'label': ['A', 'B', 'C', 'D']}],
 'answerKey': ['D', 'D']}

The preprocessing function you want to create needs to:

1. Make four copies of the `sent1` field and combine each of them with `sent2` to recreate how a sentence starts.
2. Combine `sent2` with each of the four possible sentence endings.
3. Flatten these two lists so you can tokenize them, and then unflatten them afterward so each example has a corresponding `input_ids`, `attention_mask`, and `labels` field.

In [14]:
def preprocess_function_qa(examples):
    qstems = [[context] * 4 for context in examples["question_stem"]]
    choices = [[choice for choice in dictionary['text']] for dictionary in examples["choices"]]

    qstems = sum(qstems, [])
    choices = sum(choices, [])

    tokenized_examples = tokenizer(qstems, choices, truncation=True)
    return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}

output = preprocess_function_qa(openbookqa['train'][:2])

def preprocess_labels_qa(examples):
    lettermap = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
    answers = list(examples["answerKey"])
    answers = [lettermap[a] for a in answers]

    return {'label': answers}

In [15]:
tokenized_qa = openbookqa.map(preprocess_function_qa, batched=True)
tokenized_qa = tokenized_qa.map(preprocess_labels_qa, batched=True)

Map:   0%|          | 0/4957 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/4957 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [16]:
print(tokenized_qa['train'][0].keys())

dict_keys(['id', 'question_stem', 'choices', 'answerKey', 'input_ids', 'attention_mask', 'label'])


🤗 Transformers doesn't have a data collator for multiple choice, so you'll need to adapt the [DataCollatorWithPadding](https://huggingface.co/docs/transformers/main/en/main_classes/data_collator#transformers.DataCollatorWithPadding) to create a batch of examples. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

`DataCollatorForMultipleChoice` flattens all the model inputs, applies padding, and then unflattens the results:

In [17]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch


@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        # print(features)
        # print(features[0])
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [18]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [19]:
BATCH_SIZE = 32
NUM_EPOCHS = 10
STEPS_PER_EPOCH = math.ceil(len(tokenized_qa["train"]) / BATCH_SIZE)
print(STEPS_PER_EPOCH)

155


In [20]:
PRETRAINED_MODEL_PATH = "pretrained-combined-model/checkpoint-1856"
model = AutoModelForMultipleChoice.from_pretrained(PRETRAINED_MODEL_PATH)
set_seed(42)

training_args = TrainingArguments(
    output_dir="pt-combined-ft-obqa-model",
    overwrite_output_dir=True,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    learning_rate=5e-5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_qa["train"],
    eval_dataset=tokenized_qa["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at pretrained-combined-model/checkpoint-1856 and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1948,1.122551,0.528
2,0.8839,1.041908,0.59
3,0.5924,1.170174,0.58
4,0.387,1.360223,0.566
5,0.2536,1.60964,0.556
6,0.1631,1.868231,0.56
7,0.111,1.986802,0.56
8,0.0887,2.111804,0.542
9,0.0594,2.225527,0.558
10,0.0434,2.347037,0.554


TrainOutput(global_step=1550, training_loss=0.3777329094179215, metrics={'train_runtime': 1469.0232, 'train_samples_per_second': 33.744, 'train_steps_per_second': 1.055, 'total_flos': 4248738784998072.0, 'train_loss': 0.3777329094179215, 'epoch': 10.0})

In [22]:
trainer.evaluate(tokenized_qa["test"], metric_key_prefix="test")

{'test_loss': 1.123958945274353,
 'test_accuracy': 0.516,
 'test_runtime': 5.3475,
 'test_samples_per_second': 93.501,
 'test_steps_per_second': 2.992,
 'epoch': 10.0}

In [23]:
!cp -r pt-combined-ft-obqa-model/ gdrive/MyDrive