In [7]:
# Transformers installation
! pip install pyarrow==14.0.2 transformers[torch] datasets evaluate
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git



In [8]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [9]:
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForMaskedLM,
    AutoModelForMultipleChoice,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
    set_seed,
)
from datasets import load_dataset, DatasetDict, concatenate_datasets
import evaluate
import numpy as np
import math
import os

In [10]:
openbookqa = load_dataset("openbookqa")

In [11]:
openbookqa['train'][:2]

{'id': ['7-980', '7-584'],
 'question_stem': ['The sun is responsible for',
  'When standing miles away from Mount Rushmore'],
 'choices': [{'text': ['puppies learning new tricks',
    'children growing up and getting old',
    'flowers wilting in a vase',
    'plants sprouting, blooming and wilting'],
   'label': ['A', 'B', 'C', 'D']},
  {'text': ['the mountains seem very close',
    'the mountains are boring',
    'the mountains look the same as from up close',
    'the mountains seem smaller than in photographs'],
   'label': ['A', 'B', 'C', 'D']}],
 'answerKey': ['D', 'D']}

In [12]:
csqa = DatasetDict.from_json('data/additional_qa.json')
print(openbookqa)
print(csqa)

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'question_stem', 'choices', 'answerKey'],
        num_rows: 4957
    })
    validation: Dataset({
        features: ['id', 'question_stem', 'choices', 'answerKey'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'question_stem', 'choices', 'answerKey'],
        num_rows: 500
    })
})
Dataset({
    features: ['id', 'answerKey', 'choices', 'question_stem'],
    num_rows: 9741
})


In [13]:
# MAKE SURE YOU RUN THIS CELL ONLY ONCE
obqa = openbookqa['train']
csqa = csqa.cast(obqa.features)
assert obqa.features.type == csqa.features.type
openbookqa['train'] = concatenate_datasets([openbookqa['train'], csqa])
assert len(openbookqa['train']) == 9741 + 4957
openbookqa

Casting the dataset:   0%|          | 0/9741 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'question_stem', 'choices', 'answerKey'],
        num_rows: 14698
    })
    validation: Dataset({
        features: ['id', 'question_stem', 'choices', 'answerKey'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'question_stem', 'choices', 'answerKey'],
        num_rows: 500
    })
})

In [14]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [15]:
def preprocess_function_qa(examples):
    qstems = [[context] * 4 for context in examples["question_stem"]]
    choices = [[choice for choice in dictionary['text']] for dictionary in examples["choices"]]

    qstems = sum(qstems, [])
    choices = sum(choices, [])

    tokenized_examples = tokenizer(qstems, choices, truncation=True)
    return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}

output = preprocess_function_qa(openbookqa['train'][:2])

def preprocess_labels_qa(examples):
    lettermap = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
    answers = list(examples["answerKey"])
    answers = [lettermap[a] for a in answers]

    return {'label': answers}

In [16]:
tokenized_qa = openbookqa.map(preprocess_function_qa, batched=True)
tokenized_qa = tokenized_qa.map(preprocess_labels_qa, batched=True)

Map:   0%|          | 0/14698 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/14698 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [17]:
print(tokenized_qa['train'][0].keys())

dict_keys(['id', 'question_stem', 'choices', 'answerKey', 'input_ids', 'attention_mask', 'label'])


🤗 Transformers doesn't have a data collator for multiple choice, so you'll need to adapt the [DataCollatorWithPadding](https://huggingface.co/docs/transformers/main/en/main_classes/data_collator#transformers.DataCollatorWithPadding) to create a batch of examples. It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

`DataCollatorForMultipleChoice` flattens all the model inputs, applies padding, and then unflattens the results:

In [18]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch


@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        # print(features)
        # print(features[0])
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [19]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

## Train

In [20]:
BATCH_SIZE = 32
NUM_EPOCHS = 10
STEPS_PER_EPOCH = math.ceil(len(tokenized_qa["train"]) / BATCH_SIZE)
print(STEPS_PER_EPOCH)

460


In [21]:
model = AutoModelForMultipleChoice.from_pretrained("roberta-base")
set_seed(42)

training_args = TrainingArguments(
    output_dir="base-obqa-csqa-model",
    overwrite_output_dir=True,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    learning_rate=5e-5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_qa["train"],
    eval_dataset=tokenized_qa["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.3895,1.386267,0.404
2,1.3883,1.386127,0.36
3,1.3875,1.386295,0.164
4,1.388,1.386294,0.3
5,1.3876,1.386295,0.23
6,1.3878,1.386295,0.25
7,1.3876,1.386295,0.27
8,1.3873,1.386295,0.23
9,1.3872,1.386294,0.268
10,1.3868,1.386295,0.23


TrainOutput(global_step=4600, training_loss=1.3877539195185122, metrics={'train_runtime': 3985.1544, 'train_samples_per_second': 36.882, 'train_steps_per_second': 1.154, 'total_flos': 1.256960670036e+16, 'train_loss': 1.3877539195185122, 'epoch': 10.0})

In [23]:
trainer.evaluate(tokenized_qa["test"], metric_key_prefix="test")

{'test_loss': 1.3861576318740845,
 'test_accuracy': 0.338,
 'test_runtime': 4.8227,
 'test_samples_per_second': 103.677,
 'test_steps_per_second': 3.318,
 'epoch': 10.0}

In [24]:
!cp -r base-obqa-csqa-model/ gdrive/MyDrive