# Options Generator

In [None]:
!pip install -r requirements.txt

First we import the necessary libraries. We need torch, datasets and transformers. We will use the BartTokenizer and BartForConditionalGeneration from the transformers library. And of course the Seq2SeqTrainer and TrainingArguments from the transformers library to train our model.

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import BartForConditionalGeneration

Make sure to include cuda, otherwise we are training on the cpu.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

Now we load our base model and its tokenizer using the BartTokenizer and BartForConditionalGeneration classes. The model is loaded from the 'facebook/bart-base' checkpoint.

In [None]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-base')

We load the questions dataset from where we saved it on our Hugging Face account. See the generated-questions-dataset notebook for more information on how to create this dataset.

In [2]:
data = load_dataset("nlp-group-6/sciq-with-generated-questions")
train_data = data['train']
val_data = data['validation']

We define some args which are the limitations to the model and our hardware.

In [3]:
max_input = 512
max_target = 128
batch_size = 36

We will define a `pre_process_data` function to preprocess the data. This function will tokenize the data and return the tokenized data.

In [None]:
def pre_process_data(data):
    # combine the question, correct answer and support text
    # we use the <s> token to separate the question, correct answer and support text
    question_answer_context = [question + "</s><s>" + correct_answer + "</s><s>" + support for question, correct_answer, support in zip(data['generated_question'], data['correct_answer'], data['support'])]
    
    # tokenize the data
    inputs = tokenizer(question_answer_context, padding="max_length", truncation=True, max_length=max_input, return_tensors="pt")
    targets = tokenizer(data['distractor1'], data['distractor2'], data['distractor3'], padding="max_length", truncation=True, max_length=max_target, return_tensors="pt")
    return {"input_ids": inputs.input_ids, "attention_mask": inputs.attention_mask, "labels": targets.input_ids}

Now we map the entire dataset to our `pre_process_data` function, batch and shuffle the data.

In [None]:
train_data = train_data.map(pre_process_data, batched=True).shuffle(seed=42)
val_data = val_data.map(pre_process_data, batched=True).shuffle(seed=42)

In [5]:
# empty memory, just in case
torch.cuda.empty_cache()
# put the model on the device, hopefully the GPU
model.to(device)

Now we define our seq2seq trainer and training arguments, more details about this in the paper.

In [None]:
args = Seq2SeqTrainingArguments(
    output_dir="./results_option_generation",
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size= batch_size,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=32,
    predict_with_generate=True,
    eval_accumulation_steps=32,
    fp16=True #available only with CUDA
)


trainer = Seq2SeqTrainer(
    model, 
    args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
)

trainer.train()

Finally, we save the model and tokenizer to disk.

In [None]:
OUT_DIR = "sciq_options1_generator"
model.save_pretrained(OUT_DIR)
tokenizer.save_pretrained(OUT_DIR)

Now, to test things out we load it and generate some options.

In [12]:
from transformers import BartTokenizer, BartForConditionalGeneration

model = BartForConditionalGeneration.from_pretrained(f"./{OUT_DIR}")
tokenizer = BartTokenizer.from_pretrained(f"./{OUT_DIR}")
# put them both on the same device
_ = model.to(device)

In [14]:
input_text = "What amazing machines smash particles that are smaller than atoms into each other head-on?"
correct_answer = "particle accelerators"

input_ids = tokenizer(input_text, correct_answer, return_tensors="pt").input_ids.to(device)
output = model.generate(input_ids, max_length=128, num_beams=4, num_return_sequences=3, early_stopping=True)
outputs = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in output]
print(outputs)

['particle accelerators', 'kinetic accelerators', 'neutron accelerators']
