# Questions Generator

In [None]:
! pip install -r requirements.txt

In [2]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import BartTokenizer, BartForConditionalGeneration

# make sure to include cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-base')

cpu


In [67]:
data = load_dataset("allenai/sciq")
train_data = data['train']
eval_data = data['test']
test_data = data['validation']
print(train_data)

Dataset({
    features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
    num_rows: 11679
})


In [46]:
max_input = 512
max_target = 128
batch_size = 8

In [68]:
# dataset has:
# question, distractor3, distractor1, distractor2, correct_answer, support
def pre_process_data(data):
    # tokenize the data
    inputs = tokenizer(data['support'], data['correct_answer'], padding="max_length", truncation=True, max_length=max_input, return_tensors="pt")
    targets = tokenizer(data['question'], padding="max_length", truncation=True, max_length=max_target, return_tensors="pt")
    return {"input_ids": inputs.input_ids, "attention_mask": inputs.attention_mask, "labels": targets.input_ids}

train_data = train_data.map(pre_process_data, batched=True).shuffle(seed=42)
eval_data = eval_data.map(pre_process_data, batched=True).shuffle(seed=42)
test_data = test_data.map(pre_process_data, batched=True).shuffle(seed=42)

Map:   0%|          | 0/11679 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [5]:
# empty memory
torch.cuda.empty_cache()

In [1]:
model.to(device)
args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size= batch_size,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=32,
    predict_with_generate=True,
    eval_accumulation_steps=32,
    fp16=True #available only with CUDA
)


trainer = Seq2SeqTrainer(
    model, 
    args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    tokenizer=tokenizer,
)

trainer.train()
# lets save the model
OUT_DIR = "sciq"
model.save_pretrained(OUT_DIR)
tokenizer.save_pretrained(OUT_DIR)


NameError: name 'model' is not defined

In [5]:
MODEL_FOLDER = "models/sciq"
model = BartForConditionalGeneration.from_pretrained(f"./{MODEL_FOLDER}")
tokenizer = BartTokenizer.from_pretrained(f"./{MODEL_FOLDER}")
# put them both on the same device
_ = model.to(device)

In [6]:
# now lets test it with an input
input_text = "white Milk is known for appearing very bright."
inputs = tokenizer(input_text, padding="max_length", truncation=True, max_length=max_input, return_tensors="pt")
inputs = {k: inputs[k].to(device) for k in inputs}

result = model.generate(**inputs)
output = tokenizer.decode(result[0], skip_special_tokens=True)
print(output)




What color is milk?
