# Questions Generator

In [53]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
# import BartForConditionalGeneration
from transformers import BartTokenizer, BartForConditionalGeneration

# make sure to include cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

MODEL_FOLDER = "models/sciq"
model = BartForConditionalGeneration.from_pretrained(f"./{MODEL_FOLDER}")
tokenizer = BartTokenizer.from_pretrained(f"./{MODEL_FOLDER}")
# model = BartForConditionalGeneration.from_pretrained('nlp-group-6/sciq-question-generator', token="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")

cpu


In [82]:
data = load_dataset("allenai/sciq")
test_data = data['test']
print(test_data)

Dataset({
    features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
    num_rows: 1000
})


In [59]:
max_input = 512
max_target = 128
batch_size = 8

In [83]:
# dataset has:
# question, distractor3, distractor1, distractor2, correct_answer, support
def pre_process_data(data):
    # tokenize the data
    inputs = tokenizer(data['support'], padding="max_length", truncation=True, max_length=max_input, return_tensors="pt")
    targets = tokenizer(data['question'], padding="max_length", truncation=True, max_length=max_target, return_tensors="pt")
    return {"input_ids": inputs.input_ids, "attention_mask": inputs.attention_mask, "labels": targets.input_ids}

test_data = test_data.map(pre_process_data, batched=True).shuffle(seed=42)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [6]:
# empty memory
torch.cuda.empty_cache()

In [54]:
model.to(device)
args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size= batch_size,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=32,
    predict_with_generate=True,
    eval_accumulation_steps=32,
    # fp16=True #available only with CUDA
)

trainer = Seq2SeqTrainer(
    model, 
    args,
    tokenizer=tokenizer,
)


In [20]:
from BARTScore.bart_score import BARTScorer
bart_scorer = BARTScorer(device=device, checkpoint='facebook/bart-large-cnn')
model.load_state_dict(torch.load('models/bart.pth', map_location=device))

In [80]:
from nltk.translate.bleu_score import sentence_bleu
from math import sqrt
from numpy import mean, std

def euclidean_distance(x,y):
  """ return euclidean distance between two lists """
  return sqrt(sum(pow(a-b,2) for a, b in zip(x, y)))


def squared_sum(x):
  """ return 3 rounded square rooted value """
  return round(sqrt(sum([a*a for a in x])),3)


def cos_similarity(x,y):
  """ return cosine similarity between two lists """ 
  numerator = sum(a*b for a,b in zip(x,y))
  denominator = squared_sum(x)*squared_sum(y)
  return round(numerator/float(denominator),3)


def compute_scores(target_data, predictions, prnt=True):
    bart_score_list = []
    bleu_score_list = []

    for idx, target, prediction in zip(range(len(predictions)), target_data, predictions):
        prediction = prediction[1:]
        valid_tokens = [token for token in prediction if token != -100]
        predicted_sentence = tokenizer.decode(valid_tokens, skip_special_tokens=True)
        bleu_score = sentence_bleu([target['question'].split()], predicted_sentence.split())
        bart_score = bart_scorer.score([predicted_sentence], [target['question']])
        if prnt:
            print(idx)
            print(f"Bleu: {bleu_score}")
            print(f"Bart: {bart_score}")
            print("predic: " + predicted_sentence)
            print(f"target: " + target['question'])
            print(f"answer: {target['correct_answer']}")
            print(f"support: {target['support']}")
            print()
        
        bleu_score_list.append(bleu_score)
        bart_score_list.append(bart_score[0])
        
    print(f"Bart score mean: {mean(bart_score_list)}")
    print(f"Bart score std: {std(bart_score_list)}")
    print(f"Bleu score mean: {mean(bleu_score_list)}")
    print(f"Bart score std: {std(bleu_score_list)}")

In [84]:
predictions = trainer.predict(test_data, max_new_tokens=64)

In [85]:
compute_scores(test_data, predictions[0])

0
Bleu: 0.4835447404743731
Bart: [-1.3393480777740479]
predic: What removes phosphorylated amino acids from proteins?
target: What functions in removing phosphorylated amino acids from proteins?
answer: phosphatase
support: What is the function of a phosphatase? a. A phosphatase removes phosphorylated amino acids from proteins. A phosphatase removes the phosphate group from phosphorylated amino acid residues in a protein. A phosphatase phosphorylates serine, threonine, and tyrosine residues. A phosphatase degrades second messengers in the cell. How does NF-κB induce gene expression? a. A small, hydrophobic ligand binds to NF-κB, activating it. Phosphorylation of the inhibitor Iκ-B dissociates the complex between it and NF-κB, and allows NF-κB to enter the nucleus and stimulate transcription. NF-κB is phosphorylated and is then free to enter the nucleus and bind DNA. NF-κB is a kinase that phosphorylates a transcription factor that binds DNA and promotes protein production. Apoptosis ca

In [67]:
def contains_negative(question):
    negative_words = ["not", "except", "incorrect", "no", "never", "neither", "nor", "none", "disagree", "oppose", "reject", "disallow", "disapprove", "negate", "deny"]
    words = question.lower().split()
    for word in words:
        if word in negative_words:
            return True
    return False

for prediction in predictions[0]:
    predicted_sentence = tokenizer.decode(prediction, skip_special_tokens=True)
    if contains_negative(predicted_sentence):
        print(predicted_sentence)

TypeError: sequence item 30: expected str instance, NoneType found