# Options Evaluation

In [45]:
!pip install -r requirements.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[31mERROR: Could not find a version that satisfies the requirement pywin32==306 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for pywin32==306[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


First we import the necessary libraries. We need torch, datasets and transformers. We will use the BartTokenizer and BartForConditionalGeneration from the transformers library. And of course the Seq2SeqTrainer and TrainingArguments from the transformers library to train our model.

In [None]:
import torch
from datasets import load_dataset
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import BartTokenizer, BartForConditionalGeneration

In [168]:
# make sure to include cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model_dataset_questions = BartForConditionalGeneration.from_pretrained('nlp-group-6/sciq-options-generator')
model_generated_questions = BartForConditionalGeneration.from_pretrained('nlp-group-6/sciq-options-generator-generated-questions')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

cpu


In [None]:
data = load_dataset("nlp-group-6/sciq-with-generated-questions")
test_data = data['test']
print(data)

In [None]:
max_input = 512
max_target = 128
batch_size = 36

In [None]:
# dataset has:
# question, distractor3, distractor1, distractor2, correct_answer, support
def pre_process_data(data):
    question_answer_context = [question + "</s><s>" + correct_answer + "</s><s>" + support for
                                   question, correct_answer, support in
                                   zip(data['generated_question'], data['correct_answer'], data['support'])]
    # tokenize the data
    inputs = []
    for text in question_answer_context:
        inputs.append(tokenizer(text, padding="max_length", truncation=True, max_length=max_input, return_tensors="pt"))
        # targets = tokenizer(data['distractor1'], data['distractor2'], data['distractor3'], padding="max_length", truncation=True, max_length=max_target, return_tensors="pt")
    return inputs

inputs = pre_process_data(test_data)
# test_data = test_data.map(pre_process_data, batched=True)

In [None]:
# empty memory
torch.cuda.empty_cache()

In [None]:
# TODO: add versioning

model_generated_questions.to(device)
args = Seq2SeqTrainingArguments(
    output_dir="./results_option_generation",
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size= batch_size,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=32,
    predict_with_generate=True,
    eval_accumulation_steps=32,
    fp16=torch.cuda.is_available() #available only with CUDA
)


trainer = Seq2SeqTrainer(
    model_generated_questions, 
    args,
    tokenizer=tokenizer,
)


In [163]:
from BARTScore.bart_score import BARTScorer

bart_scorer_dataset = BARTScorer(device=device, checkpoint='nlp-group-6/sciq-options-generator')
bart_scorer_generated = BARTScorer(device=device, checkpoint='nlp-group-6/sciq-options-generator-generated-questions')

tokenizer_config.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/292 [00:00<?, ?B/s]

In [165]:
from nltk.translate.bleu_score import sentence_bleu
from numpy import mean, std

def evaluate(model, inputs, test_data, bart_scorer):
    bart_score_faithfulness_list = [] # source -> hypothesis
    bart_score_precision_distr_list = [] # reference -> hypothesis
    bart_score_recall_distr_list = [] # hypothesis -> reference
    bart_score_fscore_distr_list = [] # mean of precision and recall
    
    bart_score_precision_answr_list = [] # reference -> hypothesis
    bart_score_recall_answr_list = [] # hypothesis -> reference
    bart_score_fscore_answr_list = [] # mean of precision and recall
    
    bleu_scores_distr_list = []
    bleu_scores_answr_list = []
    
    for idx, input, datapoint in zip(range(len(inputs)), inputs, test_data):
        print(idx)
        output = model.generate(**input, max_length=128, num_beams=4, num_return_sequences=4, early_stopping=True)
        output_strings = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in output]
        filtered_output = []
        correct_answer_encountered = False
        for o in output_strings:
            if not correct_answer_encountered and o == datapoint['correct_answer']:
                correct_answer_encountered = True
            else:
                filtered_output.append(o)
        print(f"predic: {filtered_output}")
        distr_targets = [datapoint['distractor1'], datapoint['distractor2'], datapoint['distractor3']]
        print(f"target: {distr_targets}")
        print(datapoint)
        
        source = datapoint['question'] + "</s><s>" + datapoint['correct_answer'] + "</s><s>" + datapoint['support']        
        print([source]*3, filtered_output[0:3])
        bart_score_faithfulness= bart_scorer.score([source]*3 , filtered_output[0:3], batch_size=4)
        bart_score_faithfulness_list = bart_score_faithfulness_list + bart_score_faithfulness
        
        bart_score_precision_distr = bart_scorer.multi_ref_score(distr_targets, [filtered_output[0:3]]*3, agg="max", batch_size=4)
        bart_score_precision_distr_list = bart_score_precision_distr_list + bart_score_precision_distr
        bart_score_recall_distr = bart_scorer.multi_ref_score(filtered_output[0:3], [distr_targets]*3, agg="max", batch_size=4)
        bart_score_recall_distr_list = bart_score_recall_distr_list + bart_score_recall_distr
        bart_score_fscore_distr = list(mean([bart_score_precision_distr, bart_score_recall_distr], axis=0))
        bart_score_fscore_distr_list = bart_score_fscore_distr_list + bart_score_fscore_distr
        
        bart_score_precision_answr = bart_scorer.score([datapoint['correct_answer']]*3, filtered_output[0:3], batch_size=4)
        bart_score_precision_answr_list = bart_score_precision_answr_list + bart_score_precision_answr
        bart_score_recall_answr = bart_scorer.score(filtered_output[0:3], [datapoint['correct_answer']]*3, batch_size=4)
        bart_score_recall_answr_list = bart_score_recall_answr_list + bart_score_recall_answr
        bart_score_fscore_answr = list(mean([bart_score_precision_answr, bart_score_recall_answr], axis=0))
        bart_score_fscore_answr_list = bart_score_fscore_answr_list + bart_score_fscore_answr
        
        print(f"Faithfulness: {bart_score_faithfulness}")
        print(f"Precision distr: {bart_score_precision_distr}")
        print(f"Recall distr: {bart_score_recall_distr}")
        print(f"Fscore distr: {bart_score_fscore_distr}")
        print(f"Precision answr: {bart_score_precision_answr}")
        print(f"Recall answr: {bart_score_recall_answr}")
        print(f"Fscore answr: {bart_score_fscore_answr}")
        
        bleu_scores_distr = []
        bleu_scores_answr = []
        for prediction in filtered_output[0:3]:
            bleu_targets = [target.split() for target in distr_targets]
            bleu_scores_distr.append(sentence_bleu(bleu_targets, prediction.split(), weights=[1.0, 0.0, 0.0, 0.0]))
            bleu_scores_answr.append(sentence_bleu(datapoint['correct_answer'], prediction.split(), weights=[1.0, 0.0, 0.0, 0.0]))
            
        bleu_scores_distr_list = bleu_scores_distr_list + bleu_scores_distr
        bleu_scores_answr_list = bleu_scores_answr_list + bleu_scores_answr
        print(f"Bleu Score distr: {bleu_scores_distr}")
        print(f"Bleu Score answr: {bleu_scores_answr}")
            
    print("")
    
    print(f"Bleu answr: {mean(bleu_scores_answr_list):.3f} ({std(bleu_scores_answr_list):.3f})")
    print(f"Bleu distr: {mean(bleu_scores_distr_list):.3f} ({std(bleu_scores_distr_list):.3f})")
    print(f"Faithful: {mean(bart_score_faithfulness_list):.3f} ({std(bart_score_faithfulness_list):.3f})")
    
    print(f"Precision distr: {mean(bart_score_precision_distr_list):.3f} ({std(bart_score_precision_distr_list):.3f})")
    print(f"Recall distr: {mean(bart_score_recall_distr_list):.3f} ({std(bart_score_recall_distr_list):.3f})")
    print(f"F-score distr: {mean(bart_score_fscore_distr_list):.3f} ({std(bart_score_fscore_distr_list):.3f})")
    
    print(f"Precision answr: {mean(bart_score_precision_answr_list):.3f} ({std(bart_score_precision_answr_list):.3f})")
    print(f"Recall answr: {mean(bart_score_recall_answr_list):.3f} ({std(bart_score_recall_answr_list):.3f})")
    print(f"F-score answr: {mean(bart_score_fscore_answr_list):.3f} ({std(bart_score_fscore_answr_list):.3f})")

In [169]:
evaluate(model_generated_questions, inputs, test_data, bart_scorer_generated)

0
predic: ['proteins', 'antibodies', 'antifreezes', 'recyclables']
target: ['antioxidants', 'Oxygen', 'residues']
{'question': 'Compounds that are capable of accepting electrons, such as o 2 or f2, are called what?', 'distractor3': 'residues', 'distractor1': 'antioxidants', 'distractor2': 'Oxygen', 'correct_answer': 'oxidants', 'support': 'Oxidants and Reductants Compounds that are capable of accepting electrons, such as O 2 or F2, are calledoxidants (or oxidizing agents) because they can oxidize other compounds. In the process of accepting electrons, an oxidant is reduced. Compounds that are capable of donating electrons, such as sodium metal or cyclohexane (C6H12), are calledreductants (or reducing agents) because they can cause the reduction of another compound. In the process of donating electrons, a reductant is oxidized. These relationships are summarized in Equation 3.30: Equation 3.30 Saylor URL: http://www. saylor. org/books.', 'generated_question': 'What are compounds capable

In [167]:
evaluate(model_dataset_questions, inputs, test_data, bart_scorer_dataset)

0
predic: ['antibodies', 'predators', 'antioxidants', 'consumers']
target: ['antioxidants', 'Oxygen', 'residues']
{'question': 'Compounds that are capable of accepting electrons, such as o 2 or f2, are called what?', 'distractor3': 'residues', 'distractor1': 'antioxidants', 'distractor2': 'Oxygen', 'correct_answer': 'oxidants', 'support': 'Oxidants and Reductants Compounds that are capable of accepting electrons, such as O 2 or F2, are calledoxidants (or oxidizing agents) because they can oxidize other compounds. In the process of accepting electrons, an oxidant is reduced. Compounds that are capable of donating electrons, such as sodium metal or cyclohexane (C6H12), are calledreductants (or reducing agents) because they can cause the reduction of another compound. In the process of donating electrons, a reductant is oxidized. These relationships are summarized in Equation 3.30: Equation 3.30 Saylor URL: http://www. saylor. org/books.', 'generated_question': 'What are compounds capable