In [3]:
import numpy as np
import torch
# Macaw-large, PTLM 
# https://github.com/allenai/macaw
# This was used in the BeliefBank Paper
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [12]:
# Downloads a pretty large model
tokenizer = AutoTokenizer.from_pretrained("allenai/macaw-large")
model = AutoModelForSeq2SeqLM.from_pretrained("allenai/macaw-large")

### Example output for a simple question

In [83]:
input_string = "$answer$ ; $mcoptions$ = (A) yes (B) no; $question$ = Is a robin a virus?"
input_ids = tokenizer.encode(input_string, return_tensors="pt")
output = model.generate(input_ids, max_length=200)

tokenizer.batch_decode(output, skip_special_tokens=True)


['$answer$ = no']

In [None]:
df = pd.read_csv("beliefbank_data/calibration_questions.csv", header=None)
df

Unnamed: 0,0
0,Is an albatross a bird?|Yes
1,Is an albatross a seabird?|Yes
2,Is an albatross an animal?|Yes
3,Is an albatross a eukaryotic_organism?|Yes
4,Is an albatross a pelagic_bird?|Yes
...,...
1067,Is a daffodil a palm tree?|No
1068,Is a daffodil a crustacean?|No
1069,Is a daffodil a jellyfish?|No
1070,Is a daffodil an invertebrate?|No


In [9]:
def load_file(file_name):
    with open(file_name, 'r') as file:
        return [line.strip().split(sep="|") for line in file]
        
print(load_file('beliefbank_data/calibration_questions.csv')[0:2])


[['Is an albatross a bird?', 'Yes'], ['Is an albatross a seabird?', 'Yes']]


In [8]:
def create_question_answer_list(file_name, n):
     # n = # of (q, a) pairs to use

    q_and_a = load_file(file_name)
    questions, answers = np.split(np.array(q_and_a), 2, axis=1)
    questions = ["$answer$ ; $mcoptions$ = (A) yes (B) no; $question$ = " + item + "?"\
         for sublist in questions for item in sublist]
    answers = [item for sublist in answers for item in sublist]

    question_list = list(questions)[:n]
    answer_list = list(answers)[:n]
    # print(question_list, answer_list)
    return question_list, answer_list


In [19]:
# run MACAW on file with questions and answers
def batch_eval(file_name, n):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    question_list, answer_list = create_question_answer_list(file_name, n)

    inputs_dict = tokenizer.batch_encode_plus(question_list, max_length = 200, padding=True, truncation=True, return_tensors="pt")
    input_ids = inputs_dict.input_ids.to(device)

    labels = tokenizer.batch_encode_plus(answer_list, max_length = 3, padding=True, truncation=True, return_tensors='pt')\
        .input_ids.to(device)

    output = model.generate(input_ids, max_length=200)
    answers = tokenizer.batch_decode(output, skip_special_tokens=True)

    return answers


In [20]:
ans = batch_eval("beliefbank_data/calibration_questions.csv", 5)
print(ans)

['$answer$ = yes', '$answer$ = yes', '$answer$ = yes', '$answer$ = yes', '$answer$ = yes']


In [52]:
# run MACAW supervised training on file with questions and answers
# this function is really similar to batch_eval; only difference is that labels (answers) are included and used 
# to run the forward-pass of the model on. the output is the logits / loss.
# see: https://huggingface.co/docs/transformers/model_doc/t5#training

def train(file_name, n):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    question_list, answer_list = create_question_answer_list(file_name, n)

    inputs_dict = tokenizer.batch_encode_plus(question_list, max_length = 200, padding=True, truncation=True, return_tensors="pt")
    input_ids = inputs_dict.input_ids.to(device)

    labels = tokenizer.batch_encode_plus(answer_list, max_length = 3, padding=True, truncation=True, return_tensors='pt')\
        .input_ids.to(device)

    # instead of generate, call forward-pass of function 
    logits = model(input_ids=input_ids, labels=labels).logits
    logits_softmax = torch.nn.functional.softmax(logits, dim = 1)
    print(logits_softmax)

    loss = model(input_ids=input_ids, labels=labels).loss
    print(loss)



In [53]:
train("beliefbank_data/silver_questions.csv", 3)


tensor([[[2.4745e-01, 7.0000e-09, 9.9592e-06,  ..., 2.2646e-02,
          2.6023e-02, 1.9308e-02],
         [7.5255e-01, 1.0000e+00, 9.9999e-01,  ..., 9.7735e-01,
          9.7398e-01, 9.8069e-01]],

        [[3.4996e-01, 3.6346e-08, 2.6528e-05,  ..., 1.8303e-02,
          2.0485e-02, 1.5737e-02],
         [6.5004e-01, 1.0000e+00, 9.9997e-01,  ..., 9.8170e-01,
          9.7951e-01, 9.8426e-01]],

        [[8.5406e-02, 8.4951e-09, 3.7509e-05,  ..., 1.0096e-03,
          1.1446e-03, 8.7485e-04],
         [9.1459e-01, 1.0000e+00, 9.9996e-01,  ..., 9.9899e-01,
          9.9886e-01, 9.9913e-01]]], grad_fn=<SoftmaxBackward0>)
tensor(24.1662, grad_fn=<NllLossBackward0>)



## Basic Evaluation

In [4]:
def macaw_evaluate(n):
    macaw_pred = batch_eval("beliefbank_data/calibration_questions.csv", n)
    macaw_pred = [item[len('$answer$ = '):] for item in macaw_pred] # remove '$answer$ = '
    _ , truth = create_question_answer_list("beliefbank_data/calibration_questions.csv", n)
    # print(macaw_pred, truth)

    correct = 0
    for idx in range(n):
        if(macaw_pred[idx].lower() == truth[idx].lower()):
            correct += 1
    return correct / n # proportion of correct macaw preds
    

In [124]:
macaw_evaluate(100)

['yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'no', 'no', 'no', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'no', 'no', 'yes', 'no', 'yes', 'yes', 'no', 'no', 'yes', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'yes', 'no', 'no', 'no', 'yes', 'yes', 'no', 'no', 'no', 'no', 'no', 'no', 'yes', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'yes', 'no', 'no', 'no', 'no', 'no', 'no', 'yes', 'no', 'yes', 'no', 'yes', 'no', 'no'] ['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', '

0.86

- Sample by looking at constraints and finding a contradiction there
- or pick an entity and multiple facts about it (one-to-many mapping)


- no train set, only a validation set and test set. (split dataset in two, hold out some entities for test set)
- 