# MACAW Question Answering
CS 224N Final Project

In [1]:
import numpy as np
import torch
# Macaw-large, PTLM 
# https://github.com/allenai/macaw
# This was used in the BeliefBank Paper
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [2]:
# Downloads a pretty large model
tokenizer = AutoTokenizer.from_pretrained("allenai/macaw-large")
model = AutoModelForSeq2SeqLM.from_pretrained("allenai/macaw-large")

### Example output for a simple question

In [4]:
input_string = "$answer$ ; $mcoptions$ = (A) yes (B) no; $question$ = Is a robin a virus?"
input_ids = tokenizer.encode(input_string, return_tensors="pt")
output = model.generate(input_ids, max_length=200)

tokenizer.batch_decode(output, skip_special_tokens=True)


['$answer$ = no']

In [None]:
df = pd.read_csv("beliefbank_data/calibration_questions.csv", header=None)
df

Unnamed: 0,0
0,Is an albatross a bird?|Yes
1,Is an albatross a seabird?|Yes
2,Is an albatross an animal?|Yes
3,Is an albatross a eukaryotic_organism?|Yes
4,Is an albatross a pelagic_bird?|Yes
...,...
1067,Is a daffodil a palm tree?|No
1068,Is a daffodil a crustacean?|No
1069,Is a daffodil a jellyfish?|No
1070,Is a daffodil an invertebrate?|No


In [5]:
def load_file(file_name):
    with open(file_name, 'r') as file:
        return [line.strip().split(sep="|") for line in file]
        
print(load_file('beliefbank_data/calibration_questions.csv')[0:2])


[['Is an albatross a bird?', 'Yes'], ['Is an albatross a seabird?', 'Yes']]


In [40]:
def create_question_answer_list(file_name, n):
     # n = # of (q, a) pairs to use

    q_and_a = load_file(file_name)
    questions, answers = np.split(np.array(q_and_a), 2, axis=1)
    questions = ["$answer$ ; $mcoptions$ = (A) yes (B) no; $question$ = " + item \
         for sublist in questions for item in sublist]
    answers = [item for sublist in answers for item in sublist]

    question_list = list(questions)[:n]
    answer_list = list(answers)[:n]
    # print(question_list, answer_list)
    return question_list, answer_list


### Evaluate MACAW on our question list

In [29]:
# Evaluates MACAW on the questions list (input)
def batch_eval(file_name, n):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    question_list, _ = create_question_answer_list(file_name, n)

    inputs_dict = tokenizer.batch_encode_plus(question_list, max_length = 200, padding=True, truncation=True, return_tensors="pt")
    input_ids = inputs_dict.input_ids.to(device)

    output = model.generate(input_ids, max_length=200)
    answers = tokenizer.batch_decode(output, skip_special_tokens=True)

    return answers


In [8]:
ans = batch_eval("beliefbank_data/calibration_questions.csv", 5)
print(ans)

['$answer$ = yes', '$answer$ = yes', '$answer$ = yes', '$answer$ = yes', '$answer$ = yes']


In [50]:
### Runs MACAW supervised training on file with questions and answers
# this function is really similar to batch_eval; only difference is that labels (answers) are included and used 
# to run the forward-pass of the model on. the output is the logits / loss.

# see: https://huggingface.co/docs/transformers/model_doc/t5#training

def train(file_name, n):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    question_list, answer_list = create_question_answer_list(file_name, n)
    # append to answer_list the '$answer$ = ' string to match formatting from MACAW output
    answer_list = [("$answer$ = " + ans).lower() for ans in answer_list]

    inputs_dict = tokenizer.batch_encode_plus(question_list, max_length = 200, padding=True, truncation=True, return_tensors="pt")
    input_ids = inputs_dict.input_ids.to(device)

    labels = tokenizer.batch_encode_plus(answer_list, max_length = 15, padding=True, truncation=True, return_tensors='pt')\
        .input_ids.to(device) # max_length is set to len("$answer$ = yes")

    # instead of generate, call forward-pass of function 
    fwd = model(input_ids=input_ids, labels=labels)

    # logits
    logits = fwd.logits
    logits_softmax = torch.nn.functional.softmax(logits, dim = 1)
    print(logits_softmax)
    print(logits_softmax.shape)

    # loss
    loss = fwd.loss # - log(P(y|x))
    confidence = torch.exp(-loss)
    print(confidence)

# for i in range(20):
train("beliefbank_data/silver_questions.csv", 4)




tensor([[[9.9993e-01, 2.9274e-11, 1.6404e-04,  ..., 9.8557e-01,
          9.8302e-01, 9.8291e-01],
         [2.6385e-08, 2.3399e-13, 2.2940e-02,  ..., 7.2222e-05,
          9.2733e-05, 1.1006e-04],
         [2.5865e-07, 5.6556e-14, 3.0727e-05,  ..., 2.7700e-08,
          4.8382e-08, 2.1112e-08],
         ...,
         [3.6993e-11, 1.1108e-13, 1.5797e-01,  ..., 1.4055e-11,
          1.9758e-11, 1.4695e-11],
         [9.0024e-06, 1.1401e-10, 8.5915e-05,  ..., 1.6595e-05,
          1.4385e-05, 2.4910e-05],
         [5.8977e-05, 1.0000e+00, 4.1737e-02,  ..., 1.4342e-02,
          1.6874e-02, 1.6954e-02]],

        [[9.9996e-01, 3.2006e-11, 1.7964e-04,  ..., 9.9845e-01,
          9.9820e-01, 9.9812e-01],
         [3.3889e-08, 3.4629e-13, 3.1626e-02,  ..., 1.9254e-04,
          2.4761e-04, 2.9026e-04],
         [4.4414e-07, 8.4894e-14, 3.4328e-05,  ..., 8.8814e-08,
          1.5721e-07, 6.8148e-08],
         ...,
         [5.9780e-11, 1.3411e-13, 1.5178e-01,  ..., 3.6938e-11,
          5.256


## Basic Evaluation

In [53]:
# Basically just output the proportion of correct Macaw predictions vs our answer list.
def macaw_evaluate(n):
    macaw_pred = batch_eval("beliefbank_data/calibration_questions.csv", n)
    macaw_pred = [item[len('$answer$ = '):] for item in macaw_pred] # remove '$answer$ = '
    question , truth = create_question_answer_list("beliefbank_data/calibration_questions.csv", n)
    # print(macaw_pred, truth)

    correct = 0
    for idx in range(n):
        if(macaw_pred[idx].lower() == truth[idx].lower()):
            correct += 1
        else:
            print("Incorrect prediction made by Macaw: {}, Truth: {} for question # {}: {}".\
                format(macaw_pred[idx], truth[idx], idx, question[idx]))
    return correct / n # proportion of correct macaw preds
    

In [54]:
macaw_evaluate(20)

Incorrect prediction made by Macaw: no, Truth: Yes for question # 11: $answer$ ; $mcoptions$ = (A) yes (B) no; $question$ = Does an albatross have a face?
Incorrect prediction made by Macaw: no, Truth: Yes for question # 15: $answer$ ; $mcoptions$ = (A) yes (B) no; $question$ = Does an albatross have a head?


0.9

Notes for sampling (from constraints_v2.json)
- Sample by looking at constraints and finding a contradiction there
- or pick an entity and multiple facts about it (one-to-many mapping)