In [22]:
import os, sys
import random
import json
import nltk 
import csv
import torch
import nltk  # $ pip install nltk
from nltk.stem import PorterStemmer
from nltk.corpus import cmudict  # >>> nltk.download('cmudict')
from nltk.tokenize import word_tokenize
from spellchecker import SpellChecker

from beliefbank_data.utils import generate_assertion, generate_question, find_constraints


In [2]:
constraints_path = "beliefbank_data/constraints_v2.json"
facts_path = "beliefbank_data/silver_facts.json"

In [3]:
constraints = json.load(open(constraints_path))
facts = json.load(open(facts_path))

In [8]:
statements = [(entity, relation, label == 'yes')
              for entity, relations in facts.items() 
              for relation, label in relations.items()]
statements[:5]

[('american bison', 'IsA,mammal', True),
 ('american bison', 'IsA,american bison', True),
 ('american bison', 'IsA,animal', True),
 ('american bison', 'IsA,vertebrate', True),
 ('american bison', 'IsA,warm blooded animal', True)]

In [18]:
for base in statements:
    entity, relation, true = base
    
    filter_dict = {
        'source': relation,
        'direction': 'forward',
    }
    selected_constraints = find_constraints(constraints, filter_dict=filter_dict)
    c = random.choice(selected_constraints)
    contra = (entity, c['target'], not c['weight'] == 'yes_yes')
    print(base, contra)
    break

('american bison', 'IsA,mammal', True) ('american bison', 'IsA,vegetable', True)


In [26]:
questions, answers = zip(*[generate_question(*base), generate_question(*contra)])
question_list = list(questions)
answer_list = list(answers)

In [27]:
print(question_list)
print(answer_list)

['Is a american bison a mammal?', 'Is a american bison a vegetable?']
['Yes', 'Yes']


In [21]:
# Macaw-large, PTLM 
# https://github.com/allenai/macaw
# This was used in the BeliefBank Paper
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Downloads a pretty large model
tokenizer = AutoTokenizer.from_pretrained("allenai/macaw-large")
model = AutoModelForSeq2SeqLM.from_pretrained("allenai/macaw-large")

In [56]:
def format_question(question_list):
    question_list = ["$answer$ ; $mcoptions$ = (A) yes (B) no; $question$ = " + item \
         for item in question_list]

    return question_list

# run QA model on questions to get answers and confidences
def train(question_list, answer_list):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    question_list = format_question(question_list)
    answer_list_all_yes = ["$answer$ = yes"] * len(question_list)     # pass in list of "yes"

    # train one-by-one (can't calculate individual loss if doing a batch encode/decode)
    answers = []
    confidences = []
    for idx in range(len(question_list)):
        input_ids = tokenizer.encode(question_list[idx], return_tensors="pt")
        labels = tokenizer.encode(answer_list_all_yes[idx], max_length = 15, padding=True, truncation=True, return_tensors="pt") # max_length is set to len("$answer$ = yes")

        output = model.generate(input_ids, max_length=200)

        answer = tokenizer.batch_decode(output, skip_special_tokens=True)
        answers.append(answer)
        fwd = model(input_ids=input_ids, labels=labels)

        # loss
        loss = fwd.loss # - log(P(y|x))
        confidence = torch.exp(-loss)

        # if answer is actually a no, then flip the confidence probability (because the probability is P(yes))
        if (answer == ["$answer$ = no"]):
            confidence = 1.0 - confidence

        confidences.append(confidence)

    print(answers, confidences)


train(question_list, answer_list)


[['$answer$ = yes'], ['$answer$ = no']] [tensor(0.9998, grad_fn=<ExpBackward0>), tensor(0.5888, grad_fn=<RsubBackward1>)]


In [None]:

# run NLI model on assertions from QA output