In [1]:
from binary_model import BinaryModel
from util import convert_tokens, get_batch_dataset, get_dataset
import tensorflow as tf
from config import flags
import numpy as np
import json

In [2]:
def get_binary_record_parser(config, is_test=True):
    def parse(example):
        para_limit = config.test_para_limit if is_test else config.para_limit
        ques_limit = config.test_ques_limit if is_test else config.ques_limit
        char_limit = config.char_limit
        features = tf.parse_single_example(example,
                                           features={
                                               "context_idxs": tf.FixedLenFeature([], tf.string),
                                               "ques_idxs": tf.FixedLenFeature([], tf.string),
                                               "context_char_idxs": tf.FixedLenFeature([], tf.string),
                                               "ques_char_idxs": tf.FixedLenFeature([], tf.string),
                                               "id": tf.FixedLenFeature([], tf.int64),
                                               "tag": tf.FixedLenFeature([], tf.string)
                                           })
        context_idxs = tf.reshape(tf.decode_raw(
            features["context_idxs"], tf.int32), [para_limit])
        ques_idxs = tf.reshape(tf.decode_raw(
            features["ques_idxs"], tf.int32), [ques_limit])
        context_char_idxs = tf.reshape(tf.decode_raw(
            features["context_char_idxs"], tf.int32), [para_limit, char_limit])
        ques_char_idxs = tf.reshape(tf.decode_raw(
            features["ques_char_idxs"], tf.int32), [ques_limit, char_limit])
        qa_id = features["id"]
        tag = tf.reshape(tf.decode_raw(
            features["tag"], tf.int32), [2])
        return context_idxs, ques_idxs, context_char_idxs, ques_char_idxs, qa_id, tag
    
    return parse

In [3]:
flags.DEFINE_string('f', 'give up already', 'who cares lol')
config = flags.FLAGS

In [4]:
test_eval_file = 'data/binary_test_meta.json'
test_record_file = 'data/binary_test.tf'

with open(test_eval_file, "r") as fh:
    eval_file = json.load(fh)

meta = {'total': 1382}

In [5]:
with open(config.word_emb_file, "r") as fh:
    word_mat = np.array(json.load(fh), dtype=np.float32)
with open(config.char_emb_file, "r") as fh:
    char_mat = np.array(json.load(fh), dtype=np.float32)

In [6]:
test_batch = get_dataset(test_record_file, get_binary_record_parser(
        config, is_test=True), config).make_one_shot_iterator()

In [7]:
model = BinaryModel(config, test_batch, word_mat, char_mat, trainable=False)

Instructions for updating:
Use the retry module or similar alternatives.


In [8]:
sess_config = tf.ConfigProto(allow_soft_placement=True)
sess_config.gpu_options.allow_growth = True

sess = tf.Session(config=sess_config)

In [9]:
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()
saver.restore(sess, tf.train.latest_checkpoint(config.save_dir + '/../binary_model'))
sess.run(tf.assign(model.is_train, tf.constant(False, dtype=tf.bool)))

INFO:tensorflow:Restoring parameters from log/model/../binary_model/badptr-savepoint-2700


False

In [10]:
total = meta['total']
print(total)

1382


In [11]:
def convert_tokens(eval_file, qa_id, pred, target):
    answer_dict = {}
    remapped_dict = {}
    for qid, p1, p2 in zip(qa_id, pred.argmax(1), target.argmax(1)):
        uuid = eval_file[str(qid)]["uuid"]
        
        answer_dict[str(qid)] = p1 == p2
        remapped_dict[uuid] = p1 == p2
    return answer_dict, remapped_dict

In [12]:
answer_dict = {}
remapped_dict = {}

for step in range(total // config.batch_size + 1):
    qa_id, loss, yp1, yp2 = sess.run([model.qa_id, model.loss, model.prediction, model.y_target])
    answer_dict_, remapped_dict_ = convert_tokens(eval_file, qa_id.tolist(), yp1, yp2)
    answer_dict.update(answer_dict_)
    remapped_dict.update(remapped_dict_)

In [13]:
from util import metric_max_over_ground_truths, exact_match_score, f1_score

In [14]:
def evaluate(eval_file, answer_dict, only=None):
    f1 = exact_match = total = 0
    for key, value in answer_dict.items():
        if only == 'adv' and len(eval_file[key]['uuid'].split('-')) == 1:
            continue
        if only == 'orig' and len(eval_file[key]['uuid'].split('-')) > 1:
            continue
        total += 1
        exact_match += 1 if value else 0
    em = 100.0 * exact_match / total
    return {'exact_match': em, 'count_true': exact_match, 'count_false': total-exact_match}

In [15]:
metrics = evaluate(eval_file, answer_dict, only='orig')
print("Unmutated data")
print("Accuracy: {}, True: {}, False: {}".format(metrics['exact_match'], metrics['count_true'], metrics['count_false']))

Unmutated data
Accuracy: 85.20408163265306, True: 334, False: 58


In [16]:
tn = metrics['count_true']
fp = metrics['count_false']

In [17]:
metrics = evaluate(eval_file, answer_dict, only='adv')
print("Mutated data")
print("Accuracy: {}, True: {}, False: {}".format(metrics['exact_match'], metrics['count_true'], metrics['count_false']))

Mutated data
Accuracy: 98.17997977755309, True: 971, False: 18


In [18]:
tp = metrics['count_true']
fn = metrics['count_false']

Recall = $\frac{TP}{TP + FN}$

In [19]:
R = tp/(tp+fn)
print(R*100)

98.17997977755309


Precision = $\frac{TP}{TP+FP}$

In [20]:
P = tp/(tp+fp)
print(P*100)

94.36345966958211


$F_1$ Measure = $\frac{2PR}{P+R}$

In [21]:
F = (2*P*R)/(P+R)
print(F*100)

96.23389494549059


Accuracy = $\frac{TP+TN}{TP+TN+FP+FN}$

In [22]:
acc = (960+287)/(meta['total']) * 100
print(acc)

90.2315484804631
