In [1]:
import numpy as np
import tensorflow as tf
import time
import os
import sys

import nsm
from nsm import data_utils
from nsm import env_factory
from nsm import graph_factory
from nsm import model_factory
from nsm import agent_factory
from nsm import executor_factory
from nsm import computer_factory
from nsm import word_embeddings

import experiment as exp

FLAGS = tf.app.flags.FLAGS  
tf.app.flags.DEFINE_string('f', '', 'kernel')
# Set the level to tf.logging.INFO if you want to see more information.
tf.logging.set_verbosity(tf.logging.ERROR)

Instructions for updating:
Use the retry module or similar alternatives.


In [22]:
# Fill in the path to your data/wikitable folder. 
# By default it is in ~/projects/data/wikitable. 
data_dir = os.path.expanduser('/Users/yinpengcheng/Research/SemanticParsing/nsm/data/wikitable')

In [23]:
FLAGS.eval_only = True
FLAGS.eval_use_gpu = False
FLAGS.eval_gpu_id = 0
FLAGS.max_n_mem = 60
FLAGS.eval_file = os.path.join(data_dir, 'processed_input/preprocess_14/data_split_1/dev_split.jsonl')
unittest_file = os.path.join(data_dir, 'processed_input/preprocess_14/data_split_1/train_split_shard_90-0.jsonl')
train_file = os.path.join(data_dir, 'processed_input/preprocess_14/train_examples.jsonl')

# Fill in the output folder and experiment name you want to load.
# By default, load the pretrained model in the repo. 
FLAGS.output_dir = os.path.expanduser('~/Research/SemanticParsing/nsm/neural-symbolic-machines/table/wtq/')
FLAGS.experiment_to_eval = 'pretrained_model'
experiment_config = exp.create_experiment_config()

In [90]:
# Load the agent and the environments in the dev set. 
# This usually takes 15-30 sec. 
# If you want to see the training environments, use train_file instead (takes about 75-150 sec). 
# fns = [FLAGS.eval_file]
fns = [train_file]
agent, envs = exp.init_experiment(fns, FLAGS.eval_use_gpu, gpu_id=str(FLAGS.eval_gpu_id))
for env in envs:
    env.punish_extra_work = False
env_dict = dict([(env.name, env) for env in envs])

INFO:tensorflow:14152 examples in dataset.
INFO:tensorflow:2108 tables.
INFO:tensorflow:2045 unique tokens in encoder vocab
INFO:tensorflow:14152 examples in the dataset
INFO:tensorflow:creating environment #0
INFO:tensorflow:creating environment #100
INFO:tensorflow:creating environment #200
INFO:tensorflow:creating environment #300
INFO:tensorflow:Not enough memory slots for example nt-347, which has 75 constants.
INFO:tensorflow:creating environment #400
INFO:tensorflow:creating environment #500
INFO:tensorflow:creating environment #600
INFO:tensorflow:creating environment #700
INFO:tensorflow:creating environment #800
INFO:tensorflow:creating environment #900
INFO:tensorflow:creating environment #1000
INFO:tensorflow:creating environment #1100
INFO:tensorflow:creating environment #1200
INFO:tensorflow:creating environment #1300
INFO:tensorflow:creating environment #1400
INFO:tensorflow:creating environment #1500
INFO:tensorflow:creating environment #1600
INFO:tensorflow:creating en

INFO:tensorflow:memory_seq2seq_graph/MemorySeq2seq/Encoder/bidirectional_rnn/bw/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0: 320000
INFO:tensorflow:memory_seq2seq_graph/MemorySeq2seq/Encoder/bidirectional_rnn/bw/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0: 800
INFO:tensorflow:memory_seq2seq_graph/MemorySeq2seq/Encoder/dense_1/kernel:0: 80000
INFO:tensorflow:memory_seq2seq_graph/MemorySeq2seq/Encoder/dense_1/bias:0: 200
INFO:tensorflow:memory_seq2seq_graph/MemorySeq2seq/ConstantEncoder/builtin_de_embeddings:0: 6200
INFO:tensorflow:memory_seq2seq_graph/MemorySeq2seq/Decoder/rnn/attention_cell_wrapper/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0: 320000
INFO:tensorflow:memory_seq2seq_graph/MemorySeq2seq/Decoder/rnn/attention_cell_wrapper/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0: 800
INFO:tensorflow:memory_seq2seq_graph/MemorySeq2seq/Decoder/rnn/attention_cell_wrapper/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0: 320000
INFO:tensorflow:memory_seq2seq_graph/MemorySeq2seq/Decoder

# Evaluate on 5 environments / questions and show generated programs. 
Use the environment id (for example, nt-34) to find the question and its accompanying table in the website below (from Stanford NLP group). 
https://nlp.stanford.edu/software/sempre/wikitable/viewer/#203-591

In [26]:
# Use 'eval_envs = envs' to evaluate on the whole validation set. Usually takes 8-10 minutes on a laptop. 
eval_envs = envs[5:10]
dev_avg_return, dev_samples, dev_samples_in_beam = exp.beam_search_eval(agent, eval_envs)
print('Accuracy on the selected {} environments are {}'.format(len(eval_envs), dev_avg_return))
print('Show the generated programs:')
print(exp.show_samples(dev_samples, envs[0].de_vocab, env_dict=env_dict))

Accuracy on the selected 5 environments are 0.8
Show the generated programs:

env nt-24
question: who ranked right after turkey?
answer: [u'Sweden', u'Sweden']
program: ( filter_str_contain_any all_rows [u'turkey'] r.nation-string ) ( next v12 ) ( hop v13 r.nation-string ) <END>
prediction: [u'sweden']
return: 1.0
prob is 1.0

env nt-34
question: who was the top ranked competitor in this race?
answer: [u'Iryna Shpylova', u'Iryna Shpylova']
program: ( first all_rows ) ( hop v7 r.cyclist-string ) <END>
prediction: [u'iryna shpylova']
return: 1.0
prob is 1.0

env nt-15
question: what was the venue when he placed first?
answer: [u'New Delhi, India', u'New Delhi, India']
program: ( argmin all_rows r.position-number ) ( first all_rows ) ( diff v10 v9 r.year-number ) <END>
prediction: [0.0]
return: 0.0
prob is 1.0

env nt-40
question: what was the number of silver medals won by ukraine?
answer: [u'2', u'2.0']
program: ( filter_str_contain_any all_rows [u'ukraine'] r.nation-string ) ( hop v12 

# Debug the beam search by showing programs in the beam. 

In [86]:
env_id = 'nt-8611'
env = env_dict[env_id]
print env.question_annotation
dev_avg_return, dev_samples, dev_samples_in_beam = exp.beam_search_eval(agent, [env])
print('Show the {} programs in beam for environment {}:'.format(len(dev_samples_in_beam), env_id))
print(exp.show_samples(dev_samples_in_beam, envs[0].de_vocab, env_dict=env_dict))

{u'features': [[0], [0], [0], [0], [0], [0], [0], [0]], u'question': u'who has won the most bronze medals?', u'pos_tags': [u'WP', u'VBD-AUX', u'VBN', u'DT', u'RBS', u'JJ', u'NNS', u'.'], u'tokens': [u'who', u'has', u'won', u'the', u'most', u'bronze', u'medals', u'?'], u'entities': [], u'prop_features': {u'r.silver-number': [0], u'r.gold-string': [0], u'r.gold-number': [0], u'r.bronze-string': [1], u'r.nation-string': [0], u'r.silver-string': [0], u'r.total-number': [0], u'r.rank-string': [0], u'r.rank-number': [0], u'r.bronze-number': [1], u'r.total-string': [0]}, u'in_table': [0, 0, 0, 0, 0, 0, 0, 0], u'processed_tokens': [u'who', u'has', u'won', u'the', u'most', u'bronze', u'medals', u'?'], u'context': u't_203_497', u'answer': [u'Sri Lanka', u'Sri Lanka'], u'id': u'nt-8611', u'tmp_tokens': [u'who', u'has', u'won', u'the', u'most', u'bronze', u'medals', u'?']}
INFO:tensorflow:eval, batch 0: 1 envs
INFO:tensorflow:5 samples in beam, batch 0.
INFO:tensorflow:0.152117967606 sec used in e

In [91]:
# decode the dataset

tf.logging.set_verbosity(tf.logging.INFO)
dataset_avg_return, dataset_samples, dataset_samples_in_beam = exp.beam_search_eval(agent, envs)

INFO:tensorflow:eval, batch 0: 50 envs
INFO:tensorflow:247 samples in beam, batch 0.
INFO:tensorflow:8.37355899811 sec used in evaluator batch 0.
INFO:tensorflow:eval, batch 1: 50 envs
INFO:tensorflow:246 samples in beam, batch 1.
INFO:tensorflow:10.6863081455 sec used in evaluator batch 1.
INFO:tensorflow:eval, batch 2: 50 envs
INFO:tensorflow:248 samples in beam, batch 2.
INFO:tensorflow:8.02242302895 sec used in evaluator batch 2.
INFO:tensorflow:eval, batch 3: 50 envs
INFO:tensorflow:249 samples in beam, batch 3.
INFO:tensorflow:7.67977285385 sec used in evaluator batch 3.
INFO:tensorflow:eval, batch 4: 50 envs
INFO:tensorflow:249 samples in beam, batch 4.
INFO:tensorflow:8.3481798172 sec used in evaluator batch 4.
INFO:tensorflow:eval, batch 5: 50 envs
INFO:tensorflow:249 samples in beam, batch 5.
INFO:tensorflow:7.82410907745 sec used in evaluator batch 5.
INFO:tensorflow:eval, batch 6: 50 envs
INFO:tensorflow:Not enough memory slots for example qa_programming, which has 75 const

INFO:tensorflow:eval, batch 37: 50 envs
INFO:tensorflow:250 samples in beam, batch 37.
INFO:tensorflow:8.40778207779 sec used in evaluator batch 37.
INFO:tensorflow:eval, batch 38: 50 envs
INFO:tensorflow:249 samples in beam, batch 38.
INFO:tensorflow:8.83371210098 sec used in evaluator batch 38.
INFO:tensorflow:eval, batch 39: 50 envs
INFO:tensorflow:246 samples in beam, batch 39.
INFO:tensorflow:8.19353795052 sec used in evaluator batch 39.
INFO:tensorflow:eval, batch 40: 50 envs
INFO:tensorflow:246 samples in beam, batch 40.
INFO:tensorflow:12.1901900768 sec used in evaluator batch 40.
INFO:tensorflow:eval, batch 41: 50 envs
INFO:tensorflow:244 samples in beam, batch 41.
INFO:tensorflow:8.21523809433 sec used in evaluator batch 41.
INFO:tensorflow:eval, batch 42: 50 envs
INFO:tensorflow:249 samples in beam, batch 42.
INFO:tensorflow:7.58965110779 sec used in evaluator batch 42.
INFO:tensorflow:eval, batch 43: 50 envs
INFO:tensorflow:245 samples in beam, batch 43.
INFO:tensorflow:8.2

INFO:tensorflow:Not enough memory slots for example qa_programming, which has 64 constants.
INFO:tensorflow:Not enough memory slots for example qa_programming, which has 64 constants.
INFO:tensorflow:244 samples in beam, batch 74.
INFO:tensorflow:7.92666816711 sec used in evaluator batch 74.
INFO:tensorflow:eval, batch 75: 50 envs
INFO:tensorflow:249 samples in beam, batch 75.
INFO:tensorflow:8.04865098 sec used in evaluator batch 75.
INFO:tensorflow:eval, batch 76: 50 envs
INFO:tensorflow:249 samples in beam, batch 76.
INFO:tensorflow:12.0574269295 sec used in evaluator batch 76.
INFO:tensorflow:eval, batch 77: 50 envs
INFO:tensorflow:246 samples in beam, batch 77.
INFO:tensorflow:8.73164105415 sec used in evaluator batch 77.
INFO:tensorflow:eval, batch 78: 50 envs
INFO:tensorflow:250 samples in beam, batch 78.
INFO:tensorflow:8.01994204521 sec used in evaluator batch 78.
INFO:tensorflow:eval, batch 79: 50 envs
INFO:tensorflow:249 samples in beam, batch 79.
INFO:tensorflow:7.677450895

INFO:tensorflow:7.73294401169 sec used in evaluator batch 110.
INFO:tensorflow:eval, batch 111: 50 envs
INFO:tensorflow:242 samples in beam, batch 111.
INFO:tensorflow:7.4926738739 sec used in evaluator batch 111.
INFO:tensorflow:eval, batch 112: 50 envs
INFO:tensorflow:249 samples in beam, batch 112.
INFO:tensorflow:8.22711491585 sec used in evaluator batch 112.
INFO:tensorflow:eval, batch 113: 50 envs
INFO:tensorflow:248 samples in beam, batch 113.
INFO:tensorflow:11.7980208397 sec used in evaluator batch 113.
INFO:tensorflow:eval, batch 114: 50 envs
INFO:tensorflow:249 samples in beam, batch 114.
INFO:tensorflow:8.02287912369 sec used in evaluator batch 114.
INFO:tensorflow:eval, batch 115: 50 envs
INFO:tensorflow:246 samples in beam, batch 115.
INFO:tensorflow:8.03357315063 sec used in evaluator batch 115.
INFO:tensorflow:eval, batch 116: 50 envs
INFO:tensorflow:248 samples in beam, batch 116.
INFO:tensorflow:8.45150518417 sec used in evaluator batch 116.
INFO:tensorflow:eval, batc

INFO:tensorflow:eval, batch 147: 50 envs
INFO:tensorflow:246 samples in beam, batch 147.
INFO:tensorflow:8.79767489433 sec used in evaluator batch 147.
INFO:tensorflow:eval, batch 148: 50 envs
INFO:tensorflow:242 samples in beam, batch 148.
INFO:tensorflow:10.9454100132 sec used in evaluator batch 148.
INFO:tensorflow:eval, batch 149: 50 envs
INFO:tensorflow:244 samples in beam, batch 149.
INFO:tensorflow:10.520815134 sec used in evaluator batch 149.
INFO:tensorflow:eval, batch 150: 50 envs
INFO:tensorflow:247 samples in beam, batch 150.
INFO:tensorflow:10.6708700657 sec used in evaluator batch 150.
INFO:tensorflow:eval, batch 151: 50 envs
INFO:tensorflow:247 samples in beam, batch 151.
INFO:tensorflow:15.1325478554 sec used in evaluator batch 151.
INFO:tensorflow:eval, batch 152: 50 envs
INFO:tensorflow:249 samples in beam, batch 152.
INFO:tensorflow:10.3158130646 sec used in evaluator batch 152.
INFO:tensorflow:eval, batch 153: 50 envs
INFO:tensorflow:246 samples in beam, batch 153.


INFO:tensorflow:eval, batch 184: 50 envs
INFO:tensorflow:247 samples in beam, batch 184.
INFO:tensorflow:13.86927104 sec used in evaluator batch 184.
INFO:tensorflow:eval, batch 185: 50 envs
INFO:tensorflow:250 samples in beam, batch 185.
INFO:tensorflow:9.13608908653 sec used in evaluator batch 185.
INFO:tensorflow:eval, batch 186: 50 envs
INFO:tensorflow:244 samples in beam, batch 186.
INFO:tensorflow:9.02978301048 sec used in evaluator batch 186.
INFO:tensorflow:eval, batch 187: 50 envs
INFO:tensorflow:248 samples in beam, batch 187.
INFO:tensorflow:9.58125090599 sec used in evaluator batch 187.
INFO:tensorflow:eval, batch 188: 50 envs
INFO:tensorflow:248 samples in beam, batch 188.
INFO:tensorflow:8.49293088913 sec used in evaluator batch 188.
INFO:tensorflow:eval, batch 189: 50 envs
INFO:tensorflow:250 samples in beam, batch 189.
INFO:tensorflow:10.4658460617 sec used in evaluator batch 189.
INFO:tensorflow:eval, batch 190: 50 envs
INFO:tensorflow:247 samples in beam, batch 190.
I

INFO:tensorflow:eval, batch 220: 50 envs
INFO:tensorflow:248 samples in beam, batch 220.
INFO:tensorflow:8.61797595024 sec used in evaluator batch 220.
INFO:tensorflow:eval, batch 221: 50 envs
INFO:tensorflow:248 samples in beam, batch 221.
INFO:tensorflow:8.55295109749 sec used in evaluator batch 221.
INFO:tensorflow:eval, batch 222: 50 envs
INFO:tensorflow:250 samples in beam, batch 222.
INFO:tensorflow:7.80124807358 sec used in evaluator batch 222.
INFO:tensorflow:eval, batch 223: 50 envs
INFO:tensorflow:247 samples in beam, batch 223.
INFO:tensorflow:8.46841096878 sec used in evaluator batch 223.
INFO:tensorflow:eval, batch 224: 50 envs
INFO:tensorflow:250 samples in beam, batch 224.
INFO:tensorflow:7.24878001213 sec used in evaluator batch 224.
INFO:tensorflow:eval, batch 225: 50 envs
INFO:tensorflow:244 samples in beam, batch 225.
INFO:tensorflow:12.4231090546 sec used in evaluator batch 225.
INFO:tensorflow:eval, batch 226: 50 envs
INFO:tensorflow:246 samples in beam, batch 226.

INFO:tensorflow:250 samples in beam, batch 257.
INFO:tensorflow:8.9394299984 sec used in evaluator batch 257.
INFO:tensorflow:eval, batch 258: 50 envs
INFO:tensorflow:250 samples in beam, batch 258.
INFO:tensorflow:8.33176422119 sec used in evaluator batch 258.
INFO:tensorflow:eval, batch 259: 50 envs
INFO:tensorflow:248 samples in beam, batch 259.
INFO:tensorflow:13.5988349915 sec used in evaluator batch 259.
INFO:tensorflow:eval, batch 260: 50 envs
INFO:tensorflow:247 samples in beam, batch 260.
INFO:tensorflow:8.89572906494 sec used in evaluator batch 260.
INFO:tensorflow:eval, batch 261: 50 envs
INFO:tensorflow:248 samples in beam, batch 261.
INFO:tensorflow:8.73163294792 sec used in evaluator batch 261.
INFO:tensorflow:eval, batch 262: 50 envs
INFO:tensorflow:241 samples in beam, batch 262.
INFO:tensorflow:8.61161899567 sec used in evaluator batch 262.
INFO:tensorflow:eval, batch 263: 50 envs
INFO:tensorflow:246 samples in beam, batch 263.
INFO:tensorflow:8.12151193619 sec used in

In [92]:
from collections import OrderedDict
decode_results = OrderedDict()
for sample in dataset_samples_in_beam:
    name = sample.traj.env_name
    program = agent_factory.traj_to_program(sample.traj, envs[0].de_vocab)
    human_readable_program = exp.to_human_readable_program(program, env_dict[sample.traj.env_name])
    is_correct = sample.traj.rewards[-1] == 1.
    
    decode_results.setdefault(name, []).append(OrderedDict(
        program=human_readable_program,
        is_correct=is_correct,
        prob=sample.prob
    ))

for key in decode_results:
    decode_results[key].sort(key=lambda x: x['prob'], reverse=True)

INFO:tensorflow:Not enough memory slots for example qa_programming, which has 75 constants.
INFO:tensorflow:Not enough memory slots for example qa_programming, which has 65 constants.
INFO:tensorflow:Not enough memory slots for example qa_programming, which has 64 constants.
INFO:tensorflow:Not enough memory slots for example qa_programming, which has 75 constants.
INFO:tensorflow:Not enough memory slots for example qa_programming, which has 76 constants.
INFO:tensorflow:Not enough memory slots for example qa_programming, which has 74 constants.
INFO:tensorflow:Not enough memory slots for example qa_programming, which has 74 constants.
INFO:tensorflow:Not enough memory slots for example qa_programming, which has 75 constants.


In [93]:
import json
json.dump(decode_results, open('wtq/pretrained_model/mapo.decode_results.train.json', 'w'), indent=2)

In [94]:
mapo_decode_results = json.load(open('wtq/pretrained_model/mapo.decode_results.train.json'))
my_decode_results = json.load(open('/Users/yinpengcheng/Research/SemanticParsing/nsm/data/wikitable_reproduce/output/model.best.decode_results.json'))

In [96]:
import numpy as np
def compute_acc(_decode_results):
    acc_list = []
    recall_list = []
    for env, hyp_list in _decode_results.items():
        is_correct = hyp_list and hyp_list[0]['is_correct']
        has_correct_program = any(hyp['is_correct'] for hyp in hyp_list)
        acc_list.append(is_correct)
        recall_list.append(has_correct_program)
    
    return np.average(acc_list)

mapo_acc = compute_acc(mapo_decode_results)
print mapo_acc
my_acc = compute_acc(my_decode_results)
print my_acc

0.5735803691393819
0.4037442599788061


In [88]:
mapo_wins = []
my_wins = []
for env in envs:
    mapo_hyp_list = mapo_decode_results[env.name]
    my_hyp_list = my_decode_results[env.name]
    
    mapo_correct = mapo_hyp_list and mapo_hyp_list[0]['is_correct']
    my_correct = my_hyp_list and my_hyp_list[0]['is_correct']
    
    if mapo_correct and not my_correct:
        mapo_wins.append(env.name)
    elif not mapo_correct and my_correct:
        my_wins.append(env.name)
        
print 'mapo wins %d' % len(mapo_wins)
print 'my wins %d' % len(my_wins)

for env_name in mapo_wins:
    print env_name
    print 'Question: ', env_dict[env_name].question_annotation['question']
    print 'mapo prediction: ', ' '.join([str(x) for x in mapo_decode_results[env_name][0]['program']])
    print 'my prediction', ' '.join([str(x) for x in my_decode_results[env_name][0]['program']])
    print ''
    print ''

mapo wins 245
my wins 192
nt-107
Question:  which player ranked the most?
mapo prediction:  ( mode all_rows r.player-string ) <END>
my prediction ( argmax all_rows r.rank-number ) ( hop v0 r.player-string ) <END>


nt-111
Question:  how long after fairfield was no. 1 built?
mapo prediction:  ( filter_str_contain_any all_rows fairfield r.name-string ) ( filter_str_contain_any all_rows no.1 r.name-string ) ( diff v1 v0 r.date_built-number ) <END>
my prediction ( filter_str_contain_any all_rows fairfield r.name-string ) ( hop v0 r.notes-number ) <END>


nt-160
Question:  how many gold medals did this country win during these olympics?
mapo prediction:  ( filter_str_contain_any all_rows gold r.medal-string ) ( count v0 ) <END>
my prediction ( filter_str_contain_any all_rows win r.sport-string ) ( hop v0 r.medal-string ) <END>


nt-250
Question:  what is the number of points scored on 6 february 1922?
mapo prediction:  ( filter_str_contain_any all_rows 6 february 1922 r.date-string ) ( hop 

mapo prediction:  ( filter_str_contain_any all_rows jaime quintana r.name-string ) ( hop v0 r.party-string ) <END>
my prediction ( first all_rows ) ( hop v0 r.party-string ) <END>


nt-6597
Question:  in 2010, who has the least amount of sales?
mapo prediction:  ( argmin all_rows r.2010_arms_sales_us_m-number ) ( hop v0 r.company_country-string ) <END>
my prediction ( argmin all_rows r.arms_sales_as_share_of_company_s_total_sales-number ) ( hop v0 r.company_country-string ) <END>


nt-6628
Question:  what is the number of wins for confey
mapo prediction:  ( filter_str_contain_any all_rows confey r.team-string ) ( hop v0 r.wins-number ) <END>
my prediction ( hop all_rows r.wins-string ) ( filter_str_contain_any all_rows v0 r.wins-string ) ( count v1 ) <END>


nt-6712
Question:  what number of total finals does foyle college have?
mapo prediction:  ( filter_str_contain_any all_rows foyle college r.school-string ) ( hop v0 r.total_finals-number ) <END>
my prediction ( filter_str_contain_a