Skip to content

Commit

Permalink
Fix for signal checkpoint writing unconverged soln
Browse files Browse the repository at this point in the history
This commit patches the issue (idaholab#25755) where a checkpoint can write an
unconverged solution depending on when the USR1 signal is received, thereby
affecting the accuracy of recovered simulations. The fix 1) adds logic that
ensures a checkpoint is not output unless _current_execute_flag is contained
in _execute_on, and 2) enforces that Checkpoint’s ‘execute_on’ parameter may
only be set to ‘TIMESTEP_END’ so that only converged solutions are output.
Exodiff tests are added to ensure that simulations recovered from signal-based
checkpoints result in the same solutions as the uninterrupted simulations.
  • Loading branch information
pbehne committed Oct 31, 2023
1 parent 6a27c47 commit 18ccd64
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 7 deletions.
31 changes: 26 additions & 5 deletions framework/src/outputs/Checkpoint.C
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,12 @@ Checkpoint::validParams()
"cp",
"This will be appended to the file_base to create the directory name for checkpoint files.");

// Since it only makes sense to write checkpoints at the end of time steps,
// change the default value of execute_on to TIMESTEP_END
ExecFlagEnum & exec_enum = params.set<ExecFlagEnum>("execute_on", true);
exec_enum = {EXEC_TIMESTEP_END};
params.suppressParameter<ExecFlagEnum>("execute_on");

return params;
}

Expand All @@ -56,6 +62,14 @@ Checkpoint::Checkpoint(const InputParameters & parameters)
_num_files(getParam<unsigned int>("num_files")),
_suffix(getParam<std::string>("suffix"))
{
// Prevent the checkpoint from executing at any time other than TIMESTEP_END
const auto & execute_on = getParam<ExecFlagEnum>("execute_on");
if (!(execute_on.size() == 1 && execute_on.contains(EXEC_TIMESTEP_END)))
paramError("execute_on",
"The checkpoint system may only be used with execute_on = ",
"TIMESTEP_END, not '",
execute_on,
"'.");
}

std::string
Expand Down Expand Up @@ -105,14 +119,21 @@ Checkpoint::outputStep(const ExecFlagType & type)
bool
Checkpoint::shouldOutput()
{
const bool parent_should_output = FileOutput::shouldOutput();
// Check if the checkpoint should "normally" output, i.e. if it was created
// through checkpoint=true
bool should_output =
(onInterval() || _current_execute_flag == EXEC_FINAL) ? FileOutput::shouldOutput() : false;

// If this is either a auto-created checkpoint, or if its an existing checkpoint acting
// as the autosave and that checkpoint isn't on its interval, then output.
if (_is_autosave == SYSTEM_AUTOSAVE || (_is_autosave == MODIFIED_EXISTING && !should_output))
(onInterval() || _current_execute_flag == EXEC_FINAL) ? parent_should_output : false;

// If this is either a auto-created checkpoint, or if its an existing
// checkpoint acting as the autosave and that checkpoint isn't on its
// interval, then output.
// parent_should_output ensures that we output only when _execute_on contains
// _current_execute_flag (see Output::shouldOutput), ensuring that we wait
// until the end of the timestep to write, preventing the output of an
// unconverged solution.
if (parent_should_output &&
(_is_autosave == SYSTEM_AUTOSAVE || (_is_autosave == MODIFIED_EXISTING && !should_output)))
{
// If this is a pure system-created autosave through AutoCheckpointAction,
// then sync across processes and only output one time per signal received.
Expand Down
4 changes: 4 additions & 0 deletions framework/src/problems/FEProblemBase.C
Original file line number Diff line number Diff line change
Expand Up @@ -7872,6 +7872,10 @@ FEProblemBase::addOutput(const std::string & object_type,
if (_app.getParam<bool>("show_input") && parameters.get<bool>("output_screen"))
parameters.set<ExecFlagEnum>("execute_input_on") = EXEC_INITIAL;
}
// Need this because Checkpoint::validParams changes the default value of
// execute_on
else if (object_type == "Checkpoint")
exclude.push_back("execute_on");

// Apply the common parameters loaded with Outputs input syntax
const InputParameters * common = output_warehouse.getCommonParameters();
Expand Down
Binary file not shown.
Binary file not shown.
38 changes: 36 additions & 2 deletions test/tests/misc/signal_handler/tests
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,19 @@
max_threads = 1
max_parallel = 1
prereq = 'test_signal'
delete_output_before_running = true
requirement = 'The app should be able to recover from the autosaved checkpoint created by a signal.'
[]
[test_signal_recover_exodiff]
type = 'Exodiff'
input = 'simple_transient_diffusion_scaled.i'
exodiff = 'simple_transient_diffusion_scaled_out.e'
prereq = 'test_signal_recover'
should_execute = 'false'
requirement = 'The app, recovered from a signal-created checkpoint, should produce an accurate solution.'
issues = '#25755'
method = 'opt'
[]
[test_signal_parallel]
type = SignalTester
sleep_time = 1
Expand Down Expand Up @@ -59,22 +70,33 @@
min_parallel = 3
max_parallel = 3
prereq = 'test_signal_parallel'
delete_output_before_running = true
requirement = 'The app should be able to recover from a parallel autosaved checkpoint created by a signal.'
max_threads = 1
[]
[test_signal_parallel_recover_exodiff]
type = 'Exodiff'
input = 'simple_transient_diffusion_scaled.i'
exodiff = 'simple_transient_diffusion_scaled_out.e'
prereq = 'test_signal_parallel_recover'
should_execute = 'false'
requirement = 'The app, recovered from a parallel signal-created checkpoint, should produce an accurate solution.'
issues = '#25755'
method = 'opt'
[]
[test_signal_debug]
type = SignalTester
sleep_time = 4
input = 'simple_transient_diffusion_scaled.i'
requirement = 'The app should write out a checkpoint file at any time by sending a signal to it, in a debug build.'
method = 'dbg'
cli_args = 'Mesh/uniform_refine=0'
cli_args = 'Mesh/uniform_refine=0 Outputs/file_base="simple_transient_diffusion_scaled_dbg"'
recover = false
[]
[test_signal_recover_debug]
type = RunApp
input = 'simple_transient_diffusion_scaled.i'
cli_args = 'Mesh/uniform_refine=0 --recover'
cli_args = 'Mesh/uniform_refine=0 Outputs/file_base="simple_transient_diffusion_scaled_dbg" --recover'
method = 'dbg'

#verify that the recovery is actually recovering and not starting from the beginning
Expand All @@ -86,6 +108,18 @@
expect_out = 'Time Step 49,'

prereq = 'test_signal_debug'
delete_output_before_running = true
requirement = 'The app should be able to recover from the autosaved checkpoint created by a signal, in a debug build.'
[]
[test_signal_recover_exodiff_debug]
type = 'Exodiff'
input = 'simple_transient_diffusion_scaled.i'
cli_args = 'Mesh/uniform_refine=0 Outputs/file_base="simple_transient_diffusion_scaled_dbg"'
exodiff = 'simple_transient_diffusion_scaled_dbg.e'
prereq = 'test_signal_recover_debug'
should_execute = 'false'
requirement = 'The app, recovered from a signal-created checkpoint, should produce an accurate solution in a debug build.'
issues = '#25755'
method = 'dbg'
[]
[]

0 comments on commit 18ccd64

Please sign in to comment.