Skip to content

Commit

Permalink
8303861: Error handling step timeouts should never be blocked by OnEr…
Browse files Browse the repository at this point in the history
…ror and others

Reviewed-by: phh
Backport-of: 345e370fed065e2ee312fc315bfc453a29fbb2f0
  • Loading branch information
Alexey Pavlyutkin authored and Paul Hohensee committed May 9, 2023
1 parent 8c243a3 commit 1f601e9
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 36 deletions.
4 changes: 2 additions & 2 deletions src/hotspot/share/runtime/thread.cpp
Expand Up @@ -1491,8 +1491,8 @@ void WatcherThread::run() {
os::die();
}

// Wait a second, then recheck for timeout.
os::naked_short_sleep(999);
// Wait a bit, then recheck for timeout.
os::naked_short_sleep(250);
}
}

Expand Down
74 changes: 52 additions & 22 deletions src/hotspot/share/utilities/vmError.cpp
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2023, Red Hat, Inc. and/or its affiliates.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -1689,42 +1690,71 @@ void VMError::show_message_box(char *buf, int buflen) {
} while (yes);
}

// Timeout handling: check if a timeout happened (either a single step did
// timeout or the whole of error reporting hit ErrorLogTimeout). Interrupt
// the reporting thread if that is the case.
// Fatal error handling is subject to several timeouts:
// - a global timeout (controlled via ErrorLogTimeout)
// - local error reporting step timeouts.
//
// The latter aims to "give the JVM a kick" if it gets stuck in one particular place during
// error reporting. This prevents one error reporting step from hogging all the time allotted
// to error reporting under ErrorLogTimeout.
//
// VMError::check_timeout() is called from the watcher thread and checks for either global
// or step timeout. If a timeout happened, we interrupt the reporting thread and set either
// _reporting_did_timeout or _step_did_timeout to signal which timeout fired. Function returns
// true if the *global* timeout fired, which will cause WatcherThread to shut down the JVM
// immediately.
bool VMError::check_timeout() {

// This function is supposed to be called from watcher thread during fatal error handling only.
assert(VMError::is_error_reported(), "Only call during error handling");
assert(Thread::current()->is_Watcher_thread(), "Only call from watcher thread");

if (ErrorLogTimeout == 0) {
return false;
}

// Do not check for timeouts if we still have a message box to show to the
// user or if there are OnError handlers to be run.
if (ShowMessageBoxOnError
|| (OnError != NULL && OnError[0] != '\0')
|| Arguments::abort_hook() != NULL) {
return false;
}
// There are three situations where we suppress the *global* error timeout:
// - if the JVM is embedded and the launcher has its abort hook installed.
// That must be allowed to run.
// - if the user specified one or more OnError commands to run, and these
// did not yet run. These must have finished.
// - if the user (typically developer) specified ShowMessageBoxOnError,
// and the error box has not yet been shown
const bool ignore_global_timeout =
(ShowMessageBoxOnError
|| (OnError != NULL && OnError[0] != '\0')
|| Arguments::abort_hook() != NULL);

const jlong reporting_start_time_l = get_reporting_start_time();
const jlong now = get_current_timestamp();
// Timestamp is stored in nanos.
if (reporting_start_time_l > 0) {
const jlong end = reporting_start_time_l + (jlong)ErrorLogTimeout * TIMESTAMP_TO_SECONDS_FACTOR;
if (end <= now) {
_reporting_did_timeout = true;
interrupt_reporting_thread();
return true; // global timeout

// Global timeout hit?
if (!ignore_global_timeout) {
const jlong reporting_start_time = get_reporting_start_time();
// Timestamp is stored in nanos.
if (reporting_start_time > 0) {
const jlong end = reporting_start_time + (jlong)ErrorLogTimeout * TIMESTAMP_TO_SECONDS_FACTOR;
if (end <= now && !_reporting_did_timeout) {
// We hit ErrorLogTimeout and we haven't interrupted the reporting
// thread yet.
_reporting_did_timeout = true;
interrupt_reporting_thread();
return true; // global timeout
}
}
}

const jlong step_start_time_l = get_step_start_time();
if (step_start_time_l > 0) {
// Reporting step timeout?
const jlong step_start_time = get_step_start_time();
if (step_start_time > 0) {
// A step times out after a quarter of the total timeout. Steps are mostly fast unless they
// hang for some reason, so this simple rule allows for three hanging step and still
// hopefully leaves time enough for the rest of the steps to finish.
const jlong end = step_start_time_l + (jlong)ErrorLogTimeout * TIMESTAMP_TO_SECONDS_FACTOR / 4;
if (end <= now) {
const int max_step_timeout_secs = 5;
const jlong timeout_duration = MAX2((jlong)max_step_timeout_secs, (jlong)ErrorLogTimeout * TIMESTAMP_TO_SECONDS_FACTOR / 4);
const jlong end = step_start_time + timeout_duration;
if (end <= now && !_step_did_timeout) {
// The step timed out and we haven't interrupted the reporting
// thread yet.
_step_did_timeout = true;
interrupt_reporting_thread();
return false; // (Not a global timeout)
Expand Down
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2023, Red Hat, Inc. and/or its affiliates.
* Copyright (c) 2017, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand All @@ -25,14 +26,17 @@
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Pattern;

import jdk.test.lib.process.OutputAnalyzer;
import jdk.test.lib.Platform;
import jdk.test.lib.process.ProcessTools;

/*
* @test
* @test id=default
* @bug 8166944
* @summary Hanging Error Reporting steps may lead to torn error logs
* @modules java.base/jdk.internal.misc
Expand All @@ -41,6 +45,16 @@
* @author Thomas Stuefe (SAP)
*/

/*
* @test id=with-on-error
* @bug 8303861
* @summary Error handling step timeouts should never be blocked by OnError etc.
* @modules java.base/jdk.internal.misc
* @library /test/lib
* @requires (vm.debug == true) & (os.family != "windows")
* @run driver TimeoutInErrorHandlingTest with-on-error
*/

public class TimeoutInErrorHandlingTest {


Expand All @@ -67,23 +81,39 @@ public static void main(String[] args) throws Exception {
* little timeout messages to see that repeated timeout handling is basically working.
*/

ProcessBuilder pb = ProcessTools.createJavaProcessBuilder(
"-XX:+UnlockDiagnosticVMOptions",
"-Xmx100M",
"-XX:ErrorHandlerTest=14",
"-XX:+TestUnresponsiveErrorHandler",
"-XX:ErrorLogTimeout=16", // 16 seconds big timeout = 4 seconds per little timeout
"-XX:-CreateCoredumpOnCrash",
"-version");
boolean withOnError = false;

if (args.length > 0) {
switch (args[0]) {
case "with-on-error": withOnError = true; break;
default: throw new RuntimeException("Invalid argument " + args[1]);
}
}

List<String> arguments = new ArrayList<>();
Collections.addAll(arguments,
"-XX:+UnlockDiagnosticVMOptions",
"-Xmx100M",
"-XX:ErrorHandlerTest=14",
"-XX:+TestUnresponsiveErrorHandler",
"-XX:ErrorLogTimeout=" + ERROR_LOG_TIMEOUT,
"-XX:-CreateCoredumpOnCrash");
if (withOnError) {
arguments.add("-XX:OnError=echo hi");
}
arguments.add("-version");
ProcessBuilder pb = ProcessTools.createJavaProcessBuilder(arguments);

OutputAnalyzer output_detail = new OutputAnalyzer(pb.start());

// we should have crashed with a SIGSEGV
output_detail.shouldMatch("# A fatal error has been detected by the Java Runtime Environment:.*");
output_detail.shouldMatch("# +(?:SIGSEGV|SIGBUS|EXCEPTION_ACCESS_VIOLATION).*");

// VM should have been aborted by WatcherThread
output_detail.shouldMatch(".*timer expired, abort.*");
// Unless we specified OnError, VM should have been aborted by WatcherThread
if (!withOnError) {
output_detail.shouldMatch(".*timer expired, abort.*");
}

// extract hs-err file
String hs_err_file = output_detail.firstMatch("# *(\\S*hs_err_pid\\d+\\.log)", 1);
Expand Down

1 comment on commit 1f601e9

@openjdk-notifier
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.