Skip to content

Commit

Permalink
Work around JVM Bug in LongGCDisruptionTests (elastic#50731)
Browse files Browse the repository at this point in the history
There is a JVM bug causing `Thread#suspend` calls to randomly take
multiple seconds breaking these tests that call the method numerous times
in a loop. Increasing the timeout would will not work since we may call
`suspend` tens if not hundreds of times and even a small number of them
experiencing the blocking will lead to multiple minutes of waiting.

This PR detects the specific issue by timing the `Thread#suspend` calls and
skips the remainder of the test if it timed out because of the JVM bug.

Closes elastic#50047
  • Loading branch information
original-brownbear committed Jan 14, 2020
1 parent f028ab0 commit 1e2d262
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
import java.util.Random;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
Expand All @@ -56,11 +58,23 @@ public class LongGCDisruption extends SingleNodeDisruption {
private Set<Thread> suspendedThreads;
private Thread blockDetectionThread;

private final AtomicBoolean sawSlowSuspendBug = new AtomicBoolean(false);

public LongGCDisruption(Random random, String disruptedNode) {
super(random);
this.disruptedNode = disruptedNode;
}

/**
* Checks if during disruption we ran into a known JVM issue that makes {@link Thread#suspend()} calls block for multiple seconds
* was observed.
* @see <a href=https://bugs.openjdk.java.net/browse/JDK-8218446>JDK-8218446</a>
* @return true if during thread suspending a call to {@link Thread#suspend()} took more than 3s
*/
public boolean sawSlowSuspendBug() {
return sawSlowSuspendBug.get();
}

@Override
public synchronized void startDisrupting() {
if (suspendedThreads == null) {
Expand Down Expand Up @@ -251,7 +265,11 @@ protected boolean suspendThreads(Set<Thread> nodeThreads) {
* assuming that it is safe.
*/
boolean definitelySafe = true;
final long startTime = System.nanoTime();
thread.suspend();
if (System.nanoTime() - startTime > TimeUnit.SECONDS.toNanos(3L)) {
sawSlowSuspendBug.set(true);
}
// double check the thread is not in a shared resource like logging; if so, let it go and come back
safe:
for (StackTraceElement stackElement : thread.getStackTrace()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
*/
package org.elasticsearch.test.disruption;

import org.elasticsearch.bootstrap.JavaVersion;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.test.ESTestCase;

Expand Down Expand Up @@ -115,8 +114,6 @@ protected long getSuspendingTimeoutInMillis() {
* but does keep retrying until all threads can be safely paused
*/
public void testNotBlockingUnsafeStackTraces() throws Exception {
assumeFalse("https://github.com/elastic/elasticsearch/issues/50047",
JavaVersion.current().equals(JavaVersion.parse("11")) || JavaVersion.current().equals(JavaVersion.parse("12")));
final String nodeName = "test_node";
LongGCDisruption disruption = new LongGCDisruption(random(), nodeName) {
@Override
Expand Down Expand Up @@ -149,14 +146,22 @@ protected Pattern[] getUnsafeClasses() {
threads[i].start();
}
// make sure some threads are under lock
disruption.startDisrupting();
try {
disruption.startDisrupting();
} catch (RuntimeException e) {
if (e.getMessage().contains("suspending node threads took too long") && disruption.sawSlowSuspendBug()) {
return;
}
throw new AssertionError(e);
}
long first = ops.get();
assertThat(lockedExecutor.lock.isLocked(), equalTo(false)); // no threads should own the lock
Thread.sleep(100);
assertThat(ops.get(), equalTo(first));
disruption.stopDisrupting();
assertBusy(() -> assertThat(ops.get(), greaterThan(first)));
} finally {
disruption.stopDisrupting();
stop.set(true);
for (final Thread thread : threads) {
thread.join();
Expand Down

0 comments on commit 1e2d262

Please sign in to comment.