Skip to content

Commit

Permalink
8312182: THPs cause huge RSS due to thread start timing issue
Browse files Browse the repository at this point in the history
8310687: JDK-8303215 is incomplete

Backport-of: 84b325b844c08809448a9c073a11443d9e3c3f8e
  • Loading branch information
tstuefe committed Aug 28, 2023
1 parent 89234bc commit 4cca633
Show file tree
Hide file tree
Showing 3 changed files with 301 additions and 7 deletions.
9 changes: 9 additions & 0 deletions src/hotspot/os/linux/globals_linux.hpp
Expand Up @@ -89,6 +89,15 @@
"to disable both the override and the printouts." \
"See prctl(PR_SET_TIMERSLACK) for more info.") \
\
product(bool, DisableTHPStackMitigation, false, DIAGNOSTIC, \
"If THPs are unconditionally enabled on the system (mode " \
"\"always\"), the JVM will prevent THP from forming in " \
"thread stacks. This switch disables that mitigation and " \
"allows THPs to form in thread stacks.") \
\
develop(bool, DelayThreadStartALot, false, \
"Artificially delay thread starts randomly for testing.") \
\


// end of RUNTIME_OS_FLAGS
Expand Down
58 changes: 51 additions & 7 deletions src/hotspot/os/linux/os_linux.cpp
Expand Up @@ -775,6 +775,10 @@ static void *thread_native_entry(Thread *thread) {

assert(osthread->pthread_id() != 0, "pthread_id was not set as expected");

if (DelayThreadStartALot) {
os::naked_short_sleep(100);
}

// call one more level start routine
thread->call_run();

Expand Down Expand Up @@ -911,6 +915,7 @@ bool os::create_thread(Thread* thread, ThreadType thr_type,
// Calculate stack size if it's not specified by caller.
size_t stack_size = os::Posix::get_initial_stack_size(thr_type, req_stack_size);
size_t guard_size = os::Linux::default_guard_size(thr_type);

// Configure glibc guard page. Must happen before calling
// get_static_tls_area_size(), which uses the guard_size.
pthread_attr_setguardsize(&attr, guard_size);
Expand All @@ -931,13 +936,16 @@ bool os::create_thread(Thread* thread, ThreadType thr_type,
}
assert(is_aligned(stack_size, os::vm_page_size()), "stack_size not aligned");

// Add an additional page to the stack size to reduce its chances of getting large page aligned
// so that the stack does not get backed by a transparent huge page.
size_t default_large_page_size = HugePages::default_static_hugepage_size();
if (default_large_page_size != 0 &&
stack_size >= default_large_page_size &&
is_aligned(stack_size, default_large_page_size)) {
stack_size += os::vm_page_size();
if (!DisableTHPStackMitigation) {
// In addition to the glibc guard page that prevents inter-thread-stack hugepage
// coalescing (see comment in os::Linux::default_guard_size()), we also make
// sure the stack size itself is not huge-page-size aligned; that makes it much
// more likely for thread stack boundaries to be unaligned as well and hence
// protects thread stacks from being targeted by khugepaged.
if (HugePages::thp_pagesize() > 0 &&
is_aligned(stack_size, HugePages::thp_pagesize())) {
stack_size += os::vm_page_size();
}
}

int status = pthread_attr_setstacksize(&attr, stack_size);
Expand Down Expand Up @@ -3077,6 +3085,27 @@ bool os::Linux::libnuma_init() {
}

size_t os::Linux::default_guard_size(os::ThreadType thr_type) {

if (!DisableTHPStackMitigation) {
// If THPs are unconditionally enabled, the following scenario can lead to huge RSS
// - parent thread spawns, in quick succession, multiple child threads
// - child threads are slow to start
// - thread stacks of future child threads are adjacent and get merged into one large VMA
// by the kernel, and subsequently transformed into huge pages by khugepaged
// - child threads come up, place JVM guard pages, thus splinter the large VMA, splinter
// the huge pages into many (still paged-in) small pages.
// The result of that sequence are thread stacks that are fully paged-in even though the
// threads did not even start yet.
// We prevent that by letting the glibc allocate a guard page, which causes a VMA with different
// permission bits to separate two ajacent thread stacks and therefore prevent merging stacks
// into one VMA.
//
// Yes, this means we have two guard sections - the glibc and the JVM one - per thread. But the
// cost for that one extra protected page is dwarfed from a large win in performance and memory
// that avoiding interference by khugepaged buys us.
return os::vm_page_size();
}

// Creating guard page is very expensive. Java thread has HotSpot
// guard pages, only enable glibc guard page for non-Java threads.
// (Remember: compiler thread is a Java thread, too!)
Expand Down Expand Up @@ -3740,6 +3769,21 @@ void os::large_page_init() {
// Query OS information first.
HugePages::initialize();

// If THPs are unconditionally enabled (THP mode "always"), khugepaged may attempt to
// coalesce small pages in thread stacks to huge pages. That costs a lot of memory and
// is usually unwanted for thread stacks. Therefore we attempt to prevent THP formation in
// thread stacks unless the user explicitly allowed THP formation by manually disabling
// -XX:+DisableTHPStackMitigation.
if (HugePages::thp_mode() == THPMode::always) {
if (DisableTHPStackMitigation) {
log_info(pagesize)("JVM will *not* prevent THPs in thread stacks. This may cause high RSS.");
} else {
log_info(pagesize)("JVM will attempt to prevent THPs in thread stacks.");
}
} else {
FLAG_SET_ERGO(DisableTHPStackMitigation, true); // Mitigation not needed
}

// 1) Handle the case where we do not want to use huge pages
if (!UseLargePages &&
!UseTransparentHugePages &&
Expand Down
241 changes: 241 additions & 0 deletions test/hotspot/jtreg/runtime/os/THPsInThreadStackPreventionTest.java
@@ -0,0 +1,241 @@
/*
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2023, Red Hat Inc.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/

/*
* @test id=ENABLED
* @bug 8303215 8312182
* @summary On THP=always systems, we prevent THPs from forming within thread stacks
* @library /test/lib
* @requires os.family == "linux"
* @requires vm.debug
* @requires os.arch=="amd64" | os.arch=="x86_64" | os.arch=="aarch64"
* @modules java.base/jdk.internal.misc
* java.management
* @run driver THPsInThreadStackPreventionTest PATCH-ENABLED
*/

/*
* @test id=DISABLED
* @bug 8303215 8312182
* @summary On THP=always systems, we prevent THPs from forming within thread stacks (negative test)
* @library /test/lib
* @requires os.family == "linux"
* @requires vm.debug
* @requires os.arch=="amd64" | os.arch=="x86_64" | os.arch=="aarch64"
* @modules java.base/jdk.internal.misc
* java.management
* @run main/manual THPsInThreadStackPreventionTest PATCH-DISABLED
*/
import jdk.test.lib.process.OutputAnalyzer;
import jdk.test.lib.process.ProcessTools;
import jtreg.SkippedException;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Objects;
import java.util.concurrent.BrokenBarrierException;
import java.util.concurrent.CyclicBarrier;

public class THPsInThreadStackPreventionTest {

// We test the mitigation for "huge rss for THP=always" introduced with JDK-8312182 and JDK-8302015:
//
// We start a program that spawns a ton of threads with a stack size close to THP page size. The threads
// are idle and should not build up a lot of stack. The threads are started with an artificial delay
// between thread start and stack guardpage creation, which exacerbates the RSS bloat (for explanation
// please see 8312182).
//
// We then observe RSS of that program. We expect it to stay below a reasonable maximum. The unpatched
// version should show an RSS of ~2 GB (paying for the fully paged in thread stacks). The fixed variant should
// cost only ~200-400 MB.

static final int numThreads = 1000;
static final long threadStackSizeMB = 2; // must be 2M
static final long heapSizeMB = 64;
static final long basicRSSOverheadMB = heapSizeMB + 150;
// A successful completion of this test would show not more than X KB per thread stack.
static final long acceptableRSSPerThreadStack = 128 * 1024;
static final long acceptableRSSForAllThreadStacks = numThreads * acceptableRSSPerThreadStack;
static final long acceptableRSSLimitMB = (acceptableRSSForAllThreadStacks / (1024 * 1024)) + basicRSSOverheadMB;

private static class TestMain {

static class Sleeper extends Thread {
CyclicBarrier barrier;
public Sleeper(CyclicBarrier barrier) {
this.barrier = barrier;
}
@Override
public void run() {
try {
barrier.await(); // wait for all siblings
barrier.await(); // wait main thread to print status
} catch (InterruptedException | BrokenBarrierException e) {
e.printStackTrace();
}
}
}

public static void main(String[] args) throws BrokenBarrierException, InterruptedException {

// Fire up 1000 threads with 2M stack size each.
Sleeper[] threads = new Sleeper[numThreads];
CyclicBarrier barrier = new CyclicBarrier(numThreads + 1);

for (int i = 0; i < numThreads; i++) {
threads[i] = new Sleeper(barrier);
threads[i].start();
}

// Wait for all threads to come up
barrier.await();

// print status
String file = "/proc/self/status";
try (FileReader fr = new FileReader(file);
BufferedReader reader = new BufferedReader(fr)) {
String line;
while ((line = reader.readLine()) != null) {
System.out.println(line);
}
} catch (IOException | NumberFormatException e) { /* ignored */ }

// Signal threads to stop
barrier.await();

}
}

static class ProcSelfStatus {

public long rssMB;
public long swapMB;
public int numLifeThreads;

// Parse output from /proc/self/status
public static ProcSelfStatus parse(OutputAnalyzer o) {
ProcSelfStatus status = new ProcSelfStatus();
String s = o.firstMatch("Threads:\\s*(\\d+)", 1);
Objects.requireNonNull(s);
status.numLifeThreads = Integer.parseInt(s);
s = o.firstMatch("VmRSS:\\s*(\\d+) kB", 1);
Objects.requireNonNull(s);
status.rssMB = Long.parseLong(s) / 1024;
s = o.firstMatch("VmSwap:\\s*(\\d+) kB", 1);
Objects.requireNonNull(s);
status.swapMB = Long.parseLong(s) / 1024;
return status;
}
}

public static void main(String[] args) throws Exception {

HugePageConfiguration config = HugePageConfiguration.readFromOS();
// This issue is bound to THP=always
if (config.getThpMode() != HugePageConfiguration.THPMode.always) {
throw new SkippedException("Test only makes sense in THP \"always\" mode");
}

String[] defaultArgs = {
"-Xlog:pagesize",
"-Xmx" + heapSizeMB + "m", "-Xms" + heapSizeMB + "m", "-XX:+AlwaysPreTouch", // stabilize RSS
"-Xss" + threadStackSizeMB + "m",
"-XX:-CreateCoredumpOnCrash",
// This will delay the child threads before they create guard pages, thereby greatly increasing the
// chance of large VMA formation + hugepage coalescation; see JDK-8312182
"-XX:+DelayThreadStartALot"
};
ArrayList<String> finalargs = new ArrayList<>(Arrays.asList(defaultArgs));

switch (args[0]) {
case "PATCH-ENABLED": {
finalargs.add(TestMain.class.getName());
ProcessBuilder pb = ProcessTools.createJavaProcessBuilder(finalargs);

OutputAnalyzer output = new OutputAnalyzer(pb.start());
output.shouldHaveExitValue(0);

// this line indicates the mitigation is active:
output.shouldContain("[pagesize] JVM will attempt to prevent THPs in thread stacks.");

ProcSelfStatus status = ProcSelfStatus.parse(output);
if (status.numLifeThreads < numThreads) {
throw new RuntimeException("Number of live threads lower than expected: " + status.numLifeThreads + ", expected " + numThreads);
} else {
System.out.println("Found " + status.numLifeThreads + " to be alive. Ok.");
}

long rssPlusSwapMB = status.swapMB + status.rssMB;

if (rssPlusSwapMB > acceptableRSSLimitMB) {
throw new RuntimeException("RSS+Swap larger than expected: " + rssPlusSwapMB + "m, expected at most " + acceptableRSSLimitMB + "m");
} else {
if (rssPlusSwapMB < heapSizeMB) { // we pretouch the java heap, so we expect to see at least that:
throw new RuntimeException("RSS+Swap suspiciously low: " + rssPlusSwapMB + "m, expected at least " + heapSizeMB + "m");
}
System.out.println("Okay: RSS+Swap=" + rssPlusSwapMB + ", within acceptable limit of " + acceptableRSSLimitMB);
}
}
break;

case "PATCH-DISABLED": {

// Only execute manually! this will allocate ~2gb of memory!

// explicitly disable the no-THP-workaround:
finalargs.add("-XX:+UnlockDiagnosticVMOptions");
finalargs.add("-XX:+DisableTHPStackMitigation");

finalargs.add(TestMain.class.getName());
ProcessBuilder pb = ProcessTools.createJavaProcessBuilder(finalargs);
OutputAnalyzer output = new OutputAnalyzer(pb.start());

output.shouldHaveExitValue(0);

// We deliberately switched off mitigation, VM should tell us:
output.shouldContain("[pagesize] JVM will *not* prevent THPs in thread stacks. This may cause high RSS.");

// Parse output from self/status
ProcSelfStatus status = ProcSelfStatus.parse(output);
if (status.numLifeThreads < numThreads) {
throw new RuntimeException("Number of live threads lower than expected (" + status.numLifeThreads + ", expected " + numThreads +")");
} else {
System.out.println("Found " + status.numLifeThreads + " to be alive. Ok.");
}

long rssPlusSwapMB = status.swapMB + status.rssMB;

if (rssPlusSwapMB < acceptableRSSLimitMB) {
throw new RuntimeException("RSS+Swap lower than expected: " + rssPlusSwapMB + "m, expected more than " + acceptableRSSLimitMB + "m");
}
break;
}

default: throw new RuntimeException("Bad argument: " + args[0]);
}
}
}

1 comment on commit 4cca633

@openjdk-notifier
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.