8255984: Shenandoah: "adaptive" heuristic is prone to missing load spikes #1099

Add a "reactive" heuristic for triggering concurrent GC cycles
earthling-amzn Nov 5, 2020
Incorporate review feed back
earthling-amzn Nov 10, 2020
Merge 'reactive' heuristic into 'adaptive' heuristic
earthling-amzn Nov 10, 2020
Rename ShenandoahReactive flags to ShenandoahAdaptive flags.
earthling-amzn Nov 12, 2020
Remove const qualifier from should_start_gc
earthling-amzn Nov 12, 2020
Restore call to reset allocation counter at cycle start
earthling-amzn Nov 13, 2020
Make logging messages more consistent
earthling-amzn Nov 13, 2020
Rename variables to improve readability
earthling-amzn Nov 13, 2020
Reuse instantaneous_rate method instead of duplicating code
earthling-amzn Nov 13, 2020
Use os::elapsedTime to avoid type issues and to be consistent with ot…
earthling-amzn Nov 13, 2020
Inline calls to gc decision methods (vestige of an earlier design)
earthling-amzn Nov 13, 2020
Defend against underflow and division by zero
earthling-amzn Nov 16, 2020
Remove dependency from allocation rate to adaptive heuristic
earthling-amzn Nov 16, 2020
Fix wrong type for os::elapsedTime
earthling-amzn Nov 17, 2020
Merge branch 'master' into shenandoah-reactive-heuristic
earthling-amzn Nov 17, 2020
Avoid recomputing instantaneous allocation rate
earthling-amzn Nov 19, 2020
Remove unused member
earthling-amzn Nov 19, 2020
Sample allocation rate even if cumulative bytes allocated hasn't changed
earthling-amzn Nov 20, 2020
Do not re-sample allocation rate with higher frequency
earthling-amzn Nov 20, 2020
Factor rate check into is_spiking
earthling-amzn Nov 23, 2020
@@ -27,14 +27,36 @@
#include "gc/shenandoah/heuristics/shenandoahAdaptiveHeuristics.hpp"
#include "gc/shenandoah/shenandoahCollectionSet.hpp"
#include "gc/shenandoah/shenandoahFreeSet.hpp"
#include "gc/shenandoah/shenandoahHeap.inline.hpp"
#include "gc/shenandoah/shenandoahHeapRegion.inline.hpp"
#include "logging/log.hpp"
#include "logging/logTag.hpp"
#include "utilities/quickSort.hpp"

// These constants are used to adjust the margin of error for the moving
// average of the allocation rate and cycle time. The units are standard
// deviations.
const double ShenandoahAdaptiveHeuristics::FULL_PENALTY_SD = 0.2;
const double ShenandoahAdaptiveHeuristics::DEGENERATE_PENALTY_SD = 0.1;

// These are used to decide if we want to make any adjustments at all
// at the end of a successful concurrent cycle.
const double ShenandoahAdaptiveHeuristics::LOWEST_EXPECTED_AVAILABLE_AT_END = -0.5;
const double ShenandoahAdaptiveHeuristics::HIGHEST_EXPECTED_AVAILABLE_AT_END = 0.5;

// These values are the confidence interval expressed as standard deviations.
// At the minimum confidence level, there is a 25% chance that the true value of
// the estimate (average cycle time or allocation rate) is not more than
// MINIMUM_CONFIDENCE standard deviations away from our estimate. Similarly, the
// MAXIMUM_CONFIDENCE interval here means there is a one in a thousand chance
// that the true value of our estimate is outside the interval. These are used
// as bounds on the adjustments applied at the outcome of a GC cycle.
const double ShenandoahAdaptiveHeuristics::MINIMUM_CONFIDENCE = 0.319; // 25%
const double ShenandoahAdaptiveHeuristics::MAXIMUM_CONFIDENCE = 3.291; // 99.9%

ShenandoahAdaptiveHeuristics::ShenandoahAdaptiveHeuristics() :
ShenandoahHeuristics() {}
_last_trigger(OTHER) { }

ShenandoahAdaptiveHeuristics::~ShenandoahAdaptiveHeuristics() {}

@@ -98,20 +120,94 @@ void ShenandoahAdaptiveHeuristics::choose_collection_set_from_regiondata(Shenand

void ShenandoahAdaptiveHeuristics::record_cycle_start() {

void ShenandoahAdaptiveHeuristics::record_success_concurrent() {

size_t available = ShenandoahHeap::heap()->free_set()->available();

double z_score = 0.0;
if ( > 0) {
z_score = (available - _available.avg()) /;

log_debug(gc, ergo)("Available: " SIZE_FORMAT " %sB, z-score=%.3f. Average available: %.1f %sB +/- %.1f %sB.",
byte_size_in_proper_unit(available), proper_unit_for_byte_size(available),
byte_size_in_proper_unit(_available.avg()), proper_unit_for_byte_size(_available.avg()),
byte_size_in_proper_unit(, proper_unit_for_byte_size(;

// In the case when a concurrent GC cycle completes successfully but with an
// unusually small amount of available memory we will adjust our trigger
// parameters so that they are more likely to initiate a new cycle.
// Conversely, when a GC cycle results in an above average amount of available
// memory, we will adjust the trigger parameters to be less likely to initiate
// a GC cycle.
// The z-score we've computed is in no way statistically related to the
// trigger parameters, but it has the nice property that worse z-scores for
// available memory indicate making larger adjustments to the trigger
// parameters. It also results in fewer adjustments as the application
// stabilizes.
// In order to avoid making endless and likely unnecessary adjustments to the
// trigger parameters, the change in available memory (with respect to the
// average) at the end of a cycle must be beyond these threshold values.
// The sign is flipped because a negative z-score indicates that the
// available memory at the end of the cycle is below average. Positive
// adjustments make the triggers more sensitive (i.e., more likely to fire).
// The z-score also gives us a measure of just how far below normal. This
// property allows us to adjust the trigger parameters proportionally.
// The `100` here is used to attenuate the size of our adjustments. This
// number was chosen empirically. It also means the adjustments at the end of
// a concurrent cycle are an order of magnitude smaller than the adjustments
// made for a degenerated or full GC cycle (which themselves were also
// chosen empirically).
adjust_last_trigger_parameters(z_score / -100);

void ShenandoahAdaptiveHeuristics::record_success_degenerated() {
// Adjust both trigger's parameters in the case of a degenerated GC because
// either of them should have triggered earlier to avoid this case.

bool ShenandoahAdaptiveHeuristics::should_start_gc() const {
void ShenandoahAdaptiveHeuristics::record_success_full() {
// Adjust both trigger's parameters in the case of a full GC because
// either of them should have triggered earlier to avoid this case.

static double saturate(double value, double min, double max) {
return MAX2(MIN2(value, max), min);

bool ShenandoahAdaptiveHeuristics::should_start_gc() {
ShenandoahHeap* heap = ShenandoahHeap::heap();
size_t max_capacity = heap->max_capacity();
size_t capacity = heap->soft_max_capacity();
size_t available = heap->free_set()->available();
size_t allocated = heap->bytes_allocated_since_gc_start();

// Make sure the code below treats available without the soft tail.
size_t soft_tail = max_capacity - capacity;
available = (available > soft_tail) ? (available - soft_tail) : 0;

// Check if we are falling below the worst limit, time to trigger the GC, regardless of
// anything else.
// Track allocation rate even if we decide to start a cycle for other reasons.
double rate = _allocation_rate.sample(allocated);
_last_trigger = OTHER;

size_t min_threshold = capacity / 100 * ShenandoahMinFreeThreshold;
if (available < min_threshold) {
log_info(gc)("Trigger: Free (" SIZE_FORMAT "%s) is below minimum threshold (" SIZE_FORMAT "%s)",
@@ -120,7 +216,6 @@ bool ShenandoahAdaptiveHeuristics::should_start_gc() const {
return true;

// Check if are need to learn a bit about the application
const size_t max_learn = ShenandoahLearningSteps;
if (_gc_times_learned < max_learn) {
size_t init_threshold = capacity / 100 * ShenandoahInitFreeThreshold;
@@ -136,7 +231,6 @@ bool ShenandoahAdaptiveHeuristics::should_start_gc() const {
// Check if allocation headroom is still okay. This also factors in:
// 1. Some space to absorb allocation spikes
// 2. Accumulated penalties from Degenerated and Full GC

size_t allocation_headroom = available;

size_t spike_headroom = capacity / 100 * ShenandoahAllocSpikeFactor;
@@ -145,24 +239,127 @@ bool ShenandoahAdaptiveHeuristics::should_start_gc() const {
allocation_headroom -= MIN2(allocation_headroom, spike_headroom);
allocation_headroom -= MIN2(allocation_headroom, penalties);
// TODO: Allocation rate is way too averaged to be useful during state changes

double average_gc = _gc_time_history->avg();
double time_since_last = time_since_last_gc();
double allocation_rate = heap->bytes_allocated_since_gc_start() / time_since_last;
double avg_cycle_time = _gc_time_history->davg() + (_margin_of_error_sd * _gc_time_history->dsd());
double avg_alloc_rate = _allocation_rate.upper_bound(_margin_of_error_sd);
if (avg_cycle_time > allocation_headroom / avg_alloc_rate) {
log_info(gc)("Trigger: Average GC time (%.2f ms) is above the time for average allocation rate (%.0f %sB/s) to deplete free headroom (" SIZE_FORMAT "%s) (margin of error = %.2f)",
avg_cycle_time * 1000,
byte_size_in_proper_unit(avg_alloc_rate), proper_unit_for_byte_size(avg_alloc_rate),
byte_size_in_proper_unit(allocation_headroom), proper_unit_for_byte_size(allocation_headroom),

if (average_gc > allocation_headroom / allocation_rate) {
log_info(gc)("Trigger: Average GC time (%.2f ms) is above the time for allocation rate (%.0f %sB/s) to deplete free headroom (" SIZE_FORMAT "%s)",
average_gc * 1000,
byte_size_in_proper_unit(allocation_rate), proper_unit_for_byte_size(allocation_rate),
byte_size_in_proper_unit(allocation_headroom), proper_unit_for_byte_size(allocation_headroom));
log_info(gc, ergo)("Free headroom: " SIZE_FORMAT "%s (free) - " SIZE_FORMAT "%s (spike) - " SIZE_FORMAT "%s (penalties) = " SIZE_FORMAT "%s",
byte_size_in_proper_unit(available), proper_unit_for_byte_size(available),
byte_size_in_proper_unit(spike_headroom), proper_unit_for_byte_size(spike_headroom),
byte_size_in_proper_unit(penalties), proper_unit_for_byte_size(penalties),
byte_size_in_proper_unit(allocation_headroom), proper_unit_for_byte_size(allocation_headroom));
byte_size_in_proper_unit(available), proper_unit_for_byte_size(available),
byte_size_in_proper_unit(spike_headroom), proper_unit_for_byte_size(spike_headroom),
byte_size_in_proper_unit(penalties), proper_unit_for_byte_size(penalties),
byte_size_in_proper_unit(allocation_headroom), proper_unit_for_byte_size(allocation_headroom));

_last_trigger = RATE;
return true;

bool is_spiking = _allocation_rate.is_spiking(rate, _spike_threshold_sd);
if (is_spiking && avg_cycle_time > allocation_headroom / rate) {
log_info(gc)("Trigger: Average GC time (%.2f ms) is above the time for instantaneous allocation rate (%.0f %sB/s) to deplete free headroom (" SIZE_FORMAT "%s) (spike threshold = %.2f)",
avg_cycle_time * 1000,
byte_size_in_proper_unit(rate), proper_unit_for_byte_size(rate),
byte_size_in_proper_unit(allocation_headroom), proper_unit_for_byte_size(allocation_headroom),
_last_trigger = SPIKE;
return true;

return ShenandoahHeuristics::should_start_gc();

void ShenandoahAdaptiveHeuristics::adjust_last_trigger_parameters(double amount) {
switch (_last_trigger) {
case RATE:
case SPIKE:
case OTHER:
// nothing to adjust here.

void ShenandoahAdaptiveHeuristics::adjust_margin_of_error(double amount) {
_margin_of_error_sd = saturate(_margin_of_error_sd + amount, MINIMUM_CONFIDENCE, MAXIMUM_CONFIDENCE);
log_debug(gc, ergo)("Margin of error now %.2f", _margin_of_error_sd);

void ShenandoahAdaptiveHeuristics::adjust_spike_threshold(double amount) {
_spike_threshold_sd = saturate(_spike_threshold_sd - amount, MINIMUM_CONFIDENCE, MAXIMUM_CONFIDENCE);
log_debug(gc, ergo)("Spike threshold now: %.2f", _spike_threshold_sd);

ShenandoahAllocationRate::ShenandoahAllocationRate() :
_interval_sec(1.0 / ShenandoahAdaptiveSampleFrequencyHz),
_rate(int(ShenandoahAdaptiveSampleSizeSeconds * ShenandoahAdaptiveSampleFrequencyHz), ShenandoahAdaptiveDecayFactor),
_rate_avg(int(ShenandoahAdaptiveSampleSizeSeconds * ShenandoahAdaptiveSampleFrequencyHz), ShenandoahAdaptiveDecayFactor) {

double ShenandoahAllocationRate::sample(size_t allocated) {
double now = os::elapsedTime();
double rate = 0.0;
if (now - _last_sample_time > _interval_sec) {
if (allocated >= _last_sample_value) {
rate = instantaneous_rate(now, allocated);

_last_sample_time = now;
_last_sample_value = allocated;
return rate;

double ShenandoahAllocationRate::upper_bound(double sds) const {
// Here we are using the standard deviation of the computed running
// average, rather than the standard deviation of the samples that went
// into the moving average. This is a much more stable value and is tied
// to the actual statistic in use (moving average over samples of averages).
return _rate.davg() + (sds * _rate_avg.dsd());

void ShenandoahAllocationRate::allocation_counter_reset() {
_last_sample_time = os::elapsedTime();
_last_sample_value = 0;

bool ShenandoahAllocationRate::is_spiking(double rate, double threshold) const {
if (rate <= 0.0) {
return false;

double sd =;
if (sd > 0) {
// There is a small chance that that rate has already been sampled, but it
// seems not to matter in practice.
double z_score = (rate - _rate.avg()) / sd;
if (z_score > threshold) {
return true;
return false;

double ShenandoahAllocationRate::instantaneous_rate(size_t allocated) const {
return instantaneous_rate(os::elapsedTime(), allocated);

double ShenandoahAllocationRate::instantaneous_rate(double time, size_t allocated) const {
size_t last_value = _last_sample_value;
double last_time = _last_sample_time;
size_t allocation_delta = (allocated > last_value) ? (allocated - last_value) : 0;
double time_delta_sec = time - last_time;
return (time_delta_sec > 0) ? (allocation_delta / time_delta_sec) : 0;
@@ -29,6 +29,28 @@
#include "gc/shenandoah/shenandoahPhaseTimings.hpp"
#include "utilities/numberSeq.hpp"

class ShenandoahAllocationRate : public CHeapObj<mtGC> {
explicit ShenandoahAllocationRate();
void allocation_counter_reset();

double sample(size_t allocated);

double instantaneous_rate(size_t allocated) const;
double upper_bound(double sds) const;
bool is_spiking(double rate, double threshold) const;


double instantaneous_rate(double time, size_t allocated) const;

double _last_sample_time;
size_t _last_sample_value;
double _interval_sec;
TruncatedSeq _rate;
TruncatedSeq _rate_avg;

class ShenandoahAdaptiveHeuristics : public ShenandoahHeuristics {
@@ -40,12 +62,70 @@ class ShenandoahAdaptiveHeuristics : public ShenandoahHeuristics {
size_t actual_free);

void record_cycle_start();
void record_success_concurrent();
void record_success_degenerated();
void record_success_full();

virtual bool should_start_gc() const;
virtual bool should_start_gc();

virtual const char* name() { return "Adaptive"; }
virtual bool is_diagnostic() { return false; }
virtual bool is_experimental() { return false; }

// These are used to adjust the margin of error and the spike threshold
// in response to GC cycle outcomes. These values are shared, but the
// margin of error and spike threshold trend in opposite directions.
const static double FULL_PENALTY_SD;
const static double DEGENERATE_PENALTY_SD;

const static double MINIMUM_CONFIDENCE;
const static double MAXIMUM_CONFIDENCE;


friend class ShenandoahAllocationRate;
// Used to record the last trigger that signaled to start a GC.
// This itself is used to decide whether or not to adjust the margin of
// error for the average cycle time and allocation rate or the allocation
// spike detection threshold.
enum Trigger {

void adjust_last_trigger_parameters(double amount);
void adjust_margin_of_error(double amount);
void adjust_spike_threshold(double amount);

ShenandoahAllocationRate _allocation_rate;

// The margin of error expressed in standard deviations to add to our
// average cycle time and allocation rate. As this value increases we
// tend to over estimate the rate at which mutators will deplete the
// heap. In other words, erring on the side of caution will trigger more
// concurrent GCs.
double _margin_of_error_sd;

// The allocation spike threshold is expressed in standard deviations.
// If the standard deviation of the most recent sample of the allocation
// rate exceeds this threshold, a GC cycle is started. As this value
// decreases the sensitivity to allocation spikes increases. In other
// words, lowering the spike threshold will tend to increase the number
// of concurrent GCs.
double _spike_threshold_sd;

// Remember which trigger is responsible for the last GC cycle. When the
// outcome of the cycle is evaluated we will adjust the parameters for the
// corresponding triggers. Note that successful outcomes will raise
// the spike threshold and lower the margin of error.
Trigger _last_trigger;

// Keep track of the available memory at the end of a GC cycle. This
// establishes what is 'normal' for the application and is used as a
// source of feedback to adjust trigger parameters.
TruncatedSeq _available;