-
Notifications
You must be signed in to change notification settings - Fork 6.3k
8373819: Genshen: Control thread can miss allocation failure notification (redux) #28932
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
224fb5a
0037e21
395d6e0
53dcc61
f64fedf
31df868
bd42dc6
550228b
cfd167f
8b3acc8
e416d12
15f60d4
8f4f55d
2e57f0a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -61,7 +61,12 @@ ShenandoahGenerationalControlThread::ShenandoahGenerationalControlThread() : | |
|
|
||
| void ShenandoahGenerationalControlThread::run_service() { | ||
|
|
||
| // This is the only instance of request. It is important that request.generation | ||
| // does not change between a concurrent cycle failure and the start of a degenerated | ||
| // cycle. We initialize it with the young generation to handle the pathological case | ||
| // where the very first cycle is degenerated (some tests exercise this path). | ||
| ShenandoahGCRequest request; | ||
| request.generation = _heap->young_generation(); | ||
| while (!should_terminate()) { | ||
|
|
||
| // Figure out if we have pending requests. | ||
|
|
@@ -77,12 +82,10 @@ void ShenandoahGenerationalControlThread::run_service() { | |
|
|
||
| // If the cycle was cancelled, continue the next iteration to deal with it. Otherwise, | ||
| // if there was no other cycle requested, cleanup and wait for the next request. | ||
| if (!_heap->cancelled_gc()) { | ||
| MonitorLocker ml(&_control_lock, Mutex::_no_safepoint_check_flag); | ||
| if (_requested_gc_cause == GCCause::_no_gc) { | ||
| set_gc_mode(ml, none); | ||
| ml.wait(); | ||
| } | ||
| MonitorLocker ml(&_control_lock, Mutex::_no_safepoint_check_flag); | ||
| if (_requested_gc_cause == GCCause::_no_gc) { | ||
| set_gc_mode(ml, none); | ||
| ml.wait(); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -96,38 +99,47 @@ void ShenandoahGenerationalControlThread::stop_service() { | |
| log_debug(gc, thread)("Stopping control thread"); | ||
| MonitorLocker ml(&_control_lock, Mutex::_no_safepoint_check_flag); | ||
| _heap->cancel_gc(GCCause::_shenandoah_stop_vm); | ||
| _requested_gc_cause = GCCause::_shenandoah_stop_vm; | ||
| notify_cancellation(ml, GCCause::_shenandoah_stop_vm); | ||
| notify_control_thread(ml, GCCause::_shenandoah_stop_vm); | ||
| // We can't wait here because it may interfere with the active cycle's ability | ||
| // to reach a safepoint (this runs on a java thread). | ||
| } | ||
|
|
||
| void ShenandoahGenerationalControlThread::check_for_request(ShenandoahGCRequest& request) { | ||
| // Hold the lock while we read request cause and generation | ||
| MonitorLocker ml(&_control_lock, Mutex::_no_safepoint_check_flag); | ||
| if (_heap->cancelled_gc()) { | ||
| // The previous request was cancelled. Either it was cancelled for an allocation | ||
| // failure (degenerated cycle), or old marking was cancelled to run a young collection. | ||
| // In either case, the correct generation for the next cycle can be determined by | ||
| // the cancellation cause. | ||
| request.cause = _heap->clear_cancellation(GCCause::_shenandoah_concurrent_gc); | ||
| if (request.cause == GCCause::_shenandoah_concurrent_gc) { | ||
|
|
||
| log_debug(gc, thread)("cancelled cause: %s, requested cause: %s", | ||
| GCCause::to_string(_heap->cancelled_cause()), GCCause::to_string(_requested_gc_cause)); | ||
|
|
||
| request.cause = _requested_gc_cause; | ||
| if (ShenandoahCollectorPolicy::is_allocation_failure(request.cause)) { | ||
| if (_degen_point == ShenandoahGC::_degenerated_unset) { | ||
| request.generation = _heap->young_generation(); | ||
| _degen_point = ShenandoahGC::_degenerated_outside_cycle; | ||
| } else { | ||
| assert(request.generation != nullptr, "Must know which generation to use for degenerated cycle"); | ||
| } | ||
| } else { | ||
| request.cause = _requested_gc_cause; | ||
| if (request.cause == GCCause::_shenandoah_concurrent_gc) { | ||
| // This is a regulator request. It is also possible that the regulator "canceled" an old mark, | ||
| // so we can clear that here. This clear operation will only clear the cancellation if it is | ||
| // a regulator request. | ||
| _heap->clear_cancellation(GCCause::_shenandoah_concurrent_gc); | ||
| } | ||
| request.generation = _requested_generation; | ||
|
|
||
| // Only clear these if we made a request from them. In the case of a cancelled gc, | ||
| // we do not want to inadvertently lose this pending request. | ||
| _requested_gc_cause = GCCause::_no_gc; | ||
| _requested_generation = nullptr; | ||
| } | ||
|
|
||
| log_debug(gc, thread)("request.cause: %s, request.generation: %s", | ||
| GCCause::to_string(request.cause), request.generation == nullptr ? "None" : request.generation->name()); | ||
|
|
||
| _requested_gc_cause = GCCause::_no_gc; | ||
| _requested_generation = nullptr; | ||
|
|
||
| if (request.cause == GCCause::_no_gc || request.cause == GCCause::_shenandoah_stop_vm) { | ||
| return; | ||
| } | ||
|
|
||
| assert(request.generation != nullptr, "request.generation cannot be null, cause is: %s", GCCause::to_string(request.cause)); | ||
| GCMode mode; | ||
| if (ShenandoahCollectorPolicy::is_allocation_failure(request.cause)) { | ||
| mode = prepare_for_allocation_failure_gc(request); | ||
|
|
@@ -140,11 +152,9 @@ void ShenandoahGenerationalControlThread::check_for_request(ShenandoahGCRequest& | |
| } | ||
|
|
||
| ShenandoahGenerationalControlThread::GCMode ShenandoahGenerationalControlThread::prepare_for_allocation_failure_gc(ShenandoahGCRequest &request) { | ||
|
|
||
| if (_degen_point == ShenandoahGC::_degenerated_unset) { | ||
| _degen_point = ShenandoahGC::_degenerated_outside_cycle; | ||
| request.generation = _heap->young_generation(); | ||
| } else if (request.generation->is_old()) { | ||
| // Important: not all paths update the request.generation. This is intentional. | ||
| // A degenerated cycle must use the same generation carried over from the previous request. | ||
| if (request.generation->is_old()) { | ||
| // This means we degenerated during the young bootstrap for the old generation | ||
| // cycle. The following degenerated cycle should therefore also be young. | ||
| request.generation = _heap->young_generation(); | ||
|
|
@@ -588,6 +598,8 @@ bool ShenandoahGenerationalControlThread::check_cancellation_or_degen(Shenandoah | |
| if (ShenandoahCollectorPolicy::is_allocation_failure(_heap->cancelled_cause())) { | ||
| assert(_degen_point == ShenandoahGC::_degenerated_unset, | ||
| "Should not be set yet: %s", ShenandoahGC::degen_point_to_string(_degen_point)); | ||
| MonitorLocker ml(&_control_lock, Mutex::_no_safepoint_check_flag); | ||
| _requested_gc_cause = _heap->cancelled_cause(); | ||
| _degen_point = point; | ||
| log_debug(gc, thread)("Cancellation detected:, reason: %s, degen point: %s", | ||
| GCCause::to_string(_heap->cancelled_cause()), | ||
|
|
@@ -633,9 +645,7 @@ void ShenandoahGenerationalControlThread::service_stw_degenerated_cycle(const Sh | |
|
|
||
| void ShenandoahGenerationalControlThread::request_gc(GCCause::Cause cause) { | ||
| if (ShenandoahCollectorPolicy::is_allocation_failure(cause)) { | ||
| // GC should already be cancelled. Here we are just notifying the control thread to | ||
| // wake up and handle the cancellation request, so we don't need to set _requested_gc_cause. | ||
| notify_cancellation(cause); | ||
| notify_control_thread(cause); | ||
| } else if (ShenandoahCollectorPolicy::should_handle_requested_gc(cause)) { | ||
| handle_requested_gc(cause); | ||
| } | ||
|
|
@@ -653,15 +663,16 @@ bool ShenandoahGenerationalControlThread::request_concurrent_gc(ShenandoahGenera | |
| MonitorLocker ml(&_control_lock, Mutex::_no_safepoint_check_flag); | ||
| if (gc_mode() == servicing_old) { | ||
| if (!preempt_old_marking(generation)) { | ||
| log_debug(gc, thread)("Cannot start young, old collection is not preemptible"); | ||
| // Global should be able to cause old collection to be abandoned | ||
| log_debug(gc, thread)("Cannot start %s, old collection is not preemptible", generation->name()); | ||
| return false; | ||
| } | ||
|
|
||
| // Cancel the old GC and wait for the control thread to start servicing the new request. | ||
| log_info(gc)("Preempting old generation mark to allow %s GC", generation->name()); | ||
| while (gc_mode() == servicing_old) { | ||
| ShenandoahHeap::heap()->cancel_gc(GCCause::_shenandoah_concurrent_gc); | ||
| notify_cancellation(ml, GCCause::_shenandoah_concurrent_gc); | ||
| notify_control_thread(ml, GCCause::_shenandoah_concurrent_gc, generation); | ||
| ml.wait(); | ||
| } | ||
| return true; | ||
|
|
@@ -695,21 +706,34 @@ void ShenandoahGenerationalControlThread::notify_control_thread(GCCause::Cause c | |
|
|
||
| void ShenandoahGenerationalControlThread::notify_control_thread(MonitorLocker& ml, GCCause::Cause cause, ShenandoahGeneration* generation) { | ||
| assert(_control_lock.is_locked(), "Request lock must be held here"); | ||
| log_debug(gc, thread)("Notify control (%s): %s, %s", gc_mode_name(gc_mode()), GCCause::to_string(cause), generation->name()); | ||
| _requested_gc_cause = cause; | ||
| _requested_generation = generation; | ||
| ml.notify(); | ||
| if (ShenandoahCollectorPolicy::is_allocation_failure(_requested_gc_cause)) { | ||
| // We have already observed a request to handle an allocation failure. We cannot allow | ||
| // another request (System.gc or regulator) to subvert the degenerated cycle. | ||
| log_debug(gc, thread)("Not overwriting gc cause %s with %s", GCCause::to_string(_requested_gc_cause), GCCause::to_string(cause)); | ||
| } else { | ||
| log_debug(gc, thread)("Notify control (%s): %s, %s", gc_mode_name(gc_mode()), GCCause::to_string(cause), generation->name()); | ||
| _requested_gc_cause = cause; | ||
| _requested_generation = generation; | ||
| ml.notify(); | ||
| } | ||
| } | ||
|
|
||
| void ShenandoahGenerationalControlThread::notify_cancellation(GCCause::Cause cause) { | ||
| void ShenandoahGenerationalControlThread::notify_control_thread(GCCause::Cause cause) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Apropos my comment above, this code has a bad smell that some versions of the method expect the lock to be held, and other methods acquire the lock. It makes reasoning about the code at an abstract level very error-prone, and potentially difficult to maintain correctly over time.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The overload that does the actual work always requires the lock to be held. The other overload is a convenience method that takes the lock and provides the locker to the actual method that does the work. |
||
| MonitorLocker ml(&_control_lock, Mutex::_no_safepoint_check_flag); | ||
| notify_cancellation(ml, cause); | ||
| notify_control_thread(ml, cause); | ||
| } | ||
|
|
||
| void ShenandoahGenerationalControlThread::notify_cancellation(MonitorLocker& ml, GCCause::Cause cause) { | ||
| assert(_heap->cancelled_gc(), "GC should already be cancelled"); | ||
| log_debug(gc,thread)("Notify control (%s): %s", gc_mode_name(gc_mode()), GCCause::to_string(cause)); | ||
| ml.notify(); | ||
| void ShenandoahGenerationalControlThread::notify_control_thread(MonitorLocker& ml, GCCause::Cause cause) { | ||
| assert(_control_lock.is_locked(), "Request lock must be held here"); | ||
| if (ShenandoahCollectorPolicy::is_allocation_failure(_requested_gc_cause)) { | ||
| // We have already observed a request to handle an allocation failure. We cannot allow | ||
| // another request (System.gc or regulator) to subvert the degenerated cycle. | ||
| log_debug(gc, thread)("Not overwriting gc cause %s with %s", GCCause::to_string(_requested_gc_cause), GCCause::to_string(cause)); | ||
| } else { | ||
| log_debug(gc, thread)("Notify control (%s): %s", gc_mode_name(gc_mode()), GCCause::to_string(cause)); | ||
| _requested_gc_cause = cause; | ||
| ml.notify(); | ||
| } | ||
| } | ||
|
|
||
| bool ShenandoahGenerationalControlThread::preempt_old_marking(ShenandoahGeneration* generation) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is MonitorLocker, and what is
_control_lock? Why are we checking_control_lockhere? If ml is being passed here for the purposes of notification on it, it must be the case that it's locked and the assert at line 708 is a leakage of abstraction? I see that your problem here is that along one path you do nothing and and don't post a notification and along another you do some work and post a notification. It sounds like what you want instead is an API of the following shape:and callers might do:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In addition, all of the various flavours of
notify_control_thread()with optional parameters should ideally call down into the version that has all of the parameters specified. In the cases where these parameters aren't specified, the version of the method fills default parameters.The fully-parameterized version of the method then contains and consolidates the entire logic and any invariant/assertion checking of the various parameters that make sense, and executes the relevant logic to return a boolean result to allow the caller to notify and/or wait on the monitor.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
From the declaration of
_control_lock:There are a few different paths into
notify_control_thread. Rather than expose the locking protocol to callers and duplicating the logic to notify or not, we have an entry point that acquires the lock for the caller before calling down into the method that does the actual work. On some paths, the lock is already held for other reasons, so the overload is provided for these cases (the lock is not reentrant).This is what is happening, except in the case of allocation failures where the caller does not "know" the generation being collected (assuming a collection is even running). The implementation here intentionally hides as much information as possible. Some callers do not need to provide the generation. In some cases, only the control thread itself can provide the correct generation. We could expose this piece of data to other threads so that could always include a generation in their request, but the control thread would still end up ignoring it. This would also be confusing on a first read of the code.