Skip to content

Commit

Permalink
[threads] Use refcounts for coordinating finalization and detaching
Browse files Browse the repository at this point in the history
Reverts a29ad08 (#9914)

The basic problem we want to solve is the following:
  1. All access to InternalThread:state must be protected by the
     InternalThread:synch_cs mutex
  2. We must destroy the mutex when we are done with the thread.
  3. We don't know which happens later - detaching the machine thread or
     finalizing its InternalThread managed object.

The solution is to replace InternalThread:synch_cs by InternalThread:longlived
which is a refcounted struct that holds the synch_cs.  The refcount starts out
at 2 when the thread is attached to the runtime and when we create the managed
InternalThread object that represents it.
Both detaching and finalizing the managed object will decrement the refounct,
and whichever one happens last will be responsible for destroying the mutex.

This addresses #11956 which was a race
condition due to the previous attempt to fix this lifetime problem.  The
previous attempt incorrectly used CAS in mono_thread_detach_internal while
continuing to use locking of synch_cs elsewhere. In particular
mono_thread_suspend_all_other_threads could race with
mono_thread_detach_internal: it expects to take the thread lock and test
thread->state and use the thread->suspended event, while detaching deletes
thread->suspended without taking a lock.

As a result we had a concurrency bug: in suspend_all_other_threads it's
possible to see both the old (non-Stopped) value of thread->state and the
new (NULL) value of thread->suspended.  Which leads to crashes.

---

Background - why we don't know if detaching or finalization happens first.

1. InternalThread normally outlives the machine thread. This can happen because
when one thread starts another it can hold a reference to the fresh thread's
Thread object which holds a reference to the InternalThread. So after the
machine thread is done, the older thread can query the state of the younger
Thread object. This is the normal situation.

2. During shutdown we can have the opposite situation: the InternalThread
objects are finalized first (this happens during root domain finalization), but
the machine threads are still running, and they may still return to
start_wrapper_internal and call detach_internal. So in this case we have an
InternalThread whose finalizer ran first and detach will run second.
  • Loading branch information
lambdageek committed Jan 15, 2019
1 parent 084c70b commit 27a450b
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 50 deletions.
2 changes: 1 addition & 1 deletion mcs/class/corlib/System.Threading/Thread.cs
Expand Up @@ -71,7 +71,7 @@ sealed class InternalThread : CriticalFinalizerObject {
internal int _serialized_principal_version;
private IntPtr appdomain_refs;
private int interruption_requested;
private IntPtr synch_cs;
private IntPtr longlived;
internal bool threadpool_thread;
private bool thread_interrupt_requested;
/* These are used from managed code */
Expand Down
7 changes: 5 additions & 2 deletions mono/metadata/object-internals.h
Expand Up @@ -444,7 +444,7 @@ struct _MonoInternalThread {
gpointer unused3;
gunichar2 *name;
guint32 name_len;
guint32 state;
guint32 state; /* must be accessed while longlived->synch_cs is locked */
MonoException *abort_exc;
int abort_state_handle;
guint64 tid; /* This is accessed as a gsize in the code (so it can hold a 64bit pointer on systems that need it), but needs to reserve 64 bits of space on all machines as it corresponds to a field in managed code */
Expand All @@ -458,7 +458,10 @@ struct _MonoInternalThread {
gpointer appdomain_refs;
/* This is modified using atomic ops, so keep it a gint32 */
gint32 __interruption_requested;
MonoCoopMutex *synch_cs;
/* data that must live as long as this managed object is not finalized
* or as long as the underlying thread is attached, whichever is
* longer */
MonoLongLivedThreadData *longlived;
MonoBoolean threadpool_thread;
MonoBoolean thread_interrupt_requested;
int stack_size;
Expand Down
14 changes: 14 additions & 0 deletions mono/metadata/threads-types.h
Expand Up @@ -106,6 +106,20 @@ mono_thread_create_internal_handle (MonoDomain *domain, T func, gpointer arg, Mo
}
#endif

/* Data owned by a MonoInternalThread that must live until both the finalizer
* for MonoInternalThread has run, and the underlying machine thread has
* detached.
*
* Normally a thread is first detached and then the InternalThread object is
* finalized and collected. However during shutdown, when the root domain is
* finalized, all the InternalThread objects are finalized first and the
* machine threads are detached later.
*/
typedef struct {
MonoRefCount ref;
MonoCoopMutex *synch_cs;
} MonoLongLivedThreadData;

void mono_threads_install_cleanup (MonoThreadCleanupFunc func);

ICALL_EXPORT
Expand Down
90 changes: 43 additions & 47 deletions mono/metadata/threads.c
Expand Up @@ -152,9 +152,6 @@ static GHashTable *contexts = NULL;
/* Cleanup queue for contexts. */
static MonoReferenceQueue *context_queue;

/* Cleanup queue for threads. */
static MonoReferenceQueue *thread_queue;

/*
* Threads which are starting up and they are not in the 'threads' hash yet.
* When mono_thread_attach_internal is called for a thread, it will be removed from this hash table.
Expand Down Expand Up @@ -470,53 +467,56 @@ thread_get_tid (MonoInternalThread *thread)
}

static void
free_synch_cs (void *user_data)
free_synch_cs (MonoCoopMutex *synch_cs)
{
MonoCoopMutex *synch_cs = (MonoCoopMutex*)user_data;
g_assert (synch_cs);
mono_coop_mutex_destroy (synch_cs);
g_free (synch_cs);
}

static void
ensure_synch_cs_set (MonoInternalThread *thread)
free_longlived_thread_data (void *user_data)
{
MonoCoopMutex *synch_cs;
MonoLongLivedThreadData *lltd = (MonoLongLivedThreadData*)user_data;
free_synch_cs (lltd->synch_cs);

if (thread->synch_cs != NULL) {
return;
}
g_free (lltd);
}

static void
init_longlived_thread_data (MonoLongLivedThreadData *lltd)
{
mono_refcount_init (lltd, free_longlived_thread_data);
mono_refcount_inc (lltd);
/* Initial refcount is 2: decremented once by
* mono_thread_detach_internal and once by the MonoInternalThread
* finalizer - whichever one happens later will deallocate. */

synch_cs = g_new0 (MonoCoopMutex, 1);
mono_coop_mutex_init_recursive (synch_cs);
lltd->synch_cs = g_new0 (MonoCoopMutex, 1);
mono_coop_mutex_init_recursive (lltd->synch_cs);

if (mono_atomic_cas_ptr ((gpointer *)&thread->synch_cs,
synch_cs, NULL) != NULL) {
/* Another thread must have installed this CS */
mono_coop_mutex_destroy (synch_cs);
g_free (synch_cs);
} else {
// If we were the ones to initialize with this synch_cs variable, we
// should associate this one with our cleanup
mono_gc_reference_queue_add (thread_queue, &thread->obj, synch_cs);
}
mono_memory_barrier ();
}

static void
dec_longlived_thread_data (MonoLongLivedThreadData *lltd)
{
mono_refcount_dec (lltd);
}

static inline void
lock_thread (MonoInternalThread *thread)
{
if (!thread->synch_cs)
ensure_synch_cs_set (thread);

g_assert (thread->synch_cs);
g_assert (thread->longlived);
g_assert (thread->longlived->synch_cs);

mono_coop_mutex_lock (thread->synch_cs);
mono_coop_mutex_lock (thread->longlived->synch_cs);
}

static inline void
unlock_thread (MonoInternalThread *thread)
{
mono_coop_mutex_unlock (thread->synch_cs);
mono_coop_mutex_unlock (thread->longlived->synch_cs);
}

static void
Expand Down Expand Up @@ -671,7 +671,8 @@ create_internal_thread_object (void)
/* only possible failure mode is OOM, from which we don't exect to recover */
mono_error_assert_ok (error);

ensure_synch_cs_set (thread);
thread->longlived = g_new0 (MonoLongLivedThreadData, 1);
init_longlived_thread_data (thread->longlived);

thread->apartment_state = ThreadApartmentState_Unknown;
thread->managed_id = get_next_managed_thread_id ();
Expand Down Expand Up @@ -940,20 +941,12 @@ mono_thread_detach_internal (MonoInternalThread *thread)
thread->abort_exc = NULL;
thread->current_appcontext = NULL;

/*
* This should be alive until after the reference queue runs the
* post-free cleanup function
*/
while (TRUE) {
guint32 old_state = thread->state;
LOCK_THREAD (thread);

guint32 new_state = old_state;
new_state |= ThreadState_Stopped;
new_state &= ~ThreadState_Background;
thread->state |= ThreadState_Stopped;
thread->state &= ~ThreadState_Background;

if (mono_atomic_cas_i32 ((gint32 *)&thread->state, new_state, old_state) == old_state)
break;
}
UNLOCK_THREAD (thread);

/*
An interruption request has leaked to cleanup. Adjust the global counter.
Expand Down Expand Up @@ -1047,6 +1040,10 @@ mono_thread_detach_internal (MonoInternalThread *thread)

mono_thread_info_unset_internal_thread_gchandle (info);

/* Possibly free synch_cs, if the finalizer for InternalThread already
* ran also. */
dec_longlived_thread_data (thread->longlived);

MONO_PROFILER_RAISE (thread_exited, (thread->tid));

/* Don't need to close the handle to this thread, even though we took a
Expand Down Expand Up @@ -1664,9 +1661,9 @@ ves_icall_System_Threading_InternalThread_Thread_free_internal (MonoInternalThre
CloseHandle (this_obj->native_handle);
#endif

// Taken care of by reference queue, but we should
// zero it out
this_obj->synch_cs = NULL;
/* Possibly free synch_cs, if the thread already detached also. */
dec_longlived_thread_data (this_obj->longlived);


if (this_obj->name) {
void *name = this_obj->name;
Expand Down Expand Up @@ -3216,7 +3213,6 @@ void mono_thread_init (MonoThreadStartCB start_cb,
mono_thread_start_cb = start_cb;
mono_thread_attach_cb = attach_cb;

thread_queue = mono_gc_reference_queue_new (free_synch_cs);
}

static gpointer
Expand Down Expand Up @@ -5460,7 +5456,7 @@ async_suspend_critical (MonoThreadInfo *info, gpointer ud)
}
}

/* LOCKING: called with @thread synch_cs held, and releases it */
/* LOCKING: called with @thread longlived->synch_cs held, and releases it */
static void
async_suspend_internal (MonoInternalThread *thread, gboolean interrupt)
{
Expand All @@ -5483,7 +5479,7 @@ async_suspend_internal (MonoInternalThread *thread, gboolean interrupt)
UNLOCK_THREAD (thread);
}

/* LOCKING: called with @thread synch_cs held, and releases it */
/* LOCKING: called with @thread longlived->synch_cs held, and releases it */
static void
self_suspend_internal (void)
{
Expand Down

0 comments on commit 27a450b

Please sign in to comment.