From 143a93f3793de7b8375da144bc9d584c3e1f0586 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Wed, 22 Jun 2016 09:47:36 -0600 Subject: [PATCH 1/2] opal/sync: remove usage of OPAL_ENABLE_MULTI_THREADS The OPAL_ENABLE_MULTI_THREADS macro is always defined as 1. This was causing us to always use the multi-thread path for synchronization objects. The code has been updated to use the opal_using_threads() function. When MPI_THREAD_MULTIPLE support is disabled at build time (2.x only) this function is a macro evaluating to false so the compiler will optimize out the MT-path in this case. The OPAL_ATOMIC_ADD_32 macro has been removed and replaced by the existing OPAL_THREAD_ADD32 macro. Signed-off-by: Nathan Hjelm --- ompi/request/request.h | 4 +-- opal/threads/mutex.h | 14 +++++++++ opal/threads/wait_sync.c | 11 ++----- opal/threads/wait_sync.h | 65 +++++++++++++++++++--------------------- 4 files changed, 49 insertions(+), 45 deletions(-) diff --git a/ompi/request/request.h b/ompi/request/request.h index cb67f44c66b..9d53b6156a3 100644 --- a/ompi/request/request.h +++ b/ompi/request/request.h @@ -416,8 +416,8 @@ static inline int ompi_request_complete(ompi_request_t* request, bool with_signa if( OPAL_LIKELY(with_signal) ) { if(!OPAL_ATOMIC_CMPSET_PTR(&request->req_complete, REQUEST_PENDING, REQUEST_COMPLETED)) { - ompi_wait_sync_t *tmp_sync = (ompi_wait_sync_t *) OPAL_ATOMIC_SWP_PTR(&request->req_complete, - REQUEST_COMPLETED); + ompi_wait_sync_t *tmp_sync = (ompi_wait_sync_t *) OPAL_ATOMIC_SWAP_PTR(&request->req_complete, + REQUEST_COMPLETED); /* In the case where another thread concurrently changed the request to REQUEST_PENDING */ if( REQUEST_PENDING != tmp_sync ) wait_sync_update(tmp_sync, 1, request->req_status.MPI_ERROR); diff --git a/opal/threads/mutex.h b/opal/threads/mutex.h index 94c123a6754..da2733a7f1d 100644 --- a/opal/threads/mutex.h +++ b/opal/threads/mutex.h @@ -324,6 +324,20 @@ OPAL_THREAD_ADD_SIZE_T(volatile size_t *addr, int delta) #endif +static inline void *opal_thread_swap_ptr (volatile void *ptr, void *newvalue) +{ + if (opal_using_threads ()) { + return opal_atomic_swap_ptr (ptr, newvalue); + } + + void *old = ((void **) ptr)[0]; + ((void **) ptr)[0] = newvalue; + + return old; +} + +#define OPAL_ATOMIC_SWAP_PTR(x, y) opal_thread_swap_ptr (x, y) + END_C_DECLS #endif /* OPAL_MUTEX_H */ diff --git a/opal/threads/wait_sync.c b/opal/threads/wait_sync.c index 099eb82391b..c9b91372442 100644 --- a/opal/threads/wait_sync.c +++ b/opal/threads/wait_sync.c @@ -3,6 +3,8 @@ * Copyright (c) 2014-2016 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2016 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -21,15 +23,6 @@ static ompi_wait_sync_t* wait_sync_list = NULL; pthread_mutex_unlock( &(who)->lock); \ } while(0) - -int sync_wait_st(ompi_wait_sync_t *sync) -{ - while(sync->count > 0) { - opal_progress(); - } - return (0 == sync->status) ? OPAL_SUCCESS : OPAL_ERROR; -} - int sync_wait_mt(ompi_wait_sync_t *sync) { if(sync->count <= 0) diff --git a/opal/threads/wait_sync.h b/opal/threads/wait_sync.h index da77003a1fc..3a4b9ef20f9 100644 --- a/opal/threads/wait_sync.h +++ b/opal/threads/wait_sync.h @@ -3,6 +3,8 @@ * Copyright (c) 2014-2016 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2016 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -27,50 +29,44 @@ typedef struct ompi_wait_sync_t { #define REQUEST_PENDING (void*)0L #define REQUEST_COMPLETED (void*)1L -#if OPAL_ENABLE_MULTI_THREADS - -#define OPAL_ATOMIC_ADD_32(a,b) opal_atomic_add_32(a,b) -#define OPAL_ATOMIC_SWP_PTR(a,b) opal_atomic_swap_ptr(a,b) -#define SYNC_WAIT(sync) sync_wait_mt(sync) -#define PTHREAD_COND_INIT(a,b) pthread_cond_init(a,b) -#define PTHREAD_MUTEX_INIT(a,b) pthread_mutex_init(a,b) +#define SYNC_WAIT(sync) (opal_using_threads() ? sync_wait_mt (sync) : sync_wait_st (sync)) +#define PTHREAD_COND_INIT(a,b) (opal_using_threads() ? pthread_cond_init (a,b) : 0) +#define PTHREAD_MUTEX_INIT(a,b) (opal_using_threads() ? pthread_mutex_init (a,b) : 0) #define WAIT_SYNC_RELEASE(sync) \ - do { \ + if (opal_using_threads()) { \ pthread_cond_destroy(&(sync)->condition); \ pthread_mutex_destroy(&(sync)->lock); \ - } while(0) + } #define WAIT_SYNC_SIGNAL(sync) \ - do { \ + if (opal_using_threads()) { \ pthread_mutex_lock(&(sync->lock)); \ pthread_cond_signal(&sync->condition); \ pthread_mutex_unlock(&(sync->lock)); \ - } while(0) - -#else + } -#define OPAL_ATOMIC_ADD_32(a,b) (*(a) += (b)) -#define OPAL_ATOMIC_SWP_PTR(a,b) *(a) = (b) -#define PTHREAD_COND_INIT(a,b) -#define PTHREAD_MUTEX_INIT(a,b) -#define SYNC_WAIT(sync) sync_wait_st(sync) -#define WAIT_SYNC_RELEASE(sync) -#define WAIT_SYNC_SIGNAL(sync) +OPAL_DECLSPEC int sync_wait_mt(ompi_wait_sync_t *sync); +static inline int sync_wait_st (ompi_wait_sync_t *sync) +{ + while (sync->count > 0) { + opal_progress(); + } -#endif /* OPAL_ENABLE_MULTI_THREADS */ + return sync->status; +} -OPAL_DECLSPEC int sync_wait_mt(ompi_wait_sync_t *sync); -OPAL_DECLSPEC int sync_wait_st(ompi_wait_sync_t *sync); -#define WAIT_SYNC_INIT(sync,c) \ - do { \ - (sync)->count = c; \ - (sync)->next = NULL; \ - (sync)->prev = NULL; \ - (sync)->status = 0; \ - PTHREAD_COND_INIT(&(sync)->condition, NULL); \ - PTHREAD_MUTEX_INIT(&(sync)->lock, NULL); \ +#define WAIT_SYNC_INIT(sync,c) \ + do { \ + (sync)->count = c; \ + (sync)->next = NULL; \ + (sync)->prev = NULL; \ + (sync)->status = 0; \ + if (opal_using_threads()) { \ + PTHREAD_COND_INIT(&(sync)->condition, NULL); \ + PTHREAD_MUTEX_INIT(&(sync)->lock, NULL); \ + } \ } while(0) /** @@ -82,12 +78,13 @@ OPAL_DECLSPEC int sync_wait_st(ompi_wait_sync_t *sync); static inline void wait_sync_update(ompi_wait_sync_t *sync, int updates, int status) { if( OPAL_LIKELY(OPAL_SUCCESS == status) ) { - if( 0 != (OPAL_ATOMIC_ADD_32(&sync->count, -updates)) ) { + if( 0 != (OPAL_THREAD_ADD32(&sync->count, -updates)) ) { return; } } else { - OPAL_ATOMIC_CMPSET_32(&(sync->count), 0, 0); - sync->status = -1; + /* this is an error path so just use the atomic */ + opal_atomic_swap_32 (&sync->count, 0); + sync->status = OPAL_ERROR; } WAIT_SYNC_SIGNAL(sync); } From e4f920f6f96a02fd88811aafb2aa41aaad2ad83e Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Wed, 22 Jun 2016 09:50:46 -0600 Subject: [PATCH 2/2] opal/progress: improve performance when there are no LP callbacks This commit adds another check to the low-priority callback conditional that short-circuits the atomic-add if there are no low-priority callbacks. This should improve performance in the common case. Signed-off-by: Nathan Hjelm --- opal/runtime/opal_progress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opal/runtime/opal_progress.c b/opal/runtime/opal_progress.c index 896b2d19a9e..2af67c8ad8f 100644 --- a/opal/runtime/opal_progress.c +++ b/opal/runtime/opal_progress.c @@ -221,7 +221,7 @@ opal_progress(void) events += (callbacks[i])(); } - if ((OPAL_THREAD_ADD32((volatile int32_t *) &num_calls, 1) & 0x7) == 0) { + if (callbacks_lp_len > 0 && (OPAL_THREAD_ADD32((volatile int32_t *) &num_calls, 1) & 0x7) == 0) { /* run low priority callbacks once every 8 calls to opal_progress() */ for (i = 0 ; i < callbacks_lp_len ; ++i) { events += (callbacks_lp[i])();