From 143a93f3793de7b8375da144bc9d584c3e1f0586 Mon Sep 17 00:00:00 2001
From: Nathan Hjelm <hjelmn@lanl.gov>
Date: Wed, 22 Jun 2016 09:47:36 -0600
Subject: [PATCH 1/2] opal/sync: remove usage of OPAL_ENABLE_MULTI_THREADS

The OPAL_ENABLE_MULTI_THREADS macro is always defined as 1. This was
causing us to always use the multi-thread path for synchronization
objects. The code has been updated to use the opal_using_threads()
function. When MPI_THREAD_MULTIPLE support is disabled at build time
(2.x only) this function is a macro evaluating to false so the
compiler will optimize out the MT-path in this case. The
OPAL_ATOMIC_ADD_32 macro has been removed and replaced by the existing
OPAL_THREAD_ADD32 macro.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
---
 ompi/request/request.h   |  4 +--
 opal/threads/mutex.h     | 14 +++++++++
 opal/threads/wait_sync.c | 11 ++-----
 opal/threads/wait_sync.h | 65 +++++++++++++++++++---------------------
 4 files changed, 49 insertions(+), 45 deletions(-)

diff --git a/ompi/request/request.h b/ompi/request/request.h
index cb67f44c66b..9d53b6156a3 100644
--- a/ompi/request/request.h
+++ b/ompi/request/request.h
@@ -416,8 +416,8 @@ static inline int ompi_request_complete(ompi_request_t* request, bool with_signa
 
     if( OPAL_LIKELY(with_signal) ) {
         if(!OPAL_ATOMIC_CMPSET_PTR(&request->req_complete, REQUEST_PENDING, REQUEST_COMPLETED)) {
-            ompi_wait_sync_t *tmp_sync = (ompi_wait_sync_t *) OPAL_ATOMIC_SWP_PTR(&request->req_complete,
-                                                                                  REQUEST_COMPLETED);
+            ompi_wait_sync_t *tmp_sync = (ompi_wait_sync_t *) OPAL_ATOMIC_SWAP_PTR(&request->req_complete,
+                                                                                   REQUEST_COMPLETED);
             /* In the case where another thread concurrently changed the request to REQUEST_PENDING */
             if( REQUEST_PENDING != tmp_sync )
                 wait_sync_update(tmp_sync, 1, request->req_status.MPI_ERROR);
diff --git a/opal/threads/mutex.h b/opal/threads/mutex.h
index 94c123a6754..da2733a7f1d 100644
--- a/opal/threads/mutex.h
+++ b/opal/threads/mutex.h
@@ -324,6 +324,20 @@ OPAL_THREAD_ADD_SIZE_T(volatile size_t *addr, int delta)
 #endif
 
 
+static inline void *opal_thread_swap_ptr (volatile void *ptr, void *newvalue)
+{
+    if (opal_using_threads ()) {
+        return opal_atomic_swap_ptr (ptr, newvalue);
+    }
+
+    void *old = ((void **) ptr)[0];
+    ((void **) ptr)[0] = newvalue;
+
+    return old;
+}
+
+#define OPAL_ATOMIC_SWAP_PTR(x, y) opal_thread_swap_ptr (x, y)
+
 END_C_DECLS
 
 #endif                          /* OPAL_MUTEX_H */
diff --git a/opal/threads/wait_sync.c b/opal/threads/wait_sync.c
index 099eb82391b..c9b91372442 100644
--- a/opal/threads/wait_sync.c
+++ b/opal/threads/wait_sync.c
@@ -3,6 +3,8 @@
  * Copyright (c) 2014-2016 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
+ * Copyright (c) 2016      Los Alamos National Security, LLC. All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -21,15 +23,6 @@ static ompi_wait_sync_t* wait_sync_list = NULL;
         pthread_mutex_unlock( &(who)->lock);           \
     } while(0)
 
-
-int sync_wait_st(ompi_wait_sync_t *sync)
-{
-    while(sync->count > 0) {
-        opal_progress();
-    }
-    return (0 == sync->status) ? OPAL_SUCCESS : OPAL_ERROR;
-}
-
 int sync_wait_mt(ompi_wait_sync_t *sync)
 {
     if(sync->count <= 0)
diff --git a/opal/threads/wait_sync.h b/opal/threads/wait_sync.h
index da77003a1fc..3a4b9ef20f9 100644
--- a/opal/threads/wait_sync.h
+++ b/opal/threads/wait_sync.h
@@ -3,6 +3,8 @@
  * Copyright (c) 2014-2016 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
+ * Copyright (c) 2016      Los Alamos National Security, LLC. All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -27,50 +29,44 @@ typedef struct ompi_wait_sync_t {
 #define REQUEST_PENDING        (void*)0L
 #define REQUEST_COMPLETED      (void*)1L
 
-#if OPAL_ENABLE_MULTI_THREADS
-
-#define OPAL_ATOMIC_ADD_32(a,b)         opal_atomic_add_32(a,b)
-#define OPAL_ATOMIC_SWP_PTR(a,b)        opal_atomic_swap_ptr(a,b)
-#define SYNC_WAIT(sync)                 sync_wait_mt(sync)
-#define PTHREAD_COND_INIT(a,b)          pthread_cond_init(a,b)
-#define PTHREAD_MUTEX_INIT(a,b)         pthread_mutex_init(a,b)
+#define SYNC_WAIT(sync)                 (opal_using_threads() ? sync_wait_mt (sync) : sync_wait_st (sync))
+#define PTHREAD_COND_INIT(a,b)          (opal_using_threads() ? pthread_cond_init (a,b) : 0)
+#define PTHREAD_MUTEX_INIT(a,b)         (opal_using_threads() ? pthread_mutex_init (a,b) : 0)
 
 #define WAIT_SYNC_RELEASE(sync)                       \
-    do {                                              \
+    if (opal_using_threads()) {                       \
        pthread_cond_destroy(&(sync)->condition);      \
        pthread_mutex_destroy(&(sync)->lock);          \
-    } while(0)
+    }
 
 #define WAIT_SYNC_SIGNAL(sync)                        \
-    do {                                              \
+    if (opal_using_threads()) {                       \
         pthread_mutex_lock(&(sync->lock));            \
         pthread_cond_signal(&sync->condition);        \
         pthread_mutex_unlock(&(sync->lock));          \
-    } while(0)
-
-#else
+    }
 
-#define OPAL_ATOMIC_ADD_32(a,b)         (*(a) += (b))
-#define OPAL_ATOMIC_SWP_PTR(a,b)        *(a) = (b)
-#define PTHREAD_COND_INIT(a,b)
-#define PTHREAD_MUTEX_INIT(a,b)
-#define SYNC_WAIT(sync)                 sync_wait_st(sync)
-#define WAIT_SYNC_RELEASE(sync)
-#define WAIT_SYNC_SIGNAL(sync)
+OPAL_DECLSPEC int sync_wait_mt(ompi_wait_sync_t *sync);
+static inline int sync_wait_st (ompi_wait_sync_t *sync)
+{
+    while (sync->count > 0) {
+        opal_progress();
+    }
 
-#endif /* OPAL_ENABLE_MULTI_THREADS */
+    return sync->status;
+}
 
-OPAL_DECLSPEC int sync_wait_mt(ompi_wait_sync_t *sync);
-OPAL_DECLSPEC int sync_wait_st(ompi_wait_sync_t *sync);
 
-#define WAIT_SYNC_INIT(sync,c)                        \
-    do {                                              \
-       (sync)->count = c;                             \
-       (sync)->next = NULL;                           \
-       (sync)->prev = NULL;                           \
-       (sync)->status = 0;                            \
-       PTHREAD_COND_INIT(&(sync)->condition, NULL);   \
-       PTHREAD_MUTEX_INIT(&(sync)->lock, NULL);       \
+#define WAIT_SYNC_INIT(sync,c)                                  \
+    do {                                                        \
+        (sync)->count = c;                                      \
+        (sync)->next = NULL;                                    \
+        (sync)->prev = NULL;                                    \
+        (sync)->status = 0;                                     \
+        if (opal_using_threads()) {                             \
+            PTHREAD_COND_INIT(&(sync)->condition, NULL);        \
+            PTHREAD_MUTEX_INIT(&(sync)->lock, NULL);            \
+        }                                                       \
     } while(0)
 
 /**
@@ -82,12 +78,13 @@ OPAL_DECLSPEC int sync_wait_st(ompi_wait_sync_t *sync);
 static inline void wait_sync_update(ompi_wait_sync_t *sync, int updates, int status)
 {
     if( OPAL_LIKELY(OPAL_SUCCESS == status) ) {
-        if( 0 != (OPAL_ATOMIC_ADD_32(&sync->count, -updates)) ) {
+        if( 0 != (OPAL_THREAD_ADD32(&sync->count, -updates)) ) {
             return;
         }
     } else {
-        OPAL_ATOMIC_CMPSET_32(&(sync->count), 0, 0);
-        sync->status = -1;
+        /* this is an error path so just use the atomic */
+        opal_atomic_swap_32 (&sync->count, 0);
+        sync->status = OPAL_ERROR;
     }
     WAIT_SYNC_SIGNAL(sync);
 }

From e4f920f6f96a02fd88811aafb2aa41aaad2ad83e Mon Sep 17 00:00:00 2001
From: Nathan Hjelm <hjelmn@lanl.gov>
Date: Wed, 22 Jun 2016 09:50:46 -0600
Subject: [PATCH 2/2] opal/progress: improve performance when there are no LP
 callbacks

This commit adds another check to the low-priority callback
conditional that short-circuits the atomic-add if there are no
low-priority callbacks. This should improve performance in the common
case.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
---
 opal/runtime/opal_progress.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/opal/runtime/opal_progress.c b/opal/runtime/opal_progress.c
index 896b2d19a9e..2af67c8ad8f 100644
--- a/opal/runtime/opal_progress.c
+++ b/opal/runtime/opal_progress.c
@@ -221,7 +221,7 @@ opal_progress(void)
         events += (callbacks[i])();
     }
 
-    if ((OPAL_THREAD_ADD32((volatile int32_t *) &num_calls, 1) & 0x7) == 0) {
+    if (callbacks_lp_len > 0 && (OPAL_THREAD_ADD32((volatile int32_t *) &num_calls, 1) & 0x7) == 0) {
         /* run low priority callbacks once every 8 calls to opal_progress() */
         for (i = 0 ; i < callbacks_lp_len ; ++i) {
             events += (callbacks_lp[i])();