From 6a19a10fbbdc0a1fde6c046ee1d8fd79c74e8105 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Tue, 14 Jul 2015 12:09:39 -0600 Subject: [PATCH 1/3] atomic/ppc: add atomics for load-link, store-conditional, and swap This commit adds implementations of opal_atomic_ll_32/64 and opal_atomic_sc_32/64. These atomics can be used to implement more efficient lifo/fifo operations on supported platforms. The only supported platform with this commit is powerpc/power. This commit also adds an implementation of opal_atomic_swap_32/64 for powerpc. Tested with Power8. Signed-off-by: Nathan Hjelm --- opal/include/opal/sys/atomic.h | 11 +++- opal/include/opal/sys/atomic_impl.h | 26 +++++++- opal/include/opal/sys/powerpc/atomic.h | 87 ++++++++++++++++++++++++++ 3 files changed, 121 insertions(+), 3 deletions(-) diff --git a/opal/include/opal/sys/atomic.h b/opal/include/opal/sys/atomic.h index d32688155ef..2a273722878 100644 --- a/opal/include/opal/sys/atomic.h +++ b/opal/include/opal/sys/atomic.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -11,7 +12,7 @@ * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. - * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -38,7 +39,7 @@ * - \c OPAL_HAVE_ATOMIC_MEM_BARRIER atomic memory barriers * - \c OPAL_HAVE_ATOMIC_SPINLOCKS atomic spinlocks * - \c OPAL_HAVE_ATOMIC_MATH_32 if 32 bit add/sub/cmpset can be done "atomicly" - * - \c OPAL_HAVE_ATOMIC_MATH_64 if 32 bit add/sub/cmpset can be done "atomicly" + * - \c OPAL_HAVE_ATOMIC_MATH_64 if 64 bit add/sub/cmpset can be done "atomicly" * * Note that for the Atomic math, atomic add/sub may be implemented as * C code using opal_atomic_cmpset. The appearance of atomic @@ -177,6 +178,12 @@ typedef struct opal_atomic_lock_t opal_atomic_lock_t; #ifndef OPAL_HAVE_ATOMIC_CMPSET_128 #define OPAL_HAVE_ATOMIC_CMPSET_128 0 #endif +#ifndef OPAL_HAVE_ATOMIC_LLSC_32 +#define OPAL_HAVE_ATOMIC_LLSC_32 0 +#endif +#ifndef OPAL_HAVE_ATOMIC_LLSC_64 +#define OPAL_HAVE_ATOMIC_LLSC_64 0 +#endif #endif /* DOXYGEN */ /********************************************************************** diff --git a/opal/include/opal/sys/atomic_impl.h b/opal/include/opal/sys/atomic_impl.h index e2da78e9604..16b03b485f3 100644 --- a/opal/include/opal/sys/atomic_impl.h +++ b/opal/include/opal/sys/atomic_impl.h @@ -11,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2010-2014 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights + * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -274,6 +274,30 @@ static inline int opal_atomic_cmpset_rel_ptr(volatile void* addr, #endif /* (OPAL_HAVE_ATOMIC_SWAP_32 || OPAL_HAVE_ATOMIC_SWAP_64) */ +#if (OPAL_HAVE_ATOMIC_LLSC_32 || OPAL_HAVE_ATOMIC_LLSC_64) + +#if SIZEOF_VOID_P == 4 && OPAL_HAVE_ATOMIC_LLSC_32 + +#define opal_atomic_ll_ptr(addr) (void *) opal_atomic_ll_32((int32_t *) addr) +#define opal_atomic_sc_ptr(addr, newval) opal_atomic_sc_32((int32_t *) addr, (int32_t) newval) + +#define OPAL_HAVE_ATOMIC_LLSC_PTR 1 + +#elif SIZEOF_VOID_P == 8 && OPAL_HAVE_ATOMIC_LLSC_64 + +#define opal_atomic_ll_ptr(addr) (void *) opal_atomic_ll_64((int64_t *) addr) +#define opal_atomic_sc_ptr(addr, newval) opal_atomic_sc_64((int64_t *) addr, (int64_t) newval) + +#define OPAL_HAVE_ATOMIC_LLSC_PTR 1 + +#endif + +#endif /* (OPAL_HAVE_ATOMIC_LLSC_32 || OPAL_HAVE_ATOMIC_LLSC_64)*/ + +#if !defined(OPAL_HAVE_ATOMIC_LLSC_PTR) +#define OPAL_HAVE_ATOMIC_LLSC_PTR 0 +#endif + #if OPAL_HAVE_ATOMIC_MATH_32 || OPAL_HAVE_ATOMIC_MATH_64 diff --git a/opal/include/opal/sys/powerpc/atomic.h b/opal/include/opal/sys/powerpc/atomic.h index 6c37261ad08..427f4ff8d75 100644 --- a/opal/include/opal/sys/powerpc/atomic.h +++ b/opal/include/opal/sys/powerpc/atomic.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -10,6 +11,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2010 IBM Corporation. All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -40,6 +43,8 @@ #define OPAL_HAVE_ATOMIC_MEM_BARRIER 1 #define OPAL_HAVE_ATOMIC_CMPSET_32 1 +#define OPAL_HAVE_ATOMIC_SWAP_32 1 +#define OPAL_HAVE_ATOMIC_LLSC_32 1 #define OPAL_HAVE_ATOMIC_MATH_32 1 #define OPAL_HAVE_ATOMIC_ADD_32 1 @@ -48,6 +53,8 @@ #if (OPAL_ASSEMBLY_ARCH == OPAL_POWERPC64) || OPAL_ASM_SUPPORT_64BIT #define OPAL_HAVE_ATOMIC_CMPSET_64 1 +#define OPAL_HAVE_ATOMIC_SWAP_64 1 +#define OPAL_HAVE_ATOMIC_LLSC_64 1 #endif @@ -140,6 +147,32 @@ static inline int opal_atomic_cmpset_32(volatile int32_t *addr, return (ret == oldval); } +static inline int32_t opal_atomic_ll_32 (volatile int32_t *addr) +{ + int32_t ret; + + __asm__ __volatile__ ("lwarx %0, 0, %1 \n\t" + : "=&r" (ret) + : "r" (addr) + :); + return ret; +} + +static inline int opal_atomic_sc_32 (volatile int32_t *addr, int32_t newval) +{ + int32_t ret, foo; + + __asm__ __volatile__ (" stwcx. %4, 0, %3 \n\t" + " li %0,0 \n\t" + " bne- 1f \n\t" + " ori %0,%0,1 \n\t" + "1:" + : "=r" (ret), "=m" (*addr), "=r" (foo) + : "r" (addr), "r" (newval) + : "cc", "memory"); + return ret; +} + /* these two functions aren't inlined in the non-gcc case because then there would be two function calls (since neither cmpset_32 nor atomic_?mb can be inlined). Instead, we "inline" them by hand in @@ -164,6 +197,20 @@ static inline int opal_atomic_cmpset_rel_32(volatile int32_t *addr, return opal_atomic_cmpset_32(addr, oldval, newval); } +static inline int32_t opal_atomic_swap_32(volatile int32_t *addr, int32_t newval) +{ + int32_t ret; + + __asm__ __volatile__ ("1: lwarx %0, 0, %2 \n\t" + " stwcx. %3, 0, %2 \n\t" + " bne- 1b \n\t" + : "=&r" (ret), "=m" (*addr) + : "r" (addr), "r" (newval) + : "cc", "memory"); + + return ret; +} + #endif /* OPAL_GCC_INLINE_ASSEMBLY */ @@ -189,6 +236,32 @@ static inline int opal_atomic_cmpset_64(volatile int64_t *addr, return (ret == oldval); } +static inline int64_t opal_atomic_ll_64(volatile int64_t *addr) +{ + int64_t ret; + + __asm__ __volatile__ ("ldarx %0, 0, %1 \n\t" + : "=&r" (ret) + : "r" (addr) + :); + return ret; +} + +static inline int opal_atomic_sc_64(volatile int64_t *addr, int64_t newval) +{ + int32_t ret, foo; + + __asm__ __volatile__ (" stdcx. %4, 0, %3 \n\t" + " li %0,0 \n\t" + " bne- 1f \n\t" + " ori %0,%0,1 \n\t" + "1:" + : "=r" (ret), "=m" (*addr), "=r" (foo) + : "r" (addr), "r" (newval) + : "cc", "memory"); + return ret; +} + /* these two functions aren't inlined in the non-gcc case because then there would be two function calls (since neither cmpset_64 nor atomic_?mb can be inlined). Instead, we "inline" them by hand in @@ -213,6 +286,20 @@ static inline int opal_atomic_cmpset_rel_64(volatile int64_t *addr, return opal_atomic_cmpset_64(addr, oldval, newval); } +static inline int64_t opal_atomic_swap_64(volatile int64_t *addr, int64_t newval) +{ + int64_t ret; + + __asm__ __volatile__ ("1: ldarx %0, 0, %2 \n\t" + " stdcx. %3, 0, %2 \n\t" + " bne- 1b \n\t" + : "=&r" (ret), "=m" (*addr) + : "r" (addr), "r" (newval) + : "cc", "memory"); + + return ret; +} + #endif /* OPAL_GCC_INLINE_ASSEMBLY */ #elif (OPAL_ASSEMBLY_ARCH == OPAL_POWERPC32) && OPAL_ASM_SUPPORT_64BIT From 2a7e191dd8368db807af3d6ea184e440d411f313 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Tue, 14 Jul 2015 12:12:59 -0600 Subject: [PATCH 2/3] opal/fifo: if available use load-linked store-conditional These instructions allow a more efficient implementation of the opal_fifo_pop_atomic function. Signed-off-by: Nathan Hjelm --- opal/class/opal_fifo.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/opal/class/opal_fifo.h b/opal/class/opal_fifo.h index 79ba5bae6b7..604601dc356 100644 --- a/opal/class/opal_fifo.h +++ b/opal/class/opal_fifo.h @@ -216,6 +216,27 @@ static inline opal_list_item_t *opal_fifo_pop_atomic (opal_fifo_t *fifo) { opal_list_item_t *item, *next; +#if OPAL_HAVE_ATOMIC_LLSC_PTR + /* use load-linked store-conditional to avoid ABA issues */ + do { + item = opal_atomic_ll_ptr (&fifo->opal_fifo_head.data.item); + if (&fifo->opal_fifo_ghost == item) { + if (&fifo->opal_fifo_ghost == fifo->opal_fifo_tail.data.item) { + return NULL; + } + + /* fifo does not appear empty. wait for the fifo to be made + * consistent by conflicting thread. */ + continue; + } + + next = (opal_list_item_t *) item->opal_list_next; + if (opal_atomic_sc_ptr (&fifo->opal_fifo_head.data.item, next)) { + break; + } + } while (1); +#else + /* protect against ABA issues by "locking" the head */ do { if (opal_atomic_cmpset_32 ((int32_t *) &fifo->opal_fifo_head.data.counter, 0, 1)) { break; @@ -234,6 +255,7 @@ static inline opal_list_item_t *opal_fifo_pop_atomic (opal_fifo_t *fifo) next = (opal_list_item_t *) item->opal_list_next; fifo->opal_fifo_head.data.item = next; +#endif if (&fifo->opal_fifo_ghost == next) { if (!opal_atomic_cmpset_ptr (&fifo->opal_fifo_tail.data.item, item, &fifo->opal_fifo_ghost)) { From 209a7a0721a03cb55aebb2a0f7956acaae3f2835 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Tue, 14 Jul 2015 12:14:17 -0600 Subject: [PATCH 3/3] opal/lifo: add load-linked store-conditional support This commit adds implementations for opal_atomic_lifo_pop and opal_atomic_lifo_push that make use of the load-linked and store-conditional instruction. These instruction allow for a more efficient implementation on supported platforms. Signed-off-by: Nathan Hjelm --- opal/class/opal_lifo.h | 51 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/opal/class/opal_lifo.h b/opal/class/opal_lifo.h index 2297de56c1d..a4e106343a8 100644 --- a/opal/class/opal_lifo.h +++ b/opal/class/opal_lifo.h @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2007 Voltaire All rights reserved. * Copyright (c) 2010 IBM Corporation. All rights reserved. - * Copyright (c) 2014 Los Alamos National Security, LLC. All rights + * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights * reseved. * $COPYRIGHT$ * @@ -25,6 +25,7 @@ #define OPAL_LIFO_H_HAS_BEEN_INCLUDED #include "opal_config.h" +#include #include "opal/class/opal_list.h" #include "opal/sys/atomic.h" @@ -180,6 +181,52 @@ static inline opal_list_item_t *opal_lifo_push_atomic (opal_lifo_t *lifo, } while (1); } +#if OPAL_HAVE_ATOMIC_LLSC_PTR + +static inline void _opal_lifo_release_cpu (void) +{ + /* NTH: there are many ways to cause the current thread to be suspended. This one + * should work well in most cases. Another approach would be to use poll (NULL, 0, ) but + * the interval will be forced to be in ms (instead of ns or us). Note that there + * is a performance improvement for the lifo test when this call is made on detection + * of contention but it may not translate into actually MPI or application performance + * improvements. */ + static struct timespec interval = { .tv_sec = 0, .tv_nsec = 100 }; + nanosleep (&interval, NULL); +} + +/* Retrieve one element from the LIFO. If we reach the ghost element then the LIFO + * is empty so we return NULL. + */ +static inline opal_list_item_t *opal_lifo_pop_atomic (opal_lifo_t* lifo) +{ + opal_list_item_t *item, *next; + int attempt = 0; + + do { + if (++attempt == 5) { + /* deliberatly suspend this thread to allow other threads to run. this should + * only occur during periods of contention on the lifo. */ + _opal_lifo_release_cpu (); + attempt = 0; + } + + item = (opal_list_item_t *) opal_atomic_ll_ptr (&lifo->opal_lifo_head.data.item); + if (&lifo->opal_lifo_ghost == item) { + return NULL; + } + + next = (opal_list_item_t *) item->opal_list_next; + } while (!opal_atomic_sc_ptr (&lifo->opal_lifo_head.data.item, next)); + + opal_atomic_wmb (); + + item->opal_list_next = NULL; + return item; +} + +#else + /* Retrieve one element from the LIFO. If we reach the ghost element then the LIFO * is empty so we return NULL. */ @@ -216,6 +263,8 @@ static inline opal_list_item_t *opal_lifo_pop_atomic (opal_lifo_t* lifo) return item; } +#endif /* OPAL_HAVE_ATOMIC_LLSC_PTR */ + #endif /* single-threaded versions of the lifo functions */