Skip to content

Commit

Permalink
[OpenMP] Use C++11 Atomics - barrier, tasking, and lock code
Browse files Browse the repository at this point in the history
These are preliminary changes that attempt to use C++11 Atomics in the runtime.
We are expecting better portability with this change across architectures/OSes.
Here is the summary of the changes.

Most variables that need synchronization operation were converted to generic
atomic variables (std::atomic<T>). Variables that are updated with combined CAS
are packed into a single atomic variable, and partial read/write is done
through unpacking/packing

Patch by Hansang Bae

Differential Revision: https://reviews.llvm.org/D47903

llvm-svn: 336563
  • Loading branch information
jpeyton52 committed Jul 9, 2018
1 parent 7cd3241 commit 37e2ef5
Show file tree
Hide file tree
Showing 17 changed files with 433 additions and 280 deletions.
46 changes: 25 additions & 21 deletions openmp/runtime/src/kmp.h
Expand Up @@ -940,7 +940,7 @@ extern int __kmp_hws_abs_flag; // absolute or per-item number requested
// HW TSC is used to reduce overhead (clock tick instead of nanosecond).
extern kmp_uint64 __kmp_ticks_per_msec;
#if KMP_COMPILER_ICC
#define KMP_NOW() _rdtsc()
#define KMP_NOW() ((kmp_uint64)_rdtsc())
#else
#define KMP_NOW() __kmp_hardware_timestamp()
#endif
Expand Down Expand Up @@ -2109,8 +2109,9 @@ typedef struct kmp_task { /* GEH: Shouldn't this be aligned somehow? */

#if OMP_40_ENABLED
typedef struct kmp_taskgroup {
kmp_int32 count; // number of allocated and not yet complete tasks
kmp_int32 cancel_request; // request for cancellation of this taskgroup
std::atomic<kmp_int32> count; // number of allocated and incomplete tasks
std::atomic<kmp_int32>
cancel_request; // request for cancellation of this taskgroup
struct kmp_taskgroup *parent; // parent taskgroup
// TODO: change to OMP_50_ENABLED, need to change build tools for this to work
#if OMP_45_ENABLED
Expand Down Expand Up @@ -2149,8 +2150,8 @@ typedef struct kmp_base_depnode {
kmp_uint32 id;
#endif

volatile kmp_int32 npredecessors;
volatile kmp_int32 nrefs;
std::atomic<kmp_int32> npredecessors;
std::atomic<kmp_int32> nrefs;
} kmp_base_depnode_t;

union KMP_ALIGN_CACHE kmp_depnode {
Expand Down Expand Up @@ -2242,18 +2243,18 @@ struct kmp_taskdata { /* aligned during dynamic allocation */
/* Currently not used except for perhaps IDB */
kmp_taskdata_t *td_parent; /* parent task */
kmp_int32 td_level; /* task nesting level */
kmp_int32 td_untied_count; /* untied task active parts counter */
std::atomic<kmp_int32> td_untied_count; // untied task active parts counter
ident_t *td_ident; /* task identifier */
// Taskwait data.
ident_t *td_taskwait_ident;
kmp_uint32 td_taskwait_counter;
kmp_int32 td_taskwait_thread; /* gtid + 1 of thread encountered taskwait */
KMP_ALIGN_CACHE kmp_internal_control_t
td_icvs; /* Internal control variables for the task */
KMP_ALIGN_CACHE volatile kmp_int32
KMP_ALIGN_CACHE std::atomic<kmp_int32>
td_allocated_child_tasks; /* Child tasks (+ current task) not yet
deallocated */
volatile kmp_int32
std::atomic<kmp_int32>
td_incomplete_child_tasks; /* Child tasks not yet complete */
#if OMP_40_ENABLED
kmp_taskgroup_t
Expand Down Expand Up @@ -2338,7 +2339,7 @@ typedef struct kmp_base_task_team {
kmp_int32 tt_untied_task_encountered;

KMP_ALIGN_CACHE
volatile kmp_int32 tt_unfinished_threads; /* #threads still active */
std::atomic<kmp_int32> tt_unfinished_threads; /* #threads still active */

KMP_ALIGN_CACHE
volatile kmp_uint32
Expand Down Expand Up @@ -2561,7 +2562,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
// ---------------------------------------------------------------------------
KMP_ALIGN_CACHE kmp_ordered_team_t t_ordered;
kmp_balign_team_t t_bar[bs_last_barrier];
volatile int t_construct; // count of single directive encountered by team
std::atomic<int> t_construct; // count of single directive encountered by team
char pad[sizeof(kmp_lock_t)]; // padding to maintain performance on big iron

// Master only
Expand Down Expand Up @@ -2636,12 +2637,14 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
// for SERIALIZED teams nested 2 or more levels deep
#if OMP_40_ENABLED
// typed flag to store request state of cancellation
kmp_int32 t_cancel_request;
std::atomic<kmp_int32> t_cancel_request;
#endif
int t_master_active; // save on fork, restore on join
kmp_taskq_t t_taskq; // this team's task queue
void *t_copypriv_data; // team specific pointer to copyprivate data array
kmp_uint32 t_copyin_counter;
#if KMP_OS_WINDOWS
std::atomic<kmp_uint32> t_copyin_counter;
#endif
#if USE_ITT_BUILD
void *t_stack_id; // team specific stack stitching id (for ittnotify)
#endif /* USE_ITT_BUILD */
Expand Down Expand Up @@ -2685,7 +2688,8 @@ typedef struct kmp_base_root {
volatile int r_active; /* TRUE if some region in a nest has > 1 thread */
// GEH: This is misnamed, should be r_in_parallel
volatile int r_nested; // TODO: GEH - This is unused, just remove it entirely.
int r_in_parallel; /* keeps a count of active parallel regions per root */
// keeps a count of active parallel regions per root
std::atomic<int> r_in_parallel;
// GEH: This is misnamed, should be r_active_levels
kmp_team_t *r_root_team;
kmp_team_t *r_hot_team;
Expand Down Expand Up @@ -2742,8 +2746,8 @@ extern int __kmp_debug_buf_atomic; /* TRUE means use atomic update of buffer
entry pointer */

extern char *__kmp_debug_buffer; /* Debug buffer itself */
extern int __kmp_debug_count; /* Counter for number of lines printed in buffer
so far */
extern std::atomic<int> __kmp_debug_count; /* Counter for number of lines
printed in buffer so far */
extern int __kmp_debug_buf_warn_chars; /* Keep track of char increase
recommended in warnings */
/* end rotating debug buffer */
Expand Down Expand Up @@ -3000,7 +3004,7 @@ extern volatile int __kmp_nth;
threads, and those in the thread pool */
extern volatile int __kmp_all_nth;
extern int __kmp_thread_pool_nth;
extern volatile int __kmp_thread_pool_active_nth;
extern std::atomic<int> __kmp_thread_pool_active_nth;

extern kmp_root_t **__kmp_root; /* root of thread hierarchy */
/* end data protected by fork/join lock */
Expand All @@ -3009,14 +3013,14 @@ extern kmp_root_t **__kmp_root; /* root of thread hierarchy */
extern kmp_global_t __kmp_global; /* global status */

extern kmp_info_t __kmp_monitor;
extern volatile kmp_uint32 __kmp_team_counter; // For Debugging Support Library
extern volatile kmp_uint32 __kmp_task_counter; // For Debugging Support Library
// For Debugging Support Library
extern std::atomic<kmp_uint32> __kmp_team_counter;
// For Debugging Support Library
extern std::atomic<kmp_uint32> __kmp_task_counter;

#if USE_DEBUGGER

#define _KMP_GEN_ID(counter) \
(__kmp_debugging ? KMP_TEST_THEN_INC32((volatile kmp_int32 *)&counter) + 1 \
: ~0)
(__kmp_debugging ? KMP_ATOMIC_INC(&counter) + 1 : ~0)
#else
#define _KMP_GEN_ID(counter) (~0)
#endif /* USE_DEBUGGER */
Expand Down
24 changes: 11 additions & 13 deletions openmp/runtime/src/kmp_barrier.cpp
Expand Up @@ -956,14 +956,12 @@ static void __kmp_hierarchical_barrier_gather(
// All subordinates are gathered; now release parent if not master thread

if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
KA_TRACE(
20,
("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
"arrived(%p): %llu => %llu\n",
gtid, team->t.t_id, tid,
__kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
" T#%d(%d:%d) arrived(%p): %llu => %llu\n",
gtid, team->t.t_id, tid,
__kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
/* Mark arrival to parent: After performing this write, a worker thread may
not assume that the team is valid any more - it could be deallocated by
the master thread at any time. */
Expand All @@ -973,8 +971,8 @@ static void __kmp_hierarchical_barrier_gather(
ANNOTATE_BARRIER_BEGIN(this_thr);
kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
flag.release();
} else { // Leaf does special release on the "offset" bits of parent's
// b_arrived flag
} else {
// Leaf does special release on "offset" bits of parent's b_arrived flag
thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
flag.set_waiter(other_threads[thr_bar->parent_tid]);
Expand Down Expand Up @@ -1353,10 +1351,10 @@ int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
#endif

#if OMP_40_ENABLED
kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
// Reset cancellation flag for worksharing constructs
if (team->t.t_cancel_request == cancel_loop ||
team->t.t_cancel_request == cancel_sections) {
team->t.t_cancel_request = cancel_noreq;
if (cancel_request == cancel_loop || cancel_request == cancel_sections) {
KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
}
#endif
#if USE_ITT_BUILD
Expand Down
10 changes: 5 additions & 5 deletions openmp/runtime/src/kmp_cancel.cpp
Expand Up @@ -51,8 +51,8 @@ kmp_int32 __kmpc_cancel(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 cncl_kind) {
{
kmp_team_t *this_team = this_thr->th.th_team;
KMP_DEBUG_ASSERT(this_team);
kmp_int32 old = KMP_COMPARE_AND_STORE_RET32(
&(this_team->t.t_cancel_request), cancel_noreq, cncl_kind);
kmp_int32 old = cancel_noreq;
this_team->t.t_cancel_request.compare_exchange_strong(old, cncl_kind);
if (old == cancel_noreq || old == cncl_kind) {
// we do not have a cancellation request in this team or we do have
// one that matches the current request -> cancel
Expand Down Expand Up @@ -89,8 +89,8 @@ kmp_int32 __kmpc_cancel(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 cncl_kind) {

taskgroup = task->td_taskgroup;
if (taskgroup) {
kmp_int32 old = KMP_COMPARE_AND_STORE_RET32(
&(taskgroup->cancel_request), cancel_noreq, cncl_kind);
kmp_int32 old = cancel_noreq;
taskgroup->cancel_request.compare_exchange_strong(old, cncl_kind);
if (old == cancel_noreq || old == cncl_kind) {
// we do not have a cancellation request in this taskgroup or we do
// have one that matches the current request -> cancel
Expand Down Expand Up @@ -257,7 +257,7 @@ kmp_int32 __kmpc_cancel_barrier(ident_t *loc, kmp_int32 gtid) {
if (__kmp_omp_cancellation) {
// depending on which construct to cancel, check the flag and
// reset the flag
switch (this_team->t.t_cancel_request) {
switch (KMP_ATOMIC_LD_RLX(&(this_team->t.t_cancel_request))) {
case cancel_parallel:
ret = 1;
// ensure that threads have checked the flag, when
Expand Down
25 changes: 12 additions & 13 deletions openmp/runtime/src/kmp_csupport.cpp
Expand Up @@ -930,9 +930,10 @@ __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
#define KMP_ACQUIRE_TAS_LOCK(lock, gtid) \
{ \
kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
if (l->lk.poll != KMP_LOCK_FREE(tas) || \
!KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), \
KMP_LOCK_BUSY(gtid + 1, tas))) { \
kmp_int32 tas_free = KMP_LOCK_FREE(tas); \
kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \
if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \
!__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) { \
kmp_uint32 spins; \
KMP_FSYNC_PREPARE(l); \
KMP_INIT_YIELD(spins); \
Expand All @@ -943,9 +944,9 @@ __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
KMP_YIELD_SPIN(spins); \
} \
kmp_backoff_t backoff = __kmp_spin_backoff_params; \
while (l->lk.poll != KMP_LOCK_FREE(tas) || \
!KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), \
KMP_LOCK_BUSY(gtid + 1, tas))) { \
while ( \
KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \
!__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) { \
__kmp_spin_backoff(&backoff); \
if (TCR_4(__kmp_nth) > \
(__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
Expand All @@ -962,17 +963,15 @@ __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
#define KMP_TEST_TAS_LOCK(lock, gtid, rc) \
{ \
kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
rc = l->lk.poll == KMP_LOCK_FREE(tas) && \
KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), \
KMP_LOCK_BUSY(gtid + 1, tas)); \
kmp_int32 tas_free = KMP_LOCK_FREE(tas); \
kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \
rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free && \
__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy); \
}

// Fast-path release tas lock
#define KMP_RELEASE_TAS_LOCK(lock, gtid) \
{ \
TCW_4(((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); \
KMP_MB(); \
}
{ KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }

#if KMP_USE_FUTEX

Expand Down
2 changes: 2 additions & 0 deletions openmp/runtime/src/kmp_debugger.cpp
Expand Up @@ -68,7 +68,9 @@ kmp_omp_struct_info_t __kmp_omp_debug_struct_info = {
addr_and_size_of(__kmp_threads),
addr_and_size_of(__kmp_root),
addr_and_size_of(__kmp_threads_capacity),
#if KMP_USE_MONITOR
addr_and_size_of(__kmp_monitor),
#endif
#if !KMP_USE_DYNAMIC_LOCK
addr_and_size_of(__kmp_user_lock_table),
#endif
Expand Down
42 changes: 21 additions & 21 deletions openmp/runtime/src/kmp_global.cpp
Expand Up @@ -57,8 +57,8 @@ int __kmp_init_counter = 0;
int __kmp_root_counter = 0;
int __kmp_version = 0;

volatile kmp_uint32 __kmp_team_counter = 0;
volatile kmp_uint32 __kmp_task_counter = 0;
std::atomic<kmp_uint32> __kmp_team_counter = ATOMIC_VAR_INIT(0);
std::atomic<kmp_uint32> __kmp_task_counter = ATOMIC_VAR_INIT(0);

unsigned int __kmp_init_wait =
KMP_DEFAULT_INIT_WAIT; /* initial number of spin-tests */
Expand Down Expand Up @@ -335,8 +335,8 @@ int __kmp_debug_buf_atomic =
FALSE; /* TRUE means use atomic update of buffer entry pointer */

char *__kmp_debug_buffer = NULL; /* Debug buffer itself */
int __kmp_debug_count =
0; /* Counter for number of lines printed in buffer so far */
std::atomic<int> __kmp_debug_count =
ATOMIC_VAR_INIT(0); /* number of lines printed in buffer so far */
int __kmp_debug_buf_warn_chars =
0; /* Keep track of char increase recommended in warnings */
/* end rotating debug buffer */
Expand Down Expand Up @@ -402,7 +402,7 @@ volatile kmp_info_t *__kmp_thread_pool = NULL;
volatile kmp_team_t *__kmp_team_pool = NULL;

KMP_ALIGN_CACHE
volatile int __kmp_thread_pool_active_nth = 0;
std::atomic<int> __kmp_thread_pool_active_nth = ATOMIC_VAR_INIT(0);

/* -------------------------------------------------
* GLOBAL/ROOT STATE */
Expand All @@ -418,47 +418,47 @@ kmp_global_t __kmp_global = {{0}};
* false sharing if the alignment is not large enough for these locks */
KMP_ALIGN_CACHE_INTERNODE

kmp_bootstrap_lock_t __kmp_initz_lock = KMP_BOOTSTRAP_LOCK_INITIALIZER(
__kmp_initz_lock); /* Control initializations */
KMP_BOOTSTRAP_LOCK_INIT(__kmp_initz_lock); /* Control initializations */
KMP_ALIGN_CACHE_INTERNODE
kmp_bootstrap_lock_t __kmp_forkjoin_lock; /* control fork/join access */
KMP_BOOTSTRAP_LOCK_INIT(__kmp_forkjoin_lock); /* control fork/join access */
KMP_ALIGN_CACHE_INTERNODE
kmp_bootstrap_lock_t __kmp_exit_lock; /* exit() is not always thread-safe */
KMP_BOOTSTRAP_LOCK_INIT(__kmp_exit_lock); /* exit() is not always thread-safe */
#if KMP_USE_MONITOR
/* control monitor thread creation */
KMP_ALIGN_CACHE_INTERNODE
kmp_bootstrap_lock_t __kmp_monitor_lock; /* control monitor thread creation */
KMP_BOOTSTRAP_LOCK_INIT(__kmp_monitor_lock);
#endif
/* used for the hack to allow threadprivate cache and __kmp_threads expansion
to co-exist */
KMP_ALIGN_CACHE_INTERNODE
kmp_bootstrap_lock_t __kmp_tp_cached_lock;
KMP_BOOTSTRAP_LOCK_INIT(__kmp_tp_cached_lock);

KMP_ALIGN_CACHE_INTERNODE
kmp_lock_t __kmp_global_lock; /* Control OS/global access */
KMP_LOCK_INIT(__kmp_global_lock); /* Control OS/global access */
KMP_ALIGN_CACHE_INTERNODE
kmp_queuing_lock_t __kmp_dispatch_lock; /* Control dispatch access */
KMP_ALIGN_CACHE_INTERNODE
kmp_lock_t __kmp_debug_lock; /* Control I/O access for KMP_DEBUG */
KMP_LOCK_INIT(__kmp_debug_lock); /* Control I/O access for KMP_DEBUG */
#else
KMP_ALIGN_CACHE

kmp_bootstrap_lock_t __kmp_initz_lock = KMP_BOOTSTRAP_LOCK_INITIALIZER(
__kmp_initz_lock); /* Control initializations */
kmp_bootstrap_lock_t __kmp_forkjoin_lock; /* control fork/join access */
kmp_bootstrap_lock_t __kmp_exit_lock; /* exit() is not always thread-safe */
KMP_BOOTSTRAP_LOCK_INIT(__kmp_initz_lock); /* Control initializations */
KMP_BOOTSTRAP_LOCK_INIT(__kmp_forkjoin_lock); /* control fork/join access */
KMP_BOOTSTRAP_LOCK_INIT(__kmp_exit_lock); /* exit() is not always thread-safe */
#if KMP_USE_MONITOR
kmp_bootstrap_lock_t __kmp_monitor_lock; /* control monitor thread creation */
/* control monitor thread creation */
KMP_BOOTSTRAP_LOCK_INIT(__kmp_monitor_lock);
#endif
/* used for the hack to allow threadprivate cache and __kmp_threads expansion
to co-exist */
kmp_bootstrap_lock_t __kmp_tp_cached_lock;
KMP_BOOTSTRAP_LOCK_INIT(__kmp_tp_cached_lock);

KMP_ALIGN(128)
kmp_lock_t __kmp_global_lock; /* Control OS/global access */
KMP_LOCK_INIT(__kmp_global_lock); /* Control OS/global access */
KMP_ALIGN(128)
kmp_queuing_lock_t __kmp_dispatch_lock; /* Control dispatch access */
KMP_ALIGN(128)
kmp_lock_t __kmp_debug_lock; /* Control I/O access for KMP_DEBUG */
KMP_LOCK_INIT(__kmp_debug_lock); /* Control I/O access for KMP_DEBUG */
#endif

/* ----------------------------------------------- */
Expand Down
4 changes: 1 addition & 3 deletions openmp/runtime/src/kmp_io.cpp
Expand Up @@ -152,9 +152,7 @@ void __kmp_vprintf(enum kmp_io __kmp_io, char const *format, va_list ap) {

if (__kmp_debug_buf && __kmp_debug_buffer != NULL) {

int dc = (__kmp_debug_buf_atomic ? KMP_TEST_THEN_INC32(&__kmp_debug_count)
: __kmp_debug_count++) %
__kmp_debug_buf_lines;
int dc = __kmp_debug_count++ % __kmp_debug_buf_lines;
char *db = &__kmp_debug_buffer[dc * __kmp_debug_buf_chars];
int chars = 0;

Expand Down

0 comments on commit 37e2ef5

Please sign in to comment.