Skip to content

Commit

Permalink
[OpenMP] libomp: Add new experimental barrier: two-level distributed …
Browse files Browse the repository at this point in the history
…barrier

Two-level distributed barrier is a new experimental barrier designed
for Intel hardware that has better performance in some cases than the
default hyper barrier.

This barrier is designed to handle fine granularity parallelism where
barriers are used frequently with little compute and memory access
between barriers. There is no need to use it for codes with few
barriers and large granularity compute, or memory intensive
applications, as little difference will be seen between this barrier
and the default hyper barrier. This barrier is designed to work
optimally with a fixed number of threads, and has a significant setup
time, so should NOT be used in situations where the number of threads
in a team is varied frequently.

The two-level distributed barrier is off by default -- hyper barrier
is used by default. To use this barrier, you must set all barrier
patterns to use this type, because it will not work with other barrier
patterns. Thus, to turn it on, the following settings are required:

KMP_FORKJOIN_BARRIER_PATTERN=dist,dist
KMP_PLAIN_BARRIER_PATTERN=dist,dist
KMP_REDUCTION_BARRIER_PATTERN=dist,dist

Branching factors (set with KMP_FORKJOIN_BARRIER, KMP_PLAIN_BARRIER,
and KMP_REDUCTION_BARRIER) are ignored by the two-level distributed
barrier.

Patch fixed for ITTNotify disabled builds and non-x86 builds

Co-authored-by: Jonathan Peyton <jonathan.l.peyton@intel.com>
Co-authored-by: Vladislav Vinogradov <vlad.vinogradov@intel.com>

Differential Revision: https://reviews.llvm.org/D103121
  • Loading branch information
TerryLWilmarth authored and jpeyton52 committed Jul 29, 2021
1 parent 84a4cae commit d8e4cb9
Show file tree
Hide file tree
Showing 20 changed files with 1,588 additions and 456 deletions.
17 changes: 17 additions & 0 deletions openmp/runtime/cmake/config-ix.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,23 @@ if (NOT LIBOMP_HAVE_SHM_OPEN_NO_LRT)
set(CMAKE_REQUIRED_LIBRARIES)
endif()

# Check for aligned memory allocator function
check_include_file(xmmintrin.h LIBOMP_HAVE_XMMINTRIN_H)
set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
if (LIBOMP_HAVE_XMMINTRIN_H)
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -DLIBOMP_HAVE_XMMINTRIN_H")
endif()
set(source_code "// check for _mm_malloc
#ifdef LIBOMP_HAVE_XMMINTRIN_H
#include <xmmintrin.h>
#endif
int main() { void *ptr = _mm_malloc(sizeof(int) * 1000, 64); _mm_free(ptr); return 0; }")
check_cxx_source_compiles("${source_code}" LIBOMP_HAVE__MM_MALLOC)
set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
check_symbol_exists(aligned_alloc "stdlib.h" LIBOMP_HAVE_ALIGNED_ALLOC)
check_symbol_exists(posix_memalign "stdlib.h" LIBOMP_HAVE_POSIX_MEMALIGN)
check_symbol_exists(_aligned_malloc "malloc.h" LIBOMP_HAVE__ALIGNED_MALLOC)

# Check linker flags
if(WIN32)
libomp_check_linker_flag(/SAFESEH LIBOMP_HAVE_SAFESEH_FLAG)
Expand Down
1 change: 1 addition & 0 deletions openmp/runtime/src/i18n/en_US.txt
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,7 @@ Using_int_Value "%1$s value \"%2$d\" will be used."
Using_uint_Value "%1$s value \"%2$u\" will be used."
Using_uint64_Value "%1$s value \"%2$s\" will be used."
Using_str_Value "%1$s value \"%2$s\" will be used."
BarrierPatternOverride "Mixing other barrier patterns with dist is prohibited. Using dist for all barrier patterns."
MaxValueUsing "%1$s maximum value \"%2$d\" will be used."
MinValueUsing "%1$s minimum value \"%2$d\" will be used."
MemoryAllocFailed "Memory allocation failed."
Expand Down
33 changes: 33 additions & 0 deletions openmp/runtime/src/kmp.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ typedef unsigned int kmp_hwloc_depth_t;
#include "kmp_debug.h"
#include "kmp_lock.h"
#include "kmp_version.h"
#include "kmp_barrier.h"
#if USE_DEBUGGER
#include "kmp_debugger.h"
#endif
Expand Down Expand Up @@ -263,6 +264,7 @@ typedef union kmp_root kmp_root_p;

template <bool C = false, bool S = true> class kmp_flag_32;
template <bool C = false, bool S = true> class kmp_flag_64;
template <bool C = false, bool S = true> class kmp_atomic_flag_64;
class kmp_flag_oncore;

#ifdef __cplusplus
Expand Down Expand Up @@ -1879,6 +1881,15 @@ typedef struct kmp_disp {
0 // Thread th_reap_state: not safe to reap (tasking)
#define KMP_SAFE_TO_REAP 1 // Thread th_reap_state: safe to reap (not tasking)

// The flag_type describes the storage used for the flag.
enum flag_type {
flag32, /**< atomic 32 bit flags */
flag64, /**< 64 bit flags */
atomic_flag64, /**< atomic 64 bit flags */
flag_oncore, /**< special 64-bit flag for on-core barrier (hierarchical) */
flag_unset
};

enum barrier_type {
bs_plain_barrier = 0, /* 0, All non-fork/join barriers (except reduction
barriers if enabled) */
Expand All @@ -1902,6 +1913,7 @@ typedef enum kmp_bar_pat { /* Barrier communication patterns */
bp_hyper_bar = 2, /* Hypercube-embedded tree with min
branching factor 2^n */
bp_hierarchical_bar = 3, /* Machine hierarchy tree */
bp_dist_bar = 4, /* Distributed barrier */
bp_last_bar /* Placeholder to mark the end */
} kmp_bar_pat_e;

Expand Down Expand Up @@ -2626,6 +2638,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
/* while awaiting queuing lock acquire */

volatile void *th_sleep_loc; // this points at a kmp_flag<T>
flag_type th_sleep_loc_type; // enum type of flag stored in th_sleep_loc

ident_t *th_ident;
unsigned th_x; // Random number generator data
Expand All @@ -2646,6 +2659,9 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
written by the worker thread) */
kmp_uint8 th_active_in_pool; // included in count of #active threads in pool
int th_active; // ! sleeping; 32 bits for TCR/TCW
std::atomic<kmp_uint32> th_used_in_team; // Flag indicating use in team
// 0 = not used in team; 1 = used in team;
// 2 = transitioning to not used in team; 3 = transitioning to used in team
struct cons_header *th_cons; // used for consistency check
#if KMP_USE_HIER_SCHED
// used for hierarchical scheduling
Expand Down Expand Up @@ -2825,6 +2841,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
#if USE_ITT_BUILD
void *t_stack_id; // team specific stack stitching id (for ittnotify)
#endif /* USE_ITT_BUILD */
distributedBarrier *b; // Distributed barrier data associated with team
} kmp_base_team_t;

union KMP_ALIGN_CACHE kmp_team {
Expand Down Expand Up @@ -4126,18 +4143,26 @@ template <bool C, bool S>
extern void __kmp_suspend_32(int th_gtid, kmp_flag_32<C, S> *flag);
template <bool C, bool S>
extern void __kmp_suspend_64(int th_gtid, kmp_flag_64<C, S> *flag);
template <bool C, bool S>
extern void __kmp_atomic_suspend_64(int th_gtid,
kmp_atomic_flag_64<C, S> *flag);
extern void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag);
#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
template <bool C, bool S>
extern void __kmp_mwait_32(int th_gtid, kmp_flag_32<C, S> *flag);
template <bool C, bool S>
extern void __kmp_mwait_64(int th_gtid, kmp_flag_64<C, S> *flag);
template <bool C, bool S>
extern void __kmp_atomic_mwait_64(int th_gtid, kmp_atomic_flag_64<C, S> *flag);
extern void __kmp_mwait_oncore(int th_gtid, kmp_flag_oncore *flag);
#endif
template <bool C, bool S>
extern void __kmp_resume_32(int target_gtid, kmp_flag_32<C, S> *flag);
template <bool C, bool S>
extern void __kmp_resume_64(int target_gtid, kmp_flag_64<C, S> *flag);
template <bool C, bool S>
extern void __kmp_atomic_resume_64(int target_gtid,
kmp_atomic_flag_64<C, S> *flag);
extern void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag);

template <bool C, bool S>
Expand All @@ -4156,6 +4181,14 @@ int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid,
void *itt_sync_obj,
#endif /* USE_ITT_BUILD */
kmp_int32 is_constrained);
template <bool C, bool S>
int __kmp_atomic_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid,
kmp_atomic_flag_64<C, S> *flag,
int final_spin, int *thread_finished,
#if USE_ITT_BUILD
void *itt_sync_obj,
#endif /* USE_ITT_BUILD */
kmp_int32 is_constrained);
int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid,
kmp_flag_oncore *flag, int final_spin,
int *thread_finished,
Expand Down
6 changes: 3 additions & 3 deletions openmp/runtime/src/kmp_atomic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -732,7 +732,7 @@ static inline kmp_cmplx128_a16_t operator/(kmp_cmplx128_a16_t &lhs,

#define OP_UPDATE_CRITICAL(TYPE, OP, LCK_ID) \
__kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid); \
(*lhs) = (TYPE)((*lhs)OP((TYPE)rhs)); \
(*lhs) = (TYPE)((*lhs)OP rhs); \
__kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);

// ------------------------------------------------------------------------
Expand Down Expand Up @@ -791,14 +791,14 @@ static inline kmp_cmplx128_a16_t operator/(kmp_cmplx128_a16_t &lhs,
{ \
TYPE old_value, new_value; \
old_value = *(TYPE volatile *)lhs; \
new_value = (TYPE)(old_value OP((TYPE)rhs)); \
new_value = (TYPE)(old_value OP rhs); \
while (!KMP_COMPARE_AND_STORE_ACQ##BITS( \
(kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) & old_value, \
*VOLATILE_CAST(kmp_int##BITS *) & new_value)) { \
KMP_DO_PAUSE; \
\
old_value = *(TYPE volatile *)lhs; \
new_value = (TYPE)(old_value OP((TYPE)rhs)); \
new_value = (TYPE)(old_value OP rhs); \
} \
}

Expand Down
Loading

0 comments on commit d8e4cb9

Please sign in to comment.