Skip to content

Commit

Permalink
[OpenMP] Add skewed iteration distribution on hybrid systems (#69946)
Browse files Browse the repository at this point in the history
This commit adds skewed distribution of iterations in
nonmonotonic:dynamic schedule (static steal) for hybrid systems when
thread affinity is assigned. Currently, it distributes the iterations at
60:40 ratio. Consider this loop with dynamic schedule type,
for (int i = 0; i < 100; ++i). In a hybrid system with 20 hardware
threads (16 CORE and 4 ATOM core), 88 iterations will be assigned to
performance cores and 12 iterations will be assigned to efficient cores.
Each thread with CORE core will process 5 iterations + extras and with
ATOM core will process 3 iterations.

Differential Revision: https://reviews.llvm.org/D152955
  • Loading branch information
jpeyton52 committed Nov 8, 2023
1 parent 3dff285 commit 5cc603c
Show file tree
Hide file tree
Showing 6 changed files with 276 additions and 56 deletions.
67 changes: 45 additions & 22 deletions openmp/runtime/src/kmp.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
#ifndef KMP_STATIC_STEAL_ENABLED
#define KMP_STATIC_STEAL_ENABLED 1
#endif
#define KMP_WEIGHTED_ITERATIONS_SUPPORTED \
(KMP_AFFINITY_SUPPORTED && KMP_STATIC_STEAL_ENABLED && \
(KMP_ARCH_X86 || KMP_ARCH_X86_64))

#define TASK_CURRENT_NOT_QUEUED 0
#define TASK_CURRENT_QUEUED 1
Expand Down Expand Up @@ -881,14 +884,8 @@ typedef struct kmp_affinity_flags_t {
KMP_BUILD_ASSERT(sizeof(kmp_affinity_flags_t) == 4);

typedef struct kmp_affinity_ids_t {
int os_id;
int ids[KMP_HW_LAST];
int operator[](size_t idx) const { return ids[idx]; }
int &operator[](size_t idx) { return ids[idx]; }
kmp_affinity_ids_t &operator=(const kmp_affinity_ids_t &rhs) {
for (int i = 0; i < KMP_HW_LAST; ++i)
ids[i] = rhs[i];
return *this;
}
} kmp_affinity_ids_t;

typedef struct kmp_affinity_attrs_t {
Expand Down Expand Up @@ -938,6 +935,10 @@ extern kmp_affin_mask_t *__kmp_affin_fullMask;
extern kmp_affin_mask_t *__kmp_affin_origMask;
extern char *__kmp_cpuinfo_file;

#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
extern int __kmp_first_osid_with_ecore;
#endif

#endif /* KMP_AFFINITY_SUPPORTED */

// This needs to be kept in sync with the values in omp.h !!!
Expand Down Expand Up @@ -1849,12 +1850,9 @@ typedef struct kmp_sched_flags {
unsigned ordered : 1;
unsigned nomerge : 1;
unsigned contains_last : 1;
#if KMP_USE_HIER_SCHED
unsigned use_hier : 1;
unsigned unused : 28;
#else
unsigned unused : 29;
#endif
unsigned use_hier : 1; // Used in KMP_USE_HIER_SCHED code
unsigned use_hybrid : 1; // Used in KMP_WEIGHTED_ITERATIONS_SUPPORTED code
unsigned unused : 27;
} kmp_sched_flags_t;

KMP_BUILD_ASSERT(sizeof(kmp_sched_flags_t) == 4);
Expand All @@ -1868,26 +1866,37 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
kmp_int32 st;
kmp_int32 tc;
kmp_lock_t *steal_lock; // lock used for chunk stealing

kmp_uint32 ordered_lower;
kmp_uint32 ordered_upper;

// KMP_ALIGN(32) ensures (if the KMP_ALIGN macro is turned on)
// a) parm3 is properly aligned and
// b) all parm1-4 are on the same cache line.
// Because of parm1-4 are used together, performance seems to be better
// if they are on the same cache line (not measured though).

struct KMP_ALIGN(32) { // AC: changed 16 to 32 in order to simplify template
kmp_int32 parm1; // structures in kmp_dispatch.cpp. This should
kmp_int32 parm2; // make no real change at least while padding is off.
struct KMP_ALIGN(32) {
kmp_int32 parm1;
kmp_int32 parm2;
kmp_int32 parm3;
kmp_int32 parm4;
};

kmp_uint32 ordered_lower;
kmp_uint32 ordered_upper;
#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
kmp_uint32 pchunks;
kmp_uint32 num_procs_with_pcore;
kmp_int32 first_thread_with_ecore;
#endif
#if KMP_OS_WINDOWS
kmp_int32 last_upper;
#endif /* KMP_OS_WINDOWS */
} dispatch_private_info32_t;

#if CACHE_LINE <= 128
KMP_BUILD_ASSERT(sizeof(dispatch_private_info32_t) <= 128);
#endif

typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
kmp_int64 count; // current chunk number for static & static-steal scheduling
kmp_int64 ub; /* upper-bound */
Expand All @@ -1896,27 +1905,38 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
kmp_int64 st; /* stride */
kmp_int64 tc; /* trip count (number of iterations) */
kmp_lock_t *steal_lock; // lock used for chunk stealing

kmp_uint64 ordered_lower;
kmp_uint64 ordered_upper;
/* parm[1-4] are used in different ways by different scheduling algorithms */

// KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
// KMP_ALIGN(32) ensures ( if the KMP_ALIGN macro is turned on )
// a) parm3 is properly aligned and
// b) all parm1-4 are in the same cache line.
// Because of parm1-4 are used together, performance seems to be better
// if they are in the same line (not measured though).

struct KMP_ALIGN(32) {
kmp_int64 parm1;
kmp_int64 parm2;
kmp_int64 parm3;
kmp_int64 parm4;
};

kmp_uint64 ordered_lower;
kmp_uint64 ordered_upper;
#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
kmp_uint64 pchunks;
kmp_uint64 num_procs_with_pcore;
kmp_int64 first_thread_with_ecore;
#endif

#if KMP_OS_WINDOWS
kmp_int64 last_upper;
#endif /* KMP_OS_WINDOWS */
} dispatch_private_info64_t;

#if CACHE_LINE <= 128
KMP_BUILD_ASSERT(sizeof(dispatch_private_info64_t) <= 128);
#endif

#else /* KMP_STATIC_STEAL_ENABLED */
typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
kmp_int32 lb;
Expand Down Expand Up @@ -3862,6 +3882,9 @@ extern int __kmp_aux_set_affinity_mask_proc(int proc, void **mask);
extern int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask);
extern int __kmp_aux_get_affinity_mask_proc(int proc, void **mask);
extern void __kmp_balanced_affinity(kmp_info_t *th, int team_size);
#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
extern int __kmp_get_first_osid_with_ecore(void);
#endif
#if KMP_OS_LINUX || KMP_OS_FREEBSD
extern int kmp_set_thread_affinity_mask_initial(void);
#endif
Expand Down
38 changes: 32 additions & 6 deletions openmp/runtime/src/kmp_affinity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4196,7 +4196,7 @@ static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask,

// Initiailze ids and attrs thread data
for (int i = 0; i < KMP_HW_LAST; ++i)
ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
ids.ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
attrs = KMP_AFFINITY_ATTRS_UNKNOWN;

// Iterate through each os id within the mask and determine
Expand All @@ -4205,19 +4205,20 @@ static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask,
int depth = __kmp_topology->get_depth();
KMP_CPU_SET_ITERATE(cpu, mask) {
int osid_idx = __kmp_osid_to_hwthread_map[cpu];
ids.os_id = cpu;
const kmp_hw_thread_t &hw_thread = __kmp_topology->at(osid_idx);
for (int level = 0; level < depth; ++level) {
kmp_hw_t type = __kmp_topology->get_type(level);
int id = hw_thread.sub_ids[level];
if (ids[type] == kmp_hw_thread_t::UNKNOWN_ID || ids[type] == id) {
ids[type] = id;
if (ids.ids[type] == kmp_hw_thread_t::UNKNOWN_ID || ids.ids[type] == id) {
ids.ids[type] = id;
} else {
// This mask spans across multiple topology units, set it as such
// and mark every level below as such as well.
ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
for (; level < depth; ++level) {
kmp_hw_t type = __kmp_topology->get_type(level);
ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
}
}
}
Expand Down Expand Up @@ -4297,6 +4298,9 @@ static void __kmp_aux_affinity_initialize_other_data(kmp_affinity_t &affinity) {
if (__kmp_topology && __kmp_topology->get_num_hw_threads()) {
machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
__kmp_affinity_get_topology_info(affinity);
#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
__kmp_first_osid_with_ecore = __kmp_get_first_osid_with_ecore();
#endif
}
}

Expand Down Expand Up @@ -4876,7 +4880,7 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) {

// Set the thread topology information to default of unknown
for (int id = 0; id < KMP_HW_LAST; ++id)
th->th.th_topology_ids[id] = kmp_hw_thread_t::UNKNOWN_ID;
th->th.th_topology_ids.ids[id] = kmp_hw_thread_t::UNKNOWN_ID;
th->th.th_topology_attrs = KMP_AFFINITY_ATTRS_UNKNOWN;

if (!KMP_AFFINITY_CAPABLE()) {
Expand Down Expand Up @@ -5273,6 +5277,28 @@ int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
}

#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
// Returns first os proc id with ATOM core
int __kmp_get_first_osid_with_ecore(void) {
int low = 0;
int high = __kmp_topology->get_num_hw_threads() - 1;
int mid = 0;
while (high - low > 1) {
mid = (high + low) / 2;
if (__kmp_topology->at(mid).attrs.get_core_type() ==
KMP_HW_CORE_TYPE_CORE) {
low = mid + 1;
} else {
high = mid;
}
}
if (__kmp_topology->at(mid).attrs.get_core_type() == KMP_HW_CORE_TYPE_ATOM) {
return mid;
}
return -1;
}
#endif

// Dynamic affinity settings - Affinity balanced
void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
KMP_DEBUG_ASSERT(th);
Expand Down

0 comments on commit 5cc603c

Please sign in to comment.