29 changes: 20 additions & 9 deletions openmp/runtime/src/kmp_dispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -670,6 +670,7 @@ __kmp_dispatch_init(
} else {
pr->ordered = FALSE;
}

if ( schedule == kmp_sch_static ) {
schedule = __kmp_static;
} else {
Expand Down Expand Up @@ -761,6 +762,19 @@ __kmp_dispatch_init(
tc = 0; // zero-trip
}

// Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing
// when statistics are disabled.
if (schedule == __kmp_static)
{
KMP_COUNT_BLOCK(OMP_FOR_static);
KMP_COUNT_VALUE(FOR_static_iterations, tc);
}
else
{
KMP_COUNT_BLOCK(OMP_FOR_dynamic);
KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
}

pr->u.p.lb = lb;
pr->u.p.ub = ub;
pr->u.p.st = st;
Expand Down Expand Up @@ -1384,6 +1398,11 @@ __kmp_dispatch_next(
static const int ___kmp_size_type = sizeof( UT );
#endif

// This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule
// is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs
// more than a compile time choice to use static scheduling would.)
KMP_TIME_BLOCK(FOR_dynamic_scheduling);

int status;
dispatch_private_info_template< T > * pr;
kmp_info_t * th = __kmp_threads[ gtid ];
Expand Down Expand Up @@ -2164,7 +2183,6 @@ __kmp_dist_get_bounds(
T *pupper,
typename traits_t< T >::signed_t incr
) {
KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic);
typedef typename traits_t< T >::unsigned_t UT;
typedef typename traits_t< T >::signed_t ST;
register kmp_uint32 team_id;
Expand Down Expand Up @@ -2222,6 +2240,7 @@ __kmp_dist_get_bounds(
} else {
trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
}

if( trip_count <= nteams ) {
KMP_DEBUG_ASSERT(
__kmp_static == kmp_sch_static_greedy || \
Expand Down Expand Up @@ -2297,7 +2316,6 @@ void
__kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
{
KMP_COUNT_BLOCK(OMP_FOR_dynamic);
KMP_DEBUG_ASSERT( __kmp_init_serial );
__kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
}
Expand All @@ -2308,7 +2326,6 @@ void
__kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
{
KMP_COUNT_BLOCK(OMP_FOR_dynamic);
KMP_DEBUG_ASSERT( __kmp_init_serial );
__kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
}
Expand All @@ -2321,7 +2338,6 @@ __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
kmp_int64 lb, kmp_int64 ub,
kmp_int64 st, kmp_int64 chunk )
{
KMP_COUNT_BLOCK(OMP_FOR_dynamic);
KMP_DEBUG_ASSERT( __kmp_init_serial );
__kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
}
Expand All @@ -2334,7 +2350,6 @@ __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
kmp_uint64 lb, kmp_uint64 ub,
kmp_int64 st, kmp_int64 chunk )
{
KMP_COUNT_BLOCK(OMP_FOR_dynamic);
KMP_DEBUG_ASSERT( __kmp_init_serial );
__kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
}
Expand All @@ -2352,7 +2367,6 @@ void
__kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
{
KMP_COUNT_BLOCK(OMP_FOR_dynamic);
KMP_DEBUG_ASSERT( __kmp_init_serial );
__kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
__kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
Expand All @@ -2362,7 +2376,6 @@ void
__kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
{
KMP_COUNT_BLOCK(OMP_FOR_dynamic);
KMP_DEBUG_ASSERT( __kmp_init_serial );
__kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
__kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
Expand All @@ -2372,7 +2385,6 @@ void
__kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
{
KMP_COUNT_BLOCK(OMP_FOR_dynamic);
KMP_DEBUG_ASSERT( __kmp_init_serial );
__kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
__kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
Expand All @@ -2382,7 +2394,6 @@ void
__kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
{
KMP_COUNT_BLOCK(OMP_FOR_dynamic);
KMP_DEBUG_ASSERT( __kmp_init_serial );
__kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
__kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
Expand Down
39 changes: 23 additions & 16 deletions openmp/runtime/src/kmp_runtime.c
Original file line number Diff line number Diff line change
Expand Up @@ -1495,7 +1495,8 @@ __kmp_fork_call(
kmp_hot_team_ptr_t **p_hot_teams;
#endif
{ // KMP_TIME_BLOCK
KMP_TIME_BLOCK(KMP_fork_call);
KMP_TIME_DEVELOPER_BLOCK(KMP_fork_call);
KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);

KA_TRACE( 20, ("__kmp_fork_call: enter T#%d\n", gtid ));
if ( __kmp_stkpadding > 0 && __kmp_root[gtid] != NULL ) {
Expand Down Expand Up @@ -1620,12 +1621,14 @@ __kmp_fork_call(
}
#endif

KMP_TIME_BLOCK(OMP_work);
__kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
{
KMP_TIME_BLOCK(OMP_work);
__kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
#if OMPT_SUPPORT
, exit_runtime_p
, exit_runtime_p
#endif
);
);
}

#if OMPT_SUPPORT
if (ompt_status & ompt_status_track) {
Expand Down Expand Up @@ -2224,8 +2227,8 @@ __kmp_fork_call(
} // END of timer KMP_fork_call block

{
//KMP_TIME_BLOCK(OMP_work);
KMP_TIME_BLOCK(USER_master_invoke);
KMP_TIME_BLOCK(OMP_work);
// KMP_TIME_DEVELOPER_BLOCK(USER_master_invoke);
if (! team->t.t_invoke( gtid )) {
KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
}
Expand Down Expand Up @@ -2280,7 +2283,7 @@ __kmp_join_call(ident_t *loc, int gtid, enum fork_context_e fork_context
#endif /* OMP_40_ENABLED */
)
{
KMP_TIME_BLOCK(KMP_join_call);
KMP_TIME_DEVELOPER_BLOCK(KMP_join_call);
kmp_team_t *team;
kmp_team_t *parent_team;
kmp_info_t *master_th;
Expand Down Expand Up @@ -2582,6 +2585,7 @@ __kmp_set_num_threads( int new_nth, int gtid )
else if (new_nth > __kmp_max_nth)
new_nth = __kmp_max_nth;

KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
thread = __kmp_threads[gtid];

__kmp_save_internal_controls( thread );
Expand Down Expand Up @@ -4790,7 +4794,7 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
kmp_internal_control_t *new_icvs,
int argc USE_NESTED_HOT_ARG(kmp_info_t *master) )
{
KMP_TIME_BLOCK(KMP_allocate_team);
KMP_TIME_DEVELOPER_BLOCK(KMP_allocate_team);
int f;
kmp_team_t *team;
int use_hot_team = ! root->r.r_active;
Expand Down Expand Up @@ -5577,12 +5581,12 @@ __kmp_launch_thread( kmp_info_t *this_thr )
}
#endif

KMP_STOP_EXPLICIT_TIMER(USER_launch_thread_loop);
KMP_STOP_DEVELOPER_EXPLICIT_TIMER(USER_launch_thread_loop);
{
KMP_TIME_BLOCK(USER_worker_invoke);
KMP_TIME_DEVELOPER_BLOCK(USER_worker_invoke);
rc = (*pteam)->t.t_invoke( gtid );
}
KMP_START_EXPLICIT_TIMER(USER_launch_thread_loop);
KMP_START_DEVELOPER_EXPLICIT_TIMER(USER_launch_thread_loop);
KMP_ASSERT( rc );

#if OMPT_SUPPORT
Expand Down Expand Up @@ -6910,12 +6914,15 @@ __kmp_invoke_task_func( int gtid )
#endif
#endif

rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn),
gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv
{
KMP_TIME_BLOCK(OMP_work);
rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn),
gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv
#if OMPT_SUPPORT
, exit_runtime_p
, exit_runtime_p
#endif
);
);
}

#if OMPT_SUPPORT && OMPT_TRACE
if (ompt_status & ompt_status_track) {
Expand Down
7 changes: 6 additions & 1 deletion openmp/runtime/src/kmp_sched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ __kmp_for_static_init(
typename traits_t< T >::signed_t chunk
) {
KMP_COUNT_BLOCK(OMP_FOR_static);
KMP_TIME_BLOCK (FOR_static_scheduling);

typedef typename traits_t< T >::unsigned_t UT;
typedef typename traits_t< T >::signed_t ST;
/* this all has to be changed back to TID and such.. */
Expand Down Expand Up @@ -151,6 +153,7 @@ __kmp_for_static_init(
team_info->microtask);
}
#endif
KMP_COUNT_VALUE (FOR_static_iterations, 0);
return;
}

Expand Down Expand Up @@ -246,6 +249,7 @@ __kmp_for_static_init(
__kmp_error_construct( kmp_i18n_msg_CnsIterationRangeTooLarge, ct_pdo, loc );
}
}
KMP_COUNT_VALUE (FOR_static_iterations, trip_count);

/* compute remaining parameters */
switch ( schedtype ) {
Expand Down Expand Up @@ -372,7 +376,7 @@ __kmp_dist_for_static_init(
typename traits_t< T >::signed_t incr,
typename traits_t< T >::signed_t chunk
) {
KMP_COUNT_BLOCK(OMP_DISTR_FOR_static);
KMP_COUNT_BLOCK(OMP_DISTRIBUTE);
typedef typename traits_t< T >::unsigned_t UT;
typedef typename traits_t< T >::signed_t ST;
register kmp_uint32 tid;
Expand Down Expand Up @@ -437,6 +441,7 @@ __kmp_dist_for_static_init(
} else {
trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
}

*pstride = *pupper - *plower; // just in case (can be unused)
if( trip_count <= nteams ) {
KMP_DEBUG_ASSERT(
Expand Down
8 changes: 3 additions & 5 deletions openmp/runtime/src/kmp_stats.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -521,16 +521,14 @@ void kmp_stats_output_module::outputStats(const char* heading)

// Special handling for synthesized statistics.
// These just have to be coded specially here for now.
// At present we only have one: the total parallel work done in each thread.
// At present we only have a few:
// The total parallel work done in each thread.
// The variance here makes it easy to see load imbalance over the whole program (though, of course,
// it's possible to have a code with awful load balance in every parallel region but perfect load
// balance oever the whole program.)
// The time spent in barriers in each thread.
allStats[TIMER_Total_work].addSample ((*it)->getTimer(TIMER_OMP_work)->getTotal());

// Time waiting for work (synthesized)
if ((t != 0) || !timeStat::workerOnly(timer_e(TIMER_OMP_await_work)))
allStats[TIMER_Total_await_work].addSample ((*it)->getTimer(TIMER_OMP_await_work)->getTotal());

// Time in explicit barriers.
allStats[TIMER_Total_barrier].addSample ((*it)->getTimer(TIMER_OMP_barrier)->getTotal());

Expand Down
147 changes: 94 additions & 53 deletions openmp/runtime/src/kmp_stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@
#include <new> // placement new
#include "kmp_stats_timing.h"

/*
* Enable developer statistics here if you want them. They are more detailed than is useful for application characterisation and
* are intended for the runtime library developer.
*/
// #define KMP_DEVELOPER_STATS 1

/*!
* @ingroup STATS_GATHERING
Expand All @@ -56,29 +61,38 @@ class stats_flags_e {
* Each thread accumulates its own count, at the end of execution the counts are aggregated treating each thread
* as a separate measurement. (Unless onlyInMaster is set, in which case there's only a single measurement).
* The min,mean,max are therefore the values for the threads.
* Adding the counter here and then putting in a KMP_BLOCK_COUNTER(name) is all you need to do.
* Adding the counter here and then putting a KMP_BLOCK_COUNTER(name) at the point you want to count is all you need to do.
* All of the tables and printing is generated from this macro.
* Format is "macro(name, flags, arg)"
*
* @ingroup STATS_GATHERING
*/
#define KMP_FOREACH_COUNTER(macro, arg) \
macro (OMP_PARALLEL, stats_flags_e::onlyInMaster, arg) \
macro (OMP_NESTED_PARALLEL, 0, arg) \
macro (OMP_FOR_static, 0, arg) \
macro (OMP_FOR_dynamic, 0, arg) \
macro (OMP_DISTR_FOR_static, 0, arg) \
macro (OMP_DISTR_FOR_dynamic, 0, arg) \
macro (OMP_DISTRIBUTE, 0, arg) \
macro (OMP_BARRIER, 0, arg) \
macro (OMP_CRITICAL,0, arg) \
macro (OMP_SINGLE, 0, arg) \
macro (OMP_MASTER, 0, arg) \
macro (OMP_TEAMS, 0, arg) \
macro (OMP_set_lock, 0, arg) \
macro (OMP_test_lock, 0, arg) \
macro (OMP_test_lock_failure, 0, arg) \
macro (REDUCE_wait, 0, arg) \
macro (REDUCE_nowait, 0, arg) \
macro (OMP_TASKYIELD, 0, arg) \
macro (TASK_executed, 0, arg) \
macro (TASK_cancelled, 0, arg) \
macro (TASK_stolen, 0, arg) \
macro (LAST,0,arg)

// OMP_PARALLEL_args -- the number of arguments passed to a fork
// FOR_static_iterations -- Number of available parallel chunks of work in a static for
// FOR_dynamic_iterations -- Number of available parallel chunks of work in a dynamic for
// Both adjust for any chunking, so if there were an iteration count of 20 but a chunk size of 10, we'd record 2.

/*!
* \brief Add new timers under KMP_FOREACH_TIMER() macro in kmp_stats.h
*
Expand All @@ -87,72 +101,45 @@ class stats_flags_e {
*
* \details A timer collects multiple samples of some count in each thread and then finally aggregates over all the threads.
* The count is normally a time (in ticks), hence the name "timer". (But can be any value, so we use this for "number of arguments passed to fork"
* as well, or we could collect "loop iteration count" if we wanted to).
* as well).
* For timers the threads are not significant, it's the individual observations that count, so the statistics are at that level.
* Format is "macro(name, flags, arg)"
*
* @ingroup STATS_GATHERING
* @ingroup STATS_GATHERING2
*/
#define KMP_FOREACH_TIMER(macro, arg) \
macro (OMP_PARALLEL_args, stats_flags_e::onlyInMaster | stats_flags_e::noUnits, arg) \
macro (FOR_static_iterations, stats_flags_e::onlyInMaster | stats_flags_e::noUnits, arg) \
macro (FOR_dynamic_iterations, stats_flags_e::noUnits, arg) \
#define KMP_FOREACH_TIMER(macro, arg) \
macro (OMP_start_end, stats_flags_e::onlyInMaster, arg) \
macro (OMP_serial, stats_flags_e::onlyInMaster, arg) \
macro (OMP_work, 0, arg) \
macro (Total_work, stats_flags_e::synthesized, arg) \
macro (OMP_await_work, stats_flags_e::notInMaster, arg) \
macro (Total_await_work, stats_flags_e::synthesized, arg) \
macro (OMP_barrier, 0, arg) \
macro (Total_barrier, stats_flags_e::synthesized, arg) \
macro (OMP_test_lock, 0, arg) \
macro (FOR_static_iterations, stats_flags_e::noUnits, arg) \
macro (FOR_static_scheduling, 0, arg) \
macro (FOR_dynamic_iterations, stats_flags_e::noUnits, arg) \
macro (FOR_dynamic_scheduling, 0, arg) \
macro (KMP_fork_call, 0, arg) \
macro (KMP_join_call, 0, arg) \
macro (KMP_fork_barrier, stats_flags_e::logEvent, arg) \
macro (KMP_join_barrier, stats_flags_e::logEvent, arg) \
macro (KMP_barrier, 0, arg) \
macro (KMP_end_split_barrier, 0, arg) \
macro (KMP_wait_sleep, 0, arg) \
macro (KMP_release, 0, arg) \
macro (KMP_hier_gather, 0, arg) \
macro (KMP_hier_release, 0, arg) \
macro (KMP_hyper_gather, stats_flags_e::logEvent, arg) \
macro (KMP_hyper_release, stats_flags_e::logEvent, arg) \
macro (KMP_linear_gather, 0, arg) \
macro (KMP_linear_release, 0, arg) \
macro (KMP_tree_gather, 0, arg) \
macro (KMP_tree_release, 0, arg) \
macro (USER_master_invoke, stats_flags_e::logEvent, arg) \
macro (USER_worker_invoke, stats_flags_e::logEvent, arg) \
macro (USER_resume, stats_flags_e::logEvent, arg) \
macro (USER_suspend, stats_flags_e::logEvent, arg) \
macro (USER_launch_thread_loop, stats_flags_e::logEvent, arg) \
macro (KMP_allocate_team, 0, arg) \
macro (KMP_setup_icv_copy, 0, arg) \
macro (USER_icv_copy, 0, arg) \
macro (TASK_execution, 0, arg) \
macro (OMP_set_numthreads, stats_flags_e::noUnits, arg) \
macro (OMP_PARALLEL_args, stats_flags_e::noUnits, arg) \
macro (OMP_single, 0, arg) \
macro (OMP_master, 0, arg) \
KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \
macro (LAST,0, arg)



// OMP_PARALLEL_args -- the number of arguments passed to a fork
// FOR_static_iterations -- Number of available parallel chunks of work in a static for
// FOR_dynamic_iterations -- Number of available parallel chunks of work in a dynamic for
// Both adjust for any chunking, so if there were an iteration count of 20 but a chunk size of 10, we'd record 2.
// OMP_serial -- thread zero time executing serial code
// OMP_start_end -- time from when OpenMP is initialized until the stats are printed at exit
// OMP_serial -- thread zero time executing serial code
// OMP_work -- elapsed time in code dispatched by a fork (measured in the thread)
// Total_work -- a synthesized statistic summarizing how much parallel work each thread executed.
// OMP_barrier -- time at "real" barriers
// Total_barrier -- a synthesized statistic summarizing how much time at real barriers in each thread
// OMP_set_lock -- time in lock setting
// OMP_test_lock -- time in testing a lock
// LOCK_WAIT -- time waiting for a lock
// FOR_static_scheduling -- time spent doing scheduling for a static "for"
// FOR_dynamic_scheduling -- time spent doing scheduling for a dynamic "for"
// KMP_wait_sleep -- time in __kmp_wait_sleep
// KMP_release -- time in __kmp_release

#if (KMP_DEVELOPER_STATS)
// Timers which are of interest tio runtime library developers, not end users.
// THese have to be explicitly enabled in addition to the other stats.

// KMP_fork_barrier -- time in __kmp_fork_barrier
// KMP_join_barrier -- time in __kmp_join_barrier
// KMP_barrier -- time in __kmp_barrier
Expand All @@ -165,6 +152,32 @@ class stats_flags_e {
// KMP_tree_release -- time in __kmp_tree_barrier_release
// KMP_hyper_gather -- time in __kmp_hyper_barrier_gather
// KMP_hyper_release -- time in __kmp_hyper_barrier_release
# define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \
macro (KMP_fork_call, 0, arg) \
macro (KMP_join_call, 0, arg) \
macro (KMP_fork_barrier, stats_flags_e::logEvent, arg) \
macro (KMP_join_barrier, stats_flags_e::logEvent, arg) \
macro (KMP_barrier, 0, arg) \
macro (KMP_end_split_barrier, 0, arg) \
macro (KMP_hier_gather, 0, arg) \
macro (KMP_hier_release, 0, arg) \
macro (KMP_hyper_gather, stats_flags_e::logEvent, arg) \
macro (KMP_hyper_release, stats_flags_e::logEvent, arg) \
macro (KMP_linear_gather, 0, arg) \
macro (KMP_linear_release, 0, arg) \
macro (KMP_tree_gather, 0, arg) \
macro (KMP_tree_release, 0, arg) \
macro (USER_master_invoke, stats_flags_e::logEvent, arg) \
macro (USER_worker_invoke, stats_flags_e::logEvent, arg) \
macro (USER_resume, stats_flags_e::logEvent, arg) \
macro (USER_suspend, stats_flags_e::logEvent, arg) \
macro (USER_launch_thread_loop, stats_flags_e::logEvent, arg) \
macro (KMP_allocate_team, 0, arg) \
macro (KMP_setup_icv_copy, 0, arg) \
macro (USER_icv_copy, 0, arg)
#else
# define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
#endif

/*!
* \brief Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro.
Expand All @@ -182,13 +195,21 @@ class stats_flags_e {
*
* @ingroup STATS_GATHERING
*/
#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) \
macro(OMP_serial, 0, arg) \
macro(OMP_start_end, 0, arg) \
macro(USER_icv_copy, 0, arg) \
macro(USER_launch_thread_loop, stats_flags_e::logEvent, arg) \
#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) \
macro(OMP_serial, 0, arg) \
macro(OMP_start_end, 0, arg) \
macro(OMP_single, 0, arg) \
macro(OMP_master, 0, arg) \
KMP_FOREACH_EXPLICIT_DEVELOPER_TIMER(macro,arg) \
macro(LAST, 0, arg)

#if (KMP_DEVELOPER_STATS)
# define KMP_FOREACH_EXPLICIT_DEVELOPER_TIMER(macro, arg) \
macro(USER_launch_thread_loop, stats_flags_e::logEvent, arg)
#else
# define KMP_FOREACH_EXPLICIT_DEVELOPER_TIMER(macro, arg)
#endif

#define ENUMERATE(name,ignore,prefix) prefix##name,
enum timer_e {
KMP_FOREACH_TIMER(ENUMERATE, TIMER_)
Expand Down Expand Up @@ -689,6 +710,21 @@ extern kmp_stats_output_module __kmp_stats_output;
*/
#define KMP_RESET_STATS() __kmp_reset_stats()

#if (KMP_DEVELOPER_STATS)
# define KMP_TIME_DEVELOPER_BLOCK(n) KMP_TIME_BLOCK(n)
# define KMP_COUNT_DEVELOPER_VALUE(n,v) KMP_COUNT_VALUE(n,v)
# define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n)
# define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) KMP_START_EXPLICIT_TIMER(n)
# define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) KMP_STOP_EXPLICIT_TIMER(n)
#else
// Null definitions
# define KMP_TIME_DEVELOPER_BLOCK(n) ((void)0)
# define KMP_COUNT_DEVELOPER_VALUE(n,v) ((void)0)
# define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
# define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) ((void)0)
# define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) ((void)0)
#endif

#else // KMP_STATS_ENABLED

// Null definitions
Expand All @@ -701,6 +737,11 @@ extern kmp_stats_output_module __kmp_stats_output;
#define KMP_OUTPUT_STATS(heading_string) ((void)0)
#define KMP_RESET_STATS() ((void)0)

#define KMP_TIME_DEVELOPER_BLOCK(n) ((void)0)
#define KMP_COUNT_DEVELOPER_VALUE(n,v) ((void)0)
#define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
#define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) ((void)0)
#define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) ((void)0)
#endif // KMP_STATS_ENABLED

#endif // KMP_STATS_H
7 changes: 7 additions & 0 deletions openmp/runtime/src/kmp_tasking.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "kmp_i18n.h"
#include "kmp_itt.h"
#include "kmp_wait_release.h"
#include "kmp_stats.h"

#if OMPT_SUPPORT
#include "ompt-specific.h"
Expand Down Expand Up @@ -1136,6 +1137,7 @@ __kmp_invoke_task( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t * current_ta
kmp_team_t * this_team = this_thr->th.th_team;
kmp_taskgroup_t * taskgroup = taskdata->td_taskgroup;
if ((taskgroup && taskgroup->cancel_request) || (this_team->t.t_cancel_request == cancel_parallel)) {
KMP_COUNT_BLOCK(TASK_cancelled);
// this task belongs to a task group and we need to cancel it
discard = 1 /* true */;
}
Expand All @@ -1146,6 +1148,8 @@ __kmp_invoke_task( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t * current_ta
// Thunks generated by gcc take a different argument list.
//
if (!discard) {
KMP_COUNT_BLOCK(TASK_executed);
KMP_TIME_BLOCK (TASK_execution);
#endif // OMP_40_ENABLED
#ifdef KMP_GOMP_COMPAT
if (taskdata->td_flags.native) {
Expand Down Expand Up @@ -1356,6 +1360,8 @@ __kmpc_omp_taskyield( ident_t *loc_ref, kmp_int32 gtid, int end_part )
kmp_info_t * thread;
int thread_finished = FALSE;

KMP_COUNT_BLOCK(OMP_TASKYIELD);

KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
gtid, loc_ref, end_part) );

Expand Down Expand Up @@ -1648,6 +1654,7 @@ __kmp_steal_task( kmp_info_t *victim, kmp_int32 gtid, kmp_task_team_t *task_team

__kmp_release_bootstrap_lock( & victim_td -> td.td_deque_lock );

KMP_COUNT_BLOCK(TASK_stolen);
KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d stole task %p from T#%d: task_team=%p "
"ntasks=%d head=%u tail=%u\n",
gtid, taskdata, __kmp_gtid_from_thread( victim ), task_team,
Expand Down
4 changes: 2 additions & 2 deletions openmp/runtime/src/z_Linux_util.c
Original file line number Diff line number Diff line change
Expand Up @@ -1688,7 +1688,7 @@ __kmp_suspend_uninitialize_thread( kmp_info_t *th )
template <class C>
static inline void __kmp_suspend_template( int th_gtid, C *flag )
{
KMP_TIME_BLOCK(USER_suspend);
KMP_TIME_DEVELOPER_BLOCK(USER_suspend);
kmp_info_t *th = __kmp_threads[th_gtid];
int status;
typename C::flag_t old_spin;
Expand Down Expand Up @@ -1826,6 +1826,7 @@ void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag) {
template <class C>
static inline void __kmp_resume_template( int target_gtid, C *flag )
{
KMP_TIME_DEVELOPER_BLOCK(USER_resume);
kmp_info_t *th = __kmp_threads[target_gtid];
int status;

Expand Down Expand Up @@ -1900,7 +1901,6 @@ void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag) {
void
__kmp_resume_monitor()
{
KMP_TIME_BLOCK(USER_resume);
int status;
#ifdef KMP_DEBUG
int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
Expand Down