diff --git a/openmp/runtime/CMakeLists.txt b/openmp/runtime/CMakeLists.txt index 43299ddbad845..2b7a3eb5bfce9 100644 --- a/openmp/runtime/CMakeLists.txt +++ b/openmp/runtime/CMakeLists.txt @@ -342,6 +342,10 @@ if(LIBOMP_OMPD_SUPPORT AND ((NOT LIBOMP_OMPT_SUPPORT) OR (NOT "${CMAKE_SYSTEM_NA set(LIBOMP_OMPD_SUPPORT FALSE) endif() +# OMPX Taskgraph support +# Whether to build with OMPX Taskgraph (e.g. task record & replay) +set(LIBOMP_OMPX_TASKGRAPH FALSE CACHE BOOL "OMPX-taskgraph (task record & replay)?") + # Error check hwloc support after config-ix has run if(LIBOMP_USE_HWLOC AND (NOT LIBOMP_HAVE_HWLOC)) libomp_error_say("Hwloc requested but not available") @@ -411,6 +415,7 @@ if(${OPENMP_STANDALONE_BUILD}) libomp_say("Use Adaptive locks -- ${LIBOMP_USE_ADAPTIVE_LOCKS}") libomp_say("Use quad precision -- ${LIBOMP_USE_QUAD_PRECISION}") libomp_say("Use Hwloc library -- ${LIBOMP_USE_HWLOC}") + libomp_say("Use OMPX-taskgraph -- ${LIBOMP_OMPX_TASKGRAPH}") endif() add_subdirectory(src) diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index bd50d58f0fdcb..251a5f0ce76fd 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -2487,6 +2487,62 @@ typedef struct { } ed; } kmp_event_t; +#if OMPX_TASKGRAPH +// Initial number of allocated nodes while recording +#define INIT_MAPSIZE 50 + +typedef struct kmp_taskgraph_flags { /*This needs to be exactly 32 bits */ + unsigned nowait : 1; + unsigned re_record : 1; + unsigned reserved : 30; +} kmp_taskgraph_flags_t; + +/// Represents a TDG node +typedef struct kmp_node_info { + kmp_task_t *task; // Pointer to the actual task + kmp_int32 *successors; // Array of the succesors ids + kmp_int32 nsuccessors; // Number of succesors of the node + std::atomic + npredecessors_counter; // Number of predessors on the fly + kmp_int32 npredecessors; // Total number of predecessors + kmp_int32 successors_size; // Number of allocated succesors ids + kmp_taskdata_t *parent_task; // Parent implicit task +} kmp_node_info_t; + +/// Represent a TDG's current status +typedef enum kmp_tdg_status { + KMP_TDG_NONE = 0, + KMP_TDG_RECORDING = 1, + KMP_TDG_READY = 2 +} kmp_tdg_status_t; + +/// Structure that contains a TDG +typedef struct kmp_tdg_info { + kmp_int32 tdg_id; // Unique idenfifier of the TDG + kmp_taskgraph_flags_t tdg_flags; // Flags related to a TDG + kmp_int32 map_size; // Number of allocated TDG nodes + kmp_int32 num_roots; // Number of roots tasks int the TDG + kmp_int32 *root_tasks; // Array of tasks identifiers that are roots + kmp_node_info_t *record_map; // Array of TDG nodes + kmp_tdg_status_t tdg_status = + KMP_TDG_NONE; // Status of the TDG (recording, ready...) + std::atomic num_tasks; // Number of TDG nodes + kmp_bootstrap_lock_t + graph_lock; // Protect graph attributes when updated via taskloop_recur + // Taskloop reduction related + void *rec_taskred_data; // Data to pass to __kmpc_task_reduction_init or + // __kmpc_taskred_init + kmp_int32 rec_num_taskred; +} kmp_tdg_info_t; + +extern kmp_int32 __kmp_max_tdgs; +extern kmp_tdg_info_t **__kmp_global_tdgs; +extern kmp_int32 __kmp_curr_tdg_idx; +extern kmp_int32 __kmp_successors_size; +extern std::atomic __kmp_tdg_task_id; +extern kmp_int32 __kmp_num_tdg; +#endif + #ifdef BUILD_TIED_TASK_STACK /* Tied Task stack definitions */ @@ -2534,7 +2590,12 @@ typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */ unsigned complete : 1; /* 1==complete, 0==not complete */ unsigned freed : 1; /* 1==freed, 0==allocated */ unsigned native : 1; /* 1==gcc-compiled task, 0==intel */ +#if OMPX_TASKGRAPH + unsigned onced : 1; /* 1==ran once already, 0==never ran, record & replay purposes */ + unsigned reserved31 : 6; /* reserved for library use */ +#else unsigned reserved31 : 7; /* reserved for library use */ +#endif } kmp_tasking_flags_t; @@ -2583,6 +2644,10 @@ struct kmp_taskdata { /* aligned during dynamic allocation */ kmp_event_t td_allow_completion_event; #if OMPT_SUPPORT ompt_task_info_t ompt_task_info; +#endif +#if OMPX_TASKGRAPH + bool is_taskgraph = 0; // whether the task is within a TDG + kmp_tdg_info_t *tdg; // used to associate task with a TDG #endif kmp_target_data_t td_target_data; }; // struct kmp_taskdata @@ -4124,6 +4189,20 @@ KMP_EXPORT void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock, uintptr_t hint); +#if OMPX_TASKGRAPH +// Taskgraph's Record & Replay mechanism +// __kmp_tdg_is_recording: check whether a given TDG is recording +// status: the tdg's current status +static inline bool __kmp_tdg_is_recording(kmp_tdg_status_t status) { + return status == KMP_TDG_RECORDING; +} + +KMP_EXPORT kmp_int32 __kmpc_start_record_task(ident_t *loc, kmp_int32 gtid, + kmp_int32 input_flags, + kmp_int32 tdg_id); +KMP_EXPORT void __kmpc_end_record_task(ident_t *loc, kmp_int32 gtid, + kmp_int32 input_flags, kmp_int32 tdg_id); +#endif /* Interface to fast scalable reduce methods routines */ KMP_EXPORT kmp_int32 __kmpc_reduce_nowait( diff --git a/openmp/runtime/src/kmp_config.h.cmake b/openmp/runtime/src/kmp_config.h.cmake index 91bb8a8312e0b..58bf64112b1a7 100644 --- a/openmp/runtime/src/kmp_config.h.cmake +++ b/openmp/runtime/src/kmp_config.h.cmake @@ -46,6 +46,8 @@ #define OMPT_SUPPORT LIBOMP_OMPT_SUPPORT #cmakedefine01 LIBOMP_OMPD_SUPPORT #define OMPD_SUPPORT LIBOMP_OMPD_SUPPORT +#cmakedefine01 LIBOMP_OMPX_TASKGRAPH +#define OMPX_TASKGRAPH LIBOMP_OMPX_TASKGRAPH #cmakedefine01 LIBOMP_PROFILING_SUPPORT #define OMP_PROFILING_SUPPORT LIBOMP_PROFILING_SUPPORT #cmakedefine01 LIBOMP_OMPT_OPTIONAL diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp index 0163846b44546..9557c519f835a 100644 --- a/openmp/runtime/src/kmp_global.cpp +++ b/openmp/runtime/src/kmp_global.cpp @@ -557,4 +557,16 @@ int __kmp_nesting_mode = 0; int __kmp_nesting_mode_nlevels = 1; int *__kmp_nesting_nth_level; +#if OMPX_TASKGRAPH +// TDG record & replay +kmp_int32 __kmp_max_tdgs = 100; +kmp_tdg_info_t **__kmp_global_tdgs = NULL; +kmp_int32 __kmp_curr_tdg_idx = + 0; // Id of the current TDG being recorded or executed +kmp_int32 __kmp_num_tdg = 0; +kmp_int32 __kmp_successors_size = 10; // Initial succesor size list for + // recording +std::atomic __kmp_tdg_task_id = 0; +#endif // end of file // + diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp index 88eb150fc0c2a..e6dd0ec946554 100644 --- a/openmp/runtime/src/kmp_settings.cpp +++ b/openmp/runtime/src/kmp_settings.cpp @@ -1238,6 +1238,18 @@ static void __kmp_stg_parse_num_threads(char const *name, char const *value, K_DIAG(1, ("__kmp_dflt_team_nth == %d\n", __kmp_dflt_team_nth)); } // __kmp_stg_parse_num_threads +#if OMPX_TASKGRAPH +static void __kmp_stg_parse_max_tdgs(char const *name, char const *value, + void *data) { + __kmp_stg_parse_int(name, value, 0, INT_MAX, &__kmp_max_tdgs); +} // __kmp_stg_parse_max_tdgs + +static void __kmp_std_print_max_tdgs(kmp_str_buf_t *buffer, char const *name, + void *data) { + __kmp_stg_print_int(buffer, name, __kmp_max_tdgs); +} // __kmp_std_print_max_tdgs +#endif + static void __kmp_stg_parse_num_hidden_helper_threads(char const *name, char const *value, void *data) { @@ -5592,6 +5604,10 @@ static kmp_setting_t __kmp_stg_table[] = { {"LIBOMP_NUM_HIDDEN_HELPER_THREADS", __kmp_stg_parse_num_hidden_helper_threads, __kmp_stg_print_num_hidden_helper_threads, NULL, 0, 0}, +#if OMPX_TASKGRAPH + {"KMP_MAX_TDGS", __kmp_stg_parse_max_tdgs, __kmp_std_print_max_tdgs, NULL, + 0, 0}, +#endif #if OMPT_SUPPORT {"OMP_TOOL", __kmp_stg_parse_omp_tool, __kmp_stg_print_omp_tool, NULL, 0, diff --git a/openmp/runtime/src/kmp_taskdeps.cpp b/openmp/runtime/src/kmp_taskdeps.cpp index ee4f96f7e5e26..22119a9d2d453 100644 --- a/openmp/runtime/src/kmp_taskdeps.cpp +++ b/openmp/runtime/src/kmp_taskdeps.cpp @@ -218,6 +218,44 @@ static kmp_depnode_list_t *__kmp_add_node(kmp_info_t *thread, static inline void __kmp_track_dependence(kmp_int32 gtid, kmp_depnode_t *source, kmp_depnode_t *sink, kmp_task_t *sink_task) { +#if OMPX_TASKGRAPH + kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task); + kmp_taskdata_t *task_sink = KMP_TASK_TO_TASKDATA(sink_task); + if (source->dn.task && sink_task) { + // Not supporting dependency between two tasks that one is within the TDG + // and the other is not + KMP_ASSERT(task_source->is_taskgraph == task_sink->is_taskgraph); + } + if (task_sink->is_taskgraph && + __kmp_tdg_is_recording(task_sink->tdg->tdg_status)) { + kmp_node_info_t *source_info = + &task_sink->tdg->record_map[task_source->td_task_id]; + bool exists = false; + for (int i = 0; i < source_info->nsuccessors; i++) { + if (source_info->successors[i] == task_sink->td_task_id) { + exists = true; + break; + } + } + if (!exists) { + if (source_info->nsuccessors >= source_info->successors_size) { + source_info->successors_size = 2 * source_info->successors_size; + kmp_int32 *old_succ_ids = source_info->successors; + kmp_int32 *new_succ_ids = (kmp_int32 *)__kmp_allocate( + source_info->successors_size * sizeof(kmp_int32)); + source_info->successors = new_succ_ids; + __kmp_free(old_succ_ids); + } + + source_info->successors[source_info->nsuccessors] = task_sink->td_task_id; + source_info->nsuccessors++; + + kmp_node_info_t *sink_info = + &(task_sink->tdg->record_map[task_sink->td_task_id]); + sink_info->npredecessors++; + } + } +#endif #ifdef KMP_SUPPORT_GRAPH_OUTPUT kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task); // do not use sink->dn.task as that is only filled after the dependences @@ -256,10 +294,23 @@ __kmp_depnode_link_successor(kmp_int32 gtid, kmp_info_t *thread, // link node as successor of list elements for (kmp_depnode_list_t *p = plist; p; p = p->next) { kmp_depnode_t *dep = p->node; +#if OMPX_TASKGRAPH + kmp_tdg_status tdg_status = KMP_TDG_NONE; + if (task) { + kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task); + if (td->is_taskgraph) + tdg_status = KMP_TASK_TO_TASKDATA(task)->tdg->tdg_status; + if (__kmp_tdg_is_recording(tdg_status)) + __kmp_track_dependence(gtid, dep, node, task); + } +#endif if (dep->dn.task) { KMP_ACQUIRE_DEPNODE(gtid, dep); if (dep->dn.task) { - __kmp_track_dependence(gtid, dep, node, task); +#if OMPX_TASKGRAPH + if (!(__kmp_tdg_is_recording(tdg_status)) && task) +#endif + __kmp_track_dependence(gtid, dep, node, task); dep->dn.successors = __kmp_add_node(thread, dep->dn.successors, node); KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to " "%p\n", @@ -281,16 +332,42 @@ static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid, if (!sink) return 0; kmp_int32 npredecessors = 0; +#if OMPX_TASKGRAPH + kmp_tdg_status tdg_status = KMP_TDG_NONE; + kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task); + if (task) { + if (td->is_taskgraph) + tdg_status = KMP_TASK_TO_TASKDATA(task)->tdg->tdg_status; + if (__kmp_tdg_is_recording(tdg_status) && sink->dn.task) + __kmp_track_dependence(gtid, sink, source, task); + } +#endif if (sink->dn.task) { // synchronously add source to sink' list of successors KMP_ACQUIRE_DEPNODE(gtid, sink); if (sink->dn.task) { - __kmp_track_dependence(gtid, sink, source, task); +#if OMPX_TASKGRAPH + if (!(__kmp_tdg_is_recording(tdg_status)) && task) +#endif + __kmp_track_dependence(gtid, sink, source, task); sink->dn.successors = __kmp_add_node(thread, sink->dn.successors, source); KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to " "%p\n", gtid, KMP_TASK_TO_TASKDATA(sink->dn.task), KMP_TASK_TO_TASKDATA(task))); +#if OMPX_TASKGRAPH + if (__kmp_tdg_is_recording(tdg_status)) { + kmp_taskdata_t *tdd = KMP_TASK_TO_TASKDATA(sink->dn.task); + if (tdd->is_taskgraph) { + if (tdd->td_flags.onced) + // decrement npredecessors if sink->dn.task belongs to a taskgraph + // and + // 1) the task is reset to its initial state (by kmp_free_task) or + // 2) the task is complete but not yet reset + npredecessors--; + } + } +#endif npredecessors++; } KMP_RELEASE_DEPNODE(gtid, sink); @@ -595,6 +672,48 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_info_t *thread = __kmp_threads[gtid]; kmp_taskdata_t *current_task = thread->th.th_current_task; +#if OMPX_TASKGRAPH + // record TDG with deps + if (new_taskdata->is_taskgraph && + __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) { + kmp_tdg_info_t *tdg = new_taskdata->tdg; + // extend record_map if needed + if (new_taskdata->td_task_id >= tdg->map_size) { + __kmp_acquire_bootstrap_lock(&tdg->graph_lock); + if (new_taskdata->td_task_id >= tdg->map_size) { + kmp_uint old_size = tdg->map_size; + kmp_uint new_size = old_size * 2; + kmp_node_info_t *old_record = tdg->record_map; + kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate( + new_size * sizeof(kmp_node_info_t)); + KMP_MEMCPY(new_record, tdg->record_map, + old_size * sizeof(kmp_node_info_t)); + tdg->record_map = new_record; + + __kmp_free(old_record); + + for (kmp_int i = old_size; i < new_size; i++) { + kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate( + __kmp_successors_size * sizeof(kmp_int32)); + new_record[i].task = nullptr; + new_record[i].successors = successorsList; + new_record[i].nsuccessors = 0; + new_record[i].npredecessors = 0; + new_record[i].successors_size = __kmp_successors_size; + KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0); + } + // update the size at the end, so that we avoid other + // threads use old_record while map_size is already updated + tdg->map_size = new_size; + } + __kmp_release_bootstrap_lock(&tdg->graph_lock); + } + tdg->record_map[new_taskdata->td_task_id].task = new_task; + tdg->record_map[new_taskdata->td_task_id].parent_task = + new_taskdata->td_parent; + KMP_ATOMIC_INC(&tdg->num_tasks); + } +#endif #if OMPT_SUPPORT if (ompt_enabled.enabled) { if (!current_task->ompt_task_info.frame.enter_frame.ptr) diff --git a/openmp/runtime/src/kmp_taskdeps.h b/openmp/runtime/src/kmp_taskdeps.h index ac6174afd3f58..d2ab515158011 100644 --- a/openmp/runtime/src/kmp_taskdeps.h +++ b/openmp/runtime/src/kmp_taskdeps.h @@ -92,6 +92,23 @@ static inline void __kmp_dephash_free(kmp_info_t *thread, kmp_dephash_t *h) { extern void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start); static inline void __kmp_release_deps(kmp_int32 gtid, kmp_taskdata_t *task) { + +#if OMPX_TASKGRAPH + if (task->is_taskgraph && !(__kmp_tdg_is_recording(task->tdg->tdg_status))) { + kmp_node_info_t *TaskInfo = &(task->tdg->record_map[task->td_task_id]); + + for (int i = 0; i < TaskInfo->nsuccessors; i++) { + kmp_int32 successorNumber = TaskInfo->successors[i]; + kmp_node_info_t *successor = &(task->tdg->record_map[successorNumber]); + kmp_int32 npredecessors = KMP_ATOMIC_DEC(&successor->npredecessors_counter) - 1; + if (successor->task != nullptr && npredecessors == 0) { + __kmp_omp_task(gtid, successor->task, false); + } + } + return; + } +#endif + kmp_info_t *thread = __kmp_threads[gtid]; kmp_depnode_t *node = task->td_depnode; @@ -120,8 +137,12 @@ static inline void __kmp_release_deps(kmp_int32 gtid, kmp_taskdata_t *task) { gtid, task)); KMP_ACQUIRE_DEPNODE(gtid, node); - node->dn.task = - NULL; // mark this task as finished, so no new dependencies are generated +#if OMPX_TASKGRAPH + if (!task->is_taskgraph || + (task->is_taskgraph && !__kmp_tdg_is_recording(task->tdg->tdg_status))) +#endif + node->dn.task = + NULL; // mark this task as finished, so no new dependencies are generated KMP_RELEASE_DEPNODE(gtid, node); kmp_depnode_list_t *next; diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp index 7e9147eeeebf2..5b6c9edd77dae 100644 --- a/openmp/runtime/src/kmp_tasking.cpp +++ b/openmp/runtime/src/kmp_tasking.cpp @@ -37,6 +37,10 @@ static void __kmp_alloc_task_deque(kmp_info_t *thread, static int __kmp_realloc_task_threads_data(kmp_info_t *thread, kmp_task_team_t *task_team); static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask); +#if OMPX_TASKGRAPH +static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id); +int __kmp_taskloop_task(int gtid, void *ptask); +#endif #ifdef BUILD_TIED_TASK_STACK @@ -281,7 +285,11 @@ static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained, } // Check mutexinoutset dependencies, acquire locks kmp_depnode_t *node = tasknew->td_depnode; +#if OMPX_TASKGRAPH + if (!tasknew->is_taskgraph && UNLIKELY(node && (node->dn.mtx_num_locks > 0))) { +#else if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) { +#endif for (int i = 0; i < node->dn.mtx_num_locks; ++i) { KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL); if (__kmp_test_lock(node->dn.mtx_locks[i], gtid)) @@ -888,12 +896,34 @@ static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata, task->data2.priority = 0; taskdata->td_flags.freed = 1; +#if OMPX_TASKGRAPH + // do not free tasks in taskgraph + if (!taskdata->is_taskgraph) { +#endif // deallocate the taskdata and shared variable blocks associated with this task #if USE_FAST_MEMORY __kmp_fast_free(thread, taskdata); #else /* ! USE_FAST_MEMORY */ __kmp_thread_free(thread, taskdata); #endif +#if OMPX_TASKGRAPH + } else { + taskdata->td_flags.complete = 0; + taskdata->td_flags.started = 0; + taskdata->td_flags.freed = 0; + taskdata->td_flags.executing = 0; + taskdata->td_flags.task_serial = + (taskdata->td_parent->td_flags.final || + taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser); + + // taskdata->td_allow_completion_event.pending_events_count = 1; + KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0); + KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0); + // start at one because counts current task and children + KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1); + } +#endif + KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata)); } @@ -980,6 +1010,10 @@ static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) { flags.detachable == TASK_DETACHABLE || flags.hidden_helper; ret = ret || KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > 0; +#if OMPX_TASKGRAPH + if (taskdata->td_taskgroup && taskdata->is_taskgraph) + ret = ret || KMP_ATOMIC_LD_ACQ(&taskdata->td_taskgroup->count) > 0; +#endif return ret; } @@ -999,6 +1033,10 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, kmp_info_t *thread = __kmp_threads[gtid]; kmp_task_team_t *task_team = thread->th.th_task_team; // might be NULL for serial teams... +#if OMPX_TASKGRAPH + // to avoid seg fault when we need to access taskdata->td_flags after free when using vanilla taskloop + bool is_taskgraph; +#endif #if KMP_DEBUG kmp_int32 children = 0; #endif @@ -1008,6 +1046,10 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); +#if OMPX_TASKGRAPH + is_taskgraph = taskdata->is_taskgraph; +#endif + // Pop task from stack if tied #ifdef BUILD_TIED_TASK_STACK if (taskdata->td_flags.tiedness == TASK_TIED) { @@ -1114,6 +1156,9 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, if (completed) { taskdata->td_flags.complete = 1; // mark the task as completed +#if OMPX_TASKGRAPH + taskdata->td_flags.onced = 1; // mark the task as ran once already +#endif #if OMPT_SUPPORT // This is not a detached task, we are done here @@ -1130,7 +1175,11 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, #endif KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks); KMP_DEBUG_ASSERT(children >= 0); +#if OMPX_TASKGRAPH + if (taskdata->td_taskgroup && !taskdata->is_taskgraph) +#else if (taskdata->td_taskgroup) +#endif KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count); } else if (task_team && (task_team->tt.tt_found_proxy_tasks || task_team->tt.tt_hidden_helper_task_encountered)) { @@ -1169,6 +1218,19 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 ); resumed_task->td_flags.executing = 1; // resume previous task +#if OMPX_TASKGRAPH + if (is_taskgraph && __kmp_track_children_task(taskdata) && + taskdata->td_taskgroup) { + // TDG: we only release taskgroup barrier here because + // free_task_and_ancestors will call + // __kmp_free_task, which resets all task parameters such as + // taskdata->started, etc. If we release the barrier earlier, these + // parameters could be read before being reset. This is not an issue for + // non-TDG implementation because we never reuse a task(data) structure + KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count); + } +#endif + KA_TRACE( 10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n", gtid, taskdata, resumed_task)); @@ -1285,6 +1347,9 @@ void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr, task->td_flags.executing = 1; task->td_flags.complete = 0; task->td_flags.freed = 0; +#if OMPX_TASKGRAPH + task->td_flags.onced = 0; +#endif task->td_depnode = NULL; task->td_last_tied = task; @@ -1321,6 +1386,9 @@ void __kmp_finish_implicit_task(kmp_info_t *thread) { if (task->td_dephash) { int children; task->td_flags.complete = 1; +#if OMPX_TASKGRAPH + task->td_flags.onced = 1; +#endif children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks); kmp_tasking_flags_t flags_old = task->td_flags; if (children == 0 && flags_old.complete == 1) { @@ -1550,7 +1618,9 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, taskdata->td_flags.executing = 0; taskdata->td_flags.complete = 0; taskdata->td_flags.freed = 0; - +#if OMPX_TASKGRAPH + taskdata->td_flags.onced = 0; +#endif KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0); // start at one because counts current task and children KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1); @@ -1586,6 +1656,15 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, } } +#if OMPX_TASKGRAPH + kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx); + if (tdg && __kmp_tdg_is_recording(tdg->tdg_status) && + (task_entry != (kmp_routine_entry_t)__kmp_taskloop_task)) { + taskdata->is_taskgraph = 1; + taskdata->tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx]; + taskdata->td_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id); + } +#endif KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n", gtid, taskdata, taskdata->td_parent)); @@ -1929,6 +2008,53 @@ kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task, bool serialize_immediate) { kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); +#if OMPX_TASKGRAPH + if (new_taskdata->is_taskgraph && + __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) { + kmp_tdg_info_t *tdg = new_taskdata->tdg; + // extend the record_map if needed + if (new_taskdata->td_task_id >= new_taskdata->tdg->map_size) { + __kmp_acquire_bootstrap_lock(&tdg->graph_lock); + // map_size could have been updated by another thread if recursive + // taskloop + if (new_taskdata->td_task_id >= tdg->map_size) { + kmp_uint old_size = tdg->map_size; + kmp_uint new_size = old_size * 2; + kmp_node_info_t *old_record = tdg->record_map; + kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate( + new_size * sizeof(kmp_node_info_t)); + + KMP_MEMCPY(new_record, old_record, old_size * sizeof(kmp_node_info_t)); + tdg->record_map = new_record; + + __kmp_free(old_record); + + for (kmp_int i = old_size; i < new_size; i++) { + kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate( + __kmp_successors_size * sizeof(kmp_int32)); + new_record[i].task = nullptr; + new_record[i].successors = successorsList; + new_record[i].nsuccessors = 0; + new_record[i].npredecessors = 0; + new_record[i].successors_size = __kmp_successors_size; + KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0); + } + // update the size at the end, so that we avoid other + // threads use old_record while map_size is already updated + tdg->map_size = new_size; + } + __kmp_release_bootstrap_lock(&tdg->graph_lock); + } + // record a task + if (tdg->record_map[new_taskdata->td_task_id].task == nullptr) { + tdg->record_map[new_taskdata->td_task_id].task = new_task; + tdg->record_map[new_taskdata->td_task_id].parent_task = + new_taskdata->td_parent; + KMP_ATOMIC_INC(&tdg->num_tasks); + } + } +#endif + /* Should we execute the new task or queue it? For now, let's just always try to queue it. If the queue fills up, then we'll execute it. */ if (new_taskdata->td_flags.proxy == TASK_PROXY || @@ -2177,7 +2303,6 @@ static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid, taskdata->ompt_task_info.frame.enter_frame = ompt_data_none; } #endif // OMPT_SUPPORT && OMPT_OPTIONAL - } KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, " @@ -2445,6 +2570,17 @@ the reduction either does not use omp_orig object, or the omp_orig is accessible without help of the runtime library. */ void *__kmpc_task_reduction_init(int gtid, int num, void *data) { +#if OMPX_TASKGRAPH + kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx); + if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) { + kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx]; + this_tdg->rec_taskred_data = + __kmp_allocate(sizeof(kmp_task_red_input_t) * num); + this_tdg->rec_num_taskred = num; + KMP_MEMCPY(this_tdg->rec_taskred_data, data, + sizeof(kmp_task_red_input_t) * num); + } +#endif return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data); } @@ -2461,6 +2597,17 @@ Note: this entry supposes the optional compiler-generated initializer routine has two parameters, pointer to object to be initialized and pointer to omp_orig */ void *__kmpc_taskred_init(int gtid, int num, void *data) { +#if OMPX_TASKGRAPH + kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx); + if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) { + kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx]; + this_tdg->rec_taskred_data = + __kmp_allocate(sizeof(kmp_task_red_input_t) * num); + this_tdg->rec_num_taskred = num; + KMP_MEMCPY(this_tdg->rec_taskred_data, data, + sizeof(kmp_task_red_input_t) * num); + } +#endif return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data); } @@ -2507,6 +2654,18 @@ void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) { kmp_int32 num = tg->reduce_num_data; kmp_int32 tid = thread->th.th_info.ds.ds_tid; +#if OMPX_TASKGRAPH + if ((thread->th.th_current_task->is_taskgraph) && + (!__kmp_tdg_is_recording( + __kmp_global_tdgs[__kmp_curr_tdg_idx]->tdg_status))) { + tg = thread->th.th_current_task->td_taskgroup; + KMP_ASSERT(tg != NULL); + KMP_ASSERT(tg->reduce_data != NULL); + arr = (kmp_taskred_data_t *)(tg->reduce_data); + num = tg->reduce_num_data; + } +#endif + KMP_ASSERT(data != NULL); while (tg != NULL) { for (int i = 0; i < num; ++i) { @@ -4278,6 +4437,9 @@ static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) { KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); taskdata->td_flags.complete = 1; // mark the task as completed +#if OMPX_TASKGRAPH + taskdata->td_flags.onced = 1; +#endif if (taskdata->td_taskgroup) KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count); @@ -4476,8 +4638,14 @@ void __kmp_fulfill_event(kmp_event_t *event) { // // thread: allocating thread // task_src: pointer to source task to be duplicated +// taskloop_recur: used only when dealing with taskgraph, +// indicating whether we need to update task->td_task_id // returns: a pointer to the allocated kmp_task_t structure (task). -kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) { +kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src +#if OMPX_TASKGRAPH + , int taskloop_recur +#endif +) { kmp_task_t *task; kmp_taskdata_t *taskdata; kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src); @@ -4505,7 +4673,15 @@ kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) { task = KMP_TASKDATA_TO_TASK(taskdata); // Initialize new task (only specific fields not affected by memcpy) +#if OMPX_TASKGRAPH + if (!taskdata->is_taskgraph || taskloop_recur) + taskdata->td_task_id = KMP_GEN_TASK_ID(); + else if (taskdata->is_taskgraph && + __kmp_tdg_is_recording(taskdata_src->tdg->tdg_status)) + taskdata->td_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id); +#else taskdata->td_task_id = KMP_GEN_TASK_ID(); +#endif if (task->shareds != NULL) { // need setup shareds pointer shareds_offset = (char *)task_src->shareds - (char *)taskdata_src; task->shareds = &((char *)taskdata)[shareds_offset]; @@ -4732,7 +4908,13 @@ void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task, lastpriv = 1; } } + +#if OMPX_TASKGRAPH + next_task = __kmp_task_dup_alloc(thread, task, /* taskloop_recur */ 0); +#else next_task = __kmp_task_dup_alloc(thread, task); // allocate new task +#endif + kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task); kmp_taskloop_bounds_t next_task_bounds = kmp_taskloop_bounds_t(next_task, task_bounds); @@ -4929,7 +5111,12 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task, lb1 = ub0 + st; // create pattern task for 2nd half of the loop +#if OMPX_TASKGRAPH + next_task = __kmp_task_dup_alloc(thread, task, + /* taskloop_recur */ 1); +#else next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task +#endif // adjust lower bound (upper bound is not changed) for the 2nd half *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1; if (ptask_dup != NULL) // construct firstprivates, etc. @@ -4962,6 +5149,12 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task, p->codeptr_ra = codeptr_ra; #endif +#if OMPX_TASKGRAPH + kmp_taskdata_t *new_task_data = KMP_TASK_TO_TASKDATA(new_task); + new_task_data->tdg = taskdata->tdg; + new_task_data->is_taskgraph = 0; +#endif + #if OMPT_SUPPORT // schedule new task with correct return address for OMPT events __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra); @@ -5001,6 +5194,9 @@ static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, __kmpc_taskgroup(loc, gtid); } +#if OMPX_TASKGRAPH + KMP_ATOMIC_DEC(&__kmp_tdg_task_id); +#endif // ========================================================================= // calculate loop parameters kmp_taskloop_bounds_t task_bounds(task, lb, ub); @@ -5248,3 +5444,221 @@ bool __kmpc_omp_has_task_team(kmp_int32 gtid) { return taskdata->td_task_team != NULL; } + +#if OMPX_TASKGRAPH +// __kmp_find_tdg: identify a TDG through its ID +// gtid: Global Thread ID +// tdg_id: ID of the TDG +// returns: If a TDG corresponding to this ID is found and not +// its initial state, return the pointer to it, otherwise nullptr +static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id) { + kmp_tdg_info_t *res = nullptr; + if (__kmp_max_tdgs == 0) + return res; + + if (__kmp_global_tdgs == NULL) + __kmp_global_tdgs = (kmp_tdg_info_t **)__kmp_allocate( + sizeof(kmp_tdg_info_t *) * __kmp_max_tdgs); + + if ((__kmp_global_tdgs[tdg_id]) && + (__kmp_global_tdgs[tdg_id]->tdg_status != KMP_TDG_NONE)) + res = __kmp_global_tdgs[tdg_id]; + return res; +} + +// __kmp_start_record: launch the execution of a previous +// recorded TDG +// gtid: Global Thread ID +// tdg: ID of the TDG +void __kmp_exec_tdg(kmp_int32 gtid, kmp_tdg_info_t *tdg) { + KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_READY); + KA_TRACE(10, ("__kmp_exec_tdg(enter): T#%d tdg_id=%d num_roots=%d\n", gtid, + tdg->tdg_id, tdg->num_roots)); + kmp_node_info_t *this_record_map = tdg->record_map; + kmp_int32 *this_root_tasks = tdg->root_tasks; + kmp_int32 this_num_roots = tdg->num_roots; + kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks); + + kmp_info_t *thread = __kmp_threads[gtid]; + kmp_taskdata_t *parent_task = thread->th.th_current_task; + + if (tdg->rec_taskred_data) { + __kmpc_taskred_init(gtid, tdg->rec_num_taskred, tdg->rec_taskred_data); + } + + for (kmp_int32 j = 0; j < this_num_tasks; j++) { + kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(this_record_map[j].task); + + td->td_parent = parent_task; + this_record_map[j].parent_task = parent_task; + + kmp_taskgroup_t *parent_taskgroup = + this_record_map[j].parent_task->td_taskgroup; + + KMP_ATOMIC_ST_RLX(&this_record_map[j].npredecessors_counter, + this_record_map[j].npredecessors); + KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_incomplete_child_tasks); + + if (parent_taskgroup) { + KMP_ATOMIC_INC(&parent_taskgroup->count); + // The taskgroup is different so we must update it + td->td_taskgroup = parent_taskgroup; + } else if (td->td_taskgroup != nullptr) { + // If the parent doesnt have a taskgroup, remove it from the task + td->td_taskgroup = nullptr; + } + if (this_record_map[j].parent_task->td_flags.tasktype == TASK_EXPLICIT) + KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_allocated_child_tasks); + } + + for (kmp_int32 j = 0; j < this_num_roots; ++j) { + __kmp_omp_task(gtid, this_record_map[this_root_tasks[j]].task, true); + } + KA_TRACE(10, ("__kmp_exec_tdg(exit): T#%d tdg_id=%d num_roots=%d\n", gtid, + tdg->tdg_id, tdg->num_roots)); +} + +// __kmp_start_record: set up a TDG structure and turn the +// recording flag to true +// gtid: Global Thread ID of the encountering thread +// input_flags: Flags associated with the TDG +// tdg_id: ID of the TDG to record +static inline void __kmp_start_record(kmp_int32 gtid, + kmp_taskgraph_flags_t *flags, + kmp_int32 tdg_id) { + kmp_tdg_info_t *tdg = + (kmp_tdg_info_t *)__kmp_allocate(sizeof(kmp_tdg_info_t)); + __kmp_global_tdgs[__kmp_curr_tdg_idx] = tdg; + // Initializing the TDG structure + tdg->tdg_id = tdg_id; + tdg->map_size = INIT_MAPSIZE; + tdg->num_roots = -1; + tdg->root_tasks = nullptr; + tdg->tdg_status = KMP_TDG_RECORDING; + tdg->rec_num_taskred = 0; + tdg->rec_taskred_data = nullptr; + KMP_ATOMIC_ST_RLX(&tdg->num_tasks, 0); + + // Initializing the list of nodes in this TDG + kmp_node_info_t *this_record_map = + (kmp_node_info_t *)__kmp_allocate(INIT_MAPSIZE * sizeof(kmp_node_info_t)); + for (kmp_int32 i = 0; i < INIT_MAPSIZE; i++) { + kmp_int32 *successorsList = + (kmp_int32 *)__kmp_allocate(__kmp_successors_size * sizeof(kmp_int32)); + this_record_map[i].task = nullptr; + this_record_map[i].successors = successorsList; + this_record_map[i].nsuccessors = 0; + this_record_map[i].npredecessors = 0; + this_record_map[i].successors_size = __kmp_successors_size; + KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter, 0); + } + + __kmp_global_tdgs[__kmp_curr_tdg_idx]->record_map = this_record_map; +} + +// __kmpc_start_record_task: Wrapper around __kmp_start_record to mark +// the beginning of the record process of a task region +// loc_ref: Location of TDG, not used yet +// gtid: Global Thread ID of the encountering thread +// input_flags: Flags associated with the TDG +// tdg_id: ID of the TDG to record, for now, incremental integer +// returns: 1 if we record, otherwise, 0 +kmp_int32 __kmpc_start_record_task(ident_t *loc_ref, kmp_int32 gtid, + kmp_int32 input_flags, kmp_int32 tdg_id) { + + kmp_int32 res; + kmp_taskgraph_flags_t *flags = (kmp_taskgraph_flags_t *)&input_flags; + KA_TRACE(10, + ("__kmpc_start_record_task(enter): T#%d loc=%p flags=%d tdg_id=%d\n", + gtid, loc_ref, input_flags, tdg_id)); + + if (__kmp_max_tdgs == 0) { + KA_TRACE( + 10, + ("__kmpc_start_record_task(abandon): T#%d loc=%p flags=%d tdg_id = %d, " + "__kmp_max_tdgs = 0\n", + gtid, loc_ref, input_flags, tdg_id)); + return 1; + } + + __kmpc_taskgroup(loc_ref, gtid); + if (kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id)) { + // TODO: use re_record flag + __kmp_exec_tdg(gtid, tdg); + res = 0; + } else { + __kmp_curr_tdg_idx = tdg_id; + KMP_DEBUG_ASSERT(__kmp_curr_tdg_idx < __kmp_max_tdgs); + __kmp_start_record(gtid, flags, tdg_id); + __kmp_num_tdg++; + res = 1; + } + KA_TRACE(10, ("__kmpc_start_record_task(exit): T#%d TDG %d starts to %s\n", + gtid, tdg_id, res ? "record" : "execute")); + return res; +} + +// __kmp_end_record: set up a TDG after recording it +// gtid: Global thread ID +// tdg: Pointer to the TDG +void __kmp_end_record(kmp_int32 gtid, kmp_tdg_info_t *tdg) { + // Store roots + kmp_node_info_t *this_record_map = tdg->record_map; + kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks); + kmp_int32 *this_root_tasks = + (kmp_int32 *)__kmp_allocate(this_num_tasks * sizeof(kmp_int32)); + kmp_int32 this_map_size = tdg->map_size; + kmp_int32 this_num_roots = 0; + kmp_info_t *thread = __kmp_threads[gtid]; + + for (kmp_int32 i = 0; i < this_num_tasks; i++) { + if (this_record_map[i].npredecessors == 0) { + this_root_tasks[this_num_roots++] = i; + } + } + + // Update with roots info and mapsize + tdg->map_size = this_map_size; + tdg->num_roots = this_num_roots; + tdg->root_tasks = this_root_tasks; + KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_RECORDING); + tdg->tdg_status = KMP_TDG_READY; + + if (thread->th.th_current_task->td_dephash) { + __kmp_dephash_free(thread, thread->th.th_current_task->td_dephash); + thread->th.th_current_task->td_dephash = NULL; + } + + // Reset predecessor counter + for (kmp_int32 i = 0; i < this_num_tasks; i++) { + KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter, + this_record_map[i].npredecessors); + } + KMP_ATOMIC_ST_RLX(&__kmp_tdg_task_id, 0); +} + +// __kmpc_end_record_task: wrapper around __kmp_end_record to mark +// the end of recording phase +// +// loc_ref: Source location information +// gtid: Global thread ID +// input_flags: Flags attached to the graph +// tdg_id: ID of the TDG just finished recording +void __kmpc_end_record_task(ident_t *loc_ref, kmp_int32 gtid, + kmp_int32 input_flags, kmp_int32 tdg_id) { + kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id); + + KA_TRACE(10, ("__kmpc_end_record_task(enter): T#%d loc=%p finishes recording" + " tdg=%d with flags=%d\n", + gtid, loc_ref, tdg_id, input_flags)); + if (__kmp_max_tdgs) { + // TODO: use input_flags->nowait + __kmpc_end_taskgroup(loc_ref, gtid); + if (__kmp_tdg_is_recording(tdg->tdg_status)) + __kmp_end_record(gtid, tdg); + } + KA_TRACE(10, ("__kmpc_end_record_task(exit): T#%d loc=%p finished recording" + " tdg=%d, its status is now READY\n", + gtid, loc_ref, tdg_id)); +} +#endif diff --git a/openmp/runtime/test/CMakeLists.txt b/openmp/runtime/test/CMakeLists.txt index 05b517fb920fd..a7790804542b7 100644 --- a/openmp/runtime/test/CMakeLists.txt +++ b/openmp/runtime/test/CMakeLists.txt @@ -30,6 +30,7 @@ update_test_compiler_features() pythonize_bool(LIBOMP_USE_HWLOC) pythonize_bool(LIBOMP_OMPT_SUPPORT) pythonize_bool(LIBOMP_OMPT_OPTIONAL) +pythonize_bool(LIBOMP_OMPX_TASKGRAPH) pythonize_bool(LIBOMP_HAVE_LIBM) pythonize_bool(LIBOMP_HAVE_LIBATOMIC) pythonize_bool(OPENMP_STANDALONE_BUILD) diff --git a/openmp/runtime/test/lit.cfg b/openmp/runtime/test/lit.cfg index b1a5b7a351098..a9399288e550a 100644 --- a/openmp/runtime/test/lit.cfg +++ b/openmp/runtime/test/lit.cfg @@ -99,6 +99,9 @@ if config.has_ompt: # for callback.h config.test_flags += " -I " + config.test_source_root + "/ompt" +if config.has_ompx_taskgraph: + config.available_features.add("ompx_taskgraph") + if 'Linux' in config.operating_system: config.available_features.add("linux") diff --git a/openmp/runtime/test/lit.site.cfg.in b/openmp/runtime/test/lit.site.cfg.in index 45a18b480130f..d6c259280619b 100644 --- a/openmp/runtime/test/lit.site.cfg.in +++ b/openmp/runtime/test/lit.site.cfg.in @@ -15,6 +15,7 @@ config.operating_system = "@CMAKE_SYSTEM_NAME@" config.hwloc_library_dir = "@LIBOMP_HWLOC_LIBRARY_DIR@" config.using_hwloc = @LIBOMP_USE_HWLOC@ config.has_ompt = @LIBOMP_OMPT_SUPPORT@ and @LIBOMP_OMPT_OPTIONAL@ +config.has_ompx_taskgraph = @LIBOMP_OMPX_TASKGRAPH@ config.has_libm = @LIBOMP_HAVE_LIBM@ config.has_libatomic = @LIBOMP_HAVE_LIBATOMIC@ config.is_standalone_build = @OPENMP_STANDALONE_BUILD@ diff --git a/openmp/runtime/test/tasking/omp_record_replay.cpp b/openmp/runtime/test/tasking/omp_record_replay.cpp new file mode 100644 index 0000000000000..69ad98003a0d6 --- /dev/null +++ b/openmp/runtime/test/tasking/omp_record_replay.cpp @@ -0,0 +1,48 @@ +// REQUIRES: ompx_taskgraph +// RUN: %libomp-cxx-compile-and-run +#include +#include +#define NT 100 + +// Compiler-generated code (emulation) +typedef struct ident { + void* dummy; +} ident_t; + + +#ifdef __cplusplus +extern "C" { + int __kmpc_global_thread_num(ident_t *); + int __kmpc_start_record_task(ident_t *, int, int, int); + void __kmpc_end_record_task(ident_t *, int, int , int); +} +#endif + +void func(int *num_exec) { + (*num_exec)++; +} + +int main() { + int num_exec = 0; + int num_tasks = 0; + int x=0; + #pragma omp parallel + #pragma omp single + for (int iter = 0; iter < NT; ++iter) { + int gtid = __kmpc_global_thread_num(nullptr); + int res = __kmpc_start_record_task(nullptr, gtid, /* kmp_tdg_flags */ 0, /* tdg_id */0); + if (res) { + num_tasks++; + #pragma omp task + func(&num_exec); + } + __kmpc_end_record_task(nullptr, gtid, /* kmp_tdg_flags */0, /* tdg_id */0); + } + + assert(num_tasks==1); + assert(num_exec==NT); + + std::cout << "Passed" << std::endl; + return 0; +} +// CHECK: Passed diff --git a/openmp/runtime/test/tasking/omp_record_replay_deps.cpp b/openmp/runtime/test/tasking/omp_record_replay_deps.cpp new file mode 100644 index 0000000000000..9b6b370b30efc --- /dev/null +++ b/openmp/runtime/test/tasking/omp_record_replay_deps.cpp @@ -0,0 +1,63 @@ +// REQUIRES: ompx_taskgraph +// RUN: %libomp-cxx-compile-and-run +#include +#include +#define NT 100 +#define MULTIPLIER 100 +#define DECREMENT 5 + +int val; +// Compiler-generated code (emulation) +typedef struct ident { + void* dummy; +} ident_t; + + +#ifdef __cplusplus +extern "C" { + int __kmpc_global_thread_num(ident_t *); + int __kmpc_start_record_task(ident_t *, int, int, int); + void __kmpc_end_record_task(ident_t *, int, int, int); +} +#endif + +void sub() { + #pragma omp atomic + val -= DECREMENT; +} + +void add() { + #pragma omp atomic + val += DECREMENT; +} + +void mult() { + // no atomicity needed, can only be executed by 1 thread + // and no concurrency with other tasks possible + val *= MULTIPLIER; +} + +int main() { + val = 0; + int *x, *y; + #pragma omp parallel + #pragma omp single + for (int iter = 0; iter < NT; ++iter) { + int gtid = __kmpc_global_thread_num(nullptr); + int res = __kmpc_start_record_task(nullptr, gtid, /* kmp_tdg_flags */0, /* tdg_id */0); + if (res) { + #pragma omp task depend(out:y) + add(); + #pragma omp task depend(out:x) + sub(); + #pragma omp task depend(in:x,y) + mult(); + } + __kmpc_end_record_task(nullptr, gtid, /* kmp_tdg_flags */0, /* tdg_id */0); + } + assert(val==0); + + std::cout << "Passed" << std::endl; + return 0; +} +// CHECK: Passed diff --git a/openmp/runtime/test/tasking/omp_record_replay_multiTDGs.cpp b/openmp/runtime/test/tasking/omp_record_replay_multiTDGs.cpp new file mode 100644 index 0000000000000..03252843689c4 --- /dev/null +++ b/openmp/runtime/test/tasking/omp_record_replay_multiTDGs.cpp @@ -0,0 +1,76 @@ +// REQUIRES: ompx_taskgraph +// RUN: %libomp-cxx-compile-and-run +#include +#include +#define NT 20 +#define MULTIPLIER 100 +#define DECREMENT 5 + +// Compiler-generated code (emulation) +typedef struct ident { + void* dummy; +} ident_t; + +int val; +#ifdef __cplusplus +extern "C" { + int __kmpc_global_thread_num(ident_t *); + int __kmpc_start_record_task(ident_t *, int, int, int); + void __kmpc_end_record_task(ident_t *, int, int , int); +} +#endif + +void sub() { + #pragma omp atomic + val -= DECREMENT; +} + +void add() { + #pragma omp atomic + val += DECREMENT; +} + +void mult() { + // no atomicity needed, can only be executed by 1 thread + // and no concurrency with other tasks possible + val *= MULTIPLIER; +} + +int main() { + int num_tasks = 0; + int *x, *y; + #pragma omp parallel + #pragma omp single + for (int iter = 0; iter < NT; ++iter) { + int gtid = __kmpc_global_thread_num(nullptr); + int res = __kmpc_start_record_task(nullptr, gtid, /* kmp_tdg_flags */ 0, /* tdg_id */0); + if (res) { + num_tasks++; + #pragma omp task depend(out:y) + add(); + #pragma omp task depend(out:x) + sub(); + #pragma omp task depend(in:x,y) + mult(); + } + __kmpc_end_record_task(nullptr, gtid, /* kmp_tdg_flags */0, /* tdg_id */0); + res = __kmpc_start_record_task(nullptr, gtid, /* kmp_tdg_flags */ 0, /* tdg_id */1); + if (res) { + num_tasks++; + #pragma omp task depend(out:y) + add(); + #pragma omp task depend(out:x) + sub(); + #pragma omp task depend(in:x,y) + mult(); + } + __kmpc_end_record_task(nullptr, gtid, /* kmp_tdg_flags */0, /* tdg_id */1); + } + + assert(num_tasks==2); + assert(val==0); + + std::cout << "Passed" << std::endl; + return 0; +} +// CHECK: Passed diff --git a/openmp/runtime/test/tasking/omp_record_replay_taskloop.cpp b/openmp/runtime/test/tasking/omp_record_replay_taskloop.cpp new file mode 100644 index 0000000000000..3d88faeeb28ee --- /dev/null +++ b/openmp/runtime/test/tasking/omp_record_replay_taskloop.cpp @@ -0,0 +1,50 @@ +// REQUIRES: ompx_taskgraph +// RUN: %libomp-cxx-compile-and-run +#include +#include + +#define NT 20 +#define N 128*128 + +typedef struct ident { + void* dummy; +} ident_t; + + +#ifdef __cplusplus +extern "C" { + int __kmpc_global_thread_num(ident_t *); + int __kmpc_start_record_task(ident_t *, int, int, int); + void __kmpc_end_record_task(ident_t *, int, int , int); +} +#endif + +int main() { + int num_tasks = 0; + + int array[N]; + for (int i = 0; i < N; ++i) + array[i] = 1; + + long sum = 0; + #pragma omp parallel + #pragma omp single + for (int iter = 0; iter < NT; ++iter) { + int gtid = __kmpc_global_thread_num(nullptr); + int res = __kmpc_start_record_task(nullptr, gtid, /* kmp_tdg_flags */0, /* tdg_id */0); + if (res) { + num_tasks++; + #pragma omp taskloop reduction(+:sum) num_tasks(4096) + for (int i = 0; i < N; ++i) { + sum += array[i]; + } + } + __kmpc_end_record_task(nullptr, gtid, /* kmp_tdg_flags */0, /* tdg_id */0); + } + assert(sum==N*NT); + assert(num_tasks==1); + + std::cout << "Passed" << std::endl; + return 0; +} +// CHECK: Passed