From 29df4ab3f3c9bf37529ee04795abfd90b7691857 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Sat, 28 Aug 2021 16:24:06 -0400 Subject: [PATCH] [OpenMP][Offloading] Add support for event related interfaces This patch adds the support form event related interfaces, which will be used later to fix data race. See D104418 for more details. Reviewed By: jdoerfert, ye-luo Differential Revision: https://reviews.llvm.org/D108528 --- openmp/libomptarget/include/omptargetplugin.h | 26 +++++ .../plugins/cuda/dynamic_cuda/cuda.cpp | 6 + .../plugins/cuda/dynamic_cuda/cuda.h | 7 ++ openmp/libomptarget/plugins/cuda/src/rtl.cpp | 110 ++++++++++++++++++ openmp/libomptarget/plugins/exports | 5 + openmp/libomptarget/src/device.cpp | 35 ++++++ openmp/libomptarget/src/device.h | 24 +++- openmp/libomptarget/src/rtl.cpp | 8 ++ openmp/libomptarget/src/rtl.h | 10 ++ 9 files changed, 230 insertions(+), 1 deletion(-) diff --git a/openmp/libomptarget/include/omptargetplugin.h b/openmp/libomptarget/include/omptargetplugin.h index b7b3eb806981c..aefad9ec25a71 100644 --- a/openmp/libomptarget/include/omptargetplugin.h +++ b/openmp/libomptarget/include/omptargetplugin.h @@ -145,6 +145,32 @@ void __tgt_rtl_set_info_flag(uint32_t); // Print the device information void __tgt_rtl_print_device_info(int32_t ID); +// Event related interfaces. It is expected to use the interfaces in the +// following way: +// 1) Create an event on the target device (__tgt_rtl_create_event). +// 2) Record the event based on the status of \p AsyncInfo->Queue at the moment +// of function call to __tgt_rtl_record_event. An event becomes "meaningful" +// once it is recorded, such that others can depend on it. +// 3) Call __tgt_rtl_wait_event to set dependence on the event. Whether the +// operation is blocking or non-blocking depends on the target. It is expected +// to be non-blocking, just set dependence and return. +// 4) Call __tgt_rtl_sync_event to sync the event. It is expected to block the +// thread calling the function. +// 5) Destroy the event (__tgt_rtl_destroy_event). +// { +int32_t __tgt_rtl_create_event(int32_t ID, void **Event); + +int32_t __tgt_rtl_record_event(int32_t ID, void *Event, + __tgt_async_info *AsyncInfo); + +int32_t __tgt_rtl_wait_event(int32_t ID, void *Event, + __tgt_async_info *AsyncInfo); + +int32_t __tgt_rtl_sync_event(int32_t ID, void *Event); + +int32_t __tgt_rtl_destroy_event(int32_t ID, void *Event); +// } + #ifdef __cplusplus } #endif diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp index fb776f7ae5586..9e9cea0beb4f4 100644 --- a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp +++ b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp @@ -69,6 +69,12 @@ DLWRAP(cuMemcpyPeerAsync, 6); DLWRAP(cuCtxGetLimit, 2); DLWRAP(cuCtxSetLimit, 2); +DLWRAP(cuEventCreate, 2); +DLWRAP(cuEventRecord, 2); +DLWRAP(cuStreamWaitEvent, 3); +DLWRAP(cuEventSynchronize, 1); +DLWRAP(cuEventDestroy, 1); + DLWRAP_FINALIZE(); #ifndef DYNAMIC_CUDA_PATH diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h index 14049e1f7559a..c6aeafef2df18 100644 --- a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h +++ b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h @@ -22,6 +22,7 @@ typedef struct CUmod_st *CUmodule; typedef struct CUctx_st *CUcontext; typedef struct CUfunc_st *CUfunction; typedef struct CUstream_st *CUstream; +typedef struct CUevent_st *CUevent; typedef enum cudaError_enum { CUDA_SUCCESS = 0, @@ -248,4 +249,10 @@ CUresult cuMemcpyPeerAsync(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, CUresult cuCtxGetLimit(size_t *, CUlimit); CUresult cuCtxSetLimit(CUlimit, size_t); +CUresult cuEventCreate(CUevent *, unsigned int); +CUresult cuEventRecord(CUevent, CUstream); +CUresult cuStreamWaitEvent(CUstream, CUevent, unsigned int); +CUresult cuEventSynchronize(CUevent); +CUresult cuEventDestroy(CUevent); + #endif diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp index 44fc67225d198..c6f51a5a57bf5 100644 --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -129,6 +129,62 @@ int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size, return OFFLOAD_SUCCESS; } +int createEvent(void **P) { + CUevent Event = nullptr; + + CUresult Err = cuEventCreate(&Event, CU_EVENT_DEFAULT); + if (Err != CUDA_SUCCESS) { + DP("Error when creating event event = " DPxMOD "\n", DPxPTR(Event)); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + + *P = Event; + + return OFFLOAD_SUCCESS; +} + +int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) { + CUstream Stream = reinterpret_cast(AsyncInfo->Queue); + CUevent Event = reinterpret_cast(EventPtr); + + CUresult Err = cuEventRecord(Event, Stream); + if (Err != CUDA_SUCCESS) { + DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n", + DPxPTR(Stream), DPxPTR(Event)); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; +} + +int syncEvent(void *EventPtr) { + CUevent Event = reinterpret_cast(EventPtr); + + CUresult Err = cuEventSynchronize(Event); + if (Err != CUDA_SUCCESS) { + DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event)); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; +} + +int destroyEvent(void *EventPtr) { + CUevent Event = reinterpret_cast(EventPtr); + + CUresult Err = cuEventDestroy(Event); + if (Err != CUDA_SUCCESS) { + DP("Error when destroying event = " DPxMOD "\n", DPxPTR(Event)); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; +} + // Structure contains per-device data struct DeviceDataTy { /// List that contains all the kernels. @@ -1332,6 +1388,25 @@ class DeviceRTLTy { "Error returned from cuDeviceGetAttribute\n"); printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2); } + + int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo, + void *EventPtr) const { + CUstream Stream = getStream(DeviceId, AsyncInfo); + CUevent Event = reinterpret_cast(EventPtr); + + // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from + // specific CUDA version, and defined as 0x0. In previous version, per CUDA + // API document, that argument has to be 0x0. + CUresult Err = cuStreamWaitEvent(Stream, Event, 0); + if (Err != CUDA_SUCCESS) { + DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n", + DPxPTR(Stream), DPxPTR(Event)); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; + } }; DeviceRTLTy DeviceRTL; @@ -1537,6 +1612,41 @@ void __tgt_rtl_print_device_info(int32_t device_id) { DeviceRTL.printDeviceInfo(device_id); } +int32_t __tgt_rtl_create_event(int32_t device_id, void **event) { + assert(event && "event is nullptr"); + return createEvent(event); +} + +int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr, + __tgt_async_info *async_info_ptr) { + assert(async_info_ptr && "async_info_ptr is nullptr"); + assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); + assert(event_ptr && "event_ptr is nullptr"); + + return recordEvent(event_ptr, async_info_ptr); +} + +int32_t __tgt_rtl_wait_event(int32_t device_id, void *event_ptr, + __tgt_async_info *async_info_ptr) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + assert(async_info_ptr && "async_info_ptr is nullptr"); + assert(event_ptr && "event is nullptr"); + + return DeviceRTL.waitEvent(device_id, async_info_ptr, event_ptr); +} + +int32_t __tgt_rtl_sync_event(int32_t device_id, void *event_ptr) { + assert(event_ptr && "event is nullptr"); + + return syncEvent(event_ptr); +} + +int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) { + assert(event_ptr && "event is nullptr"); + + return destroyEvent(event_ptr); +} + #ifdef __cplusplus } #endif diff --git a/openmp/libomptarget/plugins/exports b/openmp/libomptarget/plugins/exports index 61cc6746defdf..0a3dc8a82c2eb 100644 --- a/openmp/libomptarget/plugins/exports +++ b/openmp/libomptarget/plugins/exports @@ -24,6 +24,11 @@ VERS1.0 { __tgt_rtl_supports_empty_images; __tgt_rtl_set_info_flag; __tgt_rtl_print_device_info; + __tgt_rtl_create_event; + __tgt_rtl_record_event; + __tgt_rtl_wait_event; + __tgt_rtl_sync_event; + __tgt_rtl_destroy_event; local: *; }; diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp index f660d2321dfb6..fd7c73df722c0 100644 --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -553,6 +553,41 @@ int32_t DeviceTy::synchronize(AsyncInfoTy &AsyncInfo) { return OFFLOAD_SUCCESS; } +int32_t DeviceTy::createEvent(void **Event) { + if (RTL->create_event) + return RTL->create_event(RTLDeviceID, Event); + + return OFFLOAD_SUCCESS; +} + +int32_t DeviceTy::recordEvent(void *Event, AsyncInfoTy &AsyncInfo) { + if (RTL->record_event) + return RTL->record_event(RTLDeviceID, Event, AsyncInfo); + + return OFFLOAD_SUCCESS; +} + +int32_t DeviceTy::waitEvent(void *Event, AsyncInfoTy &AsyncInfo) { + if (RTL->wait_event) + return RTL->wait_event(RTLDeviceID, Event, AsyncInfo); + + return OFFLOAD_SUCCESS; +} + +int32_t DeviceTy::syncEvent(void *Event) { + if (RTL->sync_event) + return RTL->sync_event(RTLDeviceID, Event); + + return OFFLOAD_SUCCESS; +} + +int32_t DeviceTy::destroyEvent(void *Event) { + if (RTL->create_event) + return RTL->destroy_event(RTLDeviceID, Event); + + return OFFLOAD_SUCCESS; +} + /// Check whether a device has an associated RTL and initialize it if it's not /// already initialized. bool device_is_ready(int device_num) { diff --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h index 21cce3539349d..58c6316ff6c32 100644 --- a/openmp/libomptarget/src/device.h +++ b/openmp/libomptarget/src/device.h @@ -275,10 +275,32 @@ struct DeviceTy { /// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails. int32_t synchronize(AsyncInfoTy &AsyncInfo); - /// Calls the corresponding print in the \p RTLDEVID + /// Calls the corresponding print in the \p RTLDEVID /// device RTL to obtain the information of the specific device. bool printDeviceInfo(int32_t RTLDevID); + /// Event related interfaces. + /// { + /// Create an event. + int32_t createEvent(void **Event); + + /// Record the event based on status in AsyncInfo->Queue at the moment the + /// function is called. + int32_t recordEvent(void *Event, AsyncInfoTy &AsyncInfo); + + /// Wait for an event. This function can be blocking or non-blocking, + /// depending on the implmentation. It is expected to set a dependence on the + /// event such that corresponding operations shall only start once the event + /// is fulfilled. + int32_t waitEvent(void *Event, AsyncInfoTy &AsyncInfo); + + /// Synchronize the event. It is expected to block the thread. + int32_t syncEvent(void *Event); + + /// Destroy the event. + int32_t destroyEvent(void *Event); + /// } + private: // Call to RTL void init(); // To be called only via DeviceTy::initOnce() diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp index 47d3a0f544db1..264b1d4f7d33a 100644 --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -183,6 +183,14 @@ void RTLsTy::LoadRTLs() { dlsym(dynlib_handle, "__tgt_rtl_set_info_flag"); *((void **)&R.print_device_info) = dlsym(dynlib_handle, "__tgt_rtl_print_device_info"); + *((void **)&R.create_event) = + dlsym(dynlib_handle, "__tgt_rtl_create_event"); + *((void **)&R.record_event) = + dlsym(dynlib_handle, "__tgt_rtl_record_event"); + *((void **)&R.wait_event) = dlsym(dynlib_handle, "__tgt_rtl_wait_event"); + *((void **)&R.sync_event) = dlsym(dynlib_handle, "__tgt_rtl_sync_event"); + *((void **)&R.destroy_event) = + dlsym(dynlib_handle, "__tgt_rtl_destroy_event"); } #if OMPT_SUPPORT diff --git a/openmp/libomptarget/src/rtl.h b/openmp/libomptarget/src/rtl.h index db13927cff53f..88328d23a5284 100644 --- a/openmp/libomptarget/src/rtl.h +++ b/openmp/libomptarget/src/rtl.h @@ -57,6 +57,11 @@ struct RTLInfoTy { typedef int32_t(supports_empty_images_ty)(); typedef void(print_device_info_ty)(int32_t); typedef void(set_info_flag_ty)(uint32_t); + typedef int32_t(create_event_ty)(int32_t, void **); + typedef int32_t(record_event_ty)(int32_t, void *, __tgt_async_info *); + typedef int32_t(wait_event_ty)(int32_t, void *, __tgt_async_info *); + typedef int32_t(sync_event_ty)(int32_t, void *); + typedef int32_t(destroy_event_ty)(int32_t, void *); int32_t Idx = -1; // RTL index, index is the number of devices // of other RTLs that were registered before, @@ -95,6 +100,11 @@ struct RTLInfoTy { supports_empty_images_ty *supports_empty_images = nullptr; set_info_flag_ty *set_info_flag = nullptr; print_device_info_ty *print_device_info = nullptr; + create_event_ty *create_event = nullptr; + record_event_ty *record_event = nullptr; + wait_event_ty *wait_event = nullptr; + sync_event_ty *sync_event = nullptr; + destroy_event_ty *destroy_event = nullptr; // Are there images associated with this RTL. bool isUsed = false;