diff --git a/openmp/libomptarget/include/device.h b/openmp/libomptarget/include/device.h index 74b59a4ab367c..e71f0e22d8b70 100644 --- a/openmp/libomptarget/include/device.h +++ b/openmp/libomptarget/include/device.h @@ -528,6 +528,10 @@ struct DeviceTy { int32_t dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr, int64_t Size, AsyncInfoTy &AsyncInfo); + /// Fill memory on the target device (aka memset) + int32_t fillMemory(void *Ptr, int32_t Val, uint64_t NumValues, + AsyncInfoTy &AsyncInfo); + /// Notify the plugin about a new mapping starting at the host address /// \p HstPtr and \p Size bytes. int32_t notifyDataMapped(void *HstPtr, int64_t Size); diff --git a/openmp/libomptarget/include/omptargetplugin.h b/openmp/libomptarget/include/omptargetplugin.h index 8bdb39de9da9e..70f6c2fe0c07d 100644 --- a/openmp/libomptarget/include/omptargetplugin.h +++ b/openmp/libomptarget/include/omptargetplugin.h @@ -109,6 +109,17 @@ int32_t __tgt_rtl_data_exchange_async(int32_t SrcID, void *SrcPtr, int32_t DesID, void *DstPtr, int64_t Size, __tgt_async_info *AsyncInfo); +// Perform a memory fill operation on the target device (aka "memset") by +// calling a native driver operation. In case of success, return zero. +// Otherwise, return an error code. +int32_t __tgt_rtl_fill_memory(int32_t DevID, void *Ptr, int32_t ByteVal, + int64_t NumBytes); + +// Asynchronous version of __tgt_rtl_fill_memory +int32_t __tgt_rtl_fill_memory_async(int32_t DevID, void *Ptr, int32_t Val, + int64_t NumValues, + __tgt_async_info *AsyncInfo); + // De-allocate the data referenced by target ptr on the device. In case of // success, return zero. Otherwise, return an error code. Kind dictates what // allocator to use (e.g. shared, host, device). diff --git a/openmp/libomptarget/include/rtl.h b/openmp/libomptarget/include/rtl.h index 49a62685dcdbf..1f9dfd8061390 100644 --- a/openmp/libomptarget/include/rtl.h +++ b/openmp/libomptarget/include/rtl.h @@ -48,6 +48,9 @@ struct RTLInfoTy { typedef int32_t(data_exchange_ty)(int32_t, void *, int32_t, void *, int64_t); typedef int32_t(data_exchange_async_ty)(int32_t, void *, int32_t, void *, int64_t, __tgt_async_info *); + typedef int32_t(fill_memory_ty)(int32_t, void *, int32_t, uint64_t); + typedef int32_t(fill_memory_async_ty)(int32_t, void *, int32_t, uint64_t, + __tgt_async_info *); typedef int32_t(data_delete_ty)(int32_t, void *, int32_t); typedef int32_t(launch_kernel_ty)(int32_t, void *, void **, ptrdiff_t *, const KernelArgsTy *, __tgt_async_info *); @@ -101,6 +104,8 @@ struct RTLInfoTy { data_retrieve_async_ty *data_retrieve_async = nullptr; data_exchange_ty *data_exchange = nullptr; data_exchange_async_ty *data_exchange_async = nullptr; + fill_memory_ty *fill_memory = nullptr; + fill_memory_async_ty *fill_memory_async = nullptr; data_delete_ty *data_delete = nullptr; launch_kernel_ty *launch_kernel = nullptr; init_requires_ty *init_requires = nullptr; diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp index 4b8ac2f5f9ff5..641e14fc46480 100644 --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2357,6 +2357,14 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { getAgent(), (uint64_t)Size); } + /// Fill memory on the target device (aka memset) + Error fillMemoryImpl(void *Ptr, int32_t Val, uint64_t NumValues, + AsyncInfoWrapperTy &AsyncInfoWrapperTy) override { + hsa_status_t Status = + hsa_amd_memory_fill(const_cast(Ptr), Val, NumValues); + return Plugin::check(Status, "Error in hsa_amd_memory_fill: %s"); + } + /// Initialize the async info for interoperability purposes. Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { // TODO: Implement this function. diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp index 3b0b7de86a926..973d2334e1639 100644 --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -1453,6 +1453,14 @@ Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev, return Err; } +Error GenericDeviceTy::fillMemory(void *Ptr, int32_t Val, uint64_t NumValues, + __tgt_async_info *AsyncInfo) { + AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo); + auto Err = fillMemoryImpl(Ptr, Val, NumValues, AsyncInfoWrapper); + AsyncInfoWrapper.finalize(Err); + return Err; +} + Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs, @@ -1895,6 +1903,29 @@ int32_t __tgt_rtl_data_exchange_async(int32_t SrcDeviceId, void *SrcPtr, return OFFLOAD_SUCCESS; } +int32_t __tgt_rtl_fill_memory(int32_t DevId, void *Ptr, int32_t ByteVal, + int64_t NumBytes) { + return __tgt_rtl_fill_memory_async(DevId, Ptr, ByteVal, NumBytes, + /* AsyncInfoPtr */ nullptr); +} + +int32_t __tgt_rtl_fill_memory_async(int32_t DevId, void *Ptr, int32_t Val, + int64_t NumValues, + __tgt_async_info *AsyncInfo) { + printf("--> in function %s\n", __FUNCTION__); + printf("--> Dev: %d, Ptr: %p, Val: %d, NumValues: %ld\n", DevId, Ptr, Val, + NumValues); + GenericDeviceTy &Device = Plugin::get().getDevice(DevId); + auto Err = Device.fillMemory(Ptr, Val, NumValues, AsyncInfo); + if (Err) { + REPORT("Failure to fill memory on device (%d) at pointer " DPxMOD + " with byte value %d and %" PRId64 " values: %s\n", + DevId, DPxPTR(Ptr), Val, NumValues, toString(std::move(Err)).data()); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; +} + int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, KernelArgsTy *KernelArgs, diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h index 6abd1b6829ab5..bff6c4e8caf26 100644 --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -775,6 +775,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy { void *DstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + /// Fill memory on the target device (aka memset). + Error fillMemory(void *Ptr, int32_t Val, uint64_t NumValues, + __tgt_async_info *AsyncInfo); + virtual Error fillMemoryImpl(void *Ptr, int32_t Val, uint64_t NumValue, + AsyncInfoWrapperTy &AsyncInfo) = 0; + /// Run the kernel associated with \p EntryPtr Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo); diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp index 97e49addc5608..2148739588e82 100644 --- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp @@ -746,6 +746,10 @@ struct CUDADeviceTy : public GenericDeviceTy { void *DstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override; + // Fill memory on the target device (aka memset) + Error fillMemoryImpl(void *Ptr, int32_t Val, uint64_t NumValues, + AsyncInfoWrapperTy &AsyncInfoWrapperTy) override; + /// Initialize the async info for interoperability purposes. Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { if (auto Err = setContext()) @@ -1387,6 +1391,14 @@ Error CUDADeviceTy::dataExchangeImpl(const void *SrcPtr, return Plugin::check(Res, "Error in cuMemcpyDtoDAsync: %s"); } +/// Fill memory on the target device (aka memset) +Error CUDADeviceTy::fillMemoryImpl(void *Ptr, int32_t Val, uint64_t NumValues, + AsyncInfoWrapperTy &AsyncInfoWrapperTy) { + CUdeviceptr DevPtr = reinterpret_cast(Ptr); + CUresult Res = cuMemsetD32(DevPtr, Val, static_cast(NumValues)); + return Plugin::check(Res, "Error in cuMemsetD32: %s"); +} + GenericPluginTy *Plugin::createPlugin() { return new CUDAPluginTy(); } GenericDeviceTy *Plugin::createDevice(int32_t DeviceId, int32_t NumDevices) { diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp index c0107c1f14f76..39864c54530b9 100644 --- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -267,6 +268,14 @@ struct GenELF64DeviceTy : public GenericDeviceTy { return Plugin::error("dataExchangeImpl not supported"); } + /// Fill memory on the target device (aka memset). + Error fillMemoryImpl(void *Ptr, int32_t Val, uint64_t NumValues, + AsyncInfoWrapperTy &AsyncInfoWrapperTy) override { + (void)std::memset(Ptr, Val, + static_cast(NumValues) * sizeof(int32_t)); + return Plugin::success(); + } + /// All functions are already synchronous. No need to do anything on this /// synchronization function. Error synchronizeImpl(__tgt_async_info &AsyncInfo) override { diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp index ecef02c8a0d3d..194f64e218b17 100644 --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -320,21 +320,31 @@ EXTERN void *omp_target_memset(void *Ptr, int ByteVal, size_t NumBytes, // That will require the ability to execute a kernel from within // libomptarget.so (which we do not have at the moment). - // This is a very slow path: create a filled array on the host and upload - // it to the GPU device. - int InitialDevice = omp_get_initial_device(); - void *Shadow = omp_target_alloc(NumBytes, InitialDevice); - if (Shadow) { - (void)memset(Shadow, ByteVal, NumBytes); - (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum, - InitialDevice); - (void)omp_target_free(Shadow, InitialDevice); + if (NumBytes % sizeof(int32_t) == 0) { + DeviceTy &Dev = *PM->Devices[DeviceNum]; + AsyncInfoTy AsyncInfo(Dev); + int32_t Val = + ByteVal + (ByteVal << 8) + (ByteVal << 16) + (ByteVal << 24); + uint64_t NumValues = NumBytes / sizeof(int32_t); + int Rc = Dev.fillMemory(Ptr, Val, NumValues, AsyncInfo); + printf("--> Rc=%d\n", Rc); } else { - // If the omp_target_alloc has failed, let's just not do anything. - // omp_target_memset does not have any good way to fail, so we - // simply avoid a catastrophic failure of the process for now. - DP("omp_target_memset failed to fill memory due to error with " - "omp_target_alloc"); + // This is a very slow path: create a filled array on the host and upload + // it to the GPU device. + int InitialDevice = omp_get_initial_device(); + void *Shadow = omp_target_alloc(NumBytes, InitialDevice); + if (Shadow) { + (void)memset(Shadow, ByteVal, NumBytes); + (void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum, + InitialDevice); + (void)omp_target_free(Shadow, InitialDevice); + } else { + // If the omp_target_alloc has failed, let's just not do anything. + // omp_target_memset does not have any good way to fail, so we + // simply avoid a catastrophic failure of the process for now. + DP("omp_target_memset failed to fill memory due to error with " + "omp_target_alloc"); + } } } @@ -462,7 +472,7 @@ EXTERN int omp_target_memcpy_rect_async( "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", " "volume " DPxMOD ", element size %zu, num_dims %d\n", DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DPxPTR(DstOffsets), - DPxPTR(SrcOffsets), DPxPTR(DstDimensions), DPxPTR(SrcDimensions), + DPxPTR(SrcOffsets), DPxPTR(DstDimensimons), DPxPTR(SrcDimensions), DPxPTR(Volume), ElementSize, NumDims); // Need to check this first to not return OFFLOAD_FAIL instead diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp index 87ee480825217..e2a26900494a1 100644 --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -672,6 +672,16 @@ int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr, DstPtr, Size, AsyncInfo); } +// Run a "fill memory" operation (aka "memset") on the target device +int32_t DeviceTy::fillMemory(void *Ptr, int32_t Val, uint64_t NumValues, + AsyncInfoTy &AsyncInfo) { + if (!AsyncInfo || !RTL->fill_memory_async || !RTL->synchronize) { + assert(RTL->fill_memory && "RTL->fill_memory is nullptr"); + return RTL->fill_memory(RTLDeviceID, Ptr, Val, NumValues); + } + return RTL->fill_memory_async(RTLDeviceID, Ptr, Val, NumValues, AsyncInfo); +} + int32_t DeviceTy::notifyDataMapped(void *HstPtr, int64_t Size) { if (!RTL->data_notify_mapped) return OFFLOAD_SUCCESS; diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp index 86509cd69c561..2a3ae09a9abe5 100644 --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -209,6 +209,10 @@ bool RTLsTy::attemptLoadRTL(const std::string &RTLName, RTLInfoTy &RTL) { DynLibrary->getAddressOfSymbol("__tgt_rtl_data_exchange"); *((void **)&RTL.data_exchange_async) = DynLibrary->getAddressOfSymbol("__tgt_rtl_data_exchange_async"); + *((void **)&RTL.fill_memory) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_fill_memory"); + *((void **)&RTL.fill_memory_async) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_fill_memory_async"); *((void **)&RTL.is_data_exchangable) = DynLibrary->getAddressOfSymbol("__tgt_rtl_is_data_exchangable"); *((void **)&RTL.supports_empty_images) =