diff --git a/openmp/libomptarget/DeviceRTL/include/Configuration.h b/openmp/libomptarget/DeviceRTL/include/Configuration.h index 09bce1092f096..068c0166845a7 100644 --- a/openmp/libomptarget/DeviceRTL/include/Configuration.h +++ b/openmp/libomptarget/DeviceRTL/include/Configuration.h @@ -37,6 +37,9 @@ uint32_t getDebugKind(); /// Return the amount of dynamic shared memory that was allocated at launch. uint64_t getDynamicMemorySize(); +/// Returns the cycles per second of the device's fixed frequency clock. +uint64_t getClockFrequency(); + /// Return if debugging is enabled for the given debug kind. bool isDebugMode(DebugKind Level); diff --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp index ceccef625ed29..994ff2b67bb34 100644 --- a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp @@ -46,6 +46,10 @@ uint64_t config::getDynamicMemorySize() { return __omp_rtl_device_environment.DynamicMemSize; } +uint64_t config::getClockFrequency() { + return __omp_rtl_device_environment.ClockFrequency; +} + bool config::isDebugMode(config::DebugKind Kind) { return config::getDebugKind() & Kind; } diff --git a/openmp/libomptarget/DeviceRTL/src/Misc.cpp b/openmp/libomptarget/DeviceRTL/src/Misc.cpp index 68ce445a16edf..a19a263e55b24 100644 --- a/openmp/libomptarget/DeviceRTL/src/Misc.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Misc.cpp @@ -9,6 +9,7 @@ // //===----------------------------------------------------------------------===// +#include "Configuration.h" #include "Types.h" #include "Debug.h" @@ -27,14 +28,23 @@ double getWTime(); ///{ #pragma omp begin declare variant match(device = {arch(amdgcn)}) -double getWTick() { return ((double)1E-9); } +double getWTick() { + // The number of ticks per second for the AMDGPU clock varies by card and can + // only be retrived by querying the driver. We rely on the device environment + // to inform us what the proper frequency is. + return 1.0 / config::getClockFrequency(); +} double getWTime() { - // The intrinsics for measuring time have undocumented frequency - // This will probably need to be found by measurement on a number of - // architectures. Until then, return 0, which is very inaccurate as a - // timer but resolves the undefined symbol at link time. - return 0; + uint64_t NumTicks = 0; + if constexpr (__has_builtin(__builtin_amdgcn_s_sendmsg_rtnl)) + NumTicks = __builtin_amdgcn_s_sendmsg_rtnl(0x83); + else if constexpr (__has_builtin(__builtin_amdgcn_s_memrealtime)) + NumTicks = __builtin_amdgcn_s_memrealtime(); + else if constexpr (__has_builtin(__builtin_amdgcn_s_memtime)) + NumTicks = __builtin_amdgcn_s_memtime(); + + return static_cast(NumTicks) * getWTick(); } #pragma omp end declare variant diff --git a/openmp/libomptarget/include/DeviceEnvironment.h b/openmp/libomptarget/include/DeviceEnvironment.h index 231492c68f762..4260002a1f036 100644 --- a/openmp/libomptarget/include/DeviceEnvironment.h +++ b/openmp/libomptarget/include/DeviceEnvironment.h @@ -20,6 +20,7 @@ struct DeviceEnvironmentTy { uint32_t NumDevices; uint32_t DeviceNum; uint32_t DynamicMemSize; + uint64_t ClockFrequency; }; #endif diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp index 21436baecf9da..1fcbcf29f9e35 100644 --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -1596,6 +1596,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return Err; GridValues.GV_Warp_Size = WavefrontSize; + // Get the frequency of the steady clock. + if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY, + ClockFrequency)) + return Err; + // Load the grid values dependending on the wavefront. if (WavefrontSize == 32) GridValues = getAMDGPUGridValues<32>(); @@ -1757,6 +1762,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { /// See GenericDeviceTy::getComputeUnitKind(). std::string getComputeUnitKind() const override { return ComputeUnitKind; } + /// Returns the clock frequency for the given AMDGPU device. + uint64_t getClockFrequency() const override { return ClockFrequency; } + /// Allocate and construct an AMDGPU kernel. Expected constructKernelEntry(const __tgt_offload_entry &KernelEntry, @@ -2417,6 +2425,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { /// The GPU architecture. std::string ComputeUnitKind; + /// The frequency of the steady clock inside the device. + uint64_t ClockFrequency; + /// Reference to the host device. AMDHostDeviceTy &HostDevice; diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp index c86b2eb357936..9eaaaf817d9f1 100644 --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -509,6 +509,7 @@ Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin, // TODO: The device ID used here is not the real device ID used by OpenMP. DeviceEnvironment.DeviceNum = DeviceId; DeviceEnvironment.DynamicMemSize = OMPX_SharedMemorySize; + DeviceEnvironment.ClockFrequency = getClockFrequency(); // Create the metainfo of the device environment global. GlobalTy DevEnvGlobal("__omp_rtl_device_environment", diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h index 189406ac1dc2a..8fe615b2f6f23 100644 --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -732,6 +732,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy { return GridValues.GV_Default_Num_Teams; } uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; } + virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; } /// Get target compute unit kind (e.g., sm_80, or gfx908). virtual std::string getComputeUnitKind() const { return "unknown"; } diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp index d823cbebc3d7b..c165b582f63d0 100644 --- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp @@ -829,6 +829,9 @@ struct CUDADeviceTy : public GenericDeviceTy { return ComputeCapability.str(); } + /// Returns the clock frequency for the given NVPTX device. + uint64_t getClockFrequency() const override { return 1000000000; } + private: using CUDAStreamManagerTy = GenericDeviceResourceManagerTy; using CUDAEventManagerTy = GenericDeviceResourceManagerTy; diff --git a/openmp/libomptarget/test/offloading/wtime.c b/openmp/libomptarget/test/offloading/wtime.c index 230d67bced7de..2ba60aadb5bec 100644 --- a/openmp/libomptarget/test/offloading/wtime.c +++ b/openmp/libomptarget/test/offloading/wtime.c @@ -1,7 +1,6 @@ -// RUN: %libomptarget-compileopt-run-and-check-generic - -// UNSUPPORTED: amdgcn-amd-amdhsa +// RUN: %libomptarget-compileopt-and-run-generic +#include #include #include #include @@ -10,17 +9,17 @@ int main(int argc, char *argv[]) { int *data = (int *)malloc(N * sizeof(int)); -#pragma omp target map(from : data[0 : N]) + double duration = 0.0; + +#pragma omp target map(from : data[0 : N]) map(from : duration) { double start = omp_get_wtime(); for (int i = 0; i < N; ++i) data[i] = i; double end = omp_get_wtime(); - double duration = end - start; - printf("duration: %lfs\n", duration); + duration = end - start; } + assert(duration > 0.0); free(data); return 0; } - -// CHECK: duration: {{.+[1-9]+}}