Skip to content

Commit

Permalink
[Libomptarget] Correctly implement getWTime on AMDGPU
Browse files Browse the repository at this point in the history
AMDGPU provides a fixed frequency clock since some generations back.
However, the frequency is variable by card and must be looked up at
runtime. This patch adds a new device environment line for the clock
frequency so that we can use it in the same way as NVPTX. This is the
correct implementation and the version in ASO should be replaced.

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D154456
  • Loading branch information
jhuber6 committed Jul 5, 2023
1 parent 6c72cee commit 6764301
Show file tree
Hide file tree
Showing 9 changed files with 47 additions and 14 deletions.
3 changes: 3 additions & 0 deletions openmp/libomptarget/DeviceRTL/include/Configuration.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ uint32_t getDebugKind();
/// Return the amount of dynamic shared memory that was allocated at launch.
uint64_t getDynamicMemorySize();

/// Returns the cycles per second of the device's fixed frequency clock.
uint64_t getClockFrequency();

/// Return if debugging is enabled for the given debug kind.
bool isDebugMode(DebugKind Level);

Expand Down
4 changes: 4 additions & 0 deletions openmp/libomptarget/DeviceRTL/src/Configuration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ uint64_t config::getDynamicMemorySize() {
return __omp_rtl_device_environment.DynamicMemSize;
}

uint64_t config::getClockFrequency() {
return __omp_rtl_device_environment.ClockFrequency;
}

bool config::isDebugMode(config::DebugKind Kind) {
return config::getDebugKind() & Kind;
}
Expand Down
22 changes: 16 additions & 6 deletions openmp/libomptarget/DeviceRTL/src/Misc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
//
//===----------------------------------------------------------------------===//

#include "Configuration.h"
#include "Types.h"

#include "Debug.h"
Expand All @@ -27,14 +28,23 @@ double getWTime();
///{
#pragma omp begin declare variant match(device = {arch(amdgcn)})

double getWTick() { return ((double)1E-9); }
double getWTick() {
// The number of ticks per second for the AMDGPU clock varies by card and can
// only be retrived by querying the driver. We rely on the device environment
// to inform us what the proper frequency is.
return 1.0 / config::getClockFrequency();
}

double getWTime() {
// The intrinsics for measuring time have undocumented frequency
// This will probably need to be found by measurement on a number of
// architectures. Until then, return 0, which is very inaccurate as a
// timer but resolves the undefined symbol at link time.
return 0;
uint64_t NumTicks = 0;
if constexpr (__has_builtin(__builtin_amdgcn_s_sendmsg_rtnl))
NumTicks = __builtin_amdgcn_s_sendmsg_rtnl(0x83);
else if constexpr (__has_builtin(__builtin_amdgcn_s_memrealtime))
NumTicks = __builtin_amdgcn_s_memrealtime();
else if constexpr (__has_builtin(__builtin_amdgcn_s_memtime))
NumTicks = __builtin_amdgcn_s_memtime();

return static_cast<double>(NumTicks) * getWTick();
}

#pragma omp end declare variant
Expand Down
1 change: 1 addition & 0 deletions openmp/libomptarget/include/DeviceEnvironment.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ struct DeviceEnvironmentTy {
uint32_t NumDevices;
uint32_t DeviceNum;
uint32_t DynamicMemSize;
uint64_t ClockFrequency;
};

#endif
11 changes: 11 additions & 0 deletions openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1596,6 +1596,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Err;
GridValues.GV_Warp_Size = WavefrontSize;

// Get the frequency of the steady clock.
if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY,
ClockFrequency))
return Err;

// Load the grid values dependending on the wavefront.
if (WavefrontSize == 32)
GridValues = getAMDGPUGridValues<32>();
Expand Down Expand Up @@ -1757,6 +1762,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// See GenericDeviceTy::getComputeUnitKind().
std::string getComputeUnitKind() const override { return ComputeUnitKind; }

/// Returns the clock frequency for the given AMDGPU device.
uint64_t getClockFrequency() const override { return ClockFrequency; }

/// Allocate and construct an AMDGPU kernel.
Expected<GenericKernelTy *>
constructKernelEntry(const __tgt_offload_entry &KernelEntry,
Expand Down Expand Up @@ -2417,6 +2425,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// The GPU architecture.
std::string ComputeUnitKind;

/// The frequency of the steady clock inside the device.
uint64_t ClockFrequency;

/// Reference to the host device.
AMDHostDeviceTy &HostDevice;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,7 @@ Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin,
// TODO: The device ID used here is not the real device ID used by OpenMP.
DeviceEnvironment.DeviceNum = DeviceId;
DeviceEnvironment.DynamicMemSize = OMPX_SharedMemorySize;
DeviceEnvironment.ClockFrequency = getClockFrequency();

// Create the metainfo of the device environment global.
GlobalTy DevEnvGlobal("__omp_rtl_device_environment",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -732,6 +732,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
return GridValues.GV_Default_Num_Teams;
}
uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; }
virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; }

/// Get target compute unit kind (e.g., sm_80, or gfx908).
virtual std::string getComputeUnitKind() const { return "unknown"; }
Expand Down
3 changes: 3 additions & 0 deletions openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -829,6 +829,9 @@ struct CUDADeviceTy : public GenericDeviceTy {
return ComputeCapability.str();
}

/// Returns the clock frequency for the given NVPTX device.
uint64_t getClockFrequency() const override { return 1000000000; }

private:
using CUDAStreamManagerTy = GenericDeviceResourceManagerTy<CUDAStreamRef>;
using CUDAEventManagerTy = GenericDeviceResourceManagerTy<CUDAEventRef>;
Expand Down
15 changes: 7 additions & 8 deletions openmp/libomptarget/test/offloading/wtime.c
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
// RUN: %libomptarget-compileopt-run-and-check-generic

// UNSUPPORTED: amdgcn-amd-amdhsa
// RUN: %libomptarget-compileopt-and-run-generic

#include <assert.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
Expand All @@ -10,17 +9,17 @@

int main(int argc, char *argv[]) {
int *data = (int *)malloc(N * sizeof(int));
#pragma omp target map(from : data[0 : N])
double duration = 0.0;

#pragma omp target map(from : data[0 : N]) map(from : duration)
{
double start = omp_get_wtime();
for (int i = 0; i < N; ++i)
data[i] = i;
double end = omp_get_wtime();
double duration = end - start;
printf("duration: %lfs\n", duration);
duration = end - start;
}
assert(duration > 0.0);
free(data);
return 0;
}

// CHECK: duration: {{.+[1-9]+}}

0 comments on commit 6764301

Please sign in to comment.