Skip to content

Commit

Permalink
[OpenMP] Support 'omp_get_num_procs' on the device (#65501)
Browse files Browse the repository at this point in the history
Summary:
The `omp_get_num_procs()` function should return the amount of
parallelism availible. On the GPU, this was not defined. We have elected
to define this function as the maximum amount of wavefronts / warps that
can be simultaneously resident on the device. For AMDGPU this is the
number of CUs multiplied byth CU's per wave. For NVPTX this is the
maximum threads per SM divided by the warp size and multiplied by the
number of SMs.
  • Loading branch information
jhuber6 committed Sep 6, 2023
1 parent 6f38713 commit 460840c
Show file tree
Hide file tree
Showing 9 changed files with 52 additions and 10 deletions.
3 changes: 3 additions & 0 deletions openmp/libomptarget/DeviceRTL/include/Configuration.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ void *getIndirectCallTablePtr();
/// Returns the size of the indirect call table.
uint64_t getIndirectCallTableSize();

/// Returns the size of the indirect call table.
uint64_t getHardwareParallelism();

/// Return if debugging is enabled for the given debug kind.
bool isDebugMode(DebugKind Level);

Expand Down
4 changes: 4 additions & 0 deletions openmp/libomptarget/DeviceRTL/src/Configuration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ void *config::getIndirectCallTablePtr() {
__omp_rtl_device_environment.IndirectCallTable);
}

uint64_t config::getHardwareParallelism() {
return __omp_rtl_device_environment.HardwareParallelism;
}

uint64_t config::getIndirectCallTableSize() {
return __omp_rtl_device_environment.IndirectCallTableSize;
}
Expand Down
4 changes: 3 additions & 1 deletion openmp/libomptarget/DeviceRTL/src/Mapping.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,9 @@ uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
return NumberOfBlocks;
}

uint32_t mapping::getNumberOfProcessorElements() { __builtin_trap(); }
uint32_t mapping::getNumberOfProcessorElements() {
return static_cast<uint32_t>(config::getHardwareParallelism());
}

///}

Expand Down
1 change: 1 addition & 0 deletions openmp/libomptarget/include/Environment.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ struct DeviceEnvironmentTy {
uint64_t ClockFrequency;
uintptr_t IndirectCallTable;
uint64_t IndirectCallTableSize;
uint64_t HardwareParallelism;
};

// NOTE: Please don't change the order of those members as their indices are
Expand Down
13 changes: 9 additions & 4 deletions openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1942,16 +1942,21 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// AMDGPU devices do not have the concept of contexts.
Error setContext() override { return Plugin::success(); }

/// AMDGPU returns the product of the number of compute units and the waves
/// per compute unit.
uint64_t getHardwareParallelism() const override {
return HardwareParallelism;
}

/// We want to set up the RPC server for host services to the GPU if it is
/// availible.
bool shouldSetupRPCServer() const override {
return libomptargetSupportsRPC();
}

/// AMDGPU returns the product of the number of compute units and the waves
/// per compute unit.
uint64_t requestedRPCPortCount() const override {
return HardwareParallelism;
/// The RPC interface should have enough space for all availible parallelism.
uint64_t requestedRPCPortCount() const override {
return getHardwareParallelism();
}

/// Get the stream of the asynchronous info sructure or get a new one.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -692,6 +692,7 @@ Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin,
DeviceEnvironment.IndirectCallTable =
reinterpret_cast<uintptr_t>(CallTablePairOrErr->first);
DeviceEnvironment.IndirectCallTableSize = CallTablePairOrErr->second;
DeviceEnvironment.HardwareParallelism = getHardwareParallelism();

// Create the metainfo of the device environment global.
GlobalTy DevEnvGlobal("__omp_rtl_device_environment",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -781,6 +781,11 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
return OMPX_MinThreadsForLowTripCount;
}

/// Get the total amount of hardware parallelism supported by the target
/// device. This is the total amount of warps or wavefronts that can be
/// resident on the device simultaneously.
virtual uint64_t getHardwareParallelism() const { return 0; }

/// Get the RPC server running on this device.
RPCServerTy *getRPCServer() const { return RPCServer; }

Expand Down
16 changes: 11 additions & 5 deletions openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -301,8 +301,9 @@ struct CUDADeviceTy : public GenericDeviceTy {
if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
NumMuliprocessors))
return Err;
if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
MaxThreadsPerSM))
if (auto Err =
getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
MaxThreadsPerSM))
return Err;
if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE, WarpSize))
return Err;
Expand Down Expand Up @@ -373,16 +374,21 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Plugin::check(Res, "Error in cuCtxSetCurrent: %s");
}

/// NVIDIA returns the product of the SM count and the number of warps that
/// fit if the maximum number of threads were scheduled on each SM.
uint64_t getHardwareParallelism() const override {
return HardwareParallelism;
}

/// We want to set up the RPC server for host services to the GPU if it is
/// availible.
bool shouldSetupRPCServer() const override {
return libomptargetSupportsRPC();
}

/// NVIDIA returns the product of the SM count and the number of warps that
/// fit if the maximum number of threads were scheduled on each SM.
/// The RPC interface should have enough space for all availible parallelism.
uint64_t requestedRPCPortCount() const override {
return HardwareParallelism;
return getHardwareParallelism();
}

/// Get the stream of the asynchronous info sructure or get a new one.
Expand Down
15 changes: 15 additions & 0 deletions openmp/libomptarget/test/api/omp_get_num_procs.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// RUN: %libomptarget-compile-run-and-check-generic

#include <stdio.h>

int omp_get_num_procs();

int main() {
int num_procs;
#pragma omp target map(from : num_procs)
{ num_procs = omp_get_num_procs(); }

// CHECK: PASS
if (num_procs > 0)
printf("PASS\n");
}

0 comments on commit 460840c

Please sign in to comment.