Skip to content

Commit

Permalink
[OpenMP] Associate the KernelEnvironment with the GenericKernelTy (#7…
Browse files Browse the repository at this point in the history
…0383)

By associating the kernel environment with the generic kernel we can
access middle-end information easily, including the launch bounds ranges
that are acceptable. By constraining the number of threads accordingly,
we now obey the user-provided bounds that were passed via attributes.
  • Loading branch information
jdoerfert committed Oct 29, 2023
1 parent d8f5a18 commit d346c82
Show file tree
Hide file tree
Showing 10 changed files with 74 additions and 105 deletions.
15 changes: 7 additions & 8 deletions clang/test/OpenMP/bug57757.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,24 +32,23 @@ void foo() {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP1]], i64 0, i32 2
// CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META13:![0-9]+]])
// CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META16:![0-9]+]])
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !tbaa [[TBAA18:![0-9]+]], !alias.scope !13, !noalias !16
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !tbaa [[TBAA16:![0-9]+]], !alias.scope !13, !noalias !17
// CHECK-NEXT: switch i32 [[TMP3]], label [[DOTOMP_OUTLINED__EXIT:%.*]] [
// CHECK-NEXT: i32 0, label [[DOTUNTIED_JMP__I:%.*]]
// CHECK-NEXT: i32 1, label [[DOTUNTIED_NEXT__I:%.*]]
// CHECK-NEXT: ]
// CHECK: .untied.jmp..i:
// CHECK-NEXT: store i32 1, ptr [[TMP2]], align 4, !tbaa [[TBAA18]], !alias.scope !13, !noalias !16
// CHECK-NEXT: [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]), !noalias !19
// CHECK-NEXT: store i32 1, ptr [[TMP2]], align 4, !tbaa [[TBAA16]], !alias.scope !13, !noalias !17
// CHECK-NEXT: [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]), !noalias !13
// CHECK-NEXT: br label [[DOTOMP_OUTLINED__EXIT]]
// CHECK: .untied.next..i:
// CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP1]], i64 0, i32 1
// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP1]], i64 0, i32 1, i32 2
// CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP1]], i64 0, i32 1, i32 1
// CHECK-NEXT: [[TMP8:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA20:![0-9]+]], !alias.scope !16, !noalias !13
// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA18]], !alias.scope !16, !noalias !13
// CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA21:![0-9]+]], !alias.scope !16, !noalias !13
// CHECK-NEXT: tail call void [[TMP8]](i32 noundef [[TMP9]], float noundef [[TMP10]]) #[[ATTR2:[0-9]+]], !noalias !19
// CHECK-NEXT: [[TMP8:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA19:![0-9]+]], !noalias !13
// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA16]], !noalias !13
// CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA20:![0-9]+]], !noalias !13
// CHECK-NEXT: tail call void [[TMP8]](i32 noundef [[TMP9]], float noundef [[TMP10]]) #[[ATTR2:[0-9]+]], !noalias !13
// CHECK-NEXT: br label [[DOTOMP_OUTLINED__EXIT]]
// CHECK: .omp_outlined..exit:
// CHECK-NEXT: ret i32 0
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4093,8 +4093,8 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,

Function *Kernel = Builder.GetInsertBlock()->getParent();

/// Manifest the launch configuration in the metadata matching the kernel
/// environment.
// Manifest the launch configuration in the metadata matching the kernel
// environment.
if (MinTeamsVal > 1 || MaxTeamsVal > 0)
writeTeamsForKernel(T, *Kernel, MinTeamsVal, MaxTeamsVal);

Expand Down
8 changes: 3 additions & 5 deletions openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -411,8 +411,7 @@ struct AMDGPUDeviceImageTy : public DeviceImageTy {
/// generic kernel class.
struct AMDGPUKernelTy : public GenericKernelTy {
/// Create an AMDGPU kernel with a name and an execution mode.
AMDGPUKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode)
: GenericKernelTy(Name, ExecutionMode) {}
AMDGPUKernelTy(const char *Name) : GenericKernelTy(Name) {}

/// Initialize the AMDGPU kernel.
Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
Expand Down Expand Up @@ -1978,14 +1977,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {

/// Allocate and construct an AMDGPU kernel.
Expected<GenericKernelTy &>
constructKernel(const __tgt_offload_entry &KernelEntry,
OMPTgtExecModeFlags ExecMode) override {
constructKernel(const __tgt_offload_entry &KernelEntry) override {
// Allocate and construct the AMDGPU kernel.
AMDGPUKernelTy *AMDGPUKernel = Plugin::get().allocate<AMDGPUKernelTy>();
if (!AMDGPUKernel)
return Plugin::error("Failed to allocate memory for AMDGPU kernel");

new (AMDGPUKernel) AMDGPUKernelTy(KernelEntry.name, ExecMode);
new (AMDGPUKernel) AMDGPUKernelTy(KernelEntry.name);

return *AMDGPUKernel;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -339,9 +339,33 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,

ImagePtr = &Image;

PreferredNumThreads = GenericDevice.getDefaultNumThreads();
// Retrieve kernel environment object for the kernel.
GlobalTy KernelEnv(std::string(Name) + "_kernel_environment",
sizeof(KernelEnvironment), &KernelEnvironment);
GenericGlobalHandlerTy &GHandler = Plugin::get().getGlobalHandler();
if (auto Err =
GHandler.readGlobalFromImage(GenericDevice, *ImagePtr, KernelEnv)) {
[[maybe_unused]] std::string ErrStr = toString(std::move(Err));
DP("Failed to read kernel environment for '%s': %s\n"
"Using default SPMD (2) execution mode\n",
Name, ErrStr.data());
KernelEnvironment.Configuration.ExecMode = OMP_TGT_EXEC_MODE_SPMD;
KernelEnvironment.Configuration.MayUseNestedParallelism = /*Unknown=*/2;
KernelEnvironment.Configuration.UseGenericStateMachine = /*Unknown=*/2;
}

MaxNumThreads = GenericDevice.getThreadLimit();
// Max = Config.Max > 0 ? min(Config.Max, Device.Max) : Device.Max;
MaxNumThreads = KernelEnvironment.Configuration.MaxThreads > 0
? std::min(KernelEnvironment.Configuration.MaxThreads,
int32_t(GenericDevice.getThreadLimit()))
: GenericDevice.getThreadLimit();

// Pref = Config.Pref > 0 ? max(Config.Pref, Device.Pref) : Device.Pref;
PreferredNumThreads =
KernelEnvironment.Configuration.MinThreads > 0
? std::max(KernelEnvironment.Configuration.MinThreads,
int32_t(GenericDevice.getDefaultNumThreads()))
: GenericDevice.getDefaultNumThreads();

return initImpl(GenericDevice, Image);
}
Expand Down Expand Up @@ -890,13 +914,8 @@ Error GenericDeviceTy::registerKernelOffloadEntry(
__tgt_offload_entry &DeviceEntry) {
DeviceEntry = KernelEntry;

// Retrieve the execution mode.
auto ExecModeOrErr = getExecutionModeForKernel(KernelEntry.name, Image);
if (!ExecModeOrErr)
return ExecModeOrErr.takeError();

// Create a kernel object.
auto KernelOrErr = constructKernel(KernelEntry, *ExecModeOrErr);
auto KernelOrErr = constructKernel(KernelEntry);
if (!KernelOrErr)
return KernelOrErr.takeError();

Expand All @@ -914,45 +933,6 @@ Error GenericDeviceTy::registerKernelOffloadEntry(
return Plugin::success();
}

Expected<KernelEnvironmentTy>
GenericDeviceTy::getKernelEnvironmentForKernel(StringRef Name,
DeviceImageTy &Image) {
// Create a metadata object for the kernel environment object.
StaticGlobalTy<KernelEnvironmentTy> KernelEnv(Name.data(),
"_kernel_environment");

// Retrieve kernel environment object for the kernel.
GenericGlobalHandlerTy &GHandler = Plugin::get().getGlobalHandler();
if (auto Err = GHandler.readGlobalFromImage(*this, Image, KernelEnv))
return std::move(Err);

return KernelEnv.getValue();
}

Expected<OMPTgtExecModeFlags>
GenericDeviceTy::getExecutionModeForKernel(StringRef Name,
DeviceImageTy &Image) {
auto KernelEnvOrError = getKernelEnvironmentForKernel(Name, Image);
if (!KernelEnvOrError) {
[[maybe_unused]] std::string ErrStr =
toString(KernelEnvOrError.takeError());
DP("Failed to read kernel environment for '%s': %s\n"
"Using default SPMD (2) execution mode\n",
Name.data(), ErrStr.data());
return OMP_TGT_EXEC_MODE_SPMD;
}

auto &KernelEnv = *KernelEnvOrError;
auto ExecMode = KernelEnv.Configuration.ExecMode;

// Check that the retrieved execution mode is valid.
if (!GenericKernelTy::isValidExecutionMode(ExecMode))
return Plugin::error("Invalid execution mode %d for '%s'", ExecMode,
Name.data());

return ExecMode;
}

Error PinnedAllocationMapTy::insertEntry(void *HstPtr, void *DevAccessiblePtr,
size_t Size, bool ExternallyLocked) {
// Insert the new entry into the map.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -255,9 +255,8 @@ class DeviceImageTy {
/// implement the necessary virtual function members.
struct GenericKernelTy {
/// Construct a kernel with a name and a execution mode.
GenericKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode)
: Name(Name), ExecutionMode(ExecutionMode), PreferredNumThreads(0),
MaxNumThreads(0) {}
GenericKernelTy(const char *Name)
: Name(Name), PreferredNumThreads(0), MaxNumThreads(0) {}

virtual ~GenericKernelTy() {}

Expand Down Expand Up @@ -285,6 +284,11 @@ struct GenericKernelTy {
return *ImagePtr;
}

/// Return the kernel environment object for kernel \p Name.
const KernelEnvironmentTy &getKernelEnvironmentForKernel() {
return KernelEnvironment;
}

/// Indicate whether an execution mode is valid.
static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
switch (ExecutionMode) {
Expand All @@ -299,7 +303,7 @@ struct GenericKernelTy {
protected:
/// Get the execution mode name of the kernel.
const char *getExecutionModeName() const {
switch (ExecutionMode) {
switch (KernelEnvironment.Configuration.ExecMode) {
case OMP_TGT_EXEC_MODE_SPMD:
return "SPMD";
case OMP_TGT_EXEC_MODE_GENERIC:
Expand Down Expand Up @@ -343,19 +347,20 @@ struct GenericKernelTy {

/// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode.
bool isGenericSPMDMode() const {
return ExecutionMode == OMP_TGT_EXEC_MODE_GENERIC_SPMD;
return KernelEnvironment.Configuration.ExecMode ==
OMP_TGT_EXEC_MODE_GENERIC_SPMD;
}
bool isGenericMode() const {
return ExecutionMode == OMP_TGT_EXEC_MODE_GENERIC;
return KernelEnvironment.Configuration.ExecMode ==
OMP_TGT_EXEC_MODE_GENERIC;
}
bool isSPMDMode() const {
return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_SPMD;
}
bool isSPMDMode() const { return ExecutionMode == OMP_TGT_EXEC_MODE_SPMD; }

/// The kernel name.
const char *Name;

/// The execution flags of the kernel.
OMPTgtExecModeFlags ExecutionMode;

/// The image that contains this kernel.
DeviceImageTy *ImagePtr = nullptr;

Expand All @@ -365,6 +370,9 @@ struct GenericKernelTy {

/// The maximum number of threads which the kernel could leverage.
uint32_t MaxNumThreads;

/// The kernel environment, including execution flags.
KernelEnvironmentTy KernelEnvironment;
};

/// Class representing a map of host pinned allocations. We track these pinned
Expand Down Expand Up @@ -819,8 +827,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {

/// Allocate and construct a kernel object.
virtual Expected<GenericKernelTy &>
constructKernel(const __tgt_offload_entry &KernelEntry,
OMPTgtExecModeFlags ExecMode) = 0;
constructKernel(const __tgt_offload_entry &KernelEntry) = 0;

/// Get and set the stack size and heap size for the device. If not used, the
/// plugin can implement the setters as no-op and setting the output
Expand Down Expand Up @@ -864,10 +871,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
UInt32Envar("LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT", 32);

protected:
/// Return the execution mode used for kernel \p Name.
virtual Expected<OMPTgtExecModeFlags>
getExecutionModeForKernel(StringRef Name, DeviceImageTy &Image);

/// Environment variables defined by the LLVM OpenMP implementation
/// regarding the initial number of streams and events.
UInt32Envar OMPX_InitialNumStreams;
Expand Down Expand Up @@ -916,10 +919,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
#endif

private:
/// Return the kernel environment object for kernel \p Name.
Expected<KernelEnvironmentTy>
getKernelEnvironmentForKernel(StringRef Name, DeviceImageTy &Image);

DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
};
Expand Down
8 changes: 3 additions & 5 deletions openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,7 @@ struct CUDADeviceImageTy : public DeviceImageTy {
/// generic kernel class.
struct CUDAKernelTy : public GenericKernelTy {
/// Create a CUDA kernel with a name and an execution mode.
CUDAKernelTy(const char *Name, OMPTgtExecModeFlags ExecMode)
: GenericKernelTy(Name, ExecMode), Func(nullptr) {}
CUDAKernelTy(const char *Name) : GenericKernelTy(Name), Func(nullptr) {}

/// Initialize the CUDA kernel.
Error initImpl(GenericDeviceTy &GenericDevice,
Expand Down Expand Up @@ -356,14 +355,13 @@ struct CUDADeviceTy : public GenericDeviceTy {

/// Allocate and construct a CUDA kernel.
Expected<GenericKernelTy &>
constructKernel(const __tgt_offload_entry &KernelEntry,
OMPTgtExecModeFlags ExecMode) override {
constructKernel(const __tgt_offload_entry &KernelEntry) override {
// Allocate and construct the CUDA kernel.
CUDAKernelTy *CUDAKernel = Plugin::get().allocate<CUDAKernelTy>();
if (!CUDAKernel)
return Plugin::error("Failed to allocate memory for CUDA kernel");

new (CUDAKernel) CUDAKernelTy(KernelEntry.name, ExecMode);
new (CUDAKernel) CUDAKernelTy(KernelEntry.name);

return *CUDAKernel;
}
Expand Down
20 changes: 8 additions & 12 deletions openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

#include "llvm/ADT/SmallVector.h"
#include "llvm/Frontend/OpenMP/OMPConstants.h"
#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
#include "llvm/Support/DynamicLibrary.h"

Expand All @@ -51,8 +52,7 @@ using llvm::sys::DynamicLibrary;
/// Class implementing kernel functionalities for GenELF64.
struct GenELF64KernelTy : public GenericKernelTy {
/// Construct the kernel with a name and an execution mode.
GenELF64KernelTy(const char *Name, OMPTgtExecModeFlags ExecMode)
: GenericKernelTy(Name, ExecMode), Func(nullptr) {}
GenELF64KernelTy(const char *Name) : GenericKernelTy(Name), Func(nullptr) {}

/// Initialize the kernel.
Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
Expand All @@ -71,6 +71,10 @@ struct GenELF64KernelTy : public GenericKernelTy {
// Save the function pointer.
Func = (void (*)())Global.getPtr();

KernelEnvironment.Configuration.ExecMode = OMP_TGT_EXEC_MODE_GENERIC;
KernelEnvironment.Configuration.MayUseNestedParallelism = /* Unknown */ 2;
KernelEnvironment.Configuration.UseGenericStateMachine = /* Unknown */ 2;

// Set the maximum number of threads to a single.
MaxNumThreads = 1;
return Plugin::success();
Expand Down Expand Up @@ -137,15 +141,14 @@ struct GenELF64DeviceTy : public GenericDeviceTy {

/// Construct the kernel for a specific image on the device.
Expected<GenericKernelTy &>
constructKernel(const __tgt_offload_entry &KernelEntry,
OMPTgtExecModeFlags ExecMode) override {
constructKernel(const __tgt_offload_entry &KernelEntry) override {
// Allocate and construct the kernel.
GenELF64KernelTy *GenELF64Kernel =
Plugin::get().allocate<GenELF64KernelTy>();
if (!GenELF64Kernel)
return Plugin::error("Failed to allocate memory for GenELF64 kernel");

new (GenELF64Kernel) GenELF64KernelTy(KernelEntry.name, ExecMode);
new (GenELF64Kernel) GenELF64KernelTy(KernelEntry.name);

return *GenELF64Kernel;
}
Expand Down Expand Up @@ -325,13 +328,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
}
Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); }

protected:
/// Retrieve the execution mode for kernels. All kernels use the generic mode.
Expected<OMPTgtExecModeFlags>
getExecutionModeForKernel(StringRef Name, DeviceImageTy &Image) override {
return OMP_TGT_EXEC_MODE_GENERIC;
}

private:
/// Grid values for Generic ELF64 plugins.
static constexpr GV GenELF64GridValues = {
Expand Down
3 changes: 1 addition & 2 deletions openmp/libomptarget/test/offloading/default_thread_limit.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,7 @@ int main() {
for (int i = 0; i < N; ++i) {
optnone();
}
// FIXME: Use the attribute value to imply a thread_limit
// DEFAULT: {{(128|256)}} (MaxFlatWorkGroupSize: 42
// DEFAULT: 42 (MaxFlatWorkGroupSize: 42
#pragma omp target ompx_attribute(__attribute__((amdgpu_flat_work_group_size(42, 42))))
#pragma omp teams distribute parallel for
for (int i = 0; i < N; ++i) {
Expand Down
4 changes: 2 additions & 2 deletions openmp/libomptarget/test/offloading/thread_state_1.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ int main() {
}
}
}
if (o_lvl == 1 && o_tid == o_nt - 1 && o_nt > 1 && i_lvl == 2 && i_tid == 0 &&
i_nt == 1) {
if (o_lvl == 1 && o_tid == o_nt - 1 && o_nt >= 1 && i_lvl == 2 &&
i_tid == 0 && i_nt == 1) {
// CHECK: Success
printf("Success\n");
return 0;
Expand Down
4 changes: 2 additions & 2 deletions openmp/libomptarget/test/offloading/thread_state_2.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ int main() {
}
}
}
if (o_lvl == 1 && o_tid == o_nt - 1 && o_nt > 1 && i_lvl == 2 && i_tid == 0 &&
i_nt == 1) {
if (o_lvl == 1 && o_tid == o_nt - 1 && o_nt >= 1 && i_lvl == 2 &&
i_tid == 0 && i_nt == 1) {
// CHECK: Success
printf("Success\n");
return 0;
Expand Down

0 comments on commit d346c82

Please sign in to comment.