Skip to content

Commit

Permalink
[OpenMP] Pass min/max thread and team count to the OMPIRBuilder (#70247)
Browse files Browse the repository at this point in the history
We now provide the information about the min/max thread and team count
from to the OMPIRBuilder, no matter what the source was. That means we
unify `thread_limit`, `num_teams`, `num_threads` handling with the
target specific attriutes (`__launch_bounds__` and
`amdgpu_flat_work_group_size`). This is in preparation to pass the
values to the runtime, and to allow the middle-end (OpenMP-opt) to
tighten the values if it seems appropriate. There is no "real" change
after this commit.
  • Loading branch information
jdoerfert committed Oct 26, 2023
1 parent 57cebc7 commit 0ba57c8
Show file tree
Hide file tree
Showing 12 changed files with 1,322 additions and 1,165 deletions.
81 changes: 54 additions & 27 deletions clang/lib/CodeGen/CGOpenMPRuntime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6021,15 +6021,46 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
};

// Get NumTeams and ThreadLimit attributes
int32_t DefaultValTeams = -1;
uint32_t DefaultValThreads = UINT32_MAX;
getNumTeamsExprForTargetDirective(CGF, D, DefaultValTeams);
getNumThreadsExprForTargetDirective(CGF, D, DefaultValThreads,
int32_t DefaultValMinTeams = 1;
int32_t DefaultValMaxTeams = -1;
uint32_t DefaultValMinThreads = 1;
uint32_t DefaultValMaxThreads = UINT32_MAX;

getNumTeamsExprForTargetDirective(CGF, D, DefaultValMinTeams,
DefaultValMaxTeams);
getNumThreadsExprForTargetDirective(CGF, D, DefaultValMaxThreads,
/*UpperBoundOnly=*/true);

OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction,
DefaultValTeams, DefaultValThreads,
IsOffloadEntry, OutlinedFn, OutlinedFnID);
for (auto *C : D.getClausesOfKind<OMPXAttributeClause>()) {
for (auto *A : C->getAttrs()) {
int32_t MinThreadsVal = 1, MaxThreadsVal = 0;
int32_t MinBlocksVal = 1, MaxBlocksVal = -1;
if (auto *Attr = dyn_cast<CUDALaunchBoundsAttr>(A))
CGM.handleCUDALaunchBoundsAttr(nullptr, Attr, &MaxThreadsVal,
&MinBlocksVal, &MaxBlocksVal);
else if (auto *Attr = dyn_cast<AMDGPUFlatWorkGroupSizeAttr>(A))
CGM.handleAMDGPUFlatWorkGroupSizeAttr(
nullptr, Attr, /*ReqdWGS=*/nullptr, &MinThreadsVal, &MaxThreadsVal);
else
continue;

DefaultValMinThreads =
std::max(DefaultValMinThreads, uint32_t(MinThreadsVal));
DefaultValMaxThreads =
DefaultValMaxThreads
? std::min(DefaultValMaxThreads, uint32_t(MaxThreadsVal))
: MaxThreadsVal;
DefaultValMinTeams = DefaultValMinTeams
? std::max(DefaultValMinTeams, MinBlocksVal)
: MinBlocksVal;
DefaultValMaxTeams = std::min(DefaultValMaxTeams, MaxBlocksVal);
}
}

OMPBuilder.emitTargetRegionFunction(
EntryInfo, GenerateOutlinedFunction, DefaultValMinTeams,
DefaultValMaxTeams, DefaultValMinThreads, DefaultValMaxThreads,
IsOffloadEntry, OutlinedFn, OutlinedFnID);

if (!OutlinedFn)
return;
Expand All @@ -6038,14 +6069,8 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(

for (auto *C : D.getClausesOfKind<OMPXAttributeClause>()) {
for (auto *A : C->getAttrs()) {
if (auto *Attr = dyn_cast<CUDALaunchBoundsAttr>(A))
CGM.handleCUDALaunchBoundsAttr(OutlinedFn, Attr);
else if (auto *Attr = dyn_cast<AMDGPUFlatWorkGroupSizeAttr>(A))
CGM.handleAMDGPUFlatWorkGroupSizeAttr(OutlinedFn, Attr);
else if (auto *Attr = dyn_cast<AMDGPUWavesPerEUAttr>(A))
if (auto *Attr = dyn_cast<AMDGPUWavesPerEUAttr>(A))
CGM.handleAMDGPUWavesPerEUAttr(OutlinedFn, Attr);
else
llvm_unreachable("Unexpected attribute kind");
}
}
}
Expand Down Expand Up @@ -6103,8 +6128,8 @@ const Stmt *CGOpenMPRuntime::getSingleCompoundChild(ASTContext &Ctx,
}

const Expr *CGOpenMPRuntime::getNumTeamsExprForTargetDirective(
CodeGenFunction &CGF, const OMPExecutableDirective &D,
int32_t &DefaultVal) {
CodeGenFunction &CGF, const OMPExecutableDirective &D, int32_t &MinTeamsVal,
int32_t &MaxTeamsVal) {

OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
assert(isOpenMPTargetExecutionDirective(DirectiveKind) &&
Expand All @@ -6125,22 +6150,22 @@ const Expr *CGOpenMPRuntime::getNumTeamsExprForTargetDirective(
if (NumTeams->isIntegerConstantExpr(CGF.getContext()))
if (auto Constant =
NumTeams->getIntegerConstantExpr(CGF.getContext()))
DefaultVal = Constant->getExtValue();
MinTeamsVal = MaxTeamsVal = Constant->getExtValue();
return NumTeams;
}
DefaultVal = 0;
MinTeamsVal = MaxTeamsVal = 0;
return nullptr;
}
if (isOpenMPParallelDirective(NestedDir->getDirectiveKind()) ||
isOpenMPSimdDirective(NestedDir->getDirectiveKind())) {
DefaultVal = 1;
MinTeamsVal = MaxTeamsVal = 1;
return nullptr;
}
DefaultVal = 1;
MinTeamsVal = MaxTeamsVal = 1;
return nullptr;
}
// A value of -1 is used to check if we need to emit no teams region
DefaultVal = -1;
MinTeamsVal = MaxTeamsVal = -1;
return nullptr;
}
case OMPD_target_teams_loop:
Expand All @@ -6154,18 +6179,18 @@ const Expr *CGOpenMPRuntime::getNumTeamsExprForTargetDirective(
D.getSingleClause<OMPNumTeamsClause>()->getNumTeams();
if (NumTeams->isIntegerConstantExpr(CGF.getContext()))
if (auto Constant = NumTeams->getIntegerConstantExpr(CGF.getContext()))
DefaultVal = Constant->getExtValue();
MinTeamsVal = MaxTeamsVal = Constant->getExtValue();
return NumTeams;
}
DefaultVal = 0;
MinTeamsVal = MaxTeamsVal = 0;
return nullptr;
}
case OMPD_target_parallel:
case OMPD_target_parallel_for:
case OMPD_target_parallel_for_simd:
case OMPD_target_parallel_loop:
case OMPD_target_simd:
DefaultVal = 1;
MinTeamsVal = MaxTeamsVal = 1;
return nullptr;
case OMPD_parallel:
case OMPD_for:
Expand Down Expand Up @@ -6240,8 +6265,9 @@ llvm::Value *CGOpenMPRuntime::emitNumTeamsForTargetDirective(
"Clauses associated with the teams directive expected to be emitted "
"only for the host!");
CGBuilderTy &Bld = CGF.Builder;
int32_t DefaultNT = -1;
const Expr *NumTeams = getNumTeamsExprForTargetDirective(CGF, D, DefaultNT);
int32_t MinNT = -1, MaxNT = -1;
const Expr *NumTeams =
getNumTeamsExprForTargetDirective(CGF, D, MinNT, MaxNT);
if (NumTeams != nullptr) {
OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();

Expand Down Expand Up @@ -6271,7 +6297,8 @@ llvm::Value *CGOpenMPRuntime::emitNumTeamsForTargetDirective(
}
}

return llvm::ConstantInt::get(CGF.Int32Ty, DefaultNT);
assert(MinNT == MaxNT && "Num threads ranges require handling here.");
return llvm::ConstantInt::get(CGF.Int32Ty, MinNT);
}

/// Check for a num threads constant value (stored in \p DefaultVal), or
Expand Down
3 changes: 2 additions & 1 deletion clang/lib/CodeGen/CGOpenMPRuntime.h
Original file line number Diff line number Diff line change
Expand Up @@ -637,7 +637,8 @@ class CGOpenMPRuntime {
/// Otherwise, return nullptr.
const Expr *getNumTeamsExprForTargetDirective(CodeGenFunction &CGF,
const OMPExecutableDirective &D,
int32_t &DefaultVal);
int32_t &MinTeamsVal,
int32_t &MaxTeamsVal);
llvm::Value *emitNumTeamsForTargetDirective(CodeGenFunction &CGF,
const OMPExecutableDirective &D);

Expand Down
14 changes: 11 additions & 3 deletions clang/lib/CodeGen/CodeGenModule.h
Original file line number Diff line number Diff line change
Expand Up @@ -1543,15 +1543,23 @@ class CodeGenModule : public CodeGenTypeCache {
void moveLazyEmissionStates(CodeGenModule *NewBuilder);

/// Emit the IR encoding to attach the CUDA launch bounds attribute to \p F.
/// If \p MaxThreadsVal is not nullptr, the max threads value is stored in it,
/// if a valid one was found.
void handleCUDALaunchBoundsAttr(llvm::Function *F,
const CUDALaunchBoundsAttr *A);
const CUDALaunchBoundsAttr *A,
int32_t *MaxThreadsVal = nullptr,
int32_t *MinBlocksVal = nullptr,
int32_t *MaxClusterRankVal = nullptr);

/// Emit the IR encoding to attach the AMD GPU flat-work-group-size attribute
/// to \p F. Alternatively, the work group size can be taken from a \p
/// ReqdWGS.
/// ReqdWGS. If \p MinThreadsVal is not nullptr, the min threads value is
/// stored in it, if a valid one was found. If \p MaxThreadsVal is not
/// nullptr, the max threads value is stored in it, if a valid one was found.
void handleAMDGPUFlatWorkGroupSizeAttr(
llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *A,
const ReqdWorkGroupSizeAttr *ReqdWGS = nullptr);
const ReqdWorkGroupSizeAttr *ReqdWGS = nullptr,
int32_t *MinThreadsVal = nullptr, int32_t *MaxThreadsVal = nullptr);

/// Emit the IR encoding to attach the AMD GPU waves-per-eu attribute to \p F.
void handleAMDGPUWavesPerEUAttr(llvm::Function *F,
Expand Down
10 changes: 8 additions & 2 deletions clang/lib/CodeGen/Targets/AMDGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,8 @@ llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(

void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS,
const ReqdWorkGroupSizeAttr *ReqdWGS) {
const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal,
int32_t *MaxThreadsVal) {
unsigned Min = 0;
unsigned Max = 0;
if (FlatWGS) {
Expand All @@ -607,8 +608,13 @@ void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
if (Min != 0) {
assert(Min <= Max && "Min must be less than or equal Max");

if (MinThreadsVal)
*MinThreadsVal = Min;
if (MaxThreadsVal)
*MaxThreadsVal = Max;
std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
if (F)
F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
} else
assert(Max == 0 && "Max must be zero");
}
Expand Down
45 changes: 32 additions & 13 deletions clang/lib/CodeGen/Targets/NVPTX.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -287,33 +287,52 @@ bool NVPTXTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
}
}

void CodeGenModule::handleCUDALaunchBoundsAttr(
llvm::Function *F, const CUDALaunchBoundsAttr *Attr) {
void CodeGenModule::handleCUDALaunchBoundsAttr(llvm::Function *F,
const CUDALaunchBoundsAttr *Attr,
int32_t *MaxThreadsVal,
int32_t *MinBlocksVal,
int32_t *MaxClusterRankVal) {
// Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
llvm::APSInt MaxThreads(32);
MaxThreads = Attr->getMaxThreads()->EvaluateKnownConstInt(getContext());
if (MaxThreads > 0)
NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx",
MaxThreads.getExtValue());
if (MaxThreads > 0) {
if (MaxThreadsVal)
*MaxThreadsVal = MaxThreads.getExtValue();
if (F) {
// Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx",
MaxThreads.getExtValue());
}
}

// min and max blocks is an optional argument for CUDALaunchBoundsAttr. If it
// was not specified in __launch_bounds__ or if the user specified a 0 value,
// we don't have to add a PTX directive.
if (Attr->getMinBlocks()) {
llvm::APSInt MinBlocks(32);
MinBlocks = Attr->getMinBlocks()->EvaluateKnownConstInt(getContext());
if (MinBlocks > 0)
// Create !{<func-ref>, metadata !"minctasm", i32 <val>} node
NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "minctasm",
MinBlocks.getExtValue());
if (MinBlocks > 0) {
if (MinBlocksVal)
*MinBlocksVal = MinBlocks.getExtValue();
if (F) {
// Create !{<func-ref>, metadata !"minctasm", i32 <val>} node
NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "minctasm",
MinBlocks.getExtValue());
}
}
}
if (Attr->getMaxBlocks()) {
llvm::APSInt MaxBlocks(32);
MaxBlocks = Attr->getMaxBlocks()->EvaluateKnownConstInt(getContext());
if (MaxBlocks > 0)
// Create !{<func-ref>, metadata !"maxclusterrank", i32 <val>} node
NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxclusterrank",
MaxBlocks.getExtValue());
if (MaxBlocks > 0) {
if (MaxClusterRankVal)
*MaxClusterRankVal = MaxBlocks.getExtValue();
if (F) {
// Create !{<func-ref>, metadata !"maxclusterrank", i32 <val>} node
NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxclusterrank",
MaxBlocks.getExtValue());
}
}
}
}

Expand Down
34 changes: 22 additions & 12 deletions clang/test/OpenMP/ompx_attributes_codegen.cpp
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
// REQUIRES: amdgpu-registered-target

// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
// RUN: %clang_cc1 -target-cpu gfx900 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=AMD
// RUN: %clang_cc1 -target-cpu gfx900 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=AMD
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64 -fopenmp-targets=nvptx64 -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=NVIDIA
// expected-no-diagnostics


// Check that the target attributes are set on the generated kernel
void func() {
// CHECK: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l15() #0
// CHECK: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l17()
// CHECK: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l19() #4
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l16() #0
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l18()
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l20() #4

#pragma omp target ompx_attribute([[clang::amdgpu_flat_work_group_size(10, 20)]])
{}
Expand All @@ -21,11 +22,20 @@ void func() {
{}
}

// CHECK: attributes #0
// CHECK-SAME: "amdgpu-flat-work-group-size"="10,20"
// CHECK: attributes #4
// CHECK-SAME: "amdgpu-flat-work-group-size"="3,17"
// CHECK-SAME: "amdgpu-waves-per-eu"="3,7"
// AMD: attributes #0
// AMD-SAME: "amdgpu-flat-work-group-size"="10,20"
// AMD-SAME: "omp_target_thread_limit"="20"
// AMD: "omp_target_thread_limit"="45"
// AMD: attributes #4
// AMD-SAME: "amdgpu-flat-work-group-size"="3,17"
// AMD-SAME: "amdgpu-waves-per-eu"="3,7"
// AMD-SAME: "omp_target_thread_limit"="17"

// CHECK: !{ptr @__omp_offloading[[HASH]]_l17, !"maxntidx", i32 45}
// CHECK: !{ptr @__omp_offloading[[HASH]]_l17, !"minctasm", i32 90}
// It is unclear if we should use the AMD annotations for other targets, we do for now.
// NVIDIA: "omp_target_thread_limit"="20"
// NVIDIA: "omp_target_thread_limit"="45"
// NVIDIA: "omp_target_thread_limit"="17"
// NVIDIA: !{ptr @__omp_offloading[[HASH1:.*]]_l16, !"maxntidx", i32 20}
// NVIDIA: !{ptr @__omp_offloading[[HASH2:.*]]_l18, !"minctasm", i32 90}
// NVIDIA: !{ptr @__omp_offloading[[HASH2]]_l18, !"maxntidx", i32 45}
// NVIDIA: !{ptr @__omp_offloading[[HASH3:.*]]_l20, !"maxntidx", i32 17}

0 comments on commit 0ba57c8

Please sign in to comment.