Skip to content

Commit 367a570

Browse files
shiltianyxsamliujayfoad
committed
[Clang][HIP][CUDA] Add __cluster_dims__ and __no_cluster__ attribute
This PR adds basic frontend support for `__cluster_dims__` and `__no_cluster__` attribute. Co-authored-by: Yaxun (Sam) Liu <yaxun.liu@amd.com> Co-authored-by: Jay Foad <jay.foad@amd.com>
1 parent 65c895d commit 367a570

File tree

12 files changed

+338
-0
lines changed

12 files changed

+338
-0
lines changed

clang/include/clang/Basic/Attr.td

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1572,6 +1572,23 @@ def HIPManaged : InheritableAttr {
15721572
let Documentation = [HIPManagedAttrDocs];
15731573
}
15741574

1575+
def CUDAClusterDims : InheritableAttr {
1576+
let Spellings = [GNU<"cluster_dims">, Declspec<"__cluster_dims__">];
1577+
let Args = [ExprArgument<"X">, ExprArgument<"Y", 1>, ExprArgument<"Z", 1>];
1578+
let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
1579+
let LangOpts = [CUDA];
1580+
let Documentation = [Undocumented];
1581+
}
1582+
1583+
def CUDANoCluster : InheritableAttr {
1584+
let Spellings = [GNU<"no_cluster">, Declspec<"__no_cluster__">];
1585+
let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
1586+
let LangOpts = [CUDA];
1587+
let Documentation = [Undocumented];
1588+
}
1589+
1590+
def : MutualExclusions<[CUDAClusterDims, CUDANoCluster]>;
1591+
15751592
def CUDAInvalidTarget : InheritableAttr {
15761593
let Spellings = [];
15771594
let Subjects = SubjectList<[Function]>;

clang/include/clang/Basic/DiagnosticSemaKinds.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13070,6 +13070,14 @@ def warn_cuda_maxclusterrank_sm_90 : Warning<
1307013070
"maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring "
1307113071
"%1 attribute">, InGroup<IgnoredAttributes>;
1307213072

13073+
def err_cuda_cluster_attr_not_supported : Error<
13074+
"%select{__cluster_dims__|__no_cluster__}0 is not supported for this GPU architecture"
13075+
>;
13076+
13077+
def err_cuda_cluster_dims_too_large : Error<
13078+
"only a maximum of %0 thread blocks in a cluster is supported"
13079+
>;
13080+
1307313081
// VTable pointer authentication errors
1307413082
def err_non_polymorphic_vtable_pointer_auth : Error<
1307513083
"cannot set vtable pointer authentication on monomorphic type %0">;

clang/include/clang/Sema/Sema.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5010,6 +5010,14 @@ class Sema final : public SemaBase {
50105010
void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI,
50115011
Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks);
50125012

5013+
/// Add a cluster_dims attribute to a particular declaration.
5014+
CUDAClusterDimsAttr *createClusterDimsAttr(const AttributeCommonInfo &CI,
5015+
Expr *X, Expr *Y, Expr *Z);
5016+
void addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
5017+
Expr *Y, Expr *Z);
5018+
/// Add a no_cluster attribute to a particular declaration.
5019+
void addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI);
5020+
50135021
enum class RetainOwnershipKind { NS, CF, OS };
50145022

50155023
UuidAttr *mergeUuidAttr(Decl *D, const AttributeCommonInfo &CI,

clang/lib/CodeGen/Targets/AMDGPU.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,9 @@ static bool requiresAMDGPUProtectedVisibility(const Decl *D,
342342

343343
void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
344344
const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
345+
llvm::StringMap<bool> TargetFetureMap;
346+
M.getContext().getFunctionFeatureMap(TargetFetureMap, FD);
347+
345348
const auto *ReqdWGS =
346349
M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
347350
const bool IsOpenCLKernel =
@@ -402,6 +405,29 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
402405

403406
F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
404407
}
408+
409+
if (auto *Attr = FD->getAttr<CUDAClusterDimsAttr>()) {
410+
uint32_t X =
411+
Attr->getX()->EvaluateKnownConstInt(M.getContext()).getExtValue();
412+
uint32_t Y =
413+
Attr->getY()
414+
? Attr->getY()->EvaluateKnownConstInt(M.getContext()).getExtValue()
415+
: 1;
416+
uint32_t Z =
417+
Attr->getZ()
418+
? Attr->getZ()->EvaluateKnownConstInt(M.getContext()).getExtValue()
419+
: 1;
420+
421+
llvm::SmallString<32> AttrVal;
422+
llvm::raw_svector_ostream OS(AttrVal);
423+
OS << X << ',' << Y << ',' << Z;
424+
F->addFnAttr("amdgpu-cluster-dims", AttrVal.str());
425+
}
426+
427+
// OpenCL doesn't support cluster feature.
428+
if ((IsOpenCLKernel && TargetFetureMap.lookup("gfx1250-insts")) ||
429+
FD->getAttr<CUDANoClusterAttr>())
430+
F->addFnAttr("amdgpu-cluster-dims", "0,0,0");
405431
}
406432

407433
void AMDGPUTargetCodeGenInfo::setTargetAttributes(

clang/lib/Headers/__clang_hip_runtime_wrapper.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
#define __constant__ __attribute__((constant))
2626
#define __managed__ __attribute__((managed))
2727

28+
#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
29+
2830
#if !defined(__cplusplus) || __cplusplus < 201103L
2931
#define nullptr NULL;
3032
#endif

clang/lib/Sema/SemaDeclAttr.cpp

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5676,6 +5676,130 @@ static void handleLaunchBoundsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
56765676
AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
56775677
}
56785678

5679+
static std::pair<Expr *, int>
5680+
makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL,
5681+
const unsigned Idx) {
5682+
if (S.DiagnoseUnexpandedParameterPack(E))
5683+
return {nullptr, 0};
5684+
5685+
// Accept template arguments for now as they depend on something else.
5686+
// We'll get to check them when they eventually get instantiated.
5687+
if (E->isValueDependent())
5688+
return {E, 1};
5689+
5690+
std::optional<llvm::APSInt> I = llvm::APSInt(64);
5691+
if (!(I = E->getIntegerConstantExpr(S.Context))) {
5692+
S.Diag(E->getExprLoc(), diag::err_attribute_argument_n_type)
5693+
<< &AL << Idx << AANT_ArgumentIntegerConstant << E->getSourceRange();
5694+
return {nullptr, 0};
5695+
}
5696+
// Make sure we can fit it in 4 bits.
5697+
if (!I->isIntN(4)) {
5698+
S.Diag(E->getExprLoc(), diag::err_ice_too_large)
5699+
<< toString(*I, 10, false) << 4 << /* Unsigned */ 1;
5700+
return {nullptr, 0};
5701+
}
5702+
if (*I < 0)
5703+
S.Diag(E->getExprLoc(), diag::warn_attribute_argument_n_negative)
5704+
<< &AL << Idx << E->getSourceRange();
5705+
5706+
// We may need to perform implicit conversion of the argument.
5707+
InitializedEntity Entity = InitializedEntity::InitializeParameter(
5708+
S.Context, S.Context.getConstType(S.Context.IntTy), /*consume*/ false);
5709+
ExprResult ValArg = S.PerformCopyInitialization(Entity, SourceLocation(), E);
5710+
assert(!ValArg.isInvalid() &&
5711+
"Unexpected PerformCopyInitialization() failure.");
5712+
5713+
return {ValArg.getAs<Expr>(), I->getZExtValue()};
5714+
}
5715+
5716+
CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI,
5717+
Expr *X, Expr *Y, Expr *Z) {
5718+
CUDAClusterDimsAttr TmpAttr(Context, CI, X, Y, Z);
5719+
5720+
int ValX = 1;
5721+
int ValY = 1;
5722+
int ValZ = 1;
5723+
5724+
std::tie(X, ValX) = makeClusterDimsArgExpr(*this, X, TmpAttr, /*Idx=*/0);
5725+
if (!X)
5726+
return nullptr;
5727+
5728+
if (Y) {
5729+
std::tie(Y, ValY) = makeClusterDimsArgExpr(*this, Y, TmpAttr, /*Idx=*/1);
5730+
if (!Y)
5731+
return nullptr;
5732+
}
5733+
5734+
if (Z) {
5735+
std::tie(Z, ValZ) = makeClusterDimsArgExpr(*this, Z, TmpAttr, /*Idx=*/2);
5736+
if (!Z)
5737+
return nullptr;
5738+
}
5739+
5740+
int FlatDim = ValX * ValY * ValZ;
5741+
auto TT = (!Context.getLangOpts().CUDAIsDevice && Context.getAuxTargetInfo())
5742+
? Context.getAuxTargetInfo()->getTriple()
5743+
: Context.getTargetInfo().getTriple();
5744+
int MaxDim = 1;
5745+
if (TT.isNVPTX())
5746+
MaxDim = 8;
5747+
else if (TT.isAMDGPU())
5748+
MaxDim = 16;
5749+
else
5750+
return nullptr;
5751+
5752+
// A maximum of 8 thread blocks in a cluster is supported as a portable
5753+
// cluster size in CUDA. The number is 16 for AMDGPU.
5754+
if (FlatDim > MaxDim) {
5755+
Diag(CI.getLoc(), diag::err_cuda_cluster_dims_too_large) << MaxDim;
5756+
return nullptr;
5757+
}
5758+
5759+
return ::new (Context) CUDAClusterDimsAttr(Context, CI, X, Y, Z);
5760+
}
5761+
5762+
void Sema::addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
5763+
Expr *Y, Expr *Z) {
5764+
if (auto *Attr = createClusterDimsAttr(CI, X, Y, Z))
5765+
D->addAttr(Attr);
5766+
}
5767+
5768+
void Sema::addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI) {
5769+
if (CUDANoClusterAttr *Attr = ::new (Context) CUDANoClusterAttr(Context, CI))
5770+
D->addAttr(Attr);
5771+
}
5772+
5773+
static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
5774+
auto &TTI = S.Context.getTargetInfo();
5775+
auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
5776+
if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
5777+
(TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
5778+
S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 0;
5779+
return;
5780+
}
5781+
5782+
if (!AL.checkAtLeastNumArgs(S, /*Num=*/1) ||
5783+
!AL.checkAtMostNumArgs(S, /*Num=*/3))
5784+
return;
5785+
5786+
S.addClusterDimsAttr(D, AL, AL.getArgAsExpr(0),
5787+
AL.getNumArgs() > 1 ? AL.getArgAsExpr(1) : nullptr,
5788+
AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
5789+
}
5790+
5791+
static void handleNoClusterAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
5792+
auto &TTI = S.Context.getTargetInfo();
5793+
auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
5794+
if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
5795+
(TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
5796+
S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 1;
5797+
return;
5798+
}
5799+
5800+
S.addNoClusterAttr(D, AL);
5801+
}
5802+
56795803
static void handleArgumentWithTypeTagAttr(Sema &S, Decl *D,
56805804
const ParsedAttr &AL) {
56815805
if (!AL.isArgIdent(0)) {
@@ -7141,6 +7265,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
71417265
case ParsedAttr::AT_CUDALaunchBounds:
71427266
handleLaunchBoundsAttr(S, D, AL);
71437267
break;
7268+
case ParsedAttr::AT_CUDAClusterDims:
7269+
handleClusterDimsAttr(S, D, AL);
7270+
break;
7271+
case ParsedAttr::AT_CUDANoCluster:
7272+
handleNoClusterAttr(S, D, AL);
7273+
break;
71447274
case ParsedAttr::AT_Restrict:
71457275
handleRestrictAttr(S, D, AL);
71467276
break;

clang/lib/Sema/SemaTemplateInstantiateDecl.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -707,6 +707,38 @@ static void instantiateDependentAMDGPUMaxNumWorkGroupsAttr(
707707
S.AMDGPU().addAMDGPUMaxNumWorkGroupsAttr(New, Attr, XExpr, YExpr, ZExpr);
708708
}
709709

710+
static void instantiateDependentCUDAClusterDimsAttr(
711+
Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
712+
const CUDAClusterDimsAttr &Attr, Decl *New) {
713+
EnterExpressionEvaluationContext Unevaluated(
714+
S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
715+
716+
Expr *XExpr = nullptr;
717+
Expr *YExpr = nullptr;
718+
Expr *ZExpr = nullptr;
719+
720+
if (Attr.getX()) {
721+
ExprResult ResultX = S.SubstExpr(Attr.getX(), TemplateArgs);
722+
if (ResultX.isUsable())
723+
XExpr = ResultX.getAs<Expr>();
724+
}
725+
726+
if (Attr.getY()) {
727+
ExprResult ResultY = S.SubstExpr(Attr.getY(), TemplateArgs);
728+
if (ResultY.isUsable())
729+
YExpr = ResultY.getAs<Expr>();
730+
}
731+
732+
if (Attr.getZ()) {
733+
ExprResult ResultZ = S.SubstExpr(Attr.getZ(), TemplateArgs);
734+
if (ResultZ.isUsable())
735+
ZExpr = ResultZ.getAs<Expr>();
736+
}
737+
738+
if (XExpr)
739+
S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr);
740+
}
741+
710742
// This doesn't take any template parameters, but we have a custom action that
711743
// needs to happen when the kernel itself is instantiated. We need to run the
712744
// ItaniumMangler to mark the names required to name this kernel.
@@ -921,6 +953,11 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
921953
*this, TemplateArgs, *AMDGPUMaxNumWorkGroups, New);
922954
}
923955

956+
if (const auto *CUDAClusterDims = dyn_cast<CUDAClusterDimsAttr>(TmplAttr)) {
957+
instantiateDependentCUDAClusterDimsAttr(*this, TemplateArgs,
958+
*CUDAClusterDims, New);
959+
}
960+
924961
if (const auto *ParamAttr = dyn_cast<HLSLParamModifierAttr>(TmplAttr)) {
925962
instantiateDependentHLSLParamModifierAttr(*this, TemplateArgs, ParamAttr,
926963
New);

clang/test/CodeGenCUDA/Inputs/cuda.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
#endif
1414
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
1515
#define __grid_constant__ __attribute__((grid_constant))
16+
#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
17+
#define __no_cluster__ __attribute__((no_cluster))
1618
#else
1719
#define __constant__
1820
#define __device__
@@ -22,6 +24,8 @@
2224
#define __managed__
2325
#define __launch_bounds__(...)
2426
#define __grid_constant__
27+
#define __cluster_dims__(...)
28+
#define __no_cluster__
2529
#endif
2630

2731
struct dim3 {
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -fcuda-is-device -emit-llvm -x hip -o - %s | FileCheck %s
2+
// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -aux-triple amdgcn-amd-amdhsa -emit-llvm -x hip -o - %s | FileCheck --check-prefix=HOST %s
3+
4+
#include "Inputs/cuda.h"
5+
6+
const int constint = 4;
7+
8+
// HOST-NOT: "amdgpu-cluster-dims"
9+
10+
// CHECK: "amdgpu-cluster-dims"="2,2,2"
11+
__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {}
12+
13+
// CHECK: "amdgpu-cluster-dims"="2,2,1"
14+
__global__ void __cluster_dims__(2, 2) test_literal_2d() {}
15+
16+
// CHECK: "amdgpu-cluster-dims"="4,1,1"
17+
__global__ void __cluster_dims__(4) test_literal_1d() {}
18+
19+
// CHECK: "amdgpu-cluster-dims"="4,2,1"
20+
__global__ void __cluster_dims__(constint, constint / 2, 1) test_constant() {}
21+
22+
// CHECK: "amdgpu-cluster-dims"="0,0,0"
23+
__global__ void __no_cluster__ test_no_cluster() {}
24+
25+
// CHECK: "amdgpu-cluster-dims"="7,1,1"
26+
template<unsigned a>
27+
__global__ void __cluster_dims__(a) test_template_1d() {}
28+
template __global__ void test_template_1d<7>();
29+
30+
// CHECK: "amdgpu-cluster-dims"="2,6,1"
31+
template<unsigned a, unsigned b>
32+
__global__ void __cluster_dims__(a, b) test_template_2d() {}
33+
template __global__ void test_template_2d<2, 6>();
34+
35+
// CHECK: "amdgpu-cluster-dims"="1,2,3"
36+
template<unsigned a, unsigned b, unsigned c>
37+
__global__ void __cluster_dims__(a, b, c) test_template_3d() {}
38+
template __global__ void test_template_3d<1, 2, 3>();

clang/test/Misc/pragma-attribute-supported-attributes-list.test

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
// CHECK-NEXT: CFUnknownTransfer (SubjectMatchRule_function)
3636
// CHECK-NEXT: CPUDispatch (SubjectMatchRule_function)
3737
// CHECK-NEXT: CPUSpecific (SubjectMatchRule_function)
38+
// CHECK-NEXT: CUDAClusterDims (SubjectMatchRule_function)
3839
// CHECK-NEXT: CUDAConstant (SubjectMatchRule_variable)
3940
// CHECK-NEXT: CUDADevice (SubjectMatchRule_function, SubjectMatchRule_variable)
4041
// CHECK-NEXT: CUDADeviceBuiltinSurfaceType (SubjectMatchRule_record)
@@ -43,6 +44,7 @@
4344
// CHECK-NEXT: CUDAGridConstant (SubjectMatchRule_variable_is_parameter)
4445
// CHECK-NEXT: CUDAHost (SubjectMatchRule_function)
4546
// CHECK-NEXT: CUDALaunchBounds (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType)
47+
// CHECK-NEXT: CUDANoCluster (SubjectMatchRule_function)
4648
// CHECK-NEXT: CUDAShared (SubjectMatchRule_variable)
4749
// CHECK-NEXT: CXX11NoReturn (SubjectMatchRule_function)
4850
// CHECK-NEXT: CallableWhen (SubjectMatchRule_function_is_member)

0 commit comments

Comments
 (0)