Skip to content

Commit

Permalink
[NVPTX/CUDA] added an optional src_size argument to __nvvm_cp_async*
Browse files Browse the repository at this point in the history
The optional argument is needed for CUDA-11+ headers when we're compiling for
sm_80+ GPUs.

Differential Revision: https://reviews.llvm.org/D150820
  • Loading branch information
Artem-B committed May 18, 2023
1 parent cbc5d42 commit e7b9c2f
Show file tree
Hide file tree
Showing 9 changed files with 142 additions and 76 deletions.
8 changes: 4 additions & 4 deletions clang/include/clang/Basic/BuiltinsNVPTX.def
Expand Up @@ -968,10 +968,10 @@ TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_shared, "vWi*3", "", AND(SM_80,PT
TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_noinc, "vWi*", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_noinc_shared, "vWi*3", "", AND(SM_80,PTX70))

TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_4, "vv*3vC*1", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_8, "vv*3vC*1", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_16, "vv*3vC*1", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_cp_async_cg_shared_global_16, "vv*3vC*1", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_4, "vv*3vC*1.", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_8, "vv*3vC*1.", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_16, "vv*3vC*1.", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_cp_async_cg_shared_global_16, "vv*3vC*1.", "", AND(SM_80,PTX70))

TARGET_BUILTIN(__nvvm_cp_async_commit_group, "v", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_cp_async_wait_group, "vIi", "", AND(SM_80,PTX70))
Expand Down
2 changes: 2 additions & 0 deletions clang/include/clang/Sema/Sema.h
Expand Up @@ -13558,6 +13558,8 @@ class Sema final {
bool CheckWebAssemblyBuiltinFunctionCall(const TargetInfo &TI,
unsigned BuiltinID,
CallExpr *TheCall);
bool CheckNVPTXBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
CallExpr *TheCall);

bool SemaBuiltinVAStart(unsigned BuiltinID, CallExpr *TheCall);
bool SemaBuiltinVAStartARMMicrosoft(CallExpr *Call);
Expand Down
21 changes: 21 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Expand Up @@ -18177,6 +18177,15 @@ static Value *MakeScopedAtomic(unsigned IntrinsicID, CodeGenFunction &CGF,
{Ptr, CGF.EmitScalarExpr(E->getArg(1))});
}

static Value *MakeCpAsync(unsigned IntrinsicID, CodeGenFunction &CGF,
const CallExpr *E, int SrcSize) {
Value *SrcSizeArg = E->getNumArgs() == 3 ? CGF.EmitScalarExpr(E->getArg(2))
: CGF.Builder.getInt32(SrcSize);
return CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IntrinsicID),
{CGF.EmitScalarExpr(E->getArg(0)),
CGF.EmitScalarExpr(E->getArg(1)), SrcSizeArg});
}

static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
const CallExpr *E, CodeGenFunction &CGF) {
auto &C = CGF.CGM.getContext();
Expand Down Expand Up @@ -18840,6 +18849,18 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
case NVPTX::BI__nvvm_ldu_h2: {
return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
}
case NVPTX::BI__nvvm_cp_async_ca_shared_global_4:
return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_4, *this, E,
4);
case NVPTX::BI__nvvm_cp_async_ca_shared_global_8:
return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_8, *this, E,
8);
case NVPTX::BI__nvvm_cp_async_ca_shared_global_16:
return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_16, *this, E,
16);
case NVPTX::BI__nvvm_cp_async_cg_shared_global_16:
return MakeCpAsync(Intrinsic::nvvm_cp_async_cg_shared_global_16, *this, E,
16);
default:
return nullptr;
}
Expand Down
17 changes: 17 additions & 0 deletions clang/lib/Sema/SemaChecking.cpp
Expand Up @@ -2028,6 +2028,9 @@ bool Sema::CheckTSBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
case llvm::Triple::wasm32:
case llvm::Triple::wasm64:
return CheckWebAssemblyBuiltinFunctionCall(TI, BuiltinID, TheCall);
case llvm::Triple::nvptx:
case llvm::Triple::nvptx64:
return CheckNVPTXBuiltinFunctionCall(TI, BuiltinID, TheCall);
}
}

Expand Down Expand Up @@ -4815,6 +4818,20 @@ bool Sema::CheckWebAssemblyBuiltinFunctionCall(const TargetInfo &TI,
return false;
}

bool Sema::CheckNVPTXBuiltinFunctionCall(const TargetInfo &TI,
unsigned BuiltinID,
CallExpr *TheCall) {
switch (BuiltinID) {
case NVPTX::BI__nvvm_cp_async_ca_shared_global_4:
case NVPTX::BI__nvvm_cp_async_ca_shared_global_8:
case NVPTX::BI__nvvm_cp_async_ca_shared_global_16:
case NVPTX::BI__nvvm_cp_async_cg_shared_global_16:
return checkArgCountAtMost(*this, TheCall, 3);
}

return false;
}

/// SemaBuiltinCpuSupports - Handle __builtin_cpu_supports(char *).
/// This checks that the target supports __builtin_cpu_supports and
/// that the string argument is constant and valid.
Expand Down
17 changes: 13 additions & 4 deletions clang/test/CodeGen/builtins-nvptx.c
Expand Up @@ -830,15 +830,24 @@ __device__ void nvvm_async_copy(__attribute__((address_space(3))) void* dst, __a
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared
__nvvm_cp_async_mbarrier_arrive_noinc_shared(sharedAddr);

// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4({{.*}}, i32 4)
__nvvm_cp_async_ca_shared_global_4(dst, src);
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8({{.*}}, i32 8)
__nvvm_cp_async_ca_shared_global_8(dst, src);
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16({{.*}}, i32 16)
__nvvm_cp_async_ca_shared_global_16(dst, src);
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16({{.*}}, i32 16)
__nvvm_cp_async_cg_shared_global_16(dst, src);

// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4({{.*}}, i32 2)
__nvvm_cp_async_ca_shared_global_4(dst, src, 2);
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8({{.*}}, i32 2)
__nvvm_cp_async_ca_shared_global_8(dst, src, 2);
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16({{.*}}, i32 2)
__nvvm_cp_async_ca_shared_global_16(dst, src, 2);
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16({{.*}}, i32 2)
__nvvm_cp_async_cg_shared_global_16(dst, src, 2);

// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.commit.group
__nvvm_cp_async_commit_group();
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 0)
Expand Down
21 changes: 21 additions & 0 deletions clang/test/SemaCUDA/builtins.cu
Expand Up @@ -10,6 +10,7 @@
// RUN: -fsyntax-only -verify=host %s
// RUN: %clang_cc1 -triple nvptx64-unknown-cuda -fcuda-is-device \
// RUN: -aux-triple x86_64-unknown-unknown \
// RUN: -target-cpu sm_80 -target-feature +ptx70 \
// RUN: -fsyntax-only -verify=dev %s

#if !(defined(__amd64__) && defined(__PTX__))
Expand All @@ -28,3 +29,23 @@ __attribute__((device)) void df() {
int y = __builtin_ia32_rdtsc(); // dev-error {{reference to __host__ function '__builtin_ia32_rdtsc' in __device__ function}}
x = __builtin_abs(1);
}

#if __CUDA_ARCH__ >= 800
__attribute__((device)) void nvvm_async_copy(__attribute__((address_space(3))) void* dst,
__attribute__((address_space(1))) const void* src) {
__nvvm_cp_async_ca_shared_global_4(dst, src);
__nvvm_cp_async_ca_shared_global_8(dst, src);
__nvvm_cp_async_ca_shared_global_16(dst, src);
__nvvm_cp_async_cg_shared_global_16(dst, src);

__nvvm_cp_async_ca_shared_global_4(dst, src, 2);
__nvvm_cp_async_ca_shared_global_8(dst, src, 2);
__nvvm_cp_async_ca_shared_global_16(dst, src, 2);
__nvvm_cp_async_cg_shared_global_16(dst, src, 2);

__nvvm_cp_async_ca_shared_global_4(dst, src, 2, 3); // dev-error {{too many arguments to function call}}
__nvvm_cp_async_ca_shared_global_8(dst, src, 2, 3); // dev-error {{too many arguments to function call}}
__nvvm_cp_async_ca_shared_global_16(dst, src, 2, 3); // dev-error {{too many arguments to function call}}
__nvvm_cp_async_cg_shared_global_16(dst, src, 2, 3); // dev-error {{too many arguments to function call}}
}
#endif
12 changes: 4 additions & 8 deletions llvm/include/llvm/IR/IntrinsicsNVVM.td
Expand Up @@ -1381,26 +1381,22 @@ def int_nvvm_cp_async_mbarrier_arrive_noinc_shared :
Intrinsic<[],[llvm_shared_i64ptr_ty],[IntrConvergent, IntrNoCallback]>;

def int_nvvm_cp_async_ca_shared_global_4 :
ClangBuiltin<"__nvvm_cp_async_ca_shared_global_4">,
Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty, llvm_i32_ty],
[IntrArgMemOnly, IntrNoCallback, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
"llvm.nvvm.cp.async.ca.shared.global.4">;
def int_nvvm_cp_async_ca_shared_global_8 :
ClangBuiltin<"__nvvm_cp_async_ca_shared_global_8">,
Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty, llvm_i32_ty],
[IntrArgMemOnly, IntrNoCallback, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
"llvm.nvvm.cp.async.ca.shared.global.8">;
def int_nvvm_cp_async_ca_shared_global_16 :
ClangBuiltin<"__nvvm_cp_async_ca_shared_global_16">,
Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty, llvm_i32_ty],
[IntrArgMemOnly, IntrNoCallback, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
"llvm.nvvm.cp.async.ca.shared.global.16">;
def int_nvvm_cp_async_cg_shared_global_16 :
ClangBuiltin<"__nvvm_cp_async_cg_shared_global_16">,
Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty, llvm_i32_ty],
[IntrArgMemOnly, IntrNoCallback, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
"llvm.nvvm.cp.async.cg.shared.global.16">;
Expand Down
41 changes: 19 additions & 22 deletions llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Expand Up @@ -328,39 +328,36 @@ defm CP_ASYNC_MBARRIER_ARRIVE_NOINC :
defm CP_ASYNC_MBARRIER_ARRIVE_NOINC_SHARED :
CP_ASYNC_MBARRIER_ARRIVE<".noinc", ".shared", int_nvvm_cp_async_mbarrier_arrive_noinc_shared>;

multiclass CP_ASYNC_CA_SHARED_GLOBAL_I<string cpsize, Intrinsic Intrin> {
def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src),
!strconcat("cp.async.ca.shared.global [$dst], [$src], ", cpsize, ";"),
[(Intrin Int32Regs:$dst, Int32Regs:$src)]>,
multiclass CP_ASYNC_SHARED_GLOBAL_I<string cc, string cpsize, Intrinsic Intrin> {
def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src, Int32Regs:$src_size),
!strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
[(Intrin Int32Regs:$dst, Int32Regs:$src, Int32Regs:$src_size)]>,
Requires<[hasPTX70, hasSM80]>;
def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src),
!strconcat("cp.async.ca.shared.global [$dst], [$src], ", cpsize, ";"),
[(Intrin Int64Regs:$dst, Int64Regs:$src)]>,
def _32i: NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src, i32imm:$src_size),
!strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
[(Intrin Int32Regs:$dst, Int32Regs:$src, imm:$src_size)]>,
Requires<[hasPTX70, hasSM80]>;
def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src, Int32Regs:$src_size),
!strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
[(Intrin Int64Regs:$dst, Int64Regs:$src, Int32Regs:$src_size)]>,
Requires<[hasPTX70, hasSM80]>;
def _64i: NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src, i32imm:$src_size),
!strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
[(Intrin Int64Regs:$dst, Int64Regs:$src, imm:$src_size)]>,
Requires<[hasPTX70, hasSM80]>;
}

defm CP_ASYNC_CA_SHARED_GLOBAL_4 :
CP_ASYNC_CA_SHARED_GLOBAL_I<"4", int_nvvm_cp_async_ca_shared_global_4>;
CP_ASYNC_SHARED_GLOBAL_I<"ca", "4", int_nvvm_cp_async_ca_shared_global_4>;

defm CP_ASYNC_CA_SHARED_GLOBAL_8 :
CP_ASYNC_CA_SHARED_GLOBAL_I<"8", int_nvvm_cp_async_ca_shared_global_8>;
CP_ASYNC_SHARED_GLOBAL_I<"ca", "8", int_nvvm_cp_async_ca_shared_global_8>;

defm CP_ASYNC_CA_SHARED_GLOBAL_16 :
CP_ASYNC_CA_SHARED_GLOBAL_I<"16", int_nvvm_cp_async_ca_shared_global_16>;

multiclass CP_ASYNC_CG_SHARED_GLOBAL<string cpsize, Intrinsic Intrin> {
def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src),
!strconcat("cp.async.cg.shared.global [$dst], [$src], ", cpsize, ";"),
[(Intrin Int32Regs:$dst, Int32Regs:$src)]>,
Requires<[hasPTX70, hasSM80]>;
def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src),
!strconcat("cp.async.cg.shared.global [$dst], [$src], ", cpsize, ";"),
[(Intrin Int64Regs:$dst, Int64Regs:$src)]>,
Requires<[hasPTX70, hasSM80]>;
}
CP_ASYNC_SHARED_GLOBAL_I<"ca", "16", int_nvvm_cp_async_ca_shared_global_16>;

defm CP_ASYNC_CG_SHARED_GLOBAL_16 :
CP_ASYNC_CG_SHARED_GLOBAL<"16", int_nvvm_cp_async_cg_shared_global_16>;
CP_ASYNC_SHARED_GLOBAL_I<"cg", "16", int_nvvm_cp_async_cg_shared_global_16>;

def CP_ASYNC_COMMIT_GROUP :
NVPTXInst<(outs), (ins), "cp.async.commit_group;", [(int_nvvm_cp_async_commit_group)]>,
Expand Down

0 comments on commit e7b9c2f

Please sign in to comment.