diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 9179fa47eb690..96531def77a78 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -968,10 +968,10 @@ TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_shared, "vWi*3", "", AND(SM_80,PT TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_noinc, "vWi*", "", AND(SM_80,PTX70)) TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_noinc_shared, "vWi*3", "", AND(SM_80,PTX70)) -TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_4, "vv*3vC*1.", "", AND(SM_80,PTX70)) -TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_8, "vv*3vC*1.", "", AND(SM_80,PTX70)) -TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_16, "vv*3vC*1.", "", AND(SM_80,PTX70)) -TARGET_BUILTIN(__nvvm_cp_async_cg_shared_global_16, "vv*3vC*1.", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_4, "vv*3vC*1", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_8, "vv*3vC*1", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_16, "vv*3vC*1", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_cp_async_cg_shared_global_16, "vv*3vC*1", "", AND(SM_80,PTX70)) TARGET_BUILTIN(__nvvm_cp_async_commit_group, "v", "", AND(SM_80,PTX70)) TARGET_BUILTIN(__nvvm_cp_async_wait_group, "vIi", "", AND(SM_80,PTX70)) diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 085662d225294..5296d7000b5cc 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -13558,8 +13558,6 @@ class Sema final { bool CheckWebAssemblyBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, CallExpr *TheCall); - bool CheckNVPTXBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, - CallExpr *TheCall); bool SemaBuiltinVAStart(unsigned BuiltinID, CallExpr *TheCall); bool SemaBuiltinVAStartARMMicrosoft(CallExpr *Call); diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 040d57c190b7c..02345651f0abb 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18177,15 +18177,6 @@ static Value *MakeScopedAtomic(unsigned IntrinsicID, CodeGenFunction &CGF, {Ptr, CGF.EmitScalarExpr(E->getArg(1))}); } -static Value *MakeCpAsync(unsigned IntrinsicID, CodeGenFunction &CGF, - const CallExpr *E, int SrcSize) { - Value *SrcSizeArg = E->getNumArgs() == 3 ? CGF.EmitScalarExpr(E->getArg(2)) - : CGF.Builder.getInt32(SrcSize); - return CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IntrinsicID), - {CGF.EmitScalarExpr(E->getArg(0)), - CGF.EmitScalarExpr(E->getArg(1)), SrcSizeArg}); -} - static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID, const CallExpr *E, CodeGenFunction &CGF) { auto &C = CGF.CGM.getContext(); @@ -18849,18 +18840,6 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, case NVPTX::BI__nvvm_ldu_h2: { return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this); } - case NVPTX::BI__nvvm_cp_async_ca_shared_global_4: - return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_4, *this, E, - 4); - case NVPTX::BI__nvvm_cp_async_ca_shared_global_8: - return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_8, *this, E, - 8); - case NVPTX::BI__nvvm_cp_async_ca_shared_global_16: - return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_16, *this, E, - 16); - case NVPTX::BI__nvvm_cp_async_cg_shared_global_16: - return MakeCpAsync(Intrinsic::nvvm_cp_async_cg_shared_global_16, *this, E, - 16); default: return nullptr; } diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 06739beba4f91..7cd74329d8f32 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -2028,9 +2028,6 @@ bool Sema::CheckTSBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, case llvm::Triple::wasm32: case llvm::Triple::wasm64: return CheckWebAssemblyBuiltinFunctionCall(TI, BuiltinID, TheCall); - case llvm::Triple::nvptx: - case llvm::Triple::nvptx64: - return CheckNVPTXBuiltinFunctionCall(TI, BuiltinID, TheCall); } } @@ -4818,20 +4815,6 @@ bool Sema::CheckWebAssemblyBuiltinFunctionCall(const TargetInfo &TI, return false; } -bool Sema::CheckNVPTXBuiltinFunctionCall(const TargetInfo &TI, - unsigned BuiltinID, - CallExpr *TheCall) { - switch (BuiltinID) { - case NVPTX::BI__nvvm_cp_async_ca_shared_global_4: - case NVPTX::BI__nvvm_cp_async_ca_shared_global_8: - case NVPTX::BI__nvvm_cp_async_ca_shared_global_16: - case NVPTX::BI__nvvm_cp_async_cg_shared_global_16: - return checkArgCountAtMost(*this, TheCall, 3); - } - - return false; -} - /// SemaBuiltinCpuSupports - Handle __builtin_cpu_supports(char *). /// This checks that the target supports __builtin_cpu_supports and /// that the string argument is constant and valid. diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index 628349364dcf2..48a2d0241528b 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -830,24 +830,15 @@ __device__ void nvvm_async_copy(__attribute__((address_space(3))) void* dst, __a // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared __nvvm_cp_async_mbarrier_arrive_noinc_shared(sharedAddr); - // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4({{.*}}, i32 4) + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4 __nvvm_cp_async_ca_shared_global_4(dst, src); - // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8({{.*}}, i32 8) + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8 __nvvm_cp_async_ca_shared_global_8(dst, src); - // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16({{.*}}, i32 16) + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16 __nvvm_cp_async_ca_shared_global_16(dst, src); - // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16({{.*}}, i32 16) + // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16 __nvvm_cp_async_cg_shared_global_16(dst, src); - // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4({{.*}}, i32 2) - __nvvm_cp_async_ca_shared_global_4(dst, src, 2); - // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8({{.*}}, i32 2) - __nvvm_cp_async_ca_shared_global_8(dst, src, 2); - // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16({{.*}}, i32 2) - __nvvm_cp_async_ca_shared_global_16(dst, src, 2); - // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16({{.*}}, i32 2) - __nvvm_cp_async_cg_shared_global_16(dst, src, 2); - // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.commit.group __nvvm_cp_async_commit_group(); // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 0) diff --git a/clang/test/SemaCUDA/builtins.cu b/clang/test/SemaCUDA/builtins.cu index a3c416a96af79..78a333e511a5d 100644 --- a/clang/test/SemaCUDA/builtins.cu +++ b/clang/test/SemaCUDA/builtins.cu @@ -10,7 +10,6 @@ // RUN: -fsyntax-only -verify=host %s // RUN: %clang_cc1 -triple nvptx64-unknown-cuda -fcuda-is-device \ // RUN: -aux-triple x86_64-unknown-unknown \ -// RUN: -target-cpu sm_80 -target-feature +ptx70 \ // RUN: -fsyntax-only -verify=dev %s #if !(defined(__amd64__) && defined(__PTX__)) @@ -29,23 +28,3 @@ __attribute__((device)) void df() { int y = __builtin_ia32_rdtsc(); // dev-error {{reference to __host__ function '__builtin_ia32_rdtsc' in __device__ function}} x = __builtin_abs(1); } - -#if __CUDA_ARCH__ >= 800 -__attribute__((device)) void nvvm_async_copy(__attribute__((address_space(3))) void* dst, - __attribute__((address_space(1))) const void* src) { - __nvvm_cp_async_ca_shared_global_4(dst, src); - __nvvm_cp_async_ca_shared_global_8(dst, src); - __nvvm_cp_async_ca_shared_global_16(dst, src); - __nvvm_cp_async_cg_shared_global_16(dst, src); - - __nvvm_cp_async_ca_shared_global_4(dst, src, 2); - __nvvm_cp_async_ca_shared_global_8(dst, src, 2); - __nvvm_cp_async_ca_shared_global_16(dst, src, 2); - __nvvm_cp_async_cg_shared_global_16(dst, src, 2); - - __nvvm_cp_async_ca_shared_global_4(dst, src, 2, 3); // dev-error {{too many arguments to function call}} - __nvvm_cp_async_ca_shared_global_8(dst, src, 2, 3); // dev-error {{too many arguments to function call}} - __nvvm_cp_async_ca_shared_global_16(dst, src, 2, 3); // dev-error {{too many arguments to function call}} - __nvvm_cp_async_cg_shared_global_16(dst, src, 2, 3); // dev-error {{too many arguments to function call}} -} -#endif diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 2633027a6a62b..d1a2537fe662d 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1381,22 +1381,26 @@ def int_nvvm_cp_async_mbarrier_arrive_noinc_shared : Intrinsic<[],[llvm_shared_i64ptr_ty],[IntrConvergent, IntrNoCallback]>; def int_nvvm_cp_async_ca_shared_global_4 : - Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty, llvm_i32_ty], + ClangBuiltin<"__nvvm_cp_async_ca_shared_global_4">, + Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty], [IntrArgMemOnly, IntrNoCallback, NoAlias>, NoAlias>, WriteOnly>, ReadOnly>], "llvm.nvvm.cp.async.ca.shared.global.4">; def int_nvvm_cp_async_ca_shared_global_8 : - Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty, llvm_i32_ty], + ClangBuiltin<"__nvvm_cp_async_ca_shared_global_8">, + Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty], [IntrArgMemOnly, IntrNoCallback, NoAlias>, NoAlias>, WriteOnly>, ReadOnly>], "llvm.nvvm.cp.async.ca.shared.global.8">; def int_nvvm_cp_async_ca_shared_global_16 : - Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty, llvm_i32_ty], + ClangBuiltin<"__nvvm_cp_async_ca_shared_global_16">, + Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty], [IntrArgMemOnly, IntrNoCallback, NoAlias>, NoAlias>, WriteOnly>, ReadOnly>], "llvm.nvvm.cp.async.ca.shared.global.16">; def int_nvvm_cp_async_cg_shared_global_16 : - Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty, llvm_i32_ty], + ClangBuiltin<"__nvvm_cp_async_cg_shared_global_16">, + Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty], [IntrArgMemOnly, IntrNoCallback, NoAlias>, NoAlias>, WriteOnly>, ReadOnly>], "llvm.nvvm.cp.async.cg.shared.global.16">; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index c8b971c402623..1192cc0784084 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -328,36 +328,39 @@ defm CP_ASYNC_MBARRIER_ARRIVE_NOINC : defm CP_ASYNC_MBARRIER_ARRIVE_NOINC_SHARED : CP_ASYNC_MBARRIER_ARRIVE<".noinc", ".shared", int_nvvm_cp_async_mbarrier_arrive_noinc_shared>; -multiclass CP_ASYNC_SHARED_GLOBAL_I { - def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src, Int32Regs:$src_size), - !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"), - [(Intrin Int32Regs:$dst, Int32Regs:$src, Int32Regs:$src_size)]>, +multiclass CP_ASYNC_CA_SHARED_GLOBAL_I { + def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src), + !strconcat("cp.async.ca.shared.global [$dst], [$src], ", cpsize, ";"), + [(Intrin Int32Regs:$dst, Int32Regs:$src)]>, Requires<[hasPTX70, hasSM80]>; - def _32i: NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src, i32imm:$src_size), - !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"), - [(Intrin Int32Regs:$dst, Int32Regs:$src, imm:$src_size)]>, - Requires<[hasPTX70, hasSM80]>; - def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src, Int32Regs:$src_size), - !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"), - [(Intrin Int64Regs:$dst, Int64Regs:$src, Int32Regs:$src_size)]>, - Requires<[hasPTX70, hasSM80]>; - def _64i: NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src, i32imm:$src_size), - !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"), - [(Intrin Int64Regs:$dst, Int64Regs:$src, imm:$src_size)]>, + def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src), + !strconcat("cp.async.ca.shared.global [$dst], [$src], ", cpsize, ";"), + [(Intrin Int64Regs:$dst, Int64Regs:$src)]>, Requires<[hasPTX70, hasSM80]>; } defm CP_ASYNC_CA_SHARED_GLOBAL_4 : - CP_ASYNC_SHARED_GLOBAL_I<"ca", "4", int_nvvm_cp_async_ca_shared_global_4>; + CP_ASYNC_CA_SHARED_GLOBAL_I<"4", int_nvvm_cp_async_ca_shared_global_4>; defm CP_ASYNC_CA_SHARED_GLOBAL_8 : - CP_ASYNC_SHARED_GLOBAL_I<"ca", "8", int_nvvm_cp_async_ca_shared_global_8>; + CP_ASYNC_CA_SHARED_GLOBAL_I<"8", int_nvvm_cp_async_ca_shared_global_8>; defm CP_ASYNC_CA_SHARED_GLOBAL_16 : - CP_ASYNC_SHARED_GLOBAL_I<"ca", "16", int_nvvm_cp_async_ca_shared_global_16>; + CP_ASYNC_CA_SHARED_GLOBAL_I<"16", int_nvvm_cp_async_ca_shared_global_16>; + +multiclass CP_ASYNC_CG_SHARED_GLOBAL { + def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src), + !strconcat("cp.async.cg.shared.global [$dst], [$src], ", cpsize, ";"), + [(Intrin Int32Regs:$dst, Int32Regs:$src)]>, + Requires<[hasPTX70, hasSM80]>; + def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src), + !strconcat("cp.async.cg.shared.global [$dst], [$src], ", cpsize, ";"), + [(Intrin Int64Regs:$dst, Int64Regs:$src)]>, + Requires<[hasPTX70, hasSM80]>; +} defm CP_ASYNC_CG_SHARED_GLOBAL_16 : - CP_ASYNC_SHARED_GLOBAL_I<"cg", "16", int_nvvm_cp_async_cg_shared_global_16>; + CP_ASYNC_CG_SHARED_GLOBAL<"16", int_nvvm_cp_async_cg_shared_global_16>; def CP_ASYNC_COMMIT_GROUP : NVPTXInst<(outs), (ins), "cp.async.commit_group;", [(int_nvvm_cp_async_commit_group)]>, diff --git a/llvm/test/CodeGen/NVPTX/async-copy.ll b/llvm/test/CodeGen/NVPTX/async-copy.ll index a1bea301dd467..55c7a6d4874c5 100644 --- a/llvm/test/CodeGen/NVPTX/async-copy.ll +++ b/llvm/test/CodeGen/NVPTX/async-copy.ll @@ -1,35 +1,35 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX32 %s -; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX64 %s +; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX32 %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX64 %s ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} ; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} declare void @llvm.nvvm.cp.async.wait.group(i32) -; CHECK-LABEL: asyncwaitgroup +; ALL-LABEL: asyncwaitgroup define void @asyncwaitgroup() { - ; CHECK: cp.async.wait_group 8; + ; ALL: cp.async.wait_group 8; tail call void @llvm.nvvm.cp.async.wait.group(i32 8) - ; CHECK: cp.async.wait_group 0; + ; ALL: cp.async.wait_group 0; tail call void @llvm.nvvm.cp.async.wait.group(i32 0) - ; CHECK: cp.async.wait_group 16; + ; ALL: cp.async.wait_group 16; tail call void @llvm.nvvm.cp.async.wait.group(i32 16) ret void } declare void @llvm.nvvm.cp.async.wait.all() -; CHECK-LABEL: asyncwaitall +; ALL-LABEL: asyncwaitall define void @asyncwaitall() { -; CHECK: cp.async.wait_all +; ALL: cp.async.wait_all tail call void @llvm.nvvm.cp.async.wait.all() ret void } declare void @llvm.nvvm.cp.async.commit.group() -; CHECK-LABEL: asynccommitgroup +; ALL-LABEL: asynccommitgroup define void @asynccommitgroup() { -; CHECK: cp.async.commit_group +; ALL: cp.async.commit_group tail call void @llvm.nvvm.cp.async.commit.group() ret void } @@ -41,75 +41,72 @@ declare void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) % ; CHECK-LABEL: asyncmbarrier define void @asyncmbarrier(ptr %a) { -; The distinction between PTX32/PTX64 here is only to capture pointer register type -; in R to be used in subsequent tests. -; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%[[R:r]]{{[0-9]+}}]; -; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%[[R:rd]]{{[0-9]+}}]; +; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%r{{[0-9]+}}]; +; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%rd{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive(ptr %a) ret void } ; CHECK-LABEL: asyncmbarriershared define void @asyncmbarriershared(ptr addrspace(3) %a) { -; CHECK: cp.async.mbarrier.arrive.shared.b64 [%[[R]]{{[0-9]+}}]; +; CHECK_PTX32: cp.async.mbarrier.arrive.shared.b64 [%r{{[0-9]+}}]; +; CHECK_PTX64: cp.async.mbarrier.arrive.shared.b64 [%rd{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %a) ret void } ; CHECK-LABEL: asyncmbarriernoinc define void @asyncmbarriernoinc(ptr %a) { -; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%[[R]]{{[0-9]+}}]; +; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.b64 [%r{{[0-9]+}}]; +; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%rd{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %a) ret void } ; CHECK-LABEL: asyncmbarriernoincshared define void @asyncmbarriernoincshared(ptr addrspace(3) %a) { -; CHECK: cp.async.mbarrier.arrive.noinc.shared.b64 [%[[R]]{{[0-9]+}}]; +; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.shared.b64 [%r{{[0-9]+}}]; +; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.shared.b64 [%rd{{[0-9]+}}]; tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %a) ret void } -declare void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) +declare void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b) ; CHECK-LABEL: asynccasharedglobal4i8 -define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) { -; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 4, %r{{[0-9]+}}; -; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 4, 1; - tail call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) - tail call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 1) +define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b) { +; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 4; +; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 4; + tail call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b) ret void } -declare void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) +declare void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %a, ptr addrspace(1) %b) ; CHECK-LABEL: asynccasharedglobal8i8 -define void @asynccasharedglobal8i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) { -; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 8, %r{{[0-9]+}}; -; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 8, 1; - tail call void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) - tail call void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 1) +define void @asynccasharedglobal8i8(ptr addrspace(3) %a, ptr addrspace(1) %b) { +; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 8; +; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 8; + tail call void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %a, ptr addrspace(1) %b) ret void } -declare void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) +declare void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b) ; CHECK-LABEL: asynccasharedglobal16i8 -define void @asynccasharedglobal16i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) { -; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 16, %r{{[0-9]+}}; -; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 16, 1; - tail call void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) - tail call void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 1) +define void @asynccasharedglobal16i8(ptr addrspace(3) %a, ptr addrspace(1) %b) { +; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 16; +; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 16; + tail call void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b) ret void } -declare void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) +declare void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b) ; CHECK-LABEL: asynccgsharedglobal16i8 -define void @asynccgsharedglobal16i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) { -; CHECK: cp.async.cg.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 16, %r{{[0-9]+}}; -; CHECK: cp.async.cg.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 16, 1; - tail call void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) - tail call void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 1) +define void @asynccgsharedglobal16i8(ptr addrspace(3) %a, ptr addrspace(1) %b) { +; CHECK_PTX32: cp.async.cg.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 16; +; CHECK_PTX64: cp.async.cg.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 16; + tail call void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b) ret void }