diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index b05a4713e6340..5705f1d8e2aaa 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -2686,69 +2686,6 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N, ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); } -void NVPTXDAGToDAGISel::SelectCpAsyncBulkG2S(SDNode *N) { - // We have {Chain, Intrinsic-ID} followed by the actual intrisic args: - // {dst, mbar, src, size, multicast, cache_hint, - // multicast_flag, cache_hint_flag} - // NumOperands = {Chain, IID} + {Actual intrinsic args} - // = {2} + {8} - size_t NumOps = N->getNumOperands(); - bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1; - bool IsMultiCast = N->getConstantOperandVal(NumOps - 2) == 1; - size_t NumBaseArgs = 4; // dst, mbar, src, size - size_t MultiCastIdx = NumBaseArgs + 2; // for Chain and IID - - SDLoc DL(N); - SmallVector Ops(N->ops().slice(2, NumBaseArgs)); - - // Push MultiCast operand, if available - if (IsMultiCast) - Ops.push_back(N->getOperand(MultiCastIdx)); - - // Push CacheHint operand, if available - if (IsCacheHint) - Ops.push_back(N->getOperand(MultiCastIdx + 1)); - - // Finally, the chain operand - Ops.push_back(N->getOperand(0)); - - bool IsShared32 = - CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32; - unsigned Opcode = [&]() { - if (IsMultiCast && IsCacheHint) - return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32_MC_CH - : NVPTX::CP_ASYNC_BULK_G2S_MC_CH; - if (IsMultiCast) - return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32_MC - : NVPTX::CP_ASYNC_BULK_G2S_MC; - if (IsCacheHint) - return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32_CH - : NVPTX::CP_ASYNC_BULK_G2S_CH; - return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32 - : NVPTX::CP_ASYNC_BULK_G2S; - }(); - ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); -} - -void NVPTXDAGToDAGISel::SelectCpAsyncBulkPrefetchL2(SDNode *N) { - // We have {Chain, Intrinsic-ID} followed by the actual intrisic args: - // src, size, cache_hint, cache_hint_flag - // NumOperands = {Chain, IID} + {Actual intrinsic args} - // = {2} + {4} - size_t NumOps = N->getNumOperands(); - bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1; - size_t NumArgs = IsCacheHint ? 3 : 2; // src, size, cache_hint - - SDLoc DL(N); - SmallVector Ops(N->ops().slice(2, NumArgs)); - Ops.push_back(N->getOperand(0)); // Chain operand - - unsigned Opcode = IsCacheHint - ? NVPTX::CP_ASYNC_BULK_PREFETCH_CH - : NVPTX::CP_ASYNC_BULK_PREFETCH; - ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); -} - #define TCGEN05_ST_OPCODE(SHAPE, NUM) \ (enableUnpack ? NVPTX::TCGEN05_ST_##SHAPE##_##NUM##_UNPACK \ : NVPTX::TCGEN05_ST_##SHAPE##_##NUM) @@ -2865,12 +2802,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) { switch (IID) { default: return false; - case Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster: - SelectCpAsyncBulkG2S(N); - return true; - case Intrinsic::nvvm_cp_async_bulk_prefetch_L2: - SelectCpAsyncBulkPrefetchL2(N); - return true; case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_1d: case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_2d: case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_3d: diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 648e8e239cf78..71a5b7ff8cd30 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -92,8 +92,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N); void SelectV2I64toI128(SDNode *N); void SelectI128toV2I64(SDNode *N); - void SelectCpAsyncBulkG2S(SDNode *N); - void SelectCpAsyncBulkPrefetchL2(SDNode *N); void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false); void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false); void SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N, bool IsIm2Col = false); diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index f7b8aca0f77d8..e406c5a506623 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -544,52 +544,50 @@ multiclass CP_ASYNC_BULK_S2G_INTR { [(int_nvvm_cp_async_bulk_shared_cta_to_global_bytemask addr:$dst, addr:$src, i32:$size, i64:$ch, !if(has_ch, -1, 0), i16:$mask)]>, Requires<[hasPTX<86>, hasSM<100>]>; } -defm CP_ASYNC_BULK_S2G : CP_ASYNC_BULK_S2G_INTR<0>; -defm CP_ASYNC_BULK_S2G_CH : CP_ASYNC_BULK_S2G_INTR<1>; +defm CP_ASYNC_BULK_S2G : CP_ASYNC_BULK_S2G_INTR; +defm CP_ASYNC_BULK_S2G_CH : CP_ASYNC_BULK_S2G_INTR; -multiclass CP_ASYNC_BULK_G2S { - def NAME: NVPTXInst<(outs), - (ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size), - !strconcat(CpAsyncBulkStr<0, 0>.G2S, " [$dst], [$src], $size, [$mbar];"), []>, - Requires<[hasPTX<80>, hasSM<90>]>; - def NAME # _MC: NVPTXInst<(outs), - (ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size, Int16Regs:$mc), - !strconcat(CpAsyncBulkStr<1, 0>.G2S, " [$dst], [$src], $size, [$mbar], $mc;"), []>, - Requires<[hasPTX<80>, hasSM<90>]>; - def NAME # _CH: NVPTXInst<(outs), - (ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size, Int64Regs:$ch), - !strconcat(CpAsyncBulkStr<0, 1>.G2S, " [$dst], [$src], $size, [$mbar], $ch;"), []>, - Requires<[hasPTX<80>, hasSM<90>]>; - def NAME # _MC_CH: NVPTXInst<(outs), - (ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size, Int16Regs:$mc, Int64Regs:$ch), - !strconcat(CpAsyncBulkStr<1, 1>.G2S, " [$dst], [$src], $size, [$mbar], $mc, $ch;"), []>, - Requires<[hasPTX<80>, hasSM<90>]>; +multiclass CP_ASYNC_BULK_G2S_INTR { + defvar Intr = int_nvvm_cp_async_bulk_global_to_shared_cluster; + + def NAME : NVPTXInst<(outs), + (ins ADDR:$dst, ADDR:$mbar, ADDR:$src, + Int32Regs:$size, Int16Regs:$mask, Int64Regs:$ch), + !if(has_ch, + CpAsyncBulkStr<0, 1>.G2S # " [$dst], [$src], $size, [$mbar], $ch;", + CpAsyncBulkStr<0, 0>.G2S # " [$dst], [$src], $size, [$mbar];"), + [(Intr addr:$dst, addr:$mbar, addr:$src, i32:$size, i16:$mask, i64:$ch, 0, !if(has_ch, -1, 0))]>, + Requires<[hasPTX<80>, hasSM<90>]>; + + def NAME # _MC : NVPTXInst<(outs), + (ins ADDR:$dst, ADDR:$mbar, ADDR:$src, + Int32Regs:$size, Int16Regs:$mask, Int64Regs:$ch), + !if(has_ch, + CpAsyncBulkStr<1, 1>.G2S # " [$dst], [$src], $size, [$mbar], $mask, $ch;", + CpAsyncBulkStr<1, 0>.G2S # " [$dst], [$src], $size, [$mbar], $mask;"), + [(Intr addr:$dst, addr:$mbar, addr:$src, i32:$size, i16:$mask, i64:$ch, -1, !if(has_ch, -1, 0))]>, + Requires<[hasPTX<80>, hasSM<90>]>; } -defm CP_ASYNC_BULK_G2S : CP_ASYNC_BULK_G2S; -defm CP_ASYNC_BULK_G2S_SHARED32 : CP_ASYNC_BULK_G2S; +defm CP_ASYNC_BULK_G2S : CP_ASYNC_BULK_G2S_INTR; +defm CP_ASYNC_BULK_G2S_CH : CP_ASYNC_BULK_G2S_INTR; -multiclass CP_ASYNC_BULK_CTA_TO_CLUSTER { - def NAME: NVPTXInst<(outs), - (ins rc:$dst, rc:$mbar, rc:$src, Int32Regs:$size), - !strconcat(CpAsyncBulkStr<0, 0>.C2C, " [$dst], [$src], $size, [$mbar];"), - [(int_nvvm_cp_async_bulk_shared_cta_to_cluster rc:$dst, rc:$mbar, rc:$src, Int32Regs:$size)]>, - Requires<[hasPTX<80>, hasSM<90>]>; +def CP_ASYNC_BULK_CTA_TO_CLUSTER : NVPTXInst<(outs), + (ins ADDR:$dst, ADDR:$mbar, ADDR:$src, Int32Regs:$size), + CpAsyncBulkStr<0, 0>.C2C # " [$dst], [$src], $size, [$mbar];", + [(int_nvvm_cp_async_bulk_shared_cta_to_cluster addr:$dst, addr:$mbar, addr:$src, i32:$size)]>, + Requires<[hasPTX<80>, hasSM<90>]>; + +multiclass CP_ASYNC_BULK_PREFETCH_INTR { + def NAME : NVPTXInst<(outs), (ins ADDR:$src, Int32Regs:$size, Int64Regs:$ch), + !if(has_ch, + "cp.async.bulk.prefetch.L2.global.L2::cache_hint" # " [$src], $size, $ch;", + "cp.async.bulk.prefetch.L2.global" # " [$src], $size;"), + [(int_nvvm_cp_async_bulk_prefetch_L2 addr:$src, i32:$size, i64:$ch, !if(has_ch, -1, 0))]>, + Requires<[hasPTX<80>, hasSM<90>]>; } -defm CP_ASYNC_BULK_CTA_TO_CLUSTER : CP_ASYNC_BULK_CTA_TO_CLUSTER; -defm CP_ASYNC_BULK_CTA_TO_CLUSTER_SHARED32 : CP_ASYNC_BULK_CTA_TO_CLUSTER; +defm CP_ASYNC_BULK_PREFETCH : CP_ASYNC_BULK_PREFETCH_INTR; +defm CP_ASYNC_BULK_PREFETCH_CH : CP_ASYNC_BULK_PREFETCH_INTR; -//------------------------------ -// Bulk Copy Prefetch Functions -//------------------------------ -def CP_ASYNC_BULK_PREFETCH : NVPTXInst<(outs), - (ins Int64Regs:$src, Int32Regs:$size), - "cp.async.bulk.prefetch.L2.global [$src], $size;", []>, - Requires<[hasPTX<80>, hasSM<90>]>; - -def CP_ASYNC_BULK_PREFETCH_CH : NVPTXInst<(outs), - (ins Int64Regs:$src, Int32Regs:$size, Int64Regs:$ch), - "cp.async.bulk.prefetch.L2.global.L2::cache_hint [$src], $size, $ch;", []>, - Requires<[hasPTX<80>, hasSM<90>]>; //------------------------------------- // TMA Async Bulk Tensor Copy Functions //------------------------------------- diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll index d7f2a5df5547e..46a026313d971 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll @@ -23,10 +23,10 @@ define void @cp_async_bulk_g2s(ptr addrspace(1) %src, ptr addrspace(3) %bar, ptr ; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_g2s_param_1]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_g2s_param_2]; ; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_g2s_param_3]; -; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%rd3], [%rd1], %r1, [%rd2]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_g2s_param_4]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_g2s_param_5]; +; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%rd3], [%rd1], %r1, [%rd2]; ; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint [%rd3], [%rd1], %r1, [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_g2s_param_4]; ; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%rd3], [%rd1], %r1, [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd3], [%rd1], %r1, [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -42,48 +42,101 @@ define void @cp_async_bulk_g2s(ptr addrspace(1) %src, ptr addrspace(3) %bar, ptr ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_g2s_param_1]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_g2s_param_2]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_g2s_param_3]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%r2], [%rd1], %r3, [%r1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_g2s_param_4]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_g2s_param_5]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%r2], [%rd1], %r3, [%r1]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint [%r2], [%rd1], %r3, [%r1], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_g2s_param_4]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%r2], [%rd1], %r3, [%r1], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r2], [%rd1], %r3, [%r1], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; - tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 0, i64 0, i1 0, i1 0) - tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 0, i64 %ch, i1 0, i1 1) - tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 0, i1 1, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 0, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 0, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 1, i1 0) tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 1, i1 1) ret void } +; Tests to verify that the immediate values for "mc and ch" land correctly in PTX. +; The values of 16 and 64 are arbitrary and do not have any significance. +define void @cp_async_bulk_g2s_imm_mc_ch(ptr addrspace(1) %src, ptr addrspace(3) %bar, ptr addrspace(7) %dst, i32 %size, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_g2s_imm_mc_ch( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<2>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<6>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_g2s_imm_mc_ch_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_g2s_imm_mc_ch_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_g2s_imm_mc_ch_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_g2s_imm_mc_ch_param_3]; +; CHECK-PTX64-NEXT: mov.b64 %rd4, 64; +; CHECK-PTX64-NEXT: mov.b16 %rs1, 16; +; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd3], [%rd1], %r1, [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: ld.param.b64 %rd5, [cp_async_bulk_g2s_imm_mc_ch_param_5]; +; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%rd3], [%rd1], %r1, [%rd2], %rs1; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_g2s_imm_mc_ch_param_4]; +; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint [%rd3], [%rd1], %r1, [%rd2], %rd4; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_g2s_imm_mc_ch( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<4>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_g2s_imm_mc_ch_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_g2s_imm_mc_ch_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_g2s_imm_mc_ch_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_g2s_imm_mc_ch_param_3]; +; CHECK-PTX-SHARED32-NEXT: mov.b64 %rd2, 64; +; CHECK-PTX-SHARED32-NEXT: mov.b16 %rs1, 16; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r2], [%rd1], %r3, [%r1], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd3, [cp_async_bulk_g2s_imm_mc_ch_param_5]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%r2], [%rd1], %r3, [%r1], %rs1; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_g2s_imm_mc_ch_param_4]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint [%r2], [%rd1], %r3, [%r1], %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 16, i64 64, i1 1, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 16, i64 %ch, i1 1, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 64, i1 0, i1 1) + ret void +} + define void @cp_async_bulk_s2g(ptr addrspace(3) %src, ptr addrspace(1) %dst, i32 %size, i64 %ch) { ; CHECK-PTX64-LABEL: cp_async_bulk_s2g( ; CHECK-PTX64: { ; CHECK-PTX64-NEXT: .reg .b32 %r<2>; -; CHECK-PTX64-NEXT: .reg .b64 %rd<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; ; CHECK-PTX64-EMPTY: ; CHECK-PTX64-NEXT: // %bb.0: ; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_s2g_param_0]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_s2g_param_1]; ; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_s2g_param_2]; -; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_s2g_param_3]; -; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd2], [%rd1], %r1; +; CHECK-PTX64-NEXT: mov.b64 %rd3, 64; ; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%rd2], [%rd1], %r1, %rd3; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_s2g_param_3]; +; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd2], [%rd1], %r1; +; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%rd2], [%rd1], %r1, %rd4; ; CHECK-PTX64-NEXT: ret; ; ; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_s2g( ; CHECK-PTX-SHARED32: { ; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<3>; -; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<4>; ; CHECK-PTX-SHARED32-EMPTY: ; CHECK-PTX-SHARED32-NEXT: // %bb.0: ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_s2g_param_0]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_s2g_param_1]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_s2g_param_2]; -; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_s2g_param_3]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd1], [%r1], %r2; +; CHECK-PTX-SHARED32-NEXT: mov.b64 %rd2, 64; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%rd1], [%r1], %r2, %rd2; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd3, [cp_async_bulk_s2g_param_3]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd1], [%r1], %r2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%rd1], [%r1], %r2, %rd3; ; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 64, i1 1) tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 0) tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 1) ret void @@ -122,16 +175,19 @@ define void @cp_async_bulk_prefetch(ptr addrspace(1) %src, i32 %size, i64 %ch) { ; CHECK-LABEL: cp_async_bulk_prefetch( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [cp_async_bulk_prefetch_param_0]; ; CHECK-NEXT: ld.param.b32 %r1, [cp_async_bulk_prefetch_param_1]; -; CHECK-NEXT: ld.param.b64 %rd2, [cp_async_bulk_prefetch_param_2]; +; CHECK-NEXT: mov.b64 %rd2, 64; ; CHECK-NEXT: cp.async.bulk.prefetch.L2.global.L2::cache_hint [%rd1], %r1, %rd2; +; CHECK-NEXT: ld.param.b64 %rd3, [cp_async_bulk_prefetch_param_2]; +; CHECK-NEXT: cp.async.bulk.prefetch.L2.global.L2::cache_hint [%rd1], %r1, %rd3; ; CHECK-NEXT: cp.async.bulk.prefetch.L2.global [%rd1], %r1; ; CHECK-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %src, i32 %size, i64 64, i1 1) tail call void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %src, i32 %size, i64 %ch, i1 1) - tail call void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %src, i32 %size, i64 0, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %src, i32 %size, i64 %ch, i1 0) ret void }