diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 5ad8f9ab07e40..39f0556aef5a2 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -1334,6 +1334,32 @@ copied and it must be a multiple of 16. For more information, refer PTX ISA ``_. +'``llvm.nvvm.cp.async.bulk.global.to.shared.cta``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cta(ptr addrspace(3) %dst, ptr addrspace(3) %mbar, ptr addrspace(1) %src, i32 %size, i64 %ch, i1 %flag_ch) + +Overview: +""""""""" + +The '``@llvm.nvvm.cp.async.bulk.global.to.shared.cta``' intrinsic +corresponds to the ``cp.async.bulk.shared::cta.global.*`` family +of PTX instructions. These instructions initiate an asynchronous +copy of bulk data from global memory to shared::cta memory. +The 32-bit operand ``%size`` specifies the amount of memory to be +copied and it must be a multiple of 16. The last argument +(denoted by ``i1 %flag_ch``) is a compile-time constant. When set, +it indicates a valid cache_hint (``i64 %ch``) and generates the +``.L2::cache_hint`` variant of the PTX instruction. + +For more information, refer PTX ISA +``_. + '``llvm.nvvm.cp.async.bulk.shared.cta.to.global``' ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 2710853e17688..21badc2692037 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -2716,8 +2716,19 @@ def int_nvvm_cp_async_bulk_global_to_shared_cluster [llvm_i1_ty, // Flag for cta_mask llvm_i1_ty], // Flag for cache_hint [IntrConvergent, IntrArgMemOnly, - WriteOnly>, ReadOnly>, - NoCapture>, NoCapture>, NoCapture>]>; + WriteOnly>, ReadOnly>]>; + +// From Global to Shared CTA +def int_nvvm_cp_async_bulk_global_to_shared_cta + : DefaultAttrsIntrinsicFlags<[], + [llvm_shared_ptr_ty, // dst_shared_cta_ptr + llvm_shared_ptr_ty, // mbarrier_ptr + llvm_global_ptr_ty, // src_gmem_ptr + llvm_i32_ty, // copy_size + llvm_i64_ty], // cache_hint + [llvm_i1_ty], // Flag for cache_hint + [IntrConvergent, IntrArgMemOnly, + WriteOnly>, ReadOnly>]>; // From Shared CTA to Shared Cluster def int_nvvm_cp_async_bulk_shared_cta_to_cluster @@ -2727,9 +2738,7 @@ def int_nvvm_cp_async_bulk_shared_cta_to_cluster llvm_shared_ptr_ty, // src_smem_ptr llvm_i32_ty], // copy_size [IntrConvergent, IntrArgMemOnly, - WriteOnly>, ReadOnly>, - NoCapture>, NoCapture>, - NoCapture>]>; + WriteOnly>, ReadOnly>]>; // From Shared CTA to Global memory def int_nvvm_cp_async_bulk_shared_cta_to_global @@ -2740,8 +2749,7 @@ def int_nvvm_cp_async_bulk_shared_cta_to_global llvm_i64_ty], // cache_hint [llvm_i1_ty], // Flag for cache_hint [IntrConvergent, IntrArgMemOnly, - WriteOnly>, ReadOnly>, - NoCapture>, NoCapture>]>; + WriteOnly>, ReadOnly>]>; // From Shared CTA to Global memory with bytemask def int_nvvm_cp_async_bulk_shared_cta_to_global_bytemask diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 50827bd548ad5..ea69a54e6db37 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -497,6 +497,10 @@ class CpAsyncBulkStr { # !if(mc, ".multicast::cluster", "") # !if(ch, ".L2::cache_hint", ""); + // Global to Shared CTA memory + string G2S_CTA = "cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes" + # !if(ch, ".L2::cache_hint", ""); + // Shared CTA to Cluster memory string C2C = "cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes"; } @@ -543,6 +547,21 @@ multiclass CP_ASYNC_BULK_G2S_INTR { defm CP_ASYNC_BULK_G2S : CP_ASYNC_BULK_G2S_INTR; defm CP_ASYNC_BULK_G2S_CH : CP_ASYNC_BULK_G2S_INTR; +multiclass CP_ASYNC_BULK_G2S_CTA_INTR { + defvar Intr = int_nvvm_cp_async_bulk_global_to_shared_cta; + + def "" : NVPTXInst<(outs), + (ins ADDR:$dst, ADDR:$mbar, ADDR:$src, + B32:$size, B64:$ch), + !if(has_ch, + CpAsyncBulkStr<0, 1>.G2S_CTA # " [$dst], [$src], $size, [$mbar], $ch;", + CpAsyncBulkStr<0, 0>.G2S_CTA # " [$dst], [$src], $size, [$mbar];"), + [(Intr addr:$dst, addr:$mbar, addr:$src, i32:$size, i64:$ch, !if(has_ch, -1, 0))]>, + Requires<[hasPTX<86>, hasSM<90>]>; +} +defm CP_ASYNC_BULK_G2S_CTA : CP_ASYNC_BULK_G2S_CTA_INTR; +defm CP_ASYNC_BULK_G2S_CTA_CH : CP_ASYNC_BULK_G2S_CTA_INTR; + def CP_ASYNC_BULK_CTA_TO_CLUSTER : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$mbar, ADDR:$src, B32:$size), CpAsyncBulkStr<0, 0>.C2C # " [$dst], [$src], $size, [$mbar];", diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-ptx86.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-ptx86.ll new file mode 100644 index 0000000000000..9872b2aa0826b --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-ptx86.ll @@ -0,0 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86| FileCheck --check-prefixes=CHECK,CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cta(ptr addrspace(3), ptr addrspace(3), ptr addrspace(1), i32, i64, i1) + +define void @cp_async_bulk_g2s(ptr addrspace(1) %src, ptr addrspace(3) %bar, ptr addrspace(3) %dst, i32 %size, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_g2s( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<2>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_g2s_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_g2s_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_g2s_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_g2s_param_3]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_g2s_param_4]; +; CHECK-PTX64-NEXT: cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [%rd3], [%rd1], %r1, [%rd2]; +; CHECK-PTX64-NEXT: cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes.L2::cache_hint [%rd3], [%rd1], %r1, [%rd2], %rd4; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_g2s( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_g2s_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_g2s_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_g2s_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_g2s_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_g2s_param_4]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [%r2], [%rd1], %r3, [%r1]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes.L2::cache_hint [%r2], [%rd1], %r3, [%r1], %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cta(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i64 %ch, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cta(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i64 %ch, i1 1) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}}