Skip to content

Conversation

durga4github
Copy link
Contributor

This patch moves the TMA S2G intrinsics into their own set of loops.
This is in preparation for adding im2colw/w128 modes support to
the G2S intrinsics (but the S2G ones do not support those modes).

This patch moves the TMA S2G intrinsics into their
own set of loops. This is in preparation for adding
im2colw/w128 modes to the G2S intrinsics (and the
S2G ones do not support those modes).

Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
@llvmbot
Copy link
Member

llvmbot commented Jun 19, 2025

@llvm/pr-subscribers-llvm-ir

Author: Durgadoss R (durga4github)

Changes

This patch moves the TMA S2G intrinsics into their own set of loops.
This is in preparation for adding im2colw/w128 modes support to
the G2S intrinsics (but the S2G ones do not support those modes).


Full diff: https://github.com/llvm/llvm-project/pull/144903.diff

1 Files Affected:

  • (modified) llvm/include/llvm/IR/IntrinsicsNVVM.td (+30-26)
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 410a0dea2bf57..0375f29ad8906 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -2013,9 +2013,36 @@ class DefaultAttrsIntrinsicFlags<list<LLVMType> ret_types,
                     !foreach(i, !range(flags),
                         ImmArg<ArgIndex<!add(i, !size(param_types))>>))>;
 
-// Intrinsics for Tensor Copy using TMA
-// G2S -> From Global to Shared memory variants
-// S2G -> From Shared to Global memory variants
+// TMA Tensor Copy Intrinsics: S2G -> From Shared to Global memory variants
+foreach dim = 1...5 in {
+  defvar tensor_dim_args = !listsplat(llvm_i32_ty, dim);
+  foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
+    def int_nvvm_cp_async_bulk_tensor_s2g_ # mode # _ # dim # d :
+      DefaultAttrsIntrinsicFlags<[],
+          !listconcat([llvm_shared_ptr_ty,  // src_smem_ptr
+                       llvm_ptr_ty],        // tensormap_ptr
+                      tensor_dim_args,      // actual tensor dims
+                      [llvm_i64_ty]),       // cache_hint
+          [llvm_i1_ty],                     // Flag for cache_hint
+          [IntrConvergent,
+           ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
+           NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
+
+    // Intrinsics for TMA Copy with reduction
+    foreach red_op = ["add", "min", "max", "inc", "dec", "and", "or", "xor"] in
+      def int_nvvm_cp_async_bulk_tensor_reduce_ # red_op # _ # mode # _ # dim # d :
+        DefaultAttrsIntrinsicFlags<[],
+            !listconcat([llvm_shared_ptr_ty,  // src_smem_ptr
+                         llvm_ptr_ty],        // tensormap_ptr
+                         tensor_dim_args,     // actual tensor dims
+                        [llvm_i64_ty]),       // cache_hint
+          [llvm_i1_ty],                       // Flag for cache_hint
+          [IntrConvergent, ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
+           NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
+  }
+}
+
+// TMA Tensor Copy Intrinsics: G2S -> From Global to Shared memory variants
 foreach dim = 1...5 in {
   defvar tensor_dim_args = !listsplat(llvm_i32_ty, dim);
 
@@ -2045,17 +2072,6 @@ foreach dim = 1...5 in {
     def int_nvvm_cp_async_bulk_tensor_g2s_ # mode # _ # dim # d :
       DefaultAttrsIntrinsicFlags<[], g2s_params, g2s_flags, g2s_props>;
 
-    def int_nvvm_cp_async_bulk_tensor_s2g_ # mode # _ # dim # d :
-      DefaultAttrsIntrinsicFlags<[],
-          !listconcat([llvm_shared_ptr_ty,  // src_smem_ptr
-                       llvm_ptr_ty],        // tensormap_ptr
-                      tensor_dim_args,      // actual tensor dims
-                      [llvm_i64_ty]),       // cache_hint
-          [llvm_i1_ty],                     // Flag for cache_hint
-          [IntrConvergent,
-           ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
-           NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
-
     def int_nvvm_cp_async_bulk_tensor_prefetch_ # mode # _ # dim # d :
       DefaultAttrsIntrinsicFlags<[],
           !listconcat([llvm_ptr_ty],        // tensormap_ptr
@@ -2065,18 +2081,6 @@ foreach dim = 1...5 in {
           [llvm_i1_ty],                     // Flag for cache_hint
           [IntrConvergent,
            ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>;
-
-    // Intrinsics for TMA Copy with reduction
-    foreach red_op = ["add", "min", "max", "inc", "dec", "and", "or", "xor"] in
-      def int_nvvm_cp_async_bulk_tensor_reduce_ # red_op # _ # mode # _ # dim # d :
-        DefaultAttrsIntrinsicFlags<[],
-            !listconcat([llvm_shared_ptr_ty,  // src_smem_ptr
-                         llvm_ptr_ty],        // tensormap_ptr
-                         tensor_dim_args,     // actual tensor dims
-                        [llvm_i64_ty]),       // cache_hint
-          [llvm_i1_ty],                       // Flag for cache_hint
-          [IntrConvergent, ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
-           NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
   }
 }
 

@llvmbot
Copy link
Member

llvmbot commented Jun 19, 2025

@llvm/pr-subscribers-backend-nvptx

Author: Durgadoss R (durga4github)

Changes

This patch moves the TMA S2G intrinsics into their own set of loops.
This is in preparation for adding im2colw/w128 modes support to
the G2S intrinsics (but the S2G ones do not support those modes).


Full diff: https://github.com/llvm/llvm-project/pull/144903.diff

1 Files Affected:

  • (modified) llvm/include/llvm/IR/IntrinsicsNVVM.td (+30-26)
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 410a0dea2bf57..0375f29ad8906 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -2013,9 +2013,36 @@ class DefaultAttrsIntrinsicFlags<list<LLVMType> ret_types,
                     !foreach(i, !range(flags),
                         ImmArg<ArgIndex<!add(i, !size(param_types))>>))>;
 
-// Intrinsics for Tensor Copy using TMA
-// G2S -> From Global to Shared memory variants
-// S2G -> From Shared to Global memory variants
+// TMA Tensor Copy Intrinsics: S2G -> From Shared to Global memory variants
+foreach dim = 1...5 in {
+  defvar tensor_dim_args = !listsplat(llvm_i32_ty, dim);
+  foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
+    def int_nvvm_cp_async_bulk_tensor_s2g_ # mode # _ # dim # d :
+      DefaultAttrsIntrinsicFlags<[],
+          !listconcat([llvm_shared_ptr_ty,  // src_smem_ptr
+                       llvm_ptr_ty],        // tensormap_ptr
+                      tensor_dim_args,      // actual tensor dims
+                      [llvm_i64_ty]),       // cache_hint
+          [llvm_i1_ty],                     // Flag for cache_hint
+          [IntrConvergent,
+           ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
+           NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
+
+    // Intrinsics for TMA Copy with reduction
+    foreach red_op = ["add", "min", "max", "inc", "dec", "and", "or", "xor"] in
+      def int_nvvm_cp_async_bulk_tensor_reduce_ # red_op # _ # mode # _ # dim # d :
+        DefaultAttrsIntrinsicFlags<[],
+            !listconcat([llvm_shared_ptr_ty,  // src_smem_ptr
+                         llvm_ptr_ty],        // tensormap_ptr
+                         tensor_dim_args,     // actual tensor dims
+                        [llvm_i64_ty]),       // cache_hint
+          [llvm_i1_ty],                       // Flag for cache_hint
+          [IntrConvergent, ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
+           NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
+  }
+}
+
+// TMA Tensor Copy Intrinsics: G2S -> From Global to Shared memory variants
 foreach dim = 1...5 in {
   defvar tensor_dim_args = !listsplat(llvm_i32_ty, dim);
 
@@ -2045,17 +2072,6 @@ foreach dim = 1...5 in {
     def int_nvvm_cp_async_bulk_tensor_g2s_ # mode # _ # dim # d :
       DefaultAttrsIntrinsicFlags<[], g2s_params, g2s_flags, g2s_props>;
 
-    def int_nvvm_cp_async_bulk_tensor_s2g_ # mode # _ # dim # d :
-      DefaultAttrsIntrinsicFlags<[],
-          !listconcat([llvm_shared_ptr_ty,  // src_smem_ptr
-                       llvm_ptr_ty],        // tensormap_ptr
-                      tensor_dim_args,      // actual tensor dims
-                      [llvm_i64_ty]),       // cache_hint
-          [llvm_i1_ty],                     // Flag for cache_hint
-          [IntrConvergent,
-           ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
-           NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
-
     def int_nvvm_cp_async_bulk_tensor_prefetch_ # mode # _ # dim # d :
       DefaultAttrsIntrinsicFlags<[],
           !listconcat([llvm_ptr_ty],        // tensormap_ptr
@@ -2065,18 +2081,6 @@ foreach dim = 1...5 in {
           [llvm_i1_ty],                     // Flag for cache_hint
           [IntrConvergent,
            ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>;
-
-    // Intrinsics for TMA Copy with reduction
-    foreach red_op = ["add", "min", "max", "inc", "dec", "and", "or", "xor"] in
-      def int_nvvm_cp_async_bulk_tensor_reduce_ # red_op # _ # mode # _ # dim # d :
-        DefaultAttrsIntrinsicFlags<[],
-            !listconcat([llvm_shared_ptr_ty,  // src_smem_ptr
-                         llvm_ptr_ty],        // tensormap_ptr
-                         tensor_dim_args,     // actual tensor dims
-                        [llvm_i64_ty]),       // cache_hint
-          [llvm_i1_ty],                       // Flag for cache_hint
-          [IntrConvergent, ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
-           NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
   }
 }
 

@durga4github durga4github merged commit ef04847 into llvm:main Jun 24, 2025
10 checks passed
@durga4github durga4github deleted the durgadossr/nvptx_tma_nfc branch June 24, 2025 06:17
DrSergei pushed a commit to DrSergei/llvm-project that referenced this pull request Jun 24, 2025
This patch moves the TMA S2G intrinsics into their own set of loops.
This is in preparation for adding im2colw/w128 modes support to
the G2S intrinsics (but the S2G ones do not support those modes).

Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
anthonyhatran pushed a commit to anthonyhatran/llvm-project that referenced this pull request Jun 26, 2025
This patch moves the TMA S2G intrinsics into their own set of loops.
This is in preparation for adding im2colw/w128 modes support to
the G2S intrinsics (but the S2G ones do not support those modes).

Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants