From 8cee9237c3d137dec3cf860f5693b181aaaee884 Mon Sep 17 00:00:00 2001
From: Guray Ozen <gozen@nvidia.com>
Date: Thu, 4 Sep 2025 08:01:50 +0200
Subject: [PATCH 1/4] [MLIR][NVVM] Add definition for nvvm dialect

---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 44 ++++++++++++++++++++-
 1 file changed, 42 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 8537c7030aa8f..3dd6baa058d55 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -35,6 +35,46 @@ def LLVM_PointerSharedCluster : LLVM_PointerInAddressSpace<7>;
 //===----------------------------------------------------------------------===//
 
 def NVVM_Dialect : Dialect {
+  let summary = "The NVVM dialect that models NVIDIA's public ISA";
+
+  let description = [{
+  The NVVM dialect is MLIR's LLVM-IR-based, NVIDIA-specific backend dialect. It
+  models NVVM intrinsics and public ISA functionality and introduces NVIDIA
+  extensions to the LLVM type system and address spaces (e.g., global, shared,
+  and cluster memory), enabling faithful lowering of GPU kernels to the NVPTX
+  toolchain. Many ops have a one-to-many mapping to NVVM/PTX: a single overloaded op
+  emits one intrinsic, selected by its operand types.
+
+  **Scope and capabilities:** The dialect covers core GPU features such as
+  thread/block builtins, barriers and atomics, warp-level collectives (e.g.,
+  shuffle/vote), matrix/tensor core operations (e.g., `mma.sync`, `wgmma`),
+  tensor memory accelerator (TMA) operations, asynchronous copies (`cp.async`,
+  bulk/tensor variants) with memory barriers, cache and prefetch controls, and
+  NVVM-specific attributes and enums (e.g., FP rounding modes, memory scopes, and
+  MMA types/layouts).
+
+  **Non-goals:** NVVM is not a place for convenience or “wrapper” ops. It is not
+  intended to introduce high-level ops that expand into multiple unrelated NVVM
+  intrinsics or that lower to no intrinsic at all. Such abstractions belong in
+  higher-level dialects (e.g., `nvgpu`, `gpu`, or project-specific dialects). The
+  design intent is a thin, predictable, low-level surface with near-mechanical
+  lowering to NVVM/LLVM IR.
+
+  **Placement in the lowering pipeline:** NVVM sits below target-agnostic dialects
+  like `gpu` and NVIDIA's `nvgpu`. Typical pipelines convert `gpu`/`nvgpu` ops
+  into NVVM using `-convert-gpu-to-nvvm` and `-convert-nvgpu-to-nvvm`, then lower
+  to the LLVM dialect for final code generation via LLVM's NVPTX backend.
+
+  **Target configuration and serialization:** NVVM provides a `#nvvm.target`
+  attribute to describe the GPU target (SM, features, and flags). In conjunction
+  with `gpu` serialization (e.g., `gpu-module-to-binary`), this enables producing
+  architecture-specific GPU binaries (such as CUBIN) from nested GPU modules.
+
+  **Inline PTX:** When an intrinsic is unavailable or a performance-critical sequence
+  must be expressed directly, NVVM provides an `nvvm.inline_ptx` op to embed PTX
+  inline as a last-resort escape hatch, with explicit operands and results.
+  }];
+
   let name = "nvvm";
   let cppNamespace = "::mlir::NVVM";
   let dependentDialects = ["LLVM::LLVMDialect"];
@@ -976,7 +1016,7 @@ def NVVM_ShflOp :
   let description = [{
     The `shfl.sync` Op implements data shuffle within threads of a warp.
     The `thread_mask` denotes the threads participating in the Op where
-    the bit position corresponds to a particular thread’s laneid.
+    the bit position corresponds to a particular thread's laneid.
     The `offset` specifies a source lane or source lane offset
     (depending on `kind`). The `val` is the input value to be copied from
     the source. The `mask_and_clamp` contains two packed values specifying
@@ -1031,7 +1071,7 @@ def NVVM_VoteSyncOp
     - `ballot`: In the ballot form, the destination result is a 32 bit integer.
       In this form, the predicate from each thread in membermask are copied into
       the corresponding bit position of the result, where the bit position
-      corresponds to the thread’s lane id.
+      corresponds to the thread's lane id.
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-vote-sync)
   }];

From d1b2e398c6ca2132a642521f2747af21fda4be36 Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen@gmail.com>
Date: Thu, 4 Sep 2025 18:25:18 +0200
Subject: [PATCH 2/4] Update mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

Co-authored-by: Mehdi Amini <joker.eph@gmail.com>
---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 3dd6baa058d55..90158c291a26c 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -42,8 +42,9 @@ def NVVM_Dialect : Dialect {
   models NVVM intrinsics and public ISA functionality and introduces NVIDIA
   extensions to the LLVM type system and address spaces (e.g., global, shared,
   and cluster memory), enabling faithful lowering of GPU kernels to the NVPTX
-  toolchain. Many ops have a one-to-many mapping to NVVM/PTX: a single overloaded op
-  emits one intrinsic, selected by its operand types.
+  toolchain. While a NVVM op maps to a single LLVM IR intrinsics, the NVVM dialect uses type
+  polymorphism and other attributes that make it so that a NVVM op can map to different LLVM 
+  intrinsics.
 
   **Scope and capabilities:** The dialect covers core GPU features such as
   thread/block builtins, barriers and atomics, warp-level collectives (e.g.,

From 4774782c61836c3e5bd7ea099a3d2efde2b27c58 Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen@gmail.com>
Date: Thu, 4 Sep 2025 18:26:49 +0200
Subject: [PATCH 3/4] Update NVVMOps.td

---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 90158c291a26c..2b59038d41bf1 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -63,8 +63,8 @@ def NVVM_Dialect : Dialect {
 
   **Placement in the lowering pipeline:** NVVM sits below target-agnostic dialects
   like `gpu` and NVIDIA's `nvgpu`. Typical pipelines convert `gpu`/`nvgpu` ops
-  into NVVM using `-convert-gpu-to-nvvm` and `-convert-nvgpu-to-nvvm`, then lower
-  to the LLVM dialect for final code generation via LLVM's NVPTX backend.
+  into NVVM using `-convert-gpu-to-nvvm` and `-convert-nvgpu-to-nvvm`, then translate
+  into the LLVM for final code generation via NVPTX backend.
 
   **Target configuration and serialization:** NVVM provides a `#nvvm.target`
   attribute to describe the GPU target (SM, features, and flags). In conjunction

From 19cd0aab975fc808dfd71d3550b637a466b70259 Mon Sep 17 00:00:00 2001
From: Guray Ozen <gozen@nvidia.com>
Date: Fri, 5 Sep 2025 17:23:16 +0200
Subject: [PATCH 4/4] fx

---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 75 +++++++++++----------
 1 file changed, 39 insertions(+), 36 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 2b59038d41bf1..5940f6eb8052a 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -38,42 +38,45 @@ def NVVM_Dialect : Dialect {
   let summary = "The NVVM dialect that models NVIDIA's public ISA";
 
   let description = [{
-  The NVVM dialect is MLIR's LLVM-IR-based, NVIDIA-specific backend dialect. It
-  models NVVM intrinsics and public ISA functionality and introduces NVIDIA
-  extensions to the LLVM type system and address spaces (e.g., global, shared,
-  and cluster memory), enabling faithful lowering of GPU kernels to the NVPTX
-  toolchain. While a NVVM op maps to a single LLVM IR intrinsics, the NVVM dialect uses type
-  polymorphism and other attributes that make it so that a NVVM op can map to different LLVM 
-  intrinsics.
-
-  **Scope and capabilities:** The dialect covers core GPU features such as
-  thread/block builtins, barriers and atomics, warp-level collectives (e.g.,
-  shuffle/vote), matrix/tensor core operations (e.g., `mma.sync`, `wgmma`),
-  tensor memory accelerator (TMA) operations, asynchronous copies (`cp.async`,
-  bulk/tensor variants) with memory barriers, cache and prefetch controls, and
-  NVVM-specific attributes and enums (e.g., FP rounding modes, memory scopes, and
-  MMA types/layouts).
-
-  **Non-goals:** NVVM is not a place for convenience or “wrapper” ops. It is not
-  intended to introduce high-level ops that expand into multiple unrelated NVVM
-  intrinsics or that lower to no intrinsic at all. Such abstractions belong in
-  higher-level dialects (e.g., `nvgpu`, `gpu`, or project-specific dialects). The
-  design intent is a thin, predictable, low-level surface with near-mechanical
-  lowering to NVVM/LLVM IR.
-
-  **Placement in the lowering pipeline:** NVVM sits below target-agnostic dialects
-  like `gpu` and NVIDIA's `nvgpu`. Typical pipelines convert `gpu`/`nvgpu` ops
-  into NVVM using `-convert-gpu-to-nvvm` and `-convert-nvgpu-to-nvvm`, then translate
-  into the LLVM for final code generation via NVPTX backend.
-
-  **Target configuration and serialization:** NVVM provides a `#nvvm.target`
-  attribute to describe the GPU target (SM, features, and flags). In conjunction
-  with `gpu` serialization (e.g., `gpu-module-to-binary`), this enables producing
-  architecture-specific GPU binaries (such as CUBIN) from nested GPU modules.
-
-  **Inline PTX:** When an intrinsic is unavailable or a performance-critical sequence
-  must be expressed directly, NVVM provides an `nvvm.inline_ptx` op to embed PTX
-  inline as a last-resort escape hatch, with explicit operands and results.
+    The NVVM dialect is MLIR's LLVM-IR-based, NVIDIA-specific backend dialect. It
+    models NVVM intrinsics and public ISA functionality and introduces NVIDIA
+    extensions to the MLIR/LLVM type system and address spaces (e.g., global,
+    shared, and cluster memory), enabling faithful lowering of GPU kernels to the
+    NVPTX toolchain. While a NVVM op usually maps to a single LLVM IR intrinsic,
+    the NVVM dialect uses type polymorphism and other attributes so that a single
+    NVVM op can map to different LLVM intrinsics.
+
+    **Scope and capabilities:** The dialect covers core GPU features such as
+    thread/block builtins, barriers and atomics, warp-level collectives (e.g.,
+    shuffle/vote), matrix/tensor core operations (e.g., `mma.sync`, `wgmma`),
+    tensor memory accelerator (TMA) operations, asynchronous copies (`cp.async`,
+    bulk/tensor variants) with memory barriers, cache and prefetch controls, and
+    NVVM-specific attributes and enums (e.g., FP rounding modes, memory scopes,
+    and MMA types/layouts).
+
+    **Non-goals:** NVVM is not a place for convenience or “wrapper” ops. It is
+    not intended to introduce high-level ops that expand into multiple unrelated
+    NVVM intrinsics or that lower to no intrinsic at all. Such abstractions belong
+    in higher-level dialects (e.g., `nvgpu`, `gpu`, or project-specific dialects).
+    The design intent is a thin, predictable, low-level surface with
+    near-mechanical lowering to NVVM/LLVM IR.
+
+    **Placement in the lowering pipeline:** NVVM sits below target-agnostic
+    dialects like `gpu` and NVIDIA's `nvgpu`. Typical pipelines convert
+    `gpu`/`nvgpu` ops into NVVM using `-convert-gpu-to-nvvm` and
+    `-convert-nvgpu-to-nvvm`, then translate into LLVM for final code
+    generation via NVPTX backend.
+
+    **Target configuration and serialization:** NVVM provides a `#nvvm.target`
+    attribute to describe the GPU target (SM, features, and flags). In
+    conjunction with `gpu` serialization (e.g., `gpu-module-to-binary`), this
+    enables producing architecture-specific GPU binaries (such as CUBIN) from
+    nested GPU modules.
+
+    **Inline PTX:** When an intrinsic is unavailable or a performance-critical
+    sequence must be expressed directly, NVVM provides an `nvvm.inline_ptx` op to
+    embed PTX inline as a last-resort escape hatch, with explicit operands and
+    results.
   }];
 
   let name = "nvvm";