[MLIR][NVVM] Add tcgen05.mma MLIR Ops #164356

schwarzschild-radius · 2025-10-21T04:25:08Z

This commit adds support for tgen05.mma family of instructions in the NVVM MLIR dialect and lowers to LLVM Intrinsics. Please refer PTX ISA for information

llvmbot · 2025-10-21T04:25:57Z

@llvm/pr-subscribers-mlir

@llvm/pr-subscribers-mlir-llvm

Author: Pradeep Kumar (schwarzschild-radius)

Changes

This commit adds support for tgen05.mma family of instructions in the NVVM MLIR dialect and lowers to LLVM Intrinsics. Please refer PTX ISA for information

Patch is 472.48 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/164356.diff

15 Files Affected:

(modified) mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td (+639)
(modified) mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp (+546)
(added) mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-block-scale-shared.mlir (+229)
(added) mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-block-scale-tensor.mlir (+229)
(added) mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-invalid.mlir (+119)
(added) mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-shared.mlir (+466)
(added) mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-block-scale-shared.mlir (+229)
(added) mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-block-scale-tensor.mlir (+229)
(added) mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-shared.mlir (+442)
(added) mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-sp-tensor.mlir (+634)
(added) mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-tensor.mlir (+634)
(added) mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-shared.mlir (+133)
(added) mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-sp-shared.mlir (+133)
(added) mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-sp-tensor.mlir (+133)
(added) mlir/test/Target/LLVMIR/nvvm/tcgen05-mma-ws-tensor.mlir (+133)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index d959464836043..a580a7f42bccc 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -4537,6 +4537,645 @@ def NVVM_ClusterLaunchControlQueryCancelOp
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// NVVM tcgen05.mma attributes
+//===----------------------------------------------------------------------===//
+
+def Tcgen05MMAKindF16          : I32EnumAttrCase<"F16",    0, "f16">;
+def Tcgen05MMAKindTF32         : I32EnumAttrCase<"TF32",   1, "tf32">;
+def Tcgen05MMAKindF8F6F4       : I32EnumAttrCase<"F8F6F4", 2, "f8f6f4">;
+def Tcgen05MMAKindINT8         : I32EnumAttrCase<"I8",     3, "i8">;
+
+def Tcgen05MMAKind : I32EnumAttr<
+  "Tcgen05MMAKind",
+  "tcgen05 MMA Supported Types",
+  [Tcgen05MMAKindF8F6F4, Tcgen05MMAKindINT8, Tcgen05MMAKindF16,
+   Tcgen05MMAKindTF32]> {
+    let cppNamespace = "::mlir::NVVM";
+    let genSpecializedAttr = 0;
+}
+
+def Tcgen05MMAKindAttr : EnumAttr<NVVM_Dialect, Tcgen05MMAKind, "tcgen05_mma_kind"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
+def Tcgen05MMACollectorOpDiscard  : I32EnumAttrCase<"DISCARD", 0, "discard">;
+def Tcgen05MMACollectorOpLastUse  : I32EnumAttrCase<"LASTUSE", 1, "lastuse">;
+def Tcgen05MMACollectorOpFill     : I32EnumAttrCase<"FILL",    2, "fill">;
+def Tcgen05MMACollectorOpUse      : I32EnumAttrCase<"USE",     3, "use">;
+
+def Tcgen05MMACollectorOp : I32EnumAttr<
+  "Tcgen05MMACollectorOp",
+  "tcgen05.mma Collector Buffer Operation",
+  [Tcgen05MMACollectorOpDiscard,
+   Tcgen05MMACollectorOpLastUse,
+   Tcgen05MMACollectorOpFill,
+   Tcgen05MMACollectorOpUse]> {
+    let cppNamespace = "::mlir::NVVM";
+    let genSpecializedAttr = 0;
+}
+
+def Tcgen05MMACollectorOpAttr : EnumAttr<NVVM_Dialect, Tcgen05MMACollectorOp, "tcgen05_mma_collectorop"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
+//===----------------------------------------------------------------------===//
+// NVVM tcgen05.mma Ops.
+//===----------------------------------------------------------------------===//
+
+def NVVM_Tcgen05MMAOp : NVVM_Op<"tcgen05.mma", [AttrSizedOperandSegments]> {
+
+  let summary = "Performs MMA operation on 5th-gen tensor cores";
+
+  let arguments = (ins
+      // Attributes
+      Tcgen05MMAKindAttr:$kind,
+      CTAGroupKindAttr:$ctaGroup,
+      DefaultValuedAttr<Tcgen05MMACollectorOpAttr,
+                        "Tcgen05MMACollectorOp::DISCARD">:$collectorOp,
+      UnitAttr:$ashift,
+      // Arguments
+      LLVM_PointerTensor:$d,
+      AnyTypeOf<[LLVM_PointerTensor, I64]>:$a,
+      I64:$b,
+      I32:$idesc,
+      I1:$enableInputD,
+      // Optional arguments
+      Optional<I64>:$scaleInputD,
+      Optional<FixedVectorOfLengthAndType<[4, 8], [I32]>>:$disableOutputLane
+    );
+
+  let description = [{
+    The `tcgen05.mma` is an asynchronous op which performs matrix multiplication, 
+    and accumulation using 5th generation tensor cores
+
+    ```
+    D = A * B + (D * 2^ -scaleInputD)    // if `scaleInputD` is provided
+    D = A * B                            // if `enableInputD` is false
+    D = A * B + D                        // otherwise
+    ```
+
+    where:
+    - A is an `M x K` matrix in tensor memory or described using shared memory descriptor
+    - B is a `K x N` matrix described using shared memory descriptor
+    - D is an `M x N` accumulator matrix in tensory memory
+
+    `shared memory descriptor` is a 64 bit value which describes the properties
+    of multiplicand matrix in shared memory including its location in the shared
+    memory of the current CTA. For more details, please refer the
+    [PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-shared-memory-descriptor)
+
+    - idesc is a 32-bit value representing the [Instruction Descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instruction-descriptor)
+
+    Optional Operands:
+    - `scaleInputD` is an Immediate value operand used for scaling D matrix by 2 ^ (-scaleInputD). The valid range is [0, 15]
+
+    - `disableOutputLane` is a vector mask for selective output
+      * vector<4 x i32> when ctaGroup is CTA_1
+      * vector<8 x i32> when ctaGroup is CTA_2
+
+    Required Attributes:
+    - `kind` specifies the computation data type and precision
+      * f16    : 16-bit floating point (half precision)
+      * tf32   : Tensor Float 32 (truncated 32-bit float)
+      * f8f6f4 : Mixed precision FP8/FP6/FP4
+      * i8     : 8-bit integer operations
+
+    - `ctaGroup` specifies CTA group configuration
+      * cta_1: MMA will be performed on the current thread's CTA
+      * cta_2: MMA will be performed on the current thread and it's peer CTA
+
+    Default Attributes:
+    - collectorOp specifies the collector buffer operations for matrix A
+      * discard : Release buffer after use (default)
+      * lastuse : Mark buffer for last use
+      * fill    : Fill buffer
+      * use     : Use buffer without modification
+
+    - `ashift` shifts the rows of the A matrix down by one row
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma)
+  }];
+
+  let assemblyFormat = [{
+    $d `,` $a `,` $b `,` $idesc `,` $enableInputD (`scale` `=` $scaleInputD^)?
+    (`mask` `=` $disableOutputLane^)? attr-dict `:` `(` type(operands) `)`
+  }];
+
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair getIntrinsicIDAndArgs(
+        Operation &op, LLVM::ModuleTranslation &mt,
+        llvm::IRBuilderBase &builder);
+  }];
+
+  let llvmBuilder = [{
+    auto [ID, args] = NVVM::Tcgen05MMAOp::getIntrinsicIDAndArgs(
+        *op, moduleTranslation, builder);
+    createIntrinsicCall(builder, ID, args);
+  }];
+
+  let hasVerifier = true;
+}
+
+def NVVM_Tcgen05MMASpOp : NVVM_Op<"tcgen05.mma.sp", [AttrSizedOperandSegments]> {
+
+  let summary = "Performs MMA operation with sparse A matrix on 5th-gen tensor cores";
+
+  let arguments = (ins
+    // Attributes
+    Tcgen05MMAKindAttr:$kind,
+    CTAGroupKindAttr:$ctaGroup,
+    DefaultValuedAttr<Tcgen05MMACollectorOpAttr,
+                      "Tcgen05MMACollectorOp::DISCARD">:$collectorOp,
+    UnitAttr:$ashift,
+    // Arguments
+    LLVM_PointerTensor:$d,
+    AnyTypeOf<[LLVM_PointerTensor, I64]>:$a,
+    I64:$b,
+    I32:$idesc,
+    I1:$enableInputD,
+    LLVM_PointerTensor:$sparseMetadata,
+    Optional<I64>:$scaleInputD,
+    Optional<FixedVectorOfLengthAndType<[4, 8], [I32]>>:$disableOutputLane
+  );
+
+  let description = [{
+    The `tcgen05.mma.sp` performs matrix multiplication and accumulation with
+    sparse `A` matrix using 5th generation tensor cores.
+
+    It executes a non-blocking `M x N x K` MMA operation:
+    ```
+    D = A * B + (D * 2^ -scaleInputD)    // if `scaleInputD` is provided
+    D = A * B                            // if `enableInputD` is false
+    D = A * B + D                        // otherwise
+    ```
+
+    where:
+    - A is an `M x (K / 2)` matrix in tensor memory or described using shared memory descriptor
+    - B is a `K x N` matrix described using shared memory descriptor
+    - D is an `M x N` accumulator matrix in tensory memory
+    - sparseMetadata specifies the mapping of the `K / 2` non-zero elements to
+      the K elements before performing the MMA operation
+
+    `shared memory descriptor` is a 64 bit value which describes the properties
+    of multiplicand matrix in shared memory including its location in the shared
+    memory of the current CTA. For more details, please refer the
+    [PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-shared-memory-descriptor)
+
+    - `idesc` is a 32 bit value representing the [Instruction Descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instruction-descriptor)
+
+    Optional Operands:
+    - `scaleInputD` is an Immediate value operand used for scaling D matrix by 2 ^ (-scaleInputD). The valid range is [0, 15]
+
+    - `disableOutputLane` is a vector mask for selective output
+      * vector<4 x i32> when ctaGroup is CTA_1
+      * vector<8 x i32> when ctaGroup is CTA_2
+
+    Required Attributes:
+    - `kind` specifies the computation data type and precision
+      * f16    : 16-bit floating point (half precision)
+      * tf32   : Tensor Float 32 (truncated 32-bit float)
+      * f8f6f4 : Mixed precision FP8/FP6/FP4
+      * i8     : 8-bit integer operations
+
+    - `ctaGroup` specifies CTA group configuration
+      * cta_1: MMA will be performed on the current thread's CTA
+      * cta_2: MMA will be performed on the current thread and it's peer CTA
+
+    Default Attributes:
+    - collectorOp specifies the collector buffer operations for matrix A
+      * discard : Release buffer after use (default)
+      * lastuse : Mark buffer for last use
+      * fill    : Fill buffer
+      * use     : Use buffer without modification
+
+    - `ashift` shifts the rows of the A matrix down by one row
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma-sp)
+  }];
+
+  let assemblyFormat = [{
+    $d `,` $a `,` $b `,` $idesc `,` $enableInputD `,` $sparseMetadata (`scale` `=` $scaleInputD^)? (`mask` `=` $disableOutputLane^)? attr-dict `:` `(` type(operands) `)`
+  }];
+
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair getIntrinsicIDAndArgs(
+        Operation &op, LLVM::ModuleTranslation &mt,
+        llvm::IRBuilderBase &builder);
+  }];
+
+  let llvmBuilder = [{
+    auto [ID, args] = NVVM::Tcgen05MMASpOp::getIntrinsicIDAndArgs(
+        *op, moduleTranslation, builder);
+    createIntrinsicCall(builder, ID, args);
+  }];
+
+  let hasVerifier = true;
+}
+
+// tcgen05.mma.block_scale attribute
+def Tcgen05MMAKindMXF8F6F4     : I32EnumAttrCase<"MXF8F6F4", 0, "mxf8f6f4">;
+def Tcgen05MMAKindMXF4     : I32EnumAttrCase<"MXF4", 1, "mxf4">;
+def Tcgen05MMAKindMXF4NVF4     : I32EnumAttrCase<"MXF4NVF4", 2, "mxf4nvf4">;
+
+def Tcgen05MMABlockScaleKind : I32EnumAttr<
+  "Tcgen05MMABlockScaleKind",
+  "tcgen05.mma.block_scale supported types",
+  [Tcgen05MMAKindMXF8F6F4, Tcgen05MMAKindMXF4, Tcgen05MMAKindMXF4NVF4]> {
+    let cppNamespace = "::mlir::NVVM";
+    let genSpecializedAttr = 0;
+}
+
+def Tcgen05MMABlockScaleKindAttr : EnumAttr<NVVM_Dialect, Tcgen05MMABlockScaleKind,
+                                            "tcgen05_mma_block_scale_kind"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
+def Tcgen05MMABlockScaleDefault : I32EnumAttrCase<"DEFAULT", 0, "default">;
+def Tcgen05MMABlockScaleBlock16      : I32EnumAttrCase<"BLOCK16", 1, "block16">;
+def Tcgen05MMABlockScaleBlock32      : I32EnumAttrCase<"BLOCK32", 2, "block32">;
+
+def Tcgen05MMABlockScale
+    : I32EnumAttr<"Tcgen05MMABlockScale",
+                  "tcgen05.mma block scale attribute",
+                  [Tcgen05MMABlockScaleDefault, Tcgen05MMABlockScaleBlock16,
+                   Tcgen05MMABlockScaleBlock32]> {
+  let cppNamespace = "::mlir::NVVM";
+  let genSpecializedAttr = 0;
+}
+
+def Tcgen05MMABlockScaleAttr : EnumAttr<NVVM_Dialect, Tcgen05MMABlockScale,
+                                          "tcgen05_mma_block_scale"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
+//===----------------------------------------------------------------------===//
+// NVVM tcgen05.mma.block_scale Op
+//===----------------------------------------------------------------------===//
+
+def NVVM_Tcgen05MMABlockScaleOp : NVVM_Op<"tcgen05.mma.block_scale"> {
+
+  let summary = "Performs block scaled MMA operation on 5th-gen tensor cores";
+
+  let arguments = (ins
+      // Attributes
+      Tcgen05MMABlockScaleKindAttr:$kind,
+      CTAGroupKindAttr:$ctaGroup,
+      DefaultValuedAttr<Tcgen05MMABlockScaleAttr,
+                      "Tcgen05MMABlockScale::DEFAULT">:$blockScale,
+      DefaultValuedAttr<Tcgen05MMACollectorOpAttr,
+                        "Tcgen05MMACollectorOp::DISCARD">:$collectorOp,
+      // Arguments
+      LLVM_PointerTensor:$d,
+      AnyTypeOf<[LLVM_PointerTensor, I64]>:$a,
+      I64:$b,
+      I32:$idesc, I1:$enableInputD,
+      LLVM_PointerTensor:$scaleA,
+      LLVM_PointerTensor:$scaleB
+    );
+
+  let description = [{
+    `nvvm.tcgen05.mma.block_scale` performs matrix multiplication, and
+    accumulate (MMA) using 5th generation tensor cores. It executes a non-blocking
+    `M x N x K` matrix operation. The matrices `A` and `B` are scaled before
+    performing the matrix multiply and accumulate operation.
+
+    It executes a non-blocking `M x N x K` MMA operation:
+
+    ```
+    D = (A * scale_a)  * (B * scale_b)`      // if `enableInputD` is false
+    D = (A * scale_a)  * (B * scale_b) + D`
+    ```
+
+    where:
+    - A is an M x (K / 2) matrix in tensor memory or described using shared memory descriptor
+    - B is a K x N matrix described using shared memory descriptor
+    - D is an M x N accumulator matrix in tensor memory
+    - `scale_a` and `scale_b` are matrices in tensor memory used to scale `A` and `B` respectively
+
+    `shared memory descriptor` is a 64 bit value which describes the properties
+    of multiplicand matrix in shared memory including its location in the shared
+    memory of the current CTA. For more details, please refer the
+    [PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-shared-memory-descriptor)
+
+    - `idesc` is a 32 bit value representing the [Instruction Descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instruction-descriptor)
+
+    Required Attributes:
+    - `kind` specifies the computation data type and precision
+      * mxf8f6f4 - MX-floating point formats
+      * mxf4     - MX-floating point formats (FP4)
+      * mxf4nvf4 - MXF4 + custom NVIDIA 4-bit format (with common scaling factor)
+
+    - `ctaGroup` specifies CTA group configuration
+      * cta_1: MMA will be performed on the current thread's CTA
+      * cta_2: MMA will be performed on the current thread and it's peer CTA
+
+    Default Attributes:
+    - collectorOp specifies the collector buffer operations for matrix A
+      * discard : Release buffer after use (default)
+      * lastuse : Mark buffer for last use
+      * fill    : Fill buffer
+      * use     : Use buffer without modification
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma)
+  }];
+
+  let assemblyFormat = [{
+    $d `,` $a `,` $b `,` $idesc `,` $enableInputD `,` $scaleA `,` $scaleB
+    attr-dict `:` `(` type(operands) `)`
+  }];
+
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair
+    getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                          llvm::IRBuilderBase &builder);
+  }];
+
+  let llvmBuilder = [{
+    auto [ID, args] = NVVM::Tcgen05MMABlockScaleOp::getIntrinsicIDAndArgs(
+        *op, moduleTranslation, builder);
+    createIntrinsicCall(builder, ID, args);
+  }];
+  let hasVerifier = true;
+}
+
+def NVVM_Tcgen05MMASpBlockScaleOp : NVVM_Op<"tcgen05.mma.sp.block_scale"> {
+
+  let summary = "Performs block scaled MMA operation with sparse A matrix on 5th-gen tensor cores";
+
+  let arguments = (ins
+    // Attributes
+    Tcgen05MMABlockScaleKindAttr:$kind,
+    CTAGroupKindAttr:$ctaGroup,
+    DefaultValuedAttr<Tcgen05MMABlockScaleAttr,
+                      "Tcgen05MMABlockScale::DEFAULT">:$blockScale,
+    DefaultValuedAttr<Tcgen05MMACollectorOpAttr,
+                      "Tcgen05MMACollectorOp::DISCARD">:$collectorOp,
+    // Arguments
+    LLVM_PointerTensor:$d,
+    AnyTypeOf<[LLVM_PointerTensor, I64]>:$a,
+    I64:$b,
+    I32:$idesc,
+    I1:$enableInputD,
+    LLVM_PointerTensor:$sparseMetadata,
+    LLVM_PointerTensor:$scaleA,
+    LLVM_PointerTensor:$scaleB
+  );
+
+  let description = [{
+    `nvvm.tcgen05.mma.sp.block_scale` is an asynchronous op which performs
+    matrix multiplication, and accumulate with sparse A using 5th generation tensor cores
+
+    ```
+    D = (A * scale_a)  * (B * scale_b)      // if `enableInputD` is specified
+    D = (A * scale_a)  * (B * scale_b) + D  // otherwise
+    ```
+
+    where:
+    - A is an M x (K / 2) matrix in tensor memory or described using shared memory descriptor
+    - B is a K x N matrix described using shared memory descriptor
+    - D is an M x N accumulator matrix in tensor memory
+    - `scale_a` and `scale_b` are matrices in tensor memory used to scale `A` and `B` respectively
+
+    `shared memory descriptor` is a 64 bit value which describes the properties
+    of multiplicand matrix in shared memory including its location in the shared
+    memory of the current CTA. For more details, please refer the
+    [PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-shared-memory-descriptor)
+
+    Operands:
+    - `idesc` is a 32 bit value representing the [Instruction Descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instruction-descriptor)
+
+    - `sparseMetadata` specifies the mapping of the `K / 2` non-zero elements to
+      the K elements before performing the MMA operation
+
+    Required Attributes:
+    - `kind` specifies the computation data type and precision
+      * mxf8f6f4 - MX-floating point formats
+      * mxf4     - MX-floating point formats (FP4)
+      * mxf4nvf4 - MXF4 + custom NVIDIA 4-bit format (with common scaling factor)
+
+    - `ctaGroup` specifies CTA group configuration
+      * cta_1: MMA will be performed on the current thread's CTA
+      * cta_2: MMA will be performed on the current thread and it's peer CTA
+
+    Default Attributes:
+    - collectorOp specifies the collector buffer operations for matrix A
+      * discard : Release buffer after use (default)
+      * lastuse : Mark buffer for last use
+      * fill    : Fill buffer
+      * use     : Use buffer without modification
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma-sp)
+  }];
+
+  let assemblyFormat = [{
+    $d `,` $a `,` $b `,` $idesc `,` $enableInputD `,` $sparseMetadata `,`  $scaleA `,`  $scaleB
+    attr-dict `:` `(` type(operands) `)`
+  }];
+
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair
+    getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                          llvm::IRBuilderBase &builder);
+  }];
+
+  let llvmBuilder = [{
+    auto [ID, args] = NVVM::Tcgen05MMASpBlockScaleOp::getIntrinsicIDAndArgs(
+        *op, moduleTranslation, builder);
+    createIntrinsicCall(builder, ID, args);
+  }];
+
+  let hasVerifier = true;
+}
+
+def Tcgen05MMACollectorBBuffer0  : I32EnumAttrCase<"B0", 0, "b0">;
+def Tcgen05MMACollectorBBuffer1  : I32EnumAttrCase<"B1", 1, "b1">;
+def Tcgen05MMACollectorBBuffer2  : I32EnumAttrCase<"B2", 2, "b2">;
+def Tcgen05MMACollectorBBuffer3  : I32EnumAttrCase<"B3", 3, "b3">;
+
+def Tcgen05MMACollectorBBuffer : I32EnumAttr<
+  "Tcgen05MMACollectorBBuffer",
+  "tcgen05 MMA Collector Buffer B Attribute",
+  [Tcgen05MMACollectorBBuffer0,
+  Tcgen05MMACollectorBBuffer1,
+  Tcgen05MMACollectorBBuffer2,
+  Tcgen05MMACollectorBBuffer3]> {
+    let cppNamespace = "::mlir::NVVM";
+    let genSpecializedAttr = 0;
+}
+
+def Tcgen05MMACollectorBBufferAttr : EnumAttr<NVVM_Dialect, Tcgen05MMACollectorBBuffer, "tcgen05_mma_collectorb"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
+//===----------------------------------------------------------------------===//
+// NVVM tcgen05.mma.ws Op
+//===----------------------------------------------------------------------===//
+
+def NVVM_Tcgen05MMAWsOp : NVVM_Op<"tcgen05.mma.ws"> {
+    let summary = "Performs weight stationary convolution MMA operation on 5th-gen tensor cores";
+
+  let arguments = (ins
+    // Attributes
+    Tcgen05MMAKindAttr:$kind,
+    DefaultValuedAttr<Tcgen05MMACollectorBBufferAttr,
+                      "Tcgen05MMACollectorBBuffer::B0">:$collectorBBuffer,
+    DefaultValuedAttr<Tcgen05MMACollectorOpAttr,
+                      "Tcgen05MMACollectorOp::DISCARD">:$collectorOp,
+    // Arguments
+    LLVM_PointerTensor:$d,
+    AnyTypeOf<[LLVM_PointerTensor, I64]>:$a,
+    I64:$b,
+    I32:$idesc,
+    I1:$enableInputD,
+    Optional<I64>:$zeroColMask
+  );...
[truncated]

github-actions · 2025-10-21T04:27:01Z

✅ With the latest revision this PR passed the C/C++ code formatter.

grypp · 2025-10-21T05:49:18Z

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

+def Tcgen05MMAKindF16          : I32EnumAttrCase<"F16",    0, "f16">;
+def Tcgen05MMAKindTF32         : I32EnumAttrCase<"TF32",   1, "tf32">;
+def Tcgen05MMAKindF8F6F4       : I32EnumAttrCase<"F8F6F4", 2, "f8f6f4">;
+def Tcgen05MMAKindINT8         : I32EnumAttrCase<"I8",     3, "i8">;


should we use TypeAttr in that case?

ah, sometimes it is a mix of types like "f8f6f4", which does not have any equivalent individual type representation. So, I suppose we need to use a separate Attr for this case.

Added detailed descriptions for both the Kind attributes

grypp · 2025-10-21T05:50:03Z

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

+// NVVM tcgen05.mma Ops.
+//===----------------------------------------------------------------------===//
+
+def NVVM_Tcgen05MMAOp : NVVM_Op<"tcgen05.mma", [AttrSizedOperandSegments]> {


Can we also add NVVMRequiresSMa<100> verifier check here?

Added requires clause for sm100 and sm_110

grypp · 2025-10-21T05:52:21Z

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

+      LLVM_PointerTensor:$d,
+      AnyTypeOf<[LLVM_PointerTensor, I64]>:$a,
+      I64:$b,
+      I32:$idesc,


very nit : it's harder to get code completion with single char variable name. Maybe we write something explicit :)

ok, then, in that case, can we have matA and matB ?

maybe matrixA, matrixB

Renamed $d -> $matrixD, $a -> $matrixA, $b -> $matrixB

grypp · 2025-10-21T05:52:47Z

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

+      I64:$b,
+      I32:$idesc,
+      I1:$enableInputD,
+      // Optional arguments


let's remove that comment, we have already Optional on the attributes

Removed both comments for both Attributes and Optional

grypp · 2025-10-21T05:55:47Z

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

+    The `tcgen05.mma` is an asynchronous op which performs matrix multiplication, 
+    and accumulation using 5th generation tensor cores


Suggested change

The `tcgen05.mma` is an asynchronous op which performs matrix multiplication,

and accumulation using 5th generation tensor cores

The `tcgen05.mma` operation is an asynchronous tensor core instruction

that performs matrix multiplication, accumulation in a single fused

operation. It targets 5th-generation tensor cores, providing developers

with fine-grained control over execution and scheduling.

Updated the description for all of the tcgen05.mma Ops with the above suggestion

grypp · 2025-10-21T05:56:57Z

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

+    - B is a `K x N` matrix described using shared memory descriptor
+    - D is an `M x N` accumulator matrix in tensory memory
+
+    `shared memory descriptor` is a 64 bit value which describes the properties


maybe mention about the op tcgen05.mma_smem_desc that generates a descriptor

Removed the doc line and pointed the shared memory descriptor creation to tcgen05.mma_smem_desc Op which also contains a detailed documentation

grypp · 2025-10-21T05:57:37Z

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

+    DefaultValuedAttr<Tcgen05MMACollectorOpAttr,
+                      "Tcgen05MMACollectorOp::DISCARD">:$collectorOp,
+    UnitAttr:$ashift,
+    // Arguments


remove the comments, it's clear what is Attribute what is argument

grypp · 2025-10-21T05:57:53Z

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

+      DefaultValuedAttr<Tcgen05MMACollectorOpAttr,
+                        "Tcgen05MMACollectorOp::DISCARD">:$collectorOp,
+      UnitAttr:$ashift,
+      // Arguments


remove the comments, it's clear what is Attribute what is argument

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp

rajatbajpai · 2025-10-24T07:09:45Z

LGTM, thanks!

rajatbajpai · 2025-10-24T06:39:49Z

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp

+    args.push_back(DisableOutputLane);
+    args.push_back(builder.getInt32(static_cast<unsigned>(thisOp.getKind())));
+  } else {
+    if (hasScaleInputD) {


nit: you can move out this from both the branches.

durga4github · 2025-10-24T07:29:21Z

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

+
+    ```
+    +--------+--------------------------------------------+
+    | Matrix |                  A / B                     |


Matrix Kind | Supported types for A/B matrices

durga4github · 2025-10-24T07:34:41Z

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

+//===----------------------------------------------------------------------===//
+// NVVM tcgen05.mma Ops.
+//===----------------------------------------------------------------------===//
+


can remove newline here

durga4github · 2025-10-24T07:34:54Z

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

+def NVVM_Tcgen05MMAOp : NVVM_Op<"tcgen05.mma",
+                          [AttrSizedOperandSegments,
+                           NVVMRequiresSMa<[100, 110]>]> {
+


can remove newline here

durga4github · 2025-10-24T07:36:14Z

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

+      CTAGroupKindAttr:$ctaGroup,
+      DefaultValuedAttr<Tcgen05MMACollectorOpAttr,
+                        "Tcgen05MMACollectorOp::DISCARD">:$collectorOp,
+      UnitAttr:$ashift,


optional:
Should we do aShift?

durga4github · 2025-10-24T07:37:37Z

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

+      * f16    : 16-bit floating point (half precision)
+      * tf32   : Tensor Float 32 (truncated 32-bit float)
+      * f8f6f4 : Mixed precision FP8/FP6/FP4
+      * i8     : 8-bit integer operations


Since we documented this in the EnumAttr itself, we do not need to repeat here.
Only line 4699 should suffice.

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

durga4github · 2025-10-24T07:41:05Z

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

+    createIntrinsicCall(builder, ID, args);
+  }];
+
+  let hasVerifier = true;


Can we move it to line 4725 after the asm-format?
(to be consistent with most of the other Ops)

durga4github · 2025-10-24T07:42:36Z

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

+  let hasVerifier = true;
+}
+
+def NVVM_Tcgen05MMASpOp : NVVM_Op<"tcgen05.mma.sp",


Does NVVM_Tcgen05SparseMMAOp read better?

Let us keep "tcgen05.mma.sp" as is.

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

durga4github · 2025-10-24T07:47:51Z

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

+
+    ```
+    +------------+-------------------------------------------+
+    |            |                   A / B                   |


Can we add the same header and probably move this attr definition right after the previous one above?

durga4github · 2025-10-24T07:48:14Z

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

+    createIntrinsicCall(builder, ID, args);
+  }];
+
+  let hasVerifier = true;


Let us move it after the asm-format

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

durga4github · 2025-10-24T07:54:08Z

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

+
+def NVVM_Tcgen05MMABlockScaleOp : NVVM_Op<"tcgen05.mma.block_scale",
+                                          [NVVMRequiresSMa<[100, 110]>]> {
+


we can remove the newline here

durga4github · 2025-10-24T07:56:41Z

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

+  "Tcgen05MMACollectorBBuffer",
+  "tcgen05 MMA Collector Buffer B Attribute",
+  [Tcgen05MMACollectorBBuffer0,
+  Tcgen05MMACollectorBBuffer1,


can we fold this to the previous line?

durga4github · 2025-10-24T07:59:30Z

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp

+
+  if (hasAShift && !isATensor)
+    res = emitError(loc,
+                    "Only A operand in tensor memory support ashift attribute");


can we rephrase slightly?
"A-shift can be applied only when matrix A is in tensor memory"

durga4github · 2025-10-24T08:01:27Z

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp

+
+  if (hasDisableOutputLane) {
+    if (hasScaleInputD)
+      args.push_back(ScaleInputD);


this scaleInpD seems to be common, should we move this out?

durga4github · 2025-10-24T08:03:28Z

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp

+  assert(ID != notIntrinsic && "Invalid intrinsic for Tcgen05MMAOp.");
+
+  args.push_back(
+      builder.getInt32(static_cast<unsigned>(thisOp.getCollectorOp())));


I wonder why is this here and not in line 2386, as a sequence of args.push_back() ?

durga4github · 2025-10-24T08:05:59Z

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp

+  unsigned ctaGroup =
+      static_cast<unsigned>(getNVVMCtaGroupKind(thisOp.getCtaGroup()));
+
+  bool isATensor = isa<llvm::PointerType>(A->getType());


can we move this to line 2896, so that it is easier to see the use of 'A' ?
(like how it is done in 2905-2907)

durga4github · 2025-10-24T08:15:24Z

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp

 static constexpr unsigned notIntrinsic = llvm::Intrinsic::not_intrinsic;

+static constexpr llvm::nvvm::CTAGroupKind
+getNVVMCtaGroupKind(NVVM::CTAGroupKind ctaGroup) {


Nice for implementing this utility!

[optional]

In all the use of this utility below, I observe that we are casting the value to int.

Do you think it makes sense to do the cast here and return unsigned directly from here?
(Then we can name it getNVVMCtaGroupKindAsInt or something like that)

durga4github

I have a few tiny nits. It should be good to go after a refresh.

durga4github · 2025-10-24T10:17:18Z

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

+
+  let assemblyFormat = [{
+    $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD `,` $sparseMetadata (`scale` `=` $scaleInputD^)? (`mask` `=` $disableOutputLane^)? attr-dict `:` `(` type(operands) `)`
+  }];


Thanks for moving the asm-format right next to the args. It is much easier to read and relate quickly

durga4github

The latest revision LGTM.
Thank you for addressing the comments!

grypp · 2025-10-27T16:34:52Z

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

+// NVVM tcgen05.mma attributes
+//===----------------------------------------------------------------------===//
+
+def Tcgen05MMAKindF16          : I32EnumAttrCase<"F16",    0, "f16">;


Can we make this f16_bf16 so it's clear on the IR?

grypp · 2025-10-27T16:35:30Z

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

+
+    ```
+    +-------------+--------------------------------------------+
+    | Matrix Kind |      supported types for A / B             |


nice ! thanks

grypp · 2025-10-27T16:36:17Z

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp


 static constexpr unsigned notIntrinsic = llvm::Intrinsic::not_intrinsic;

+static constexpr llvm::nvvm::CTAGroupKind


nit: can you comment on top of the function

grypp · 2025-10-27T16:38:11Z

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp

+  args.push_back(mt.lookupValue(thisOp.getEnableInputD()));
+
+  // [hasDisableOutputLane][hasScaleInputD][isATensor][CtaGroup][EnableAShift];
+  static constexpr llvm::Intrinsic::ID tcgen05MMAIDs[2][2][2][2][2] = {


We implement this table in slightly more readable way. we can follow the same style if you like
https://github.com/llvm/llvm-project/blob/main/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp#L1797-L1801

@grypp Are you saying that we should split the table into separate ones for scaled, disable output lane etc.?

grypp · 2025-10-27T16:39:59Z

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp

+         disableOutputLaneType.getNumElements() != 4) ||
+        (ctaGroup == NVVM::CTAGroupKind::CTA_2 &&
+         disableOutputLaneType.getNumElements() != 8))
+      res = emitError(loc) << "Disable Output Lane of length "


We follow the following style when returning errors. Maybe we follow the same style here.

verifyTcgen05MMAOp(...) { if ... return emitError(loc) << "Disable O return success();

Updated the verifiers in the latest revision

grypp · 2025-10-27T16:41:46Z

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp

+  args.push_back(mt.lookupValue(thisOp.getSparseMetadata()));
+
+  // [hasDisableOutputLane][hasScaleInputD][isATensor][CtaGroup][EnableAShift];
+  static constexpr llvm::Intrinsic::ID tcgen05MMASparseIDs[2][2][2][2][2] = {


Same comment goes for the creating table in the same style

@grypp Can you please confirm what the expected change is? I looked at the code and I noticed that we are using std::array but here we have a 5 dimensional matrix, so having a std::array type would be verbose (std::array<std::array<std::array<std::array<std::array<unsigned, 2>, 2>, 2>, 2>, 2>). What do you think?

I think Guray meant something like below.

using EnableAShiftArray = std::array<llvm::Intrinsic::ID, 2>; using CtaGroupArray = std::array<EnableAShiftArray, 2>; using IsATensorArray = std::array<CtaGroupArray, 2>; using HasScaleInputDArray = std::array<IsATensorArray, 2>; using HasDisableOutputLaneArray = std::array<HasScaleInputDArray, 2>; // [hasDisableOutputLane][hasScaleInputD][isATensor][CtaGroup][EnableAShift] static constexpr HasDisableOutputLaneArray tcgen05MMAIDs = {

grypp · 2025-10-27T16:42:05Z

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp

+    res = emitError(loc,
+                    llvm::formatv("{} kind does not support block16 attribute",
+                                  stringifyEnum(kind)));
+  return res;


same comment about the style of the verifier.

grypp

PR is in a good shape. I've 2 major comments that are mostly stylistic

Verifier
Creating intrinsic table

This commit adds support for tgen05.mma family of instructions in the NVVM MLIR dialect and lowers to LLVM Intrinsics. Please refer [PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions) for information

schwarzschild-radius requested review from durga4github and rajatbajpai October 21, 2025 04:25

schwarzschild-radius requested a review from grypp as a code owner October 21, 2025 04:25

llvmbot added mlir:llvm mlir labels Oct 21, 2025

schwarzschild-radius force-pushed the tcgen05_mma_mlir_support branch 3 times, most recently from 6efa8ca to edd9ac3 Compare October 21, 2025 05:24

grypp reviewed Oct 21, 2025

View reviewed changes

rajatbajpai reviewed Oct 21, 2025

View reviewed changes

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp Outdated Show resolved Hide resolved

schwarzschild-radius force-pushed the tcgen05_mma_mlir_support branch 5 times, most recently from a0ddc62 to a605a51 Compare October 23, 2025 13:32

rajatbajpai approved these changes Oct 24, 2025

View reviewed changes

durga4github reviewed Oct 24, 2025

View reviewed changes

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td Show resolved Hide resolved

durga4github reviewed Oct 24, 2025

View reviewed changes

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td Show resolved Hide resolved

durga4github reviewed Oct 24, 2025

View reviewed changes

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td Show resolved Hide resolved

durga4github reviewed Oct 24, 2025

View reviewed changes

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td Show resolved Hide resolved

durga4github reviewed Oct 24, 2025

View reviewed changes

schwarzschild-radius force-pushed the tcgen05_mma_mlir_support branch from a605a51 to 4f70032 Compare October 24, 2025 10:04

durga4github reviewed Oct 24, 2025

View reviewed changes

durga4github approved these changes Oct 24, 2025

View reviewed changes

grypp reviewed Oct 27, 2025

View reviewed changes

schwarzschild-radius force-pushed the tcgen05_mma_mlir_support branch from 4f70032 to 8e78c84 Compare November 1, 2025 08:10

[MLIR][NVVM] Add tcgen05.mma MLIR Ops

dfb4b1e

This commit adds support for tgen05.mma family of instructions in the NVVM MLIR dialect and lowers to LLVM Intrinsics. Please refer [PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions) for information

schwarzschild-radius force-pushed the tcgen05_mma_mlir_support branch from 8e78c84 to dfb4b1e Compare November 1, 2025 08:18

		The `tcgen05.mma` is an asynchronous op which performs matrix multiplication,
		and accumulation using 5th generation tensor cores

-    The `tcgen05.mma` is an asynchronous op which performs matrix multiplication,
-    and accumulation using 5th generation tensor cores
+    The `tcgen05.mma` operation is an asynchronous tensor core instruction
+    that performs matrix multiplication, accumulation in a single fused
+    operation. It targets 5th-generation tensor cores, providing developers
+    with fine-grained control over execution and scheduling.


		def NVVM_Tcgen05MMABlockScaleOp : NVVM_Op<"tcgen05.mma.block_scale",
		[NVVMRequiresSMa<[100, 110]>]> {


		static constexpr unsigned notIntrinsic = llvm::Intrinsic::not_intrinsic;

		static constexpr llvm::nvvm::CTAGroupKind

[MLIR][NVVM] Add tcgen05.mma MLIR Ops #164356

Are you sure you want to change the base?

[MLIR][NVVM] Add tcgen05.mma MLIR Ops #164356

Uh oh!

Conversation

schwarzschild-radius commented Oct 21, 2025

Uh oh!

llvmbot commented Oct 21, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Oct 21, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

durga4github Oct 21, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

rajatbajpai commented Oct 24, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

llvmbot commented Oct 21, 2025 •

edited

Loading

github-actions bot commented Oct 21, 2025 •

edited

Loading

durga4github Oct 21, 2025 •

edited

Loading