-
Notifications
You must be signed in to change notification settings - Fork 15.2k
Revert "[MLIR][NVVM] Add tcgen05.mma MLIR Ops" #168583
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This reverts commit fb829bf.
|
@llvm/pr-subscribers-mlir-llvm Author: Mehdi Amini (joker-eph) ChangesReverts llvm/llvm-project#164356 The bots are broken. Patch is 472.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168583.diff 15 Files Affected:
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 524b9f820f290..8d5bc7333d47f 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -4598,551 +4598,6 @@ def NVVM_ClusterLaunchControlQueryCancelOp
}];
}
-//===----------------------------------------------------------------------===//
-// NVVM tcgen05.mma Ops
-//===----------------------------------------------------------------------===//
-
-def Tcgen05MMAKindF16 : I32EnumAttrCase<"F16", 0, "f16">;
-def Tcgen05MMAKindTF32 : I32EnumAttrCase<"TF32", 1, "tf32">;
-def Tcgen05MMAKindF8F6F4 : I32EnumAttrCase<"F8F6F4", 2, "f8f6f4">;
-def Tcgen05MMAKindINT8 : I32EnumAttrCase<"I8", 3, "i8">;
-
-def Tcgen05MMAKind : I32EnumAttr<
- "Tcgen05MMAKind",
- "tcgen05 MMA Supported Types",
- [Tcgen05MMAKindF8F6F4, Tcgen05MMAKindINT8, Tcgen05MMAKindF16,
- Tcgen05MMAKindTF32]> {
- let cppNamespace = "::mlir::NVVM";
- let genSpecializedAttr = 0;
-}
-
-def Tcgen05MMAKindAttr : EnumAttr<NVVM_Dialect, Tcgen05MMAKind, "tcgen05_mma_kind"> {
- let description = [{
- The Tcgen05MMAKind attribute describes the allowed set of types for matrix A and B in the tcgen05.mma.{sp} Op. The following are supported types for each kind:
-
- ```
- +-------------+--------------------------------------------+
- | Matrix Kind | supported types for A / B |
- +-------------+--------------------------------------------+
- | f16 | f16, bf16 |
- | tf32 | tf32 |
- | f8f6f4 | e4m3, e5m2, e2m3, e3m2, e2m1 |
- | i8 | unsigned 8b, signed 8b |
- +-------------+--------------------------------------------+
- ```
- }];
- let assemblyFormat = "`<` $value `>`";
-}
-
-def Tcgen05MMACollectorOpDiscard : I32EnumAttrCase<"DISCARD", 0, "discard">;
-def Tcgen05MMACollectorOpLastUse : I32EnumAttrCase<"LASTUSE", 1, "lastuse">;
-def Tcgen05MMACollectorOpFill : I32EnumAttrCase<"FILL", 2, "fill">;
-def Tcgen05MMACollectorOpUse : I32EnumAttrCase<"USE", 3, "use">;
-
-def Tcgen05MMACollectorOp : I32EnumAttr<
- "Tcgen05MMACollectorOp",
- "tcgen05.mma Collector Buffer Operation",
- [Tcgen05MMACollectorOpDiscard,
- Tcgen05MMACollectorOpLastUse,
- Tcgen05MMACollectorOpFill,
- Tcgen05MMACollectorOpUse]> {
- let cppNamespace = "::mlir::NVVM";
- let genSpecializedAttr = 0;
-}
-
-def Tcgen05MMACollectorOpAttr : EnumAttr<NVVM_Dialect, Tcgen05MMACollectorOp, "tcgen05_mma_collectorop"> {
- let description = [{
- Tcgen05MMACollectorOp attribute specifies the collector buffer operations.
- The following are the supported operations:
- * discard : Release buffer after use (default)
- * lastuse : Mark buffer for last use
- * fill : Fill buffer
- * use : Use buffer without modification
- }];
- let assemblyFormat = "`<` $value `>`";
-}
-
-def NVVM_Tcgen05MMAOp : NVVM_Op<"tcgen05.mma",
- [AttrSizedOperandSegments,
- NVVMRequiresSMa<[100, 110]>]> {
- let summary = "Performs MMA operation on 5th-gen tensor cores";
-
- let description = [{
- The `tcgen05.mma` operation is an asynchronous tensor core instruction that
- performs matrix multiplication, accumulation in a single fused operation. It
- targets 5th-generation tensor cores, providing developers with fine-grained
- control over execution and scheduling.
-
- ```
- D = A * B + (D * 2^ -scaleInputD) // if `scaleInputD` is provided
- D = A * B // if `enableInputD` is false
- D = A * B + D // otherwise
- ```
-
- where:
- - A is an `M x K` matrix in tensor memory or described using shared memory descriptor
- - B is a `K x N` matrix described using shared memory descriptor
- - D is an `M x N` accumulator matrix in tensor memory
-
- The `shared memory descriptor` can be generated using `tcgen05.mma_smem_desc` Op
-
- - idesc is a 32-bit value representing the [Instruction Descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instruction-descriptor)
-
- Optional Operands:
- - `scaleInputD` is an Immediate value operand used for scaling D matrix by 2 ^ (-scaleInputD). The valid range is [0, 15]
-
- - `disableOutputLane` is a vector mask for selective output
- * vector<4 x i32> when ctaGroup is CTA_1
- * vector<8 x i32> when ctaGroup is CTA_2
-
- Required Attributes:
- - `kind` is a Tcgen05MMAKind attribute
-
- - `ctaGroup` specifies CTA group configuration
- * cta_1: MMA will be performed on the current thread's CTA
- * cta_2: MMA will be performed on the current thread and it's peer CTA
-
- Default Attributes:
- - collectorOp is a Tcgen05MMACollectorOp attribute with matrix A as the collector buffer
-
- - `aShift` shifts the rows of the A matrix down by one row and can only be
- applied if A is in tensor memory
-
- [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma)
- }];
-
- let arguments = (ins
- Tcgen05MMAKindAttr:$kind,
- CTAGroupKindAttr:$ctaGroup,
- DefaultValuedAttr<Tcgen05MMACollectorOpAttr,
- "Tcgen05MMACollectorOp::DISCARD">:$collectorOp,
- UnitAttr:$aShift,
- LLVM_PointerTensor:$matrixD,
- AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA,
- I64:$matrixB,
- I32:$idesc,
- I1:$enableInputD,
- Optional<I64>:$scaleInputD,
- Optional<FixedVectorOfLengthAndType<[4, 8], [I32]>>:$disableOutputLane
- );
-
- let assemblyFormat = [{
- $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD (`scale` `=` $scaleInputD^)?
- (`mask` `=` $disableOutputLane^)? attr-dict `:` `(` type(operands) `)`
- }];
-
- let hasVerifier = true;
-
- let extraClassDeclaration = [{
- static mlir::NVVM::IDArgPair getIntrinsicIDAndArgs(
- Operation &op, LLVM::ModuleTranslation &mt,
- llvm::IRBuilderBase &builder);
- }];
-
- let llvmBuilder = [{
- auto [ID, args] = NVVM::Tcgen05MMAOp::getIntrinsicIDAndArgs(
- *op, moduleTranslation, builder);
- createIntrinsicCall(builder, ID, args);
- }];
-}
-
-def NVVM_Tcgen05MMASparseOp : NVVM_Op<"tcgen05.mma.sp",
- [AttrSizedOperandSegments,
- NVVMRequiresSMa<[100, 110]>]> {
- let summary = "Performs MMA operation with sparse A matrix on 5th-gen tensor cores";
-
- let description = [{
- The `tcgen05.mma.sp` operation is an asynchronous tensor core instruction
- that performs matrix multiplication, accumulation with sparse `A` matrix in
- a single fused operation. It targets 5th-generation tensor cores, providing
- developers with fine-grained control over execution and scheduling.
-
- ```
- D = A * B + (D * 2^ -scaleInputD) // if `scaleInputD` is provided
- D = A * B // if `enableInputD` is false
- D = A * B + D // otherwise
- ```
-
- where:
- - A is an `M x (K / 2)` matrix in tensor memory or described using shared memory descriptor
- - B is a `K x N` matrix described using shared memory descriptor
- - D is an `M x N` accumulator matrix in tensor memory
- - sparseMetadata located in tensor memory specifies the mapping of the `K / 2`
- non-zero elements to the K elements before performing the MMA operation
-
- Other attributes and operands are similar to that of tcgen05.mma Op
-
- [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma-sp)
- }];
-
- let arguments = (ins
- Tcgen05MMAKindAttr:$kind,
- CTAGroupKindAttr:$ctaGroup,
- DefaultValuedAttr<Tcgen05MMACollectorOpAttr,
- "Tcgen05MMACollectorOp::DISCARD">:$collectorOp,
- UnitAttr:$aShift,
- LLVM_PointerTensor:$matrixD,
- AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA,
- I64:$matrixB,
- I32:$idesc,
- I1:$enableInputD,
- LLVM_PointerTensor:$sparseMetadata,
- Optional<I64>:$scaleInputD,
- Optional<FixedVectorOfLengthAndType<[4, 8], [I32]>>:$disableOutputLane
- );
-
- let assemblyFormat = [{
- $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD `,` $sparseMetadata (`scale` `=` $scaleInputD^)? (`mask` `=` $disableOutputLane^)? attr-dict `:` `(` type(operands) `)`
- }];
-
- let hasVerifier = true;
-
- let extraClassDeclaration = [{
- static mlir::NVVM::IDArgPair getIntrinsicIDAndArgs(
- Operation &op, LLVM::ModuleTranslation &mt,
- llvm::IRBuilderBase &builder);
- }];
-
- let llvmBuilder = [{
- auto [ID, args] = NVVM::Tcgen05MMASparseOp::getIntrinsicIDAndArgs(
- *op, moduleTranslation, builder);
- createIntrinsicCall(builder, ID, args);
- }];
-}
-
-def Tcgen05MMAKindMXF8F6F4 : I32EnumAttrCase<"MXF8F6F4", 0, "mxf8f6f4">;
-def Tcgen05MMAKindMXF4 : I32EnumAttrCase<"MXF4", 1, "mxf4">;
-def Tcgen05MMAKindMXF4NVF4 : I32EnumAttrCase<"MXF4NVF4", 2, "mxf4nvf4">;
-
-def Tcgen05MMABlockScaleKind : I32EnumAttr<
- "Tcgen05MMABlockScaleKind",
- "tcgen05.mma.block_scale supported types",
- [Tcgen05MMAKindMXF8F6F4, Tcgen05MMAKindMXF4, Tcgen05MMAKindMXF4NVF4]> {
- let cppNamespace = "::mlir::NVVM";
- let genSpecializedAttr = 0;
-}
-
-def Tcgen05MMABlockScaleKindAttr : EnumAttr<NVVM_Dialect, Tcgen05MMABlockScaleKind,
- "tcgen05_mma_block_scale_kind"> {
- let description = [{
- The Tcgen05MMABlockScaleKind attribute describes the allowed set of types for matrix A and B in the tcgen05.mma.{sp}.block_scale Op. The following are supported types for each kind:
-
- ```
- +--------------+-------------------------------------------+
- | Matrix Kind | supported types for A / B |
- +--------------+-------------------------------------------+
- | mxf8f6f4 | e4m3, e5m3, e2m3, e3m2, e2m1 |
- | mxf4 | e2m1 |
- | mxf4nvf4 | e2m1 |
- +--------------+-------------------------------------------+
- ```
- }];
- let assemblyFormat = "`<` $value `>`";
-}
-
-def Tcgen05MMABlockScaleDefault : I32EnumAttrCase<"DEFAULT", 0, "default">;
-def Tcgen05MMABlockScaleBlock16 : I32EnumAttrCase<"BLOCK16", 1, "block16">;
-def Tcgen05MMABlockScaleBlock32 : I32EnumAttrCase<"BLOCK32", 2, "block32">;
-
-def Tcgen05MMABlockScale
- : I32EnumAttr<"Tcgen05MMABlockScale",
- "tcgen05.mma block scale attribute",
- [Tcgen05MMABlockScaleDefault, Tcgen05MMABlockScaleBlock16,
- Tcgen05MMABlockScaleBlock32]> {
- let cppNamespace = "::mlir::NVVM";
- let genSpecializedAttr = 0;
-}
-
-def Tcgen05MMABlockScaleAttr : EnumAttr<NVVM_Dialect, Tcgen05MMABlockScale,
- "tcgen05_mma_block_scale"> {
- let assemblyFormat = "`<` $value `>`";
-}
-
-def NVVM_Tcgen05MMABlockScaleOp : NVVM_Op<"tcgen05.mma.block_scale",
- [NVVMRequiresSMa<[100, 110]>]> {
- let summary = "Performs block scaled MMA operation on 5th-gen tensor cores";
-
- let description = [{
- The `tcgen05.mma.block_scale` operation is an asynchronous tensor core instruction
- that performs matrix multiplication, accumulation with block scaling in a
- single fused operation. It targets 5th-generation tensor cores, providing
- developers with fine-grained control over execution and scheduling.
-
- ```
- D = (A * scale_a) * (B * scale_b)` // if `enableInputD` is false
- D = (A * scale_a) * (B * scale_b) + D`
- ```
-
- where:
- - A is an M x (K / 2) matrix in tensor memory or described using shared memory descriptor
- - B is a K x N matrix described using shared memory descriptor
- - D is an M x N accumulator matrix in tensor memory
- - `scale_a` and `scale_b` are matrices in tensor memory used to scale `A` and `B` respectively
-
- The `shared memory descriptor` can be generated using `tcgen05.mma_smem_desc` Op
-
- - `idesc` is a 32 bit value representing the [Instruction Descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instruction-descriptor)
-
- Required Attributes:
- - `kind` is a Tcgen05MMABlockScaleKind attribute
-
- - `ctaGroup` specifies CTA group configuration
- * cta_1: MMA will be performed on the current thread's CTA
- * cta_2: MMA will be performed on the current thread and it's peer CTA
-
- Default Attributes:
- - collectorOp is a Tcgen05MMACollectorOp attribute with matrix A as the collector buffer
-
- [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma)
- }];
-
- let arguments = (ins
- Tcgen05MMABlockScaleKindAttr:$kind,
- CTAGroupKindAttr:$ctaGroup,
- DefaultValuedAttr<Tcgen05MMABlockScaleAttr,
- "Tcgen05MMABlockScale::DEFAULT">:$blockScale,
- DefaultValuedAttr<Tcgen05MMACollectorOpAttr,
- "Tcgen05MMACollectorOp::DISCARD">:$collectorOp,
- LLVM_PointerTensor:$matrixD,
- AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA,
- I64:$matrixB,
- I32:$idesc, I1:$enableInputD,
- LLVM_PointerTensor:$scaleA,
- LLVM_PointerTensor:$scaleB
- );
-
- let assemblyFormat = [{
- $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD `,` $scaleA `,` $scaleB
- attr-dict `:` `(` type(operands) `)`
- }];
-
- let hasVerifier = true;
-
- let extraClassDeclaration = [{
- static mlir::NVVM::IDArgPair
- getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
- llvm::IRBuilderBase &builder);
- }];
-
- let llvmBuilder = [{
- auto [ID, args] = NVVM::Tcgen05MMABlockScaleOp::getIntrinsicIDAndArgs(
- *op, moduleTranslation, builder);
- createIntrinsicCall(builder, ID, args);
- }];
-}
-
-def NVVM_Tcgen05MMASparseBlockScaleOp : NVVM_Op<"tcgen05.mma.sp.block_scale",
- [NVVMRequiresSMa<[100, 110]>]> {
- let summary = "Performs block scaled MMA operation with sparse A matrix on 5th-gen tensor cores";
-
- let description = [{
- The `tcgen05.mma.sp.block_scale` operation is an asynchronous tensor core
- instruction that performs matrix multiplication, accumulation with block
- scaling, and sparse `A` matrix in a single fused operation. It targets
- 5th-generation tensor cores, providing developers with fine-grained control
- over execution, and scheduling.
-
- ```
- D = (A * scale_a) * (B * scale_b) // if `enableInputD` is specified
- D = (A * scale_a) * (B * scale_b) + D // otherwise
- ```
-
- where:
- - A is an M x (K / 2) matrix in tensor memory or described using shared memory descriptor
- - B is a K x N matrix described using shared memory descriptor
- - D is an M x N accumulator matrix in tensor memory
- - `scale_a` and `scale_b` are matrices in tensor memory used to scale `A` and `B` respectively
-
- Other attributes and operands are similar to that of tcgen05.mma.block_scale Op
-
- [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma-sp)
- }];
-
- let arguments = (ins
- Tcgen05MMABlockScaleKindAttr:$kind,
- CTAGroupKindAttr:$ctaGroup,
- DefaultValuedAttr<Tcgen05MMABlockScaleAttr,
- "Tcgen05MMABlockScale::DEFAULT">:$blockScale,
- DefaultValuedAttr<Tcgen05MMACollectorOpAttr,
- "Tcgen05MMACollectorOp::DISCARD">:$collectorOp,
- LLVM_PointerTensor:$matrixD,
- AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA,
- I64:$matrixB,
- I32:$idesc,
- I1:$enableInputD,
- LLVM_PointerTensor:$sparseMetadata,
- LLVM_PointerTensor:$scaleA,
- LLVM_PointerTensor:$scaleB
- );
-
- let assemblyFormat = [{
- $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD `,` $sparseMetadata `,` $scaleA `,` $scaleB
- attr-dict `:` `(` type(operands) `)`
- }];
-
- let hasVerifier = true;
-
- let extraClassDeclaration = [{
- static mlir::NVVM::IDArgPair
- getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
- llvm::IRBuilderBase &builder);
- }];
-
- let llvmBuilder = [{
- auto [ID, args] = NVVM::Tcgen05MMASparseBlockScaleOp::getIntrinsicIDAndArgs(
- *op, moduleTranslation, builder);
- createIntrinsicCall(builder, ID, args);
- }];
-}
-
-def Tcgen05MMACollectorBBuffer0 : I32EnumAttrCase<"B0", 0, "b0">;
-def Tcgen05MMACollectorBBuffer1 : I32EnumAttrCase<"B1", 1, "b1">;
-def Tcgen05MMACollectorBBuffer2 : I32EnumAttrCase<"B2", 2, "b2">;
-def Tcgen05MMACollectorBBuffer3 : I32EnumAttrCase<"B3", 3, "b3">;
-
-def Tcgen05MMACollectorBBuffer : I32EnumAttr<
- "Tcgen05MMACollectorBBuffer",
- "tcgen05 MMA Collector Buffer B Attribute",
- [Tcgen05MMACollectorBBuffer0, Tcgen05MMACollectorBBuffer1, Tcgen05MMACollectorBBuffer2,
- Tcgen05MMACollectorBBuffer3]> {
- let cppNamespace = "::mlir::NVVM";
- let genSpecializedAttr = 0;
-}
-
-def Tcgen05MMACollectorBBufferAttr : EnumAttr<NVVM_Dialect, Tcgen05MMACollectorBBuffer, "tcgen05_mma_collectorb"> {
- let assemblyFormat = "`<` $value `>`";
-}
-
-def NVVM_Tcgen05MMAWsOp : NVVM_Op<"tcgen05.mma.ws",
- [NVVMRequiresSMa<[100, 110]>]> {
- let summary = "Performs weight stationary convolution MMA operation on 5th-gen tensor cores";
-
- let description = [{
- The `tcgen05.mma.ws` operation is an asynchronous tensor core instruction
- that performs weight stationary convolution matrix multiplication, accumulation
- in a single fused operation. It targets 5th-generation tensor cores, providing
- developers with fine-grained control over execution, and scheduling.
-
- ```
- D = A * B` // if `enableInputD` is false
- D = A * B + D` // otherwise
- ```
-
- where:
- - A is an `M x K` matrix in tensor memory or described using shared memory descriptor
- - B is a `K x N` matrix described using shared memory descriptor
- - D is an `M x N` accumulator matrix in tensor memory
-
- The `shared memory descriptor` can be generated using `tcgen05.mma_smem_desc` Op
-
- - idesc is a 32-bit value representing the [Instruction Descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instruction-descriptor)
-
- Optional Operands:
- - zeroColMask is a 64 bit value representing the [Zero-column mask descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-zero-column-mask-descriptor)
-
- Required Attributes:
- - `kind` is a Tcgen05MMAKind attribute
-
- Default Valued Attributes:
- - collectorBBuffer specifies collector buffer for matrix B: b0 (default), b1, b2, b3
-
- - collectorOp is a Tcgen05MMACollectorOp attribute with matrix B as the collector buffer
-
- [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma-ws)
- }];
-
- let arguments = (ins
- Tcgen05MMAKindAttr:$kind,
- DefaultValuedAttr<Tcgen05MMACollectorBBufferAttr,
- "Tcgen05MMACollectorBBuffer::B0">:$collectorBBuffer,
- DefaultValuedAttr<Tcgen05MMACollectorOpAttr,
- "Tcgen05MMACollectorOp::DISCARD">:$collectorOp,
- LLVM_PointerTensor:$matrixD,
- AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA,
- I64:$matrixB,
- I32:$idesc,
- I1:$enableInputD,
- Optional<I64>:$zeroColMask
- );
-
- let assemblyFormat = [{
- $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD (`,` $zeroColMask^)?
- attr-dict `:` `(` type(operands) `)`
- }];
-
- let extraClassDeclaration = [{
- static mlir::NVVM::IDArgPair getIntrinsicIDAndArgs(
- Operation &op, LLVM::ModuleTranslation &mt,
- llvm::IRBuilderBase &builder);
- }];
-
- let llvmBuilder = [{
- auto [ID, args] =
- NVVM::Tcgen05MMAWsOp::getIntrinsicIDAndArgs(*op, moduleTranslation, builder);
- createIntrinsicCall(builder, ID, args);
- }];
-}
-
-def NVVM_Tcgen...
[truncated]
|
|
@llvm/pr-subscribers-mlir Author: Mehdi Amini (joker-eph) ChangesReverts llvm/llvm-project#164356 The bots are broken. Patch is 472.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168583.diff 15 Files Affected:
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 524b9f820f290..8d5bc7333d47f 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -4598,551 +4598,6 @@ def NVVM_ClusterLaunchControlQueryCancelOp
}];
}
-//===----------------------------------------------------------------------===//
-// NVVM tcgen05.mma Ops
-//===----------------------------------------------------------------------===//
-
-def Tcgen05MMAKindF16 : I32EnumAttrCase<"F16", 0, "f16">;
-def Tcgen05MMAKindTF32 : I32EnumAttrCase<"TF32", 1, "tf32">;
-def Tcgen05MMAKindF8F6F4 : I32EnumAttrCase<"F8F6F4", 2, "f8f6f4">;
-def Tcgen05MMAKindINT8 : I32EnumAttrCase<"I8", 3, "i8">;
-
-def Tcgen05MMAKind : I32EnumAttr<
- "Tcgen05MMAKind",
- "tcgen05 MMA Supported Types",
- [Tcgen05MMAKindF8F6F4, Tcgen05MMAKindINT8, Tcgen05MMAKindF16,
- Tcgen05MMAKindTF32]> {
- let cppNamespace = "::mlir::NVVM";
- let genSpecializedAttr = 0;
-}
-
-def Tcgen05MMAKindAttr : EnumAttr<NVVM_Dialect, Tcgen05MMAKind, "tcgen05_mma_kind"> {
- let description = [{
- The Tcgen05MMAKind attribute describes the allowed set of types for matrix A and B in the tcgen05.mma.{sp} Op. The following are supported types for each kind:
-
- ```
- +-------------+--------------------------------------------+
- | Matrix Kind | supported types for A / B |
- +-------------+--------------------------------------------+
- | f16 | f16, bf16 |
- | tf32 | tf32 |
- | f8f6f4 | e4m3, e5m2, e2m3, e3m2, e2m1 |
- | i8 | unsigned 8b, signed 8b |
- +-------------+--------------------------------------------+
- ```
- }];
- let assemblyFormat = "`<` $value `>`";
-}
-
-def Tcgen05MMACollectorOpDiscard : I32EnumAttrCase<"DISCARD", 0, "discard">;
-def Tcgen05MMACollectorOpLastUse : I32EnumAttrCase<"LASTUSE", 1, "lastuse">;
-def Tcgen05MMACollectorOpFill : I32EnumAttrCase<"FILL", 2, "fill">;
-def Tcgen05MMACollectorOpUse : I32EnumAttrCase<"USE", 3, "use">;
-
-def Tcgen05MMACollectorOp : I32EnumAttr<
- "Tcgen05MMACollectorOp",
- "tcgen05.mma Collector Buffer Operation",
- [Tcgen05MMACollectorOpDiscard,
- Tcgen05MMACollectorOpLastUse,
- Tcgen05MMACollectorOpFill,
- Tcgen05MMACollectorOpUse]> {
- let cppNamespace = "::mlir::NVVM";
- let genSpecializedAttr = 0;
-}
-
-def Tcgen05MMACollectorOpAttr : EnumAttr<NVVM_Dialect, Tcgen05MMACollectorOp, "tcgen05_mma_collectorop"> {
- let description = [{
- Tcgen05MMACollectorOp attribute specifies the collector buffer operations.
- The following are the supported operations:
- * discard : Release buffer after use (default)
- * lastuse : Mark buffer for last use
- * fill : Fill buffer
- * use : Use buffer without modification
- }];
- let assemblyFormat = "`<` $value `>`";
-}
-
-def NVVM_Tcgen05MMAOp : NVVM_Op<"tcgen05.mma",
- [AttrSizedOperandSegments,
- NVVMRequiresSMa<[100, 110]>]> {
- let summary = "Performs MMA operation on 5th-gen tensor cores";
-
- let description = [{
- The `tcgen05.mma` operation is an asynchronous tensor core instruction that
- performs matrix multiplication, accumulation in a single fused operation. It
- targets 5th-generation tensor cores, providing developers with fine-grained
- control over execution and scheduling.
-
- ```
- D = A * B + (D * 2^ -scaleInputD) // if `scaleInputD` is provided
- D = A * B // if `enableInputD` is false
- D = A * B + D // otherwise
- ```
-
- where:
- - A is an `M x K` matrix in tensor memory or described using shared memory descriptor
- - B is a `K x N` matrix described using shared memory descriptor
- - D is an `M x N` accumulator matrix in tensor memory
-
- The `shared memory descriptor` can be generated using `tcgen05.mma_smem_desc` Op
-
- - idesc is a 32-bit value representing the [Instruction Descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instruction-descriptor)
-
- Optional Operands:
- - `scaleInputD` is an Immediate value operand used for scaling D matrix by 2 ^ (-scaleInputD). The valid range is [0, 15]
-
- - `disableOutputLane` is a vector mask for selective output
- * vector<4 x i32> when ctaGroup is CTA_1
- * vector<8 x i32> when ctaGroup is CTA_2
-
- Required Attributes:
- - `kind` is a Tcgen05MMAKind attribute
-
- - `ctaGroup` specifies CTA group configuration
- * cta_1: MMA will be performed on the current thread's CTA
- * cta_2: MMA will be performed on the current thread and it's peer CTA
-
- Default Attributes:
- - collectorOp is a Tcgen05MMACollectorOp attribute with matrix A as the collector buffer
-
- - `aShift` shifts the rows of the A matrix down by one row and can only be
- applied if A is in tensor memory
-
- [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma)
- }];
-
- let arguments = (ins
- Tcgen05MMAKindAttr:$kind,
- CTAGroupKindAttr:$ctaGroup,
- DefaultValuedAttr<Tcgen05MMACollectorOpAttr,
- "Tcgen05MMACollectorOp::DISCARD">:$collectorOp,
- UnitAttr:$aShift,
- LLVM_PointerTensor:$matrixD,
- AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA,
- I64:$matrixB,
- I32:$idesc,
- I1:$enableInputD,
- Optional<I64>:$scaleInputD,
- Optional<FixedVectorOfLengthAndType<[4, 8], [I32]>>:$disableOutputLane
- );
-
- let assemblyFormat = [{
- $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD (`scale` `=` $scaleInputD^)?
- (`mask` `=` $disableOutputLane^)? attr-dict `:` `(` type(operands) `)`
- }];
-
- let hasVerifier = true;
-
- let extraClassDeclaration = [{
- static mlir::NVVM::IDArgPair getIntrinsicIDAndArgs(
- Operation &op, LLVM::ModuleTranslation &mt,
- llvm::IRBuilderBase &builder);
- }];
-
- let llvmBuilder = [{
- auto [ID, args] = NVVM::Tcgen05MMAOp::getIntrinsicIDAndArgs(
- *op, moduleTranslation, builder);
- createIntrinsicCall(builder, ID, args);
- }];
-}
-
-def NVVM_Tcgen05MMASparseOp : NVVM_Op<"tcgen05.mma.sp",
- [AttrSizedOperandSegments,
- NVVMRequiresSMa<[100, 110]>]> {
- let summary = "Performs MMA operation with sparse A matrix on 5th-gen tensor cores";
-
- let description = [{
- The `tcgen05.mma.sp` operation is an asynchronous tensor core instruction
- that performs matrix multiplication, accumulation with sparse `A` matrix in
- a single fused operation. It targets 5th-generation tensor cores, providing
- developers with fine-grained control over execution and scheduling.
-
- ```
- D = A * B + (D * 2^ -scaleInputD) // if `scaleInputD` is provided
- D = A * B // if `enableInputD` is false
- D = A * B + D // otherwise
- ```
-
- where:
- - A is an `M x (K / 2)` matrix in tensor memory or described using shared memory descriptor
- - B is a `K x N` matrix described using shared memory descriptor
- - D is an `M x N` accumulator matrix in tensor memory
- - sparseMetadata located in tensor memory specifies the mapping of the `K / 2`
- non-zero elements to the K elements before performing the MMA operation
-
- Other attributes and operands are similar to that of tcgen05.mma Op
-
- [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma-sp)
- }];
-
- let arguments = (ins
- Tcgen05MMAKindAttr:$kind,
- CTAGroupKindAttr:$ctaGroup,
- DefaultValuedAttr<Tcgen05MMACollectorOpAttr,
- "Tcgen05MMACollectorOp::DISCARD">:$collectorOp,
- UnitAttr:$aShift,
- LLVM_PointerTensor:$matrixD,
- AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA,
- I64:$matrixB,
- I32:$idesc,
- I1:$enableInputD,
- LLVM_PointerTensor:$sparseMetadata,
- Optional<I64>:$scaleInputD,
- Optional<FixedVectorOfLengthAndType<[4, 8], [I32]>>:$disableOutputLane
- );
-
- let assemblyFormat = [{
- $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD `,` $sparseMetadata (`scale` `=` $scaleInputD^)? (`mask` `=` $disableOutputLane^)? attr-dict `:` `(` type(operands) `)`
- }];
-
- let hasVerifier = true;
-
- let extraClassDeclaration = [{
- static mlir::NVVM::IDArgPair getIntrinsicIDAndArgs(
- Operation &op, LLVM::ModuleTranslation &mt,
- llvm::IRBuilderBase &builder);
- }];
-
- let llvmBuilder = [{
- auto [ID, args] = NVVM::Tcgen05MMASparseOp::getIntrinsicIDAndArgs(
- *op, moduleTranslation, builder);
- createIntrinsicCall(builder, ID, args);
- }];
-}
-
-def Tcgen05MMAKindMXF8F6F4 : I32EnumAttrCase<"MXF8F6F4", 0, "mxf8f6f4">;
-def Tcgen05MMAKindMXF4 : I32EnumAttrCase<"MXF4", 1, "mxf4">;
-def Tcgen05MMAKindMXF4NVF4 : I32EnumAttrCase<"MXF4NVF4", 2, "mxf4nvf4">;
-
-def Tcgen05MMABlockScaleKind : I32EnumAttr<
- "Tcgen05MMABlockScaleKind",
- "tcgen05.mma.block_scale supported types",
- [Tcgen05MMAKindMXF8F6F4, Tcgen05MMAKindMXF4, Tcgen05MMAKindMXF4NVF4]> {
- let cppNamespace = "::mlir::NVVM";
- let genSpecializedAttr = 0;
-}
-
-def Tcgen05MMABlockScaleKindAttr : EnumAttr<NVVM_Dialect, Tcgen05MMABlockScaleKind,
- "tcgen05_mma_block_scale_kind"> {
- let description = [{
- The Tcgen05MMABlockScaleKind attribute describes the allowed set of types for matrix A and B in the tcgen05.mma.{sp}.block_scale Op. The following are supported types for each kind:
-
- ```
- +--------------+-------------------------------------------+
- | Matrix Kind | supported types for A / B |
- +--------------+-------------------------------------------+
- | mxf8f6f4 | e4m3, e5m3, e2m3, e3m2, e2m1 |
- | mxf4 | e2m1 |
- | mxf4nvf4 | e2m1 |
- +--------------+-------------------------------------------+
- ```
- }];
- let assemblyFormat = "`<` $value `>`";
-}
-
-def Tcgen05MMABlockScaleDefault : I32EnumAttrCase<"DEFAULT", 0, "default">;
-def Tcgen05MMABlockScaleBlock16 : I32EnumAttrCase<"BLOCK16", 1, "block16">;
-def Tcgen05MMABlockScaleBlock32 : I32EnumAttrCase<"BLOCK32", 2, "block32">;
-
-def Tcgen05MMABlockScale
- : I32EnumAttr<"Tcgen05MMABlockScale",
- "tcgen05.mma block scale attribute",
- [Tcgen05MMABlockScaleDefault, Tcgen05MMABlockScaleBlock16,
- Tcgen05MMABlockScaleBlock32]> {
- let cppNamespace = "::mlir::NVVM";
- let genSpecializedAttr = 0;
-}
-
-def Tcgen05MMABlockScaleAttr : EnumAttr<NVVM_Dialect, Tcgen05MMABlockScale,
- "tcgen05_mma_block_scale"> {
- let assemblyFormat = "`<` $value `>`";
-}
-
-def NVVM_Tcgen05MMABlockScaleOp : NVVM_Op<"tcgen05.mma.block_scale",
- [NVVMRequiresSMa<[100, 110]>]> {
- let summary = "Performs block scaled MMA operation on 5th-gen tensor cores";
-
- let description = [{
- The `tcgen05.mma.block_scale` operation is an asynchronous tensor core instruction
- that performs matrix multiplication, accumulation with block scaling in a
- single fused operation. It targets 5th-generation tensor cores, providing
- developers with fine-grained control over execution and scheduling.
-
- ```
- D = (A * scale_a) * (B * scale_b)` // if `enableInputD` is false
- D = (A * scale_a) * (B * scale_b) + D`
- ```
-
- where:
- - A is an M x (K / 2) matrix in tensor memory or described using shared memory descriptor
- - B is a K x N matrix described using shared memory descriptor
- - D is an M x N accumulator matrix in tensor memory
- - `scale_a` and `scale_b` are matrices in tensor memory used to scale `A` and `B` respectively
-
- The `shared memory descriptor` can be generated using `tcgen05.mma_smem_desc` Op
-
- - `idesc` is a 32 bit value representing the [Instruction Descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instruction-descriptor)
-
- Required Attributes:
- - `kind` is a Tcgen05MMABlockScaleKind attribute
-
- - `ctaGroup` specifies CTA group configuration
- * cta_1: MMA will be performed on the current thread's CTA
- * cta_2: MMA will be performed on the current thread and it's peer CTA
-
- Default Attributes:
- - collectorOp is a Tcgen05MMACollectorOp attribute with matrix A as the collector buffer
-
- [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma)
- }];
-
- let arguments = (ins
- Tcgen05MMABlockScaleKindAttr:$kind,
- CTAGroupKindAttr:$ctaGroup,
- DefaultValuedAttr<Tcgen05MMABlockScaleAttr,
- "Tcgen05MMABlockScale::DEFAULT">:$blockScale,
- DefaultValuedAttr<Tcgen05MMACollectorOpAttr,
- "Tcgen05MMACollectorOp::DISCARD">:$collectorOp,
- LLVM_PointerTensor:$matrixD,
- AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA,
- I64:$matrixB,
- I32:$idesc, I1:$enableInputD,
- LLVM_PointerTensor:$scaleA,
- LLVM_PointerTensor:$scaleB
- );
-
- let assemblyFormat = [{
- $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD `,` $scaleA `,` $scaleB
- attr-dict `:` `(` type(operands) `)`
- }];
-
- let hasVerifier = true;
-
- let extraClassDeclaration = [{
- static mlir::NVVM::IDArgPair
- getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
- llvm::IRBuilderBase &builder);
- }];
-
- let llvmBuilder = [{
- auto [ID, args] = NVVM::Tcgen05MMABlockScaleOp::getIntrinsicIDAndArgs(
- *op, moduleTranslation, builder);
- createIntrinsicCall(builder, ID, args);
- }];
-}
-
-def NVVM_Tcgen05MMASparseBlockScaleOp : NVVM_Op<"tcgen05.mma.sp.block_scale",
- [NVVMRequiresSMa<[100, 110]>]> {
- let summary = "Performs block scaled MMA operation with sparse A matrix on 5th-gen tensor cores";
-
- let description = [{
- The `tcgen05.mma.sp.block_scale` operation is an asynchronous tensor core
- instruction that performs matrix multiplication, accumulation with block
- scaling, and sparse `A` matrix in a single fused operation. It targets
- 5th-generation tensor cores, providing developers with fine-grained control
- over execution, and scheduling.
-
- ```
- D = (A * scale_a) * (B * scale_b) // if `enableInputD` is specified
- D = (A * scale_a) * (B * scale_b) + D // otherwise
- ```
-
- where:
- - A is an M x (K / 2) matrix in tensor memory or described using shared memory descriptor
- - B is a K x N matrix described using shared memory descriptor
- - D is an M x N accumulator matrix in tensor memory
- - `scale_a` and `scale_b` are matrices in tensor memory used to scale `A` and `B` respectively
-
- Other attributes and operands are similar to that of tcgen05.mma.block_scale Op
-
- [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma-sp)
- }];
-
- let arguments = (ins
- Tcgen05MMABlockScaleKindAttr:$kind,
- CTAGroupKindAttr:$ctaGroup,
- DefaultValuedAttr<Tcgen05MMABlockScaleAttr,
- "Tcgen05MMABlockScale::DEFAULT">:$blockScale,
- DefaultValuedAttr<Tcgen05MMACollectorOpAttr,
- "Tcgen05MMACollectorOp::DISCARD">:$collectorOp,
- LLVM_PointerTensor:$matrixD,
- AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA,
- I64:$matrixB,
- I32:$idesc,
- I1:$enableInputD,
- LLVM_PointerTensor:$sparseMetadata,
- LLVM_PointerTensor:$scaleA,
- LLVM_PointerTensor:$scaleB
- );
-
- let assemblyFormat = [{
- $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD `,` $sparseMetadata `,` $scaleA `,` $scaleB
- attr-dict `:` `(` type(operands) `)`
- }];
-
- let hasVerifier = true;
-
- let extraClassDeclaration = [{
- static mlir::NVVM::IDArgPair
- getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
- llvm::IRBuilderBase &builder);
- }];
-
- let llvmBuilder = [{
- auto [ID, args] = NVVM::Tcgen05MMASparseBlockScaleOp::getIntrinsicIDAndArgs(
- *op, moduleTranslation, builder);
- createIntrinsicCall(builder, ID, args);
- }];
-}
-
-def Tcgen05MMACollectorBBuffer0 : I32EnumAttrCase<"B0", 0, "b0">;
-def Tcgen05MMACollectorBBuffer1 : I32EnumAttrCase<"B1", 1, "b1">;
-def Tcgen05MMACollectorBBuffer2 : I32EnumAttrCase<"B2", 2, "b2">;
-def Tcgen05MMACollectorBBuffer3 : I32EnumAttrCase<"B3", 3, "b3">;
-
-def Tcgen05MMACollectorBBuffer : I32EnumAttr<
- "Tcgen05MMACollectorBBuffer",
- "tcgen05 MMA Collector Buffer B Attribute",
- [Tcgen05MMACollectorBBuffer0, Tcgen05MMACollectorBBuffer1, Tcgen05MMACollectorBBuffer2,
- Tcgen05MMACollectorBBuffer3]> {
- let cppNamespace = "::mlir::NVVM";
- let genSpecializedAttr = 0;
-}
-
-def Tcgen05MMACollectorBBufferAttr : EnumAttr<NVVM_Dialect, Tcgen05MMACollectorBBuffer, "tcgen05_mma_collectorb"> {
- let assemblyFormat = "`<` $value `>`";
-}
-
-def NVVM_Tcgen05MMAWsOp : NVVM_Op<"tcgen05.mma.ws",
- [NVVMRequiresSMa<[100, 110]>]> {
- let summary = "Performs weight stationary convolution MMA operation on 5th-gen tensor cores";
-
- let description = [{
- The `tcgen05.mma.ws` operation is an asynchronous tensor core instruction
- that performs weight stationary convolution matrix multiplication, accumulation
- in a single fused operation. It targets 5th-generation tensor cores, providing
- developers with fine-grained control over execution, and scheduling.
-
- ```
- D = A * B` // if `enableInputD` is false
- D = A * B + D` // otherwise
- ```
-
- where:
- - A is an `M x K` matrix in tensor memory or described using shared memory descriptor
- - B is a `K x N` matrix described using shared memory descriptor
- - D is an `M x N` accumulator matrix in tensor memory
-
- The `shared memory descriptor` can be generated using `tcgen05.mma_smem_desc` Op
-
- - idesc is a 32-bit value representing the [Instruction Descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instruction-descriptor)
-
- Optional Operands:
- - zeroColMask is a 64 bit value representing the [Zero-column mask descriptor](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-zero-column-mask-descriptor)
-
- Required Attributes:
- - `kind` is a Tcgen05MMAKind attribute
-
- Default Valued Attributes:
- - collectorBBuffer specifies collector buffer for matrix B: b0 (default), b1, b2, b3
-
- - collectorOp is a Tcgen05MMACollectorOp attribute with matrix B as the collector buffer
-
- [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma-ws)
- }];
-
- let arguments = (ins
- Tcgen05MMAKindAttr:$kind,
- DefaultValuedAttr<Tcgen05MMACollectorBBufferAttr,
- "Tcgen05MMACollectorBBuffer::B0">:$collectorBBuffer,
- DefaultValuedAttr<Tcgen05MMACollectorOpAttr,
- "Tcgen05MMACollectorOp::DISCARD">:$collectorOp,
- LLVM_PointerTensor:$matrixD,
- AnyTypeOf<[LLVM_PointerTensor, I64]>:$matrixA,
- I64:$matrixB,
- I32:$idesc,
- I1:$enableInputD,
- Optional<I64>:$zeroColMask
- );
-
- let assemblyFormat = [{
- $matrixD `,` $matrixA `,` $matrixB `,` $idesc `,` $enableInputD (`,` $zeroColMask^)?
- attr-dict `:` `(` type(operands) `)`
- }];
-
- let extraClassDeclaration = [{
- static mlir::NVVM::IDArgPair getIntrinsicIDAndArgs(
- Operation &op, LLVM::ModuleTranslation &mt,
- llvm::IRBuilderBase &builder);
- }];
-
- let llvmBuilder = [{
- auto [ID, args] =
- NVVM::Tcgen05MMAWsOp::getIntrinsicIDAndArgs(*op, moduleTranslation, builder);
- createIntrinsicCall(builder, ID, args);
- }];
-}
-
-def NVVM_Tcgen...
[truncated]
|
Reland commit fb829bf with additional fixes relating to post-merge CI failures
Reverts #164356
The bots are broken.