From 2bc3761acae2ba1e5fd0687991fe8fcd16cf4656 Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Thu, 4 Dec 2025 18:29:22 +0000 Subject: [PATCH 1/2] [mlir][AMDGPU] Rename gfx1250 packed extension ops, change firstScaleLane The current name of scaled_ext_packed816 was, in retrospect, bothering me, since it just has a bunch of numbers on the end and doesn't really reflect the wave-wide nature of the operation. On top of that, the fact that firstScaleLane was 0 or 1, which might be read as the first lane being 1 (and not what it actually was, 16), also seemed weird. Therefore, before this op sees any use, 1. Renaem it to scaled_ext_packed_matrix 2. Change the semantics of firstScaleLane to actually point at the lane where the scales start (valid options currently are 0 or 16, the two halves of a wave32 wave). (Disclaimer: the mechanical updates were done via AI.) --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 33 +++---- .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 99 ++++++++++--------- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 8 +- .../Conversion/AMDGPUToROCDL/gfx1250.mlir | 86 ++++++++-------- mlir/test/Dialect/AMDGPU/ops.mlir | 80 +++++++-------- 5 files changed, 154 insertions(+), 152 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 16eaf28ddd95b..6ac84c646e3ae 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -146,12 +146,8 @@ def AMDGPU_ExtPackedFp8Op : }]; } -def IsValidBlockSize: AttrConstraint< - CPred<"::llvm::is_contained({16, 32}, ::llvm::cast<::mlir::IntegerAttr>($_self).getInt())">, - "whose value is 16 or 32">; - -def AMDGPU_ScaledExtPacked816Op - : AMDGPU_Op<"scaled_ext_packed816", [Pure, AllShapesMatch<["source", "res"]>]>, +def AMDGPU_ScaledExtPackedMatrixOp + : AMDGPU_Op<"scaled_ext_packed_matrix", [Pure, AllShapesMatch<["source", "res"]>]>, Arguments<( ins AnyTypeOf<[FixedVectorOfShapeAndType<[8], F4E2M1FN>, FixedVectorOfShapeAndType<[8], F8E4M3FN>, @@ -159,8 +155,8 @@ def AMDGPU_ScaledExtPacked816Op FixedVectorOfShapeAndType<[16], F6E2M3FN>, FixedVectorOfShapeAndType<[16], F6E3M2FN>]>:$source, FixedVectorOfShapeAndType<[4], F8E8M0FNU>:$scale, - ConfinedAttr:$blockSize, - ConfinedAttr, IntMaxValue<1>]>:$firstScaleLane, + ConfinedAttr]>:$blockSize, + ConfinedAttr]>:$firstScaleLane, ConfinedAttr, IntMaxValue<3>]>:$firstScaleByte)>, Results<( outs AnyTypeOf<[FixedVectorOfShapeAndType<[8], F32>, @@ -170,9 +166,12 @@ def AMDGPU_ScaledExtPacked816Op FixedVectorOfShapeAndType<[16], F16>, FixedVectorOfShapeAndType<[16], BF16>]>:$res)> { - let summary = "Extend a vector of packed floating point values"; + let summary = "Extend a wave-wide matrix of packed floating point values"; let description = [{ + Extend matrix of microfloats (8 or 16 elements per lane) using a set of scales + that may be stored on other lanes. + The scales applied to the input microfloats are stored in bytes which come from the `scales` input provided in a *half* of the wave identified by `firstScaleLane`. The bytes used is selected by `firstScaleByte` and depends @@ -192,14 +191,14 @@ def AMDGPU_ScaledExtPacked816Op ```mlir // Input: 8-element vector of F8E4M3FN, converting to F32 // Lanes 0-15 read from byte 0, lanes 16-31 read from byte 1 - %result = amdgpu.scaled_ext_packed816 %source scale(%scales) + %result = amdgpu.scaled_ext_packed_matrix %source scale(%scales) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xf32> // Input: 16-element vector of F6E2M3FN, converting to F16 // Lanes 0-15 read from byte 2, lanes 16-31 read from byte 3 - %result = amdgpu.scaled_ext_packed816 %source scale(%scales) - blockSize(32) firstScaleLane(1) firstScaleByte(2) + %result = amdgpu.scaled_ext_packed_matrix %source scale(%scales) + blockSize(32) firstScaleLane(16) firstScaleByte(2) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf16> ``` @@ -211,19 +210,19 @@ def AMDGPU_ScaledExtPacked816Op ```mlir // Input: 8-element vector of F8E5M2, converting to BF16 // Lanes 0-15 read from byte 0, lanes 16-31 read from byte 2 (0+2) - %result = amdgpu.scaled_ext_packed816 %source scale(%scales) + %result = amdgpu.scaled_ext_packed_matrix %source scale(%scales) blockSize(16) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xbf16> // Input: 16-element vector of F6E3M2FN, converting to F32 // Lanes 0-15 read from byte 1, lanes 16-31 read from byte 3 (1+2) - %result = amdgpu.scaled_ext_packed816 %source scale(%scales) - blockSize(16) firstScaleLane(1) firstScaleByte(1) + %result = amdgpu.scaled_ext_packed_matrix %source scale(%scales) + blockSize(16) firstScaleLane(16) firstScaleByte(1) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf32> ``` Note: the layout for the scales generally mirrors how the WMMA - instructions use for matix scales. These selection operands allows + instructions use for matrix scales. These selection operands allows one to choose portions of the matrix to convert. When `source` is either F8E4M3FN or F8E5M2 and `blockSize` is 32, @@ -233,7 +232,7 @@ def AMDGPU_ScaledExtPacked816Op When `source` is either F8E4M3FN or F8E5M2 and `blockSize` is 16, following combinations are allowed: * `firstScaleLane(0), firstScaleByte(0)` - * `firstScaleLane(1), firstScaleByte(2)` + * `firstScaleLane(16), firstScaleByte(2)` all other combinations are reserved. Available on gfx1250+. diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 2b6938712dad2..26175b86bf262 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -1506,16 +1506,17 @@ struct ExtPackedFp8OpLowering final ConversionPatternRewriter &rewriter) const override; }; -struct ScaledExtPacked816OpLowering final - : public ConvertOpToLLVMPattern { - ScaledExtPacked816OpLowering(const LLVMTypeConverter &converter, - Chipset chipset) - : ConvertOpToLLVMPattern(converter), +struct ScaledExtPackedMatrixOpLowering final + : public ConvertOpToLLVMPattern { + ScaledExtPackedMatrixOpLowering(const LLVMTypeConverter &converter, + Chipset chipset) + : ConvertOpToLLVMPattern(converter), chipset(chipset) {} Chipset chipset; LogicalResult - matchAndRewrite(ScaledExtPacked816Op op, ScaledExtPacked816OpAdaptor adaptor, + matchAndRewrite(ScaledExtPackedMatrixOp op, + ScaledExtPackedMatrixOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; }; @@ -1627,34 +1628,35 @@ LogicalResult ExtPackedFp8OpLowering::matchAndRewrite( return success(); } -int32_t getScaleSel(int32_t blockSize, unsigned bitWidth, - int32_t firstScaleLane, int32_t firstScaleByte) { - // When lowering amdgpu.scaled_ext_packed816 to rocdl.cvt.scale.pk*.f*.f* - // operations, the attributes blockSize, sourceType, firstScaleLane and +int32_t getScaleSel(int32_t blockSize, unsigned bitWidth, int32_t scaleWaveHalf, + int32_t firstScaleByte) { + // When lowering amdgpu.scaled_ext_packed_matrix to rocdl.cvt.scale.pk*.f*.f* + // operations, the attributes blockSize, sourceType, scaleWaveHalf, and // firstScaleByte are merged into a single attribute scaleSel. This is how - // those values are merged together. + // those values are merged together. (Note: scaleWaveHalf isn't a high-level + // attribute but is derifed from firstScaleLane). assert(llvm::is_contained({16, 32}, blockSize)); assert(llvm::is_contained(llvm::ArrayRef{4, 6, 8}, bitWidth)); - const bool is_fp8 = bitWidth == 8; - const bool is_block_16 = blockSize == 16; + const bool isFp8 = bitWidth == 8; + const bool isBlock16 = blockSize == 16; - if (!is_fp8) { - int bit_0 = is_block_16; + if (!isFp8) { + int32_t bit0 = isBlock16; assert(llvm::is_contained({0, 1, 2}, firstScaleByte)); - int bit_1 = (firstScaleByte == 2) << 1; + int32_t bit1 = (firstScaleByte == 2) << 1; assert(llvm::is_contained({0, 1}, firstScaleLane)); - int bit_2 = firstScaleLane << 2; - return bit_2 | bit_1 | bit_0; + int32_t bit2 = scaleWaveHalf << 2; + return bit2 | bit1 | bit0; } - int bit_0 = is_block_16; + int32_t bit0 = isBlock16; // firstScaleByte is guaranteed to be defined by two bits. assert(llvm::is_contained({0, 1, 2, 3}, firstScaleByte)); - int bit_2_and_1 = firstScaleByte << 1; + int32_t bits2and1 = firstScaleByte << 1; assert(llvm::is_contained({0, 1}, firstScaleLane)); - int bit_3 = firstScaleLane << 3; - int bits = bit_3 | bit_2_and_1 | bit_0; + int32_t bit3 = scaleWaveHalf << 3; + int32_t bits = bit3 | bits2and1 | bit0; // These are invalid cases. assert(!llvm::is_contained( {0b0011, 0b0101, 0b0111, 0b1000, 0b1001, 0b1011, 0b1111}, bits)); @@ -1717,8 +1719,8 @@ scaledExtPacked816ToIntrinsic(Type srcElemType, Type destElemType) { "instructions"); } -LogicalResult ScaledExtPacked816OpLowering::matchAndRewrite( - ScaledExtPacked816Op op, ScaledExtPacked816OpAdaptor adaptor, +LogicalResult ScaledExtPackedMatrixOpLowering::matchAndRewrite( + ScaledExtPackedMatrixOp op, ScaledExtPackedMatrixOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { using fp4 = Float4E2M1FNType; using fp8 = Float8E4M3FNType; @@ -1732,7 +1734,9 @@ LogicalResult ScaledExtPacked816OpLowering::matchAndRewrite( "Scaled fp packed conversion instructions are not available on target " "architecture and their emulation is not implemented"); } - int32_t firstScaleLane = op.getFirstScaleLane(); + // Convert user-facing firstScaleLane (0 or 16) to the half of the wave that + // is being selected. + int32_t scaleWaveHalf = op.getFirstScaleLane() / 16; int32_t firstScaleByte = op.getFirstScaleByte(); int32_t blockSize = op.getBlockSize(); auto sourceType = cast(op.getSource().getType()); @@ -1770,7 +1774,7 @@ LogicalResult ScaledExtPacked816OpLowering::matchAndRewrite( "no intrinsic matching packed scaled conversion on the given chipset"); int32_t scaleSel = - getScaleSel(blockSize, bitWidth, firstScaleLane, firstScaleByte); + getScaleSel(blockSize, bitWidth, scaleWaveHalf, firstScaleByte); Value castedScale = LLVM::BitcastOp::create(rewriter, loc, i32, adaptor.getScale()); Value castedSource = @@ -2388,27 +2392,26 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, Chipset chipset) { populateAMDGPUMemorySpaceAttributeConversions(converter); - patterns - .add, - RawBufferOpLowering, - RawBufferOpLowering, - RawBufferOpLowering, - RawBufferOpLowering, - RawBufferOpLowering, - RawBufferOpLowering, - AMDGPUDPPLowering, MemoryCounterWaitOpLowering, LDSBarrierOpLowering, - SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering, - WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPacked816OpLowering, - ScaledExtPackedOpLowering, PackedScaledTruncOpLowering, - PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering, - GatherToLDSOpLowering, TransposeLoadOpLowering, - AMDGPUPermlaneLowering, AMDGPUMakeDmaBaseLowering>(converter, - chipset); + patterns.add< + FatRawBufferCastLowering, + RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering, + AMDGPUDPPLowering, MemoryCounterWaitOpLowering, LDSBarrierOpLowering, + SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering, + WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPackedMatrixOpLowering, + ScaledExtPackedOpLowering, PackedScaledTruncOpLowering, + PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering, + GatherToLDSOpLowering, TransposeLoadOpLowering, AMDGPUPermlaneLowering, + AMDGPUMakeDmaBaseLowering>(converter, chipset); patterns.add(converter); } diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index 8b58c3b1dd182..f78eca621da52 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -343,9 +343,9 @@ void RawBufferAtomicCmpswapOp::getCanonicalizationPatterns( } //===----------------------------------------------------------------------===// -// ScaledExtPacked816Op +// ScaledExtPackedMatrixOp //===----------------------------------------------------------------------===// -LogicalResult ScaledExtPacked816Op::verify() { +LogicalResult ScaledExtPackedMatrixOp::verify() { int blockSize = getBlockSize(); assert(llvm::is_contained({16, 32}, blockSize) && "invalid block size"); @@ -376,10 +376,10 @@ LogicalResult ScaledExtPacked816Op::verify() { } else { if (is_block_16) { bool is_valid = ((firstScaleLane == 0) && (firstScaleByte == 0)) || - ((firstScaleLane == 1) && (firstScaleByte == 2)); + ((firstScaleLane == 16) && (firstScaleByte == 2)); if (!is_valid) { return emitOpError("blockSize of 16 can only have (firstScaleLane, " - "firstScaleByte) be (0, 0) or (1, 2) for f8."); + "firstScaleByte) be (0, 0) or (16, 2) for f8."); } } } diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir index 27daea58f8f92..d0ec69d6fea6e 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir @@ -1,165 +1,165 @@ // RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1250 --split-input-file --verify-diagnostics \ // RUN: | FileCheck %s -// CHECK-LABEL: @scaled_ext_packed816_fp4 +// CHECK-LABEL: @scaled_ext_packed_matrix_fp4 // CHECK-SAME: (%[[SOURCE:.+]]: vector<8xf4E2M1FN>, %[[SCALE:.+]]: vector<4xf8E8M0FNU>) -func.func @scaled_ext_packed816_fp4(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { +func.func @scaled_ext_packed_matrix_fp4(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { // CHECK: %[[SCALE_4xi8:.+]] = builtin.unrealized_conversion_cast %[[SCALE]] : vector<4xf8E8M0FNU> to vector<4xi8> // CHECK: %[[SOURCE_8xi4:.+]] = builtin.unrealized_conversion_cast %[[SOURCE]] : vector<8xf4E2M1FN> to vector<8xi4> // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32 // CHECK: %[[SOURCE_i32:.+]] = llvm.bitcast %[[SOURCE_8xi4]] : vector<8xi4> to i32 // CHECK: rocdl.cvt.scale.pk8.f16.fp4 %[[SOURCE_i32]], %[[SCALE_i32]][0] : vector<8xf16> - %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16> + %ret0 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16> // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32 // CHECK: %[[SOURCE_i32:.+]] = llvm.bitcast %[[SOURCE_8xi4]] : vector<8xi4> to i32 // CHECK: rocdl.cvt.scale.pk8.bf16.fp4 %[[SOURCE_i32]], %[[SCALE_i32]][0] : vector<8xbf16> - %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xbf16> + %ret1 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xbf16> // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32 // CHECK: %[[SOURCE_i32:.+]] = llvm.bitcast %[[SOURCE_8xi4]] : vector<8xi4> to i32 // CHECK: rocdl.cvt.scale.pk8.f32.fp4 %[[SOURCE_i32]], %[[SCALE_i32]][0] : vector<8xf32> - %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf32> + %ret2 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf32> func.return %ret0, %ret1, %ret2: vector<8xf16>, vector<8xbf16>, vector<8xf32> } -// CHECK-LABEL: @scaled_ext_packed816_fp8 +// CHECK-LABEL: @scaled_ext_packed_matrix_fp8 // CHECK-SAME: (%[[SOURCE:.+]]: vector<8xf8E4M3FN>, %[[SCALE:.+]]: vector<4xf8E8M0FNU>) -func.func @scaled_ext_packed816_fp8(%v: vector<8xf8E4M3FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { +func.func @scaled_ext_packed_matrix_fp8(%v: vector<8xf8E4M3FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { // CHECK: %[[SCALE_4xi8:.+]] = builtin.unrealized_conversion_cast %[[SCALE]] : vector<4xf8E8M0FNU> to vector<4xi8> // CHECK: %[[SOURCE_8xi8:.+]] = builtin.unrealized_conversion_cast %[[SOURCE]] : vector<8xf8E4M3FN> to vector<8xi8> // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32 // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32> // CHECK: rocdl.cvt.scale.pk8.f16.fp8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xf16> - %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xf16> + %ret0 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xf16> // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32 // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32> // CHECK: rocdl.cvt.scale.pk8.bf16.fp8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xbf16> - %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xbf16> + %ret1 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xbf16> // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32 // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32> // CHECK: rocdl.cvt.scale.pk8.f32.fp8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xf32> - %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xf32> + %ret2 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xf32> func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32> } -// CHECK-LABEL: @scaled_ext_packed816_bf8 +// CHECK-LABEL: @scaled_ext_packed_matrix_bf8 // CHECK-SAME: (%[[SOURCE:.+]]: vector<8xf8E5M2>, %[[SCALE:.+]]: vector<4xf8E8M0FNU>) -func.func @scaled_ext_packed816_bf8(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { +func.func @scaled_ext_packed_matrix_bf8(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { // CHECK: %[[SCALE_4xi8:.+]] = builtin.unrealized_conversion_cast %[[SCALE]] : vector<4xf8E8M0FNU> to vector<4xi8> // CHECK: %[[SOURCE_8xi8:.+]] = builtin.unrealized_conversion_cast %[[SOURCE]] : vector<8xf8E5M2> to vector<8xi8> // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32 // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32> // CHECK: %[[RES:.+]] = rocdl.cvt.scale.pk8.f16.bf8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xf16> - %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16> + %ret0 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16> // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32 // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32> // CHECK: rocdl.cvt.scale.pk8.bf16.bf8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xbf16> - %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xbf16> + %ret1 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xbf16> // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32 // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32> // CHECK: rocdl.cvt.scale.pk8.f32.bf8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xf32> - %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf32> + %ret2 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf32> func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32> } -// CHECK-LABEL: @scaled_ext_packed816_fp6 +// CHECK-LABEL: @scaled_ext_packed_matrix_fp6 // CHECK-SAME: (%[[SOURCE:.+]]: vector<16xf6E2M3FN>, %[[SCALE:.+]]: vector<4xf8E8M0FNU>) -func.func @scaled_ext_packed816_fp6(%v: vector<16xf6E2M3FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) { +func.func @scaled_ext_packed_matrix_fp6(%v: vector<16xf6E2M3FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) { // CHECK-DAG: %[[SCALE_4xi8:.+]] = builtin.unrealized_conversion_cast %[[SCALE]] : vector<4xf8E8M0FNU> to vector<4xi8> // CHECK-DAG: %[[SOURCE_16xi6:.+]] = builtin.unrealized_conversion_cast %[[SOURCE]] : vector<16xf6E2M3FN> to vector<16xi6> // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32 // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32> // CHECK: rocdl.cvt.scale.pk16.f16.fp6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xf16> - %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf16> + %ret0 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf16> // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32 // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32> // CHECK: rocdl.cvt.scale.pk16.bf16.fp6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xbf16> - %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xbf16> + %ret1 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xbf16> // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32 // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32> // CHECK: rocdl.cvt.scale.pk16.f32.fp6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xf32> - %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf32> + %ret2 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf32> return %ret0, %ret1, %ret2: vector<16xf16>, vector<16xbf16>, vector<16xf32> } -// CHECK-LABEL: @scaled_ext_packed816_bf6 +// CHECK-LABEL: @scaled_ext_packed_matrix_bf6 // CHECK-SAME: (%[[SOURCE:.+]]: vector<16xf6E3M2FN>, %[[SCALE:.+]]: vector<4xf8E8M0FNU>) -func.func @scaled_ext_packed816_bf6(%v: vector<16xf6E3M2FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) { +func.func @scaled_ext_packed_matrix_bf6(%v: vector<16xf6E3M2FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) { // CHECK-DAG: %[[SCALE_4xi8:.+]] = builtin.unrealized_conversion_cast %[[SCALE]] : vector<4xf8E8M0FNU> to vector<4xi8> // CHECK-DAG: %[[SOURCE_16xi6:.+]] = builtin.unrealized_conversion_cast %[[SOURCE]] : vector<16xf6E3M2FN> to vector<16xi6> // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32 // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32> // CHECK: rocdl.cvt.scale.pk16.f16.bf6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xf16> - %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf16> + %ret0 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf16> // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32 // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32> // CHECK: rocdl.cvt.scale.pk16.bf16.bf6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xbf16> - %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xbf16> + %ret1 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xbf16> // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32 // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32> // CHECK: rocdl.cvt.scale.pk16.f32.bf6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xf32> - %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf32> + %ret2 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf32> return %ret0, %ret1, %ret2: vector<16xf16>, vector<16xbf16>, vector<16xf32> } // ----- -func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_16(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) { - // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have firstScaleByte be 0 or 1 for f4 and f6}} - %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(2) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16> +func.func @amdgpu.scaled_ext_packed_matrix_invalid_block_size_and_first_scale_byte_16(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) { + // expected-error@+1 {{'amdgpu.scaled_ext_packed_matrix' op blockSize of 16 can only have firstScaleByte be 0 or 1 for f4 and f6}} + %ret0 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(2) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16> func.return } // ----- -func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_32(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) { - // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 32 can only have firstScaleByte be 0 or 2 for f4 and f6.}} - %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(1) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16> +func.func @amdgpu.scaled_ext_packed_matrix_invalid_block_size_and_first_scale_byte_32(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) { + // expected-error@+1 {{'amdgpu.scaled_ext_packed_matrix' op blockSize of 32 can only have firstScaleByte be 0 or 2 for f4 and f6.}} + %ret0 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(1) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16> func.return } // ----- -func.func @amdgpu.scaled_ext_packed816_invalid_attributes_for_f8(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) { - // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have (firstScaleLane, firstScaleByte) be (0, 0) or (1, 2) for f8.}} - %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(1) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16> +func.func @amdgpu.scaled_ext_packed_matrix_invalid_attributes_for_f8(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) { + // expected-error@+1 {{'amdgpu.scaled_ext_packed_matrix' op blockSize of 16 can only have (firstScaleLane, firstScaleByte) be (0, 0) or (16, 2) for f8.}} + %ret0 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(1) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16> func.return } // ----- -func.func @amdgpu.scaled_ext_packed816_invalid_input_output_sizes(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) { - // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op failed to verify that all of {source, res} have same shape}} - %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<16xf16> +func.func @amdgpu.scaled_ext_packed_matrix_invalid_input_output_sizes(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) { + // expected-error@+1 {{'amdgpu.scaled_ext_packed_matrix' op failed to verify that all of {source, res} have same shape}} + %ret0 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<16xf16> func.return } // ----- -func.func @amdgpu.scaled_ext_packed816_invalid_src_elem_type(%v: vector<16xf16>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>) { - // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op operand #0 must be}} - %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf16>, vector<4xf8E8M0FNU> -> vector<16xf16> +func.func @amdgpu.scaled_ext_packed_matrix_invalid_src_elem_type(%v: vector<16xf16>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>) { + // expected-error@+1 {{'amdgpu.scaled_ext_packed_matrix' op operand #0 must be}} + %ret0 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf16>, vector<4xf8E8M0FNU> -> vector<16xf16> return %ret0: vector<16xf16> } // ----- -func.func @amdgpu.scaled_ext_packed816_invalid_dst_elem_type(%v: vector<16xf6E3M2FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf64>) { - // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op result #0 must be vector}} - %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf64> +func.func @amdgpu.scaled_ext_packed_matrix_invalid_dst_elem_type(%v: vector<16xf6E3M2FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf64>) { + // expected-error@+1 {{'amdgpu.scaled_ext_packed_matrix' op result #0 must be vector}} + %ret0 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf64> return %ret0: vector<16xf64> } diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 3260bd4a8df9a..0eccd0a7430bc 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -221,58 +221,58 @@ func.func @scaled_ext_scalar_f4e2m1_bf16(%v: vector<2xf4E2M1FN>, %scale: f32) -> func.return %ret : vector<2xbf16> } -// CHECK-LABEL: func.func @scaled_ext_packed816_fp4 -func.func @scaled_ext_packed816_fp4(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { - // CHECK: amdgpu.scaled_ext_packed816 - %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16> - // CHECK: amdgpu.scaled_ext_packed816 - %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xbf16> - // CHECK: amdgpu.scaled_ext_packed816 - %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf32> +// CHECK-LABEL: func.func @scaled_ext_packed_matrix_fp4 +func.func @scaled_ext_packed_matrix_fp4(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { + // CHECK: amdgpu.scaled_ext_packed_matrix + %ret0 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16> + // CHECK: amdgpu.scaled_ext_packed_matrix + %ret1 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xbf16> + // CHECK: amdgpu.scaled_ext_packed_matrix + %ret2 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf32> func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32> } -// CHECK-LABEL: func.func @scaled_ext_packed816_fp8 -func.func @scaled_ext_packed816_fp8(%v: vector<8xf8E4M3FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { - // CHECK: amdgpu.scaled_ext_packed816 - %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xf16> - // CHECK: amdgpu.scaled_ext_packed816 - %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xbf16> - // CHECK: amdgpu.scaled_ext_packed816 - %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xf32> +// CHECK-LABEL: func.func @scaled_ext_packed_matrix_fp8 +func.func @scaled_ext_packed_matrix_fp8(%v: vector<8xf8E4M3FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { + // CHECK: amdgpu.scaled_ext_packed_matrix + %ret0 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xf16> + // CHECK: amdgpu.scaled_ext_packed_matrix + %ret1 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xbf16> + // CHECK: amdgpu.scaled_ext_packed_matrix + %ret2 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xf32> func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32> } -// CHECK-LABEL: func.func @scaled_ext_packed816_bf8 -func.func @scaled_ext_packed816_bf8(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { - // CHECK: amdgpu.scaled_ext_packed816 - %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16> - // CHECK: amdgpu.scaled_ext_packed816 - %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xbf16> - // CHECK: amdgpu.scaled_ext_packed816 - %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf32> +// CHECK-LABEL: func.func @scaled_ext_packed_matrix_bf8 +func.func @scaled_ext_packed_matrix_bf8(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { + // CHECK: amdgpu.scaled_ext_packed_matrix + %ret0 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16> + // CHECK: amdgpu.scaled_ext_packed_matrix + %ret1 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xbf16> + // CHECK: amdgpu.scaled_ext_packed_matrix + %ret2 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf32> func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32> } -// CHECK-LABEL: func.func @scaled_ext_packed816_fp6 -func.func @scaled_ext_packed816_fp6(%v: vector<16xf6E2M3FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) { - // CHECK: amdgpu.scaled_ext_packed816 - %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf16> - // CHECK: amdgpu.scaled_ext_packed816 - %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xbf16> - // CHECK: amdgpu.scaled_ext_packed816 - %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf32> +// CHECK-LABEL: func.func @scaled_ext_packed_matrix_fp6 +func.func @scaled_ext_packed_matrix_fp6(%v: vector<16xf6E2M3FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) { + // CHECK: amdgpu.scaled_ext_packed_matrix + %ret0 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf16> + // CHECK: amdgpu.scaled_ext_packed_matrix + %ret1 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xbf16> + // CHECK: amdgpu.scaled_ext_packed_matrix + %ret2 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf32> func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32> } -// CHECK-LABEL: func.func @scaled_ext_packed816_bf16 -func.func @scaled_ext_packed816_bf16(%v: vector<16xf6E3M2FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) { - // CHECK: amdgpu.scaled_ext_packed816 - %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf16> - // CHECK: amdgpu.scaled_ext_packed816 - %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xbf16> - // CHECK: amdgpu.scaled_ext_packed816 - %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf32> +// CHECK-LABEL: func.func @scaled_ext_packed_matrix_bf6 +func.func @scaled_ext_packed_matrix_bf6(%v: vector<16xf6E3M2FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) { + // CHECK: amdgpu.scaled_ext_packed_matrix + %ret0 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf16> + // CHECK: amdgpu.scaled_ext_packed_matrix + %ret1 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xbf16> + // CHECK: amdgpu.scaled_ext_packed_matrix + %ret2 = amdgpu.scaled_ext_packed_matrix %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf32> func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32> } From 63ece8dad873a4a61abed5d4dc3144ea54e57833 Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Thu, 4 Dec 2025 11:39:30 -0800 Subject: [PATCH 2/2] FIx the "rename" button missing a spot Co-authored-by: Erick Ochoa Lopez --- mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 26175b86bf262..a85973c2493ee 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -1645,7 +1645,7 @@ int32_t getScaleSel(int32_t blockSize, unsigned bitWidth, int32_t scaleWaveHalf, int32_t bit0 = isBlock16; assert(llvm::is_contained({0, 1, 2}, firstScaleByte)); int32_t bit1 = (firstScaleByte == 2) << 1; - assert(llvm::is_contained({0, 1}, firstScaleLane)); + assert(llvm::is_contained({0, 1}, scaleWaveHalf)); int32_t bit2 = scaleWaveHalf << 2; return bit2 | bit1 | bit0; } @@ -1654,7 +1654,7 @@ int32_t getScaleSel(int32_t blockSize, unsigned bitWidth, int32_t scaleWaveHalf, // firstScaleByte is guaranteed to be defined by two bits. assert(llvm::is_contained({0, 1, 2, 3}, firstScaleByte)); int32_t bits2and1 = firstScaleByte << 1; - assert(llvm::is_contained({0, 1}, firstScaleLane)); + assert(llvm::is_contained({0, 1}, scaleWaveHalf)); int32_t bit3 = scaleWaveHalf << 3; int32_t bits = bit3 | bits2and1 | bit0; // These are invalid cases.