From f92db34c23bcade79c941bcc8f58605ea2ab6c10 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Fri, 19 Sep 2025 14:33:14 -0400 Subject: [PATCH 1/4] [mlir][amdgpu] Add scaled_ext_packed{8,16} operations --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 42 ++++++++++++++ mlir/test/Dialect/AMDGPU/ops.mlir | 55 +++++++++++++++++++ 2 files changed, 97 insertions(+) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index a24a918357f2d..d5ea737e229ff 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -112,6 +112,48 @@ def AMDGPU_ExtPackedFp8Op : }]; } +def AMDGPU_ScaledExtPacked8Op + : AMDGPU_Op<"scaled_ext_packed8", [Pure]>, + Arguments<( + ins VectorOfLengthAndType<[8], [F4E2M1FN,F8E4M3FN,F8E5M2]>:$source, + F32:$scale, + ConfinedAttr]>:$index)>, + Results<( + outs AnyTypeOf<[FixedVectorOfLengthAndType<[8], [F32]>, + FixedVectorOfLengthAndType<[8], [F16]>, + FixedVectorOfLengthAndType<[8], [BF16]>]>:$res)> { + let summary = "Extend a vector of packed floating point values"; + + let description = [{ + Extend and scale eight packed floats in to eight floats and return them. + }]; + + let assemblyFormat = [{ + attr-dict $source `,` $scale `[` $index `]` `:` type($source) `to` type($res) + }]; +} + +def AMDGPU_ScaledExtPacked16Op + : AMDGPU_Op<"scaled_ext_packed16", [Pure]>, + Arguments<( + ins VectorOfLengthAndType<[16], [F6E2M3FN, F6E3M2FN]>:$source, + F32:$scale, + ConfinedAttr]>:$index)>, + Results<( + outs AnyTypeOf<[FixedVectorOfLengthAndType<[16], [F32]>, + FixedVectorOfLengthAndType<[16], [F16]>, + FixedVectorOfLengthAndType<[16], [BF16]>]>:$res)> { + let summary = "Extend a vector of packed floating point values"; + + let description = [{ + Extend and scale 16 packed floats to 16 floats and return them. + }]; + + let assemblyFormat = [{ + attr-dict $source `,` $scale `[` $index `]` `:` type($source) `to` type($res) + }]; +} + def AMDGPU_ScaledExtPackedOp : AMDGPU_Op<"scaled_ext_packed", [Pure]>, Arguments<( diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 369e0fff538e1..1841c0815b435 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -221,6 +221,61 @@ func.func @scaled_ext_scalar_f4e2m1_bf16(%v: vector<2xf4E2M1FN>, %scale: f32) -> func.return %ret : vector<2xbf16> } +// CHECK-LABEL: func.func @scaled_ext_packed8_fp4 +func.func @scaled_ext_packed8_fp4(%v: vector<8xf4E2M1FN>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { + // CHECK: amdgpu.scaled_ext_packed8 + %ret0 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xf16> + // CHECK: amdgpu.scaled_ext_packed8 + %ret1 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xbf16> + // CHECK: amdgpu.scaled_ext_packed8 + %ret2 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xf32> + func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32> +} + +// CHECK-LABEL: func.func @scaled_ext_packed8_fp8 +func.func @scaled_ext_packed8_fp8(%v: vector<8xf8E4M3FN>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { + // CHECK: amdgpu.scaled_ext_packed8 + %ret0 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xf16> + // CHECK: amdgpu.scaled_ext_packed8 + %ret1 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xbf16> + // CHECK: amdgpu.scaled_ext_packed8 + %ret2 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xf32> + func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32> +} + +// CHECK-LABEL: func.func @scaled_ext_packed8_bf8 +func.func @scaled_ext_packed8_bf8(%v: vector<8xf8E5M2>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { + // CHECK: amdgpu.scaled_ext_packed8 + %ret0 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E5M2> to vector<8xf16> + // CHECK: amdgpu.scaled_ext_packed8 + %ret1 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E5M2> to vector<8xbf16> + // CHECK: amdgpu.scaled_ext_packed8 + %ret2 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E5M2> to vector<8xf32> + func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32> +} + +// CHECK-LABEL: func.func @scaled_ext_packed16_fp6 +func.func @scaled_ext_packed16_fp6(%v: vector<16xf6E2M3FN>, %scale: f32) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) { + // CHECK: amdgpu.scaled_ext_packed16 + %ret0 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xf16> + // CHECK: amdgpu.scaled_ext_packed16 + %ret1 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xbf16> + // CHECK: amdgpu.scaled_ext_packed16 + %ret2 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xf32> + func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32> +} + +// CHECK-LABEL: func.func @scaled_ext_packed16_bf16 +func.func @scaled_ext_packed16_bf16(%v: vector<16xf6E3M2FN>, %scale: f32) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) { + // CHECK: amdgpu.scaled_ext_packed16 + %ret0 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xf16> + // CHECK: amdgpu.scaled_ext_packed16 + %ret1 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xbf16> + // CHECK: amdgpu.scaled_ext_packed16 + %ret2 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xf32> + func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32> +} + // CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f32 // CHECK: amdgpu.packed_scaled_trunc func.func @packed_scaled_trunc_f8e4m3_f32(%v: vector<2xf32>, %scale: f32) -> vector<4xf8E4M3FN> { From c3832b09bed55cbcaa40d6643ce56f0398ddcab4 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Fri, 19 Sep 2025 15:30:20 -0400 Subject: [PATCH 2/4] [mlir][amdgpu] Use existing scaled_ext_packed instead of new ops --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 62 +++++------------ .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 18 ++--- .../ArithToAMDGPU/ArithToAMDGPU.cpp | 5 +- mlir/test/Dialect/AMDGPU/ops.mlir | 66 +++++++++---------- 4 files changed, 60 insertions(+), 91 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index d5ea737e229ff..724391dbd6b94 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -112,60 +112,27 @@ def AMDGPU_ExtPackedFp8Op : }]; } -def AMDGPU_ScaledExtPacked8Op - : AMDGPU_Op<"scaled_ext_packed8", [Pure]>, - Arguments<( - ins VectorOfLengthAndType<[8], [F4E2M1FN,F8E4M3FN,F8E5M2]>:$source, - F32:$scale, - ConfinedAttr]>:$index)>, - Results<( - outs AnyTypeOf<[FixedVectorOfLengthAndType<[8], [F32]>, - FixedVectorOfLengthAndType<[8], [F16]>, - FixedVectorOfLengthAndType<[8], [BF16]>]>:$res)> { - let summary = "Extend a vector of packed floating point values"; - - let description = [{ - Extend and scale eight packed floats in to eight floats and return them. - }]; - - let assemblyFormat = [{ - attr-dict $source `,` $scale `[` $index `]` `:` type($source) `to` type($res) - }]; -} - -def AMDGPU_ScaledExtPacked16Op - : AMDGPU_Op<"scaled_ext_packed16", [Pure]>, - Arguments<( - ins VectorOfLengthAndType<[16], [F6E2M3FN, F6E3M2FN]>:$source, - F32:$scale, - ConfinedAttr]>:$index)>, - Results<( - outs AnyTypeOf<[FixedVectorOfLengthAndType<[16], [F32]>, - FixedVectorOfLengthAndType<[16], [F16]>, - FixedVectorOfLengthAndType<[16], [BF16]>]>:$res)> { - let summary = "Extend a vector of packed floating point values"; - - let description = [{ - Extend and scale 16 packed floats to 16 floats and return them. - }]; - - let assemblyFormat = [{ - attr-dict $source `,` $scale `[` $index `]` `:` type($source) `to` type($res) - }]; -} - def AMDGPU_ScaledExtPackedOp : AMDGPU_Op<"scaled_ext_packed", [Pure]>, Arguments<( ins AnyTypeOf<[VectorOfLengthAndType<[1, 2, 3, 4], [F8E5M2, F8E4M3FN]>, - VectorOfLengthAndType<[1, 2, 3, 4, 5, 6, 7, 8], - [F4E2M1FN]>]>:$source, + VectorOfLengthAndType<[1, 2, 3, 4, 5, 6, 7, 8],[F4E2M1FN]>, + VectorOfLengthAndType<[8],[F4E2M1FN, F8E4M3FN, F8E5M2]>, + VectorOfLengthAndType<[16], [F6E2M3FN, F6E3M2FN]>]>:$source, F32:$scale, - ConfinedAttr]>:$index)>, + OptionalAttr]>>:$index, + OptionalAttr]>>:$scaleSel)>, Results<( outs AnyTypeOf<[FixedVectorOfLengthAndType<[2], [F32]>, FixedVectorOfLengthAndType<[2], [F16]>, - FixedVectorOfLengthAndType<[2], [BF16]>]>:$res)> { + FixedVectorOfLengthAndType<[2], [BF16]>, + FixedVectorOfLengthAndType<[8], [F32]>, + FixedVectorOfLengthAndType<[8], [F16]>, + FixedVectorOfLengthAndType<[8], [BF16]>, + FixedVectorOfLengthAndType<[16], [F32]>, + FixedVectorOfLengthAndType<[16], [F16]>, + FixedVectorOfLengthAndType<[16], [BF16]>]>:$res)> { + let summary = "Extend a vector of packed floating point values"; let description = [{ @@ -181,8 +148,9 @@ def AMDGPU_ScaledExtPackedOp the remaining values in the <2 x i8> will be filled with undefined values as needed. }]; + let assemblyFormat = [{ - attr-dict $source `[` $index `]` `,` $scale `:` type($source) `to` type($res) + attr-dict $source ( `[` $index^ `]` )? `,` $scale ( `[` $scaleSel^ `]` )? `:` type($source) `to` type($res) }]; } diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 0078eed8b7a67..e7d780b6a18b7 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -1510,31 +1510,31 @@ LogicalResult ScaledExtPackedOpLowering::matchAndRewrite( if (isa(sourceElemType) && destElemType.isF32()) rewriter.replaceOpWithNewOp( - op, destVecType, i32Source, scale, op.getIndex()); + op, destVecType, i32Source, scale, *op.getIndex()); else if (isa(sourceElemType) && destElemType.isF16()) rewriter.replaceOpWithNewOp( - op, destVecType, i32Source, scale, op.getIndex()); + op, destVecType, i32Source, scale, *op.getIndex()); else if (isa(sourceElemType) && destElemType.isBF16()) rewriter.replaceOpWithNewOp( - op, destVecType, i32Source, scale, op.getIndex()); + op, destVecType, i32Source, scale, *op.getIndex()); else if (isa(sourceElemType) && destElemType.isF32()) rewriter.replaceOpWithNewOp( - op, destVecType, i32Source, scale, op.getIndex()); + op, destVecType, i32Source, scale, *op.getIndex()); else if (isa(sourceElemType) && destElemType.isF16()) rewriter.replaceOpWithNewOp( - op, destVecType, i32Source, scale, op.getIndex()); + op, destVecType, i32Source, scale, *op.getIndex()); else if (isa(sourceElemType) && destElemType.isBF16()) rewriter.replaceOpWithNewOp( - op, destVecType, i32Source, scale, op.getIndex()); + op, destVecType, i32Source, scale, *op.getIndex()); else if (isa(sourceElemType) && destElemType.isF32()) rewriter.replaceOpWithNewOp( - op, destVecType, i32Source, scale, op.getIndex()); + op, destVecType, i32Source, scale, *op.getIndex()); else if (isa(sourceElemType) && destElemType.isF16()) rewriter.replaceOpWithNewOp( - op, destVecType, i32Source, scale, op.getIndex()); + op, destVecType, i32Source, scale, *op.getIndex()); else if (isa(sourceElemType) && destElemType.isBF16()) rewriter.replaceOpWithNewOp( - op, destVecType, i32Source, scale, op.getIndex()); + op, destVecType, i32Source, scale, *op.getIndex()); else return failure(); diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp index 3d6f6cab42244..b3c30e6814a22 100644 --- a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp +++ b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp @@ -482,7 +482,8 @@ ScalingExtFRewritePattern::matchAndRewrite(arith::ScalingExtFOp op, VectorType::get(1, inType), in); // TODO: replace this with non-packed ScaledExtOp Value scaleExt = amdgpu::ScaledExtPackedOp::create( - rewriter, loc, extScaleResultType, inCast, scale, 0); + rewriter, loc, extScaleResultType, inCast, scale, + rewriter.getI32IntegerAttr(0), nullptr); scaleExt = rewriter.replaceOpWithNewOp(op, scaleExt, 0); return success(); } @@ -539,7 +540,7 @@ ScalingExtFRewritePattern::matchAndRewrite(arith::ScalingExtFOp op, // TODO: replace this with non-packed ScaledExtOp for sliceWidth == 1 Value scaleExt = amdgpu::ScaledExtPackedOp::create( rewriter, loc, extScaleResultType, inSlice, uniformScale, - j / opOutWidth); + rewriter.getI32IntegerAttr(j / opOutWidth), nullptr); if (outSliceWidth < opOutWidth) { scaleExt = vector::ExtractStridedSliceOp::create( rewriter, loc, scaleExt, 0, outSliceWidth, 1); diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 1841c0815b435..3ac02069e2dbc 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -221,58 +221,58 @@ func.func @scaled_ext_scalar_f4e2m1_bf16(%v: vector<2xf4E2M1FN>, %scale: f32) -> func.return %ret : vector<2xbf16> } -// CHECK-LABEL: func.func @scaled_ext_packed8_fp4 +// CHECK-LABEL: func.func @scaled_ext_packed8_fp func.func @scaled_ext_packed8_fp4(%v: vector<8xf4E2M1FN>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { - // CHECK: amdgpu.scaled_ext_packed8 - %ret0 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xf16> - // CHECK: amdgpu.scaled_ext_packed8 - %ret1 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xbf16> - // CHECK: amdgpu.scaled_ext_packed8 - %ret2 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xf32> + // CHECK: amdgpu.scaled_ext_packed + %ret0 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xf16> + // CHECK: amdgpu.scaled_ext_packed + %ret1 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xbf16> + // CHECK: amdgpu.scaled_ext_packed + %ret2 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xf32> func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32> } -// CHECK-LABEL: func.func @scaled_ext_packed8_fp8 +// CHECK-LABEL: func.func @scaled_ext_packed8_fp func.func @scaled_ext_packed8_fp8(%v: vector<8xf8E4M3FN>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { - // CHECK: amdgpu.scaled_ext_packed8 - %ret0 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xf16> - // CHECK: amdgpu.scaled_ext_packed8 - %ret1 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xbf16> - // CHECK: amdgpu.scaled_ext_packed8 - %ret2 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xf32> + // CHECK: amdgpu.scaled_ext_packed + %ret0 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xf16> + // CHECK: amdgpu.scaled_ext_packed + %ret1 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xbf16> + // CHECK: amdgpu.scaled_ext_packed + %ret2 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xf32> func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32> } -// CHECK-LABEL: func.func @scaled_ext_packed8_bf8 +// CHECK-LABEL: func.func @scaled_ext_packed8_bf func.func @scaled_ext_packed8_bf8(%v: vector<8xf8E5M2>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { - // CHECK: amdgpu.scaled_ext_packed8 - %ret0 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E5M2> to vector<8xf16> - // CHECK: amdgpu.scaled_ext_packed8 - %ret1 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E5M2> to vector<8xbf16> - // CHECK: amdgpu.scaled_ext_packed8 - %ret2 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E5M2> to vector<8xf32> + // CHECK: amdgpu.scaled_ext_packed + %ret0 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<8xf8E5M2> to vector<8xf16> + // CHECK: amdgpu.scaled_ext_packed + %ret1 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<8xf8E5M2> to vector<8xbf16> + // CHECK: amdgpu.scaled_ext_packed + %ret2 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<8xf8E5M2> to vector<8xf32> func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32> } // CHECK-LABEL: func.func @scaled_ext_packed16_fp6 func.func @scaled_ext_packed16_fp6(%v: vector<16xf6E2M3FN>, %scale: f32) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) { - // CHECK: amdgpu.scaled_ext_packed16 - %ret0 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xf16> - // CHECK: amdgpu.scaled_ext_packed16 - %ret1 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xbf16> - // CHECK: amdgpu.scaled_ext_packed16 - %ret2 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xf32> + // CHECK: amdgpu.scaled_ext_packed + %ret0 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xf16> + // CHECK: amdgpu.scaled_ext_packed + %ret1 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xbf16> + // CHECK: amdgpu.scaled_ext_packed + %ret2 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xf32> func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32> } // CHECK-LABEL: func.func @scaled_ext_packed16_bf16 func.func @scaled_ext_packed16_bf16(%v: vector<16xf6E3M2FN>, %scale: f32) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) { - // CHECK: amdgpu.scaled_ext_packed16 - %ret0 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xf16> - // CHECK: amdgpu.scaled_ext_packed16 - %ret1 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xbf16> - // CHECK: amdgpu.scaled_ext_packed16 - %ret2 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xf32> + // CHECK: amdgpu.scaled_ext_packed + %ret0 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xf16> + // CHECK: amdgpu.scaled_ext_packed + %ret1 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xbf16> + // CHECK: amdgpu.scaled_ext_packed + %ret2 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xf32> func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32> } From d34b02df979ab9a2531ff2be1944fcf15678a882 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Fri, 19 Sep 2025 15:45:56 -0400 Subject: [PATCH 3/4] Revert "[mlir][amdgpu] Use existing scaled_ext_packed instead of new ops" This reverts commit c3832b09bed55cbcaa40d6643ce56f0398ddcab4. --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 62 ++++++++++++----- .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 18 ++--- .../ArithToAMDGPU/ArithToAMDGPU.cpp | 5 +- mlir/test/Dialect/AMDGPU/ops.mlir | 66 +++++++++---------- 4 files changed, 91 insertions(+), 60 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 724391dbd6b94..d5ea737e229ff 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -112,27 +112,60 @@ def AMDGPU_ExtPackedFp8Op : }]; } +def AMDGPU_ScaledExtPacked8Op + : AMDGPU_Op<"scaled_ext_packed8", [Pure]>, + Arguments<( + ins VectorOfLengthAndType<[8], [F4E2M1FN,F8E4M3FN,F8E5M2]>:$source, + F32:$scale, + ConfinedAttr]>:$index)>, + Results<( + outs AnyTypeOf<[FixedVectorOfLengthAndType<[8], [F32]>, + FixedVectorOfLengthAndType<[8], [F16]>, + FixedVectorOfLengthAndType<[8], [BF16]>]>:$res)> { + let summary = "Extend a vector of packed floating point values"; + + let description = [{ + Extend and scale eight packed floats in to eight floats and return them. + }]; + + let assemblyFormat = [{ + attr-dict $source `,` $scale `[` $index `]` `:` type($source) `to` type($res) + }]; +} + +def AMDGPU_ScaledExtPacked16Op + : AMDGPU_Op<"scaled_ext_packed16", [Pure]>, + Arguments<( + ins VectorOfLengthAndType<[16], [F6E2M3FN, F6E3M2FN]>:$source, + F32:$scale, + ConfinedAttr]>:$index)>, + Results<( + outs AnyTypeOf<[FixedVectorOfLengthAndType<[16], [F32]>, + FixedVectorOfLengthAndType<[16], [F16]>, + FixedVectorOfLengthAndType<[16], [BF16]>]>:$res)> { + let summary = "Extend a vector of packed floating point values"; + + let description = [{ + Extend and scale 16 packed floats to 16 floats and return them. + }]; + + let assemblyFormat = [{ + attr-dict $source `,` $scale `[` $index `]` `:` type($source) `to` type($res) + }]; +} + def AMDGPU_ScaledExtPackedOp : AMDGPU_Op<"scaled_ext_packed", [Pure]>, Arguments<( ins AnyTypeOf<[VectorOfLengthAndType<[1, 2, 3, 4], [F8E5M2, F8E4M3FN]>, - VectorOfLengthAndType<[1, 2, 3, 4, 5, 6, 7, 8],[F4E2M1FN]>, - VectorOfLengthAndType<[8],[F4E2M1FN, F8E4M3FN, F8E5M2]>, - VectorOfLengthAndType<[16], [F6E2M3FN, F6E3M2FN]>]>:$source, + VectorOfLengthAndType<[1, 2, 3, 4, 5, 6, 7, 8], + [F4E2M1FN]>]>:$source, F32:$scale, - OptionalAttr]>>:$index, - OptionalAttr]>>:$scaleSel)>, + ConfinedAttr]>:$index)>, Results<( outs AnyTypeOf<[FixedVectorOfLengthAndType<[2], [F32]>, FixedVectorOfLengthAndType<[2], [F16]>, - FixedVectorOfLengthAndType<[2], [BF16]>, - FixedVectorOfLengthAndType<[8], [F32]>, - FixedVectorOfLengthAndType<[8], [F16]>, - FixedVectorOfLengthAndType<[8], [BF16]>, - FixedVectorOfLengthAndType<[16], [F32]>, - FixedVectorOfLengthAndType<[16], [F16]>, - FixedVectorOfLengthAndType<[16], [BF16]>]>:$res)> { - + FixedVectorOfLengthAndType<[2], [BF16]>]>:$res)> { let summary = "Extend a vector of packed floating point values"; let description = [{ @@ -148,9 +181,8 @@ def AMDGPU_ScaledExtPackedOp the remaining values in the <2 x i8> will be filled with undefined values as needed. }]; - let assemblyFormat = [{ - attr-dict $source ( `[` $index^ `]` )? `,` $scale ( `[` $scaleSel^ `]` )? `:` type($source) `to` type($res) + attr-dict $source `[` $index `]` `,` $scale `:` type($source) `to` type($res) }]; } diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index e7d780b6a18b7..0078eed8b7a67 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -1510,31 +1510,31 @@ LogicalResult ScaledExtPackedOpLowering::matchAndRewrite( if (isa(sourceElemType) && destElemType.isF32()) rewriter.replaceOpWithNewOp( - op, destVecType, i32Source, scale, *op.getIndex()); + op, destVecType, i32Source, scale, op.getIndex()); else if (isa(sourceElemType) && destElemType.isF16()) rewriter.replaceOpWithNewOp( - op, destVecType, i32Source, scale, *op.getIndex()); + op, destVecType, i32Source, scale, op.getIndex()); else if (isa(sourceElemType) && destElemType.isBF16()) rewriter.replaceOpWithNewOp( - op, destVecType, i32Source, scale, *op.getIndex()); + op, destVecType, i32Source, scale, op.getIndex()); else if (isa(sourceElemType) && destElemType.isF32()) rewriter.replaceOpWithNewOp( - op, destVecType, i32Source, scale, *op.getIndex()); + op, destVecType, i32Source, scale, op.getIndex()); else if (isa(sourceElemType) && destElemType.isF16()) rewriter.replaceOpWithNewOp( - op, destVecType, i32Source, scale, *op.getIndex()); + op, destVecType, i32Source, scale, op.getIndex()); else if (isa(sourceElemType) && destElemType.isBF16()) rewriter.replaceOpWithNewOp( - op, destVecType, i32Source, scale, *op.getIndex()); + op, destVecType, i32Source, scale, op.getIndex()); else if (isa(sourceElemType) && destElemType.isF32()) rewriter.replaceOpWithNewOp( - op, destVecType, i32Source, scale, *op.getIndex()); + op, destVecType, i32Source, scale, op.getIndex()); else if (isa(sourceElemType) && destElemType.isF16()) rewriter.replaceOpWithNewOp( - op, destVecType, i32Source, scale, *op.getIndex()); + op, destVecType, i32Source, scale, op.getIndex()); else if (isa(sourceElemType) && destElemType.isBF16()) rewriter.replaceOpWithNewOp( - op, destVecType, i32Source, scale, *op.getIndex()); + op, destVecType, i32Source, scale, op.getIndex()); else return failure(); diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp index b3c30e6814a22..3d6f6cab42244 100644 --- a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp +++ b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp @@ -482,8 +482,7 @@ ScalingExtFRewritePattern::matchAndRewrite(arith::ScalingExtFOp op, VectorType::get(1, inType), in); // TODO: replace this with non-packed ScaledExtOp Value scaleExt = amdgpu::ScaledExtPackedOp::create( - rewriter, loc, extScaleResultType, inCast, scale, - rewriter.getI32IntegerAttr(0), nullptr); + rewriter, loc, extScaleResultType, inCast, scale, 0); scaleExt = rewriter.replaceOpWithNewOp(op, scaleExt, 0); return success(); } @@ -540,7 +539,7 @@ ScalingExtFRewritePattern::matchAndRewrite(arith::ScalingExtFOp op, // TODO: replace this with non-packed ScaledExtOp for sliceWidth == 1 Value scaleExt = amdgpu::ScaledExtPackedOp::create( rewriter, loc, extScaleResultType, inSlice, uniformScale, - rewriter.getI32IntegerAttr(j / opOutWidth), nullptr); + j / opOutWidth); if (outSliceWidth < opOutWidth) { scaleExt = vector::ExtractStridedSliceOp::create( rewriter, loc, scaleExt, 0, outSliceWidth, 1); diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 3ac02069e2dbc..1841c0815b435 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -221,58 +221,58 @@ func.func @scaled_ext_scalar_f4e2m1_bf16(%v: vector<2xf4E2M1FN>, %scale: f32) -> func.return %ret : vector<2xbf16> } -// CHECK-LABEL: func.func @scaled_ext_packed8_fp +// CHECK-LABEL: func.func @scaled_ext_packed8_fp4 func.func @scaled_ext_packed8_fp4(%v: vector<8xf4E2M1FN>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { - // CHECK: amdgpu.scaled_ext_packed - %ret0 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xf16> - // CHECK: amdgpu.scaled_ext_packed - %ret1 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xbf16> - // CHECK: amdgpu.scaled_ext_packed - %ret2 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xf32> + // CHECK: amdgpu.scaled_ext_packed8 + %ret0 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xf16> + // CHECK: amdgpu.scaled_ext_packed8 + %ret1 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xbf16> + // CHECK: amdgpu.scaled_ext_packed8 + %ret2 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xf32> func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32> } -// CHECK-LABEL: func.func @scaled_ext_packed8_fp +// CHECK-LABEL: func.func @scaled_ext_packed8_fp8 func.func @scaled_ext_packed8_fp8(%v: vector<8xf8E4M3FN>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { - // CHECK: amdgpu.scaled_ext_packed - %ret0 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xf16> - // CHECK: amdgpu.scaled_ext_packed - %ret1 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xbf16> - // CHECK: amdgpu.scaled_ext_packed - %ret2 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xf32> + // CHECK: amdgpu.scaled_ext_packed8 + %ret0 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xf16> + // CHECK: amdgpu.scaled_ext_packed8 + %ret1 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xbf16> + // CHECK: amdgpu.scaled_ext_packed8 + %ret2 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xf32> func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32> } -// CHECK-LABEL: func.func @scaled_ext_packed8_bf +// CHECK-LABEL: func.func @scaled_ext_packed8_bf8 func.func @scaled_ext_packed8_bf8(%v: vector<8xf8E5M2>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { - // CHECK: amdgpu.scaled_ext_packed - %ret0 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<8xf8E5M2> to vector<8xf16> - // CHECK: amdgpu.scaled_ext_packed - %ret1 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<8xf8E5M2> to vector<8xbf16> - // CHECK: amdgpu.scaled_ext_packed - %ret2 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<8xf8E5M2> to vector<8xf32> + // CHECK: amdgpu.scaled_ext_packed8 + %ret0 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E5M2> to vector<8xf16> + // CHECK: amdgpu.scaled_ext_packed8 + %ret1 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E5M2> to vector<8xbf16> + // CHECK: amdgpu.scaled_ext_packed8 + %ret2 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E5M2> to vector<8xf32> func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32> } // CHECK-LABEL: func.func @scaled_ext_packed16_fp6 func.func @scaled_ext_packed16_fp6(%v: vector<16xf6E2M3FN>, %scale: f32) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) { - // CHECK: amdgpu.scaled_ext_packed - %ret0 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xf16> - // CHECK: amdgpu.scaled_ext_packed - %ret1 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xbf16> - // CHECK: amdgpu.scaled_ext_packed - %ret2 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xf32> + // CHECK: amdgpu.scaled_ext_packed16 + %ret0 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xf16> + // CHECK: amdgpu.scaled_ext_packed16 + %ret1 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xbf16> + // CHECK: amdgpu.scaled_ext_packed16 + %ret2 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xf32> func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32> } // CHECK-LABEL: func.func @scaled_ext_packed16_bf16 func.func @scaled_ext_packed16_bf16(%v: vector<16xf6E3M2FN>, %scale: f32) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) { - // CHECK: amdgpu.scaled_ext_packed - %ret0 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xf16> - // CHECK: amdgpu.scaled_ext_packed - %ret1 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xbf16> - // CHECK: amdgpu.scaled_ext_packed - %ret2 = amdgpu.scaled_ext_packed %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xf32> + // CHECK: amdgpu.scaled_ext_packed16 + %ret0 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xf16> + // CHECK: amdgpu.scaled_ext_packed16 + %ret1 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xbf16> + // CHECK: amdgpu.scaled_ext_packed16 + %ret2 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xf32> func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32> } From 0d09fc6a218dfcb079648f62a08e4bfba47ba914 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Mon, 22 Sep 2025 16:07:48 -0400 Subject: [PATCH 4/4] merge into a single op --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 32 ++------ mlir/test/Dialect/AMDGPU/ops.mlir | 80 +++++++++---------- 2 files changed, 48 insertions(+), 64 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index d5ea737e229ff..55629a2fd95f1 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -112,41 +112,25 @@ def AMDGPU_ExtPackedFp8Op : }]; } -def AMDGPU_ScaledExtPacked8Op - : AMDGPU_Op<"scaled_ext_packed8", [Pure]>, +def AMDGPU_ScaledExtPacked816Op + : AMDGPU_Op<"scaled_ext_packed816", [Pure]>, Arguments<( - ins VectorOfLengthAndType<[8], [F4E2M1FN,F8E4M3FN,F8E5M2]>:$source, + ins AnyTypeOf<[VectorOfLengthAndType<[8], [F4E2M1FN,F8E4M3FN,F8E5M2]>, + VectorOfLengthAndType<[16], [F6E2M3FN, F6E3M2FN]>]>:$source, F32:$scale, ConfinedAttr]>:$index)>, Results<( outs AnyTypeOf<[FixedVectorOfLengthAndType<[8], [F32]>, FixedVectorOfLengthAndType<[8], [F16]>, - FixedVectorOfLengthAndType<[8], [BF16]>]>:$res)> { - let summary = "Extend a vector of packed floating point values"; - - let description = [{ - Extend and scale eight packed floats in to eight floats and return them. - }]; - - let assemblyFormat = [{ - attr-dict $source `,` $scale `[` $index `]` `:` type($source) `to` type($res) - }]; -} - -def AMDGPU_ScaledExtPacked16Op - : AMDGPU_Op<"scaled_ext_packed16", [Pure]>, - Arguments<( - ins VectorOfLengthAndType<[16], [F6E2M3FN, F6E3M2FN]>:$source, - F32:$scale, - ConfinedAttr]>:$index)>, - Results<( - outs AnyTypeOf<[FixedVectorOfLengthAndType<[16], [F32]>, + FixedVectorOfLengthAndType<[8], [BF16]>, + FixedVectorOfLengthAndType<[16], [F32]>, FixedVectorOfLengthAndType<[16], [F16]>, FixedVectorOfLengthAndType<[16], [BF16]>]>:$res)> { + let summary = "Extend a vector of packed floating point values"; let description = [{ - Extend and scale 16 packed floats to 16 floats and return them. + Extend and scale 8/16 packed floats in to 8/16 floats and return them. }]; let assemblyFormat = [{ diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 1841c0815b435..de4b9d9431a9e 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -221,58 +221,58 @@ func.func @scaled_ext_scalar_f4e2m1_bf16(%v: vector<2xf4E2M1FN>, %scale: f32) -> func.return %ret : vector<2xbf16> } -// CHECK-LABEL: func.func @scaled_ext_packed8_fp4 -func.func @scaled_ext_packed8_fp4(%v: vector<8xf4E2M1FN>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { - // CHECK: amdgpu.scaled_ext_packed8 - %ret0 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xf16> - // CHECK: amdgpu.scaled_ext_packed8 - %ret1 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xbf16> - // CHECK: amdgpu.scaled_ext_packed8 - %ret2 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xf32> +// CHECK-LABEL: func.func @scaled_ext_packed816_fp4 +func.func @scaled_ext_packed816_fp4(%v: vector<8xf4E2M1FN>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { + // CHECK: amdgpu.scaled_ext_packed816 + %ret0 = amdgpu.scaled_ext_packed816 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xf16> + // CHECK: amdgpu.scaled_ext_packed816 + %ret1 = amdgpu.scaled_ext_packed816 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xbf16> + // CHECK: amdgpu.scaled_ext_packed816 + %ret2 = amdgpu.scaled_ext_packed816 %v, %scale[0] : vector<8xf4E2M1FN> to vector<8xf32> func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32> } -// CHECK-LABEL: func.func @scaled_ext_packed8_fp8 -func.func @scaled_ext_packed8_fp8(%v: vector<8xf8E4M3FN>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { - // CHECK: amdgpu.scaled_ext_packed8 - %ret0 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xf16> - // CHECK: amdgpu.scaled_ext_packed8 - %ret1 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xbf16> - // CHECK: amdgpu.scaled_ext_packed8 - %ret2 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xf32> +// CHECK-LABEL: func.func @scaled_ext_packed816_fp8 +func.func @scaled_ext_packed816_fp8(%v: vector<8xf8E4M3FN>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { + // CHECK: amdgpu.scaled_ext_packed816 + %ret0 = amdgpu.scaled_ext_packed816 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xf16> + // CHECK: amdgpu.scaled_ext_packed816 + %ret1 = amdgpu.scaled_ext_packed816 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xbf16> + // CHECK: amdgpu.scaled_ext_packed816 + %ret2 = amdgpu.scaled_ext_packed816 %v, %scale[0] : vector<8xf8E4M3FN> to vector<8xf32> func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32> } -// CHECK-LABEL: func.func @scaled_ext_packed8_bf8 -func.func @scaled_ext_packed8_bf8(%v: vector<8xf8E5M2>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { - // CHECK: amdgpu.scaled_ext_packed8 - %ret0 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E5M2> to vector<8xf16> - // CHECK: amdgpu.scaled_ext_packed8 - %ret1 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E5M2> to vector<8xbf16> - // CHECK: amdgpu.scaled_ext_packed8 - %ret2 = amdgpu.scaled_ext_packed8 %v, %scale[0] : vector<8xf8E5M2> to vector<8xf32> +// CHECK-LABEL: func.func @scaled_ext_packed816_bf8 +func.func @scaled_ext_packed816_bf8(%v: vector<8xf8E5M2>, %scale: f32) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) { + // CHECK: amdgpu.scaled_ext_packed816 + %ret0 = amdgpu.scaled_ext_packed816 %v, %scale[0] : vector<8xf8E5M2> to vector<8xf16> + // CHECK: amdgpu.scaled_ext_packed816 + %ret1 = amdgpu.scaled_ext_packed816 %v, %scale[0] : vector<8xf8E5M2> to vector<8xbf16> + // CHECK: amdgpu.scaled_ext_packed816 + %ret2 = amdgpu.scaled_ext_packed816 %v, %scale[0] : vector<8xf8E5M2> to vector<8xf32> func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32> } -// CHECK-LABEL: func.func @scaled_ext_packed16_fp6 -func.func @scaled_ext_packed16_fp6(%v: vector<16xf6E2M3FN>, %scale: f32) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) { - // CHECK: amdgpu.scaled_ext_packed16 - %ret0 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xf16> - // CHECK: amdgpu.scaled_ext_packed16 - %ret1 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xbf16> - // CHECK: amdgpu.scaled_ext_packed16 - %ret2 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xf32> +// CHECK-LABEL: func.func @scaled_ext_packed816_fp6 +func.func @scaled_ext_packed816_fp6(%v: vector<16xf6E2M3FN>, %scale: f32) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) { + // CHECK: amdgpu.scaled_ext_packed816 + %ret0 = amdgpu.scaled_ext_packed816 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xf16> + // CHECK: amdgpu.scaled_ext_packed816 + %ret1 = amdgpu.scaled_ext_packed816 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xbf16> + // CHECK: amdgpu.scaled_ext_packed816 + %ret2 = amdgpu.scaled_ext_packed816 %v, %scale[0] : vector<16xf6E2M3FN> to vector<16xf32> func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32> } -// CHECK-LABEL: func.func @scaled_ext_packed16_bf16 -func.func @scaled_ext_packed16_bf16(%v: vector<16xf6E3M2FN>, %scale: f32) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) { - // CHECK: amdgpu.scaled_ext_packed16 - %ret0 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xf16> - // CHECK: amdgpu.scaled_ext_packed16 - %ret1 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xbf16> - // CHECK: amdgpu.scaled_ext_packed16 - %ret2 = amdgpu.scaled_ext_packed16 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xf32> +// CHECK-LABEL: func.func @scaled_ext_packed816_bf16 +func.func @scaled_ext_packed816_bf16(%v: vector<16xf6E3M2FN>, %scale: f32) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) { + // CHECK: amdgpu.scaled_ext_packed816 + %ret0 = amdgpu.scaled_ext_packed816 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xf16> + // CHECK: amdgpu.scaled_ext_packed816 + %ret1 = amdgpu.scaled_ext_packed816 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xbf16> + // CHECK: amdgpu.scaled_ext_packed816 + %ret2 = amdgpu.scaled_ext_packed816 %v, %scale[0] : vector<16xf6E3M2FN> to vector<16xf32> func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32> }