diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 45cb67f0eee4a..4820b7a747ac2 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -127,7 +127,7 @@ def AMDGPU_ScaledExtPacked816Op FixedVectorOfShapeAndType<[4], F8E8M0FNU>:$scale, ConfinedAttr:$blockSize, ConfinedAttr, IntMaxValue<1>]>:$firstScaleLane, - ConfinedAttr, IntMaxValue<2>]>:$firstScaleByte)>, + ConfinedAttr, IntMaxValue<3>]>:$firstScaleByte)>, Results<( outs AnyTypeOf<[FixedVectorOfShapeAndType<[8], F32>, FixedVectorOfShapeAndType<[8], F16>, @@ -139,17 +139,21 @@ def AMDGPU_ScaledExtPacked816Op let summary = "Extend a vector of packed floating point values"; let description = [{ - The scales applied to the input microfloats are stored in two bytes which + The scales applied to the input microfloats are stored in bytes which come from the `scales` input provided in a *half* of the wave identified - by `firstScaleLane`. The pair of bytes used is selected by - `firstScaleByte`. The 16 vectors in consecutive lanes starting from + by `firstScaleLane`. The bytes used is selected by `firstScaleByte` and depends + on the type of `source`. The 16 vectors in consecutive lanes starting from `firstScaleLane` (which we'll call the scale vectors) will be used by both - halves of the wave (with lane L reading from L % 16'th scale vector), but - each half will use a different byte. + halves of the wave (with lane L reading from L % 16'th scale vector). + + When `source` is either F4E2M1FN, F6E2M3FN, or F6E3M2FN each half of the + wave will use a different byte. The first one being `firstScaleByte` and + the second one being `firstScaleByte` + 1. When the block size is 32, + `firstScaleByte` can be either 0 or 2, selecting halves of the scale vectors. + Lanes 0-15 will read from `firstScaleByte` and lanes 16-31 will read + from `firstScaleByte` + 1. + - When the block size is 32, `firstScaleByte` can be either 0 or 2, - selecting halves of the scale vectors. Lanes 0-15 will read from - `firstScaleByte` and lanes 16-31 will read from `firstScaleByte` + 1. For example: ```mlir // Input: 8-element vector of F8E4M3FN, converting to F32 @@ -165,7 +169,8 @@ def AMDGPU_ScaledExtPacked816Op : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf16> ``` - However, when the block size is 16, `firstScaleByte` can be 0 or 1. + When `source` is either F4E2M1FN, F6E2M3FN, or F6E3M2FN and + the block size is 16, `firstScaleByte` can be 0 or 1. Lanes 0-15 read from the `firstScaleByte`th element of the scale vectors, while lanes 16-31 read from `firstScaleByte` + 2. For example: @@ -187,6 +192,16 @@ def AMDGPU_ScaledExtPacked816Op instructions use for matix scales. These selection operands allows one to choose portions of the matrix to convert. + When `source` is either F8E4M3FN or F8E5M2 and `blockSize` is 32, + then the same byte will be used by both halves of the wave. + In this case, `firstScaleByte` can be any value from 0 to 3. + + When `source` is either F8E4M3FN or F8E5M2 and `blockSize` is 16, + following combinations are allowed: + * `firstScaleLane(0), firstScaleByte(0)` + * `firstScaleLane(1), firstScaleByte(2)` + all other combinations are reserved. + Available on gfx1250+. }]; diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index df955fc90b45f..5c35823678576 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -344,14 +344,27 @@ void RawBufferAtomicCmpswapOp::getCanonicalizationPatterns( LogicalResult ScaledExtPacked816Op::verify() { int blockSize = getBlockSize(); assert((blockSize == 16 || blockSize == 32) && "invalid block size"); + int firstScaleByte = getFirstScaleByte(); - if (blockSize == 16 && !llvm::is_contained({0, 1}, firstScaleByte)) { - return emitOpError( - "blockSize of 16 can only have firstScaleByte be 0 or 1."); + auto sourceType = cast(getSource().getType()); + Type elementType = sourceType.getElementType(); + auto floatType = cast(elementType); + int bitWidth = floatType.getWidth(); + + if (llvm::is_contained({4, 6}, bitWidth) && blockSize == 16 && + !llvm::is_contained({0, 1}, firstScaleByte)) { + return emitOpError("blockSize of 16 can only have firstScaleByte be 0 or 1 " + "for f4 and f6."); + } + if (llvm::is_contained({4, 6}, bitWidth) && blockSize == 32 && + !llvm::is_contained({0, 2}, firstScaleByte)) { + return emitOpError("blockSize of 32 can only have firstScaleByte be 0 or 2 " + "for f4 and f6."); } - if (blockSize == 32 && !llvm::is_contained({0, 2}, firstScaleByte)) { + if (bitWidth == 8 && blockSize == 16 && + !llvm::is_contained({0, 2}, firstScaleByte)) { return emitOpError( - "blockSize of 32 can only have firstScaleByte be 0 or 2."); + "blockSize of 16 can only have firstScaleByte be 0 or 2 for f8."); } return success(); diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir index 4c6f62a045405..5c8cc8b67c4b3 100644 --- a/mlir/test/Dialect/AMDGPU/invalid.mlir +++ b/mlir/test/Dialect/AMDGPU/invalid.mlir @@ -333,17 +333,25 @@ func.func @gather_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf16>, %mem2 : // ----- -func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_16(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) { - // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have firstScaleByte be 0 or 1.}} - %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(2) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16> +func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_16(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) { + // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have firstScaleByte be 0 or 1 for f4 and f6}} + %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(2) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16> func.return } // ----- -func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_32(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) { - // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 32 can only have firstScaleByte be 0 or 2.}} - %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(1) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16> +func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_32(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) { + // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 32 can only have firstScaleByte be 0 or 2 for f4 and f6.}} + %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(1) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16> + func.return +} + +// ----- + +func.func @amdgpu.scaled_ext_packed816_invalid_attributes_for_f8(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) { + // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have firstScaleByte be 0 or 2 for f8.}} + %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(1) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16> func.return }