Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 16 additions & 17 deletions mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -146,21 +146,17 @@ def AMDGPU_ExtPackedFp8Op :
}];
}

def IsValidBlockSize: AttrConstraint<
CPred<"::llvm::is_contained({16, 32}, ::llvm::cast<::mlir::IntegerAttr>($_self).getInt())">,
"whose value is 16 or 32">;

def AMDGPU_ScaledExtPacked816Op
: AMDGPU_Op<"scaled_ext_packed816", [Pure, AllShapesMatch<["source", "res"]>]>,
def AMDGPU_ScaledExtPackedMatrixOp
: AMDGPU_Op<"scaled_ext_packed_matrix", [Pure, AllShapesMatch<["source", "res"]>]>,
Arguments<(
ins AnyTypeOf<[FixedVectorOfShapeAndType<[8], F4E2M1FN>,
FixedVectorOfShapeAndType<[8], F8E4M3FN>,
FixedVectorOfShapeAndType<[8], F8E5M2>,
FixedVectorOfShapeAndType<[16], F6E2M3FN>,
FixedVectorOfShapeAndType<[16], F6E3M2FN>]>:$source,
FixedVectorOfShapeAndType<[4], F8E8M0FNU>:$scale,
ConfinedAttr<I32Attr, [IsValidBlockSize]>:$blockSize,
ConfinedAttr<I32Attr, [IntMinValue<0>, IntMaxValue<1>]>:$firstScaleLane,
ConfinedAttr<I32Attr, [IntIsOneOf<[16, 32]>]>:$blockSize,
ConfinedAttr<I32Attr, [IntIsOneOf<[0, 16]>]>:$firstScaleLane,
ConfinedAttr<I32Attr, [IntMinValue<0>, IntMaxValue<3>]>:$firstScaleByte)>,
Results<(
outs AnyTypeOf<[FixedVectorOfShapeAndType<[8], F32>,
Expand All @@ -170,9 +166,12 @@ def AMDGPU_ScaledExtPacked816Op
FixedVectorOfShapeAndType<[16], F16>,
FixedVectorOfShapeAndType<[16], BF16>]>:$res)> {

let summary = "Extend a vector of packed floating point values";
let summary = "Extend a wave-wide matrix of packed floating point values";

let description = [{
Extend matrix of microfloats (8 or 16 elements per lane) using a set of scales
that may be stored on other lanes.

The scales applied to the input microfloats are stored in bytes which
come from the `scales` input provided in a *half* of the wave identified
by `firstScaleLane`. The bytes used is selected by `firstScaleByte` and depends
Expand All @@ -192,14 +191,14 @@ def AMDGPU_ScaledExtPacked816Op
```mlir
// Input: 8-element vector of F8E4M3FN, converting to F32
// Lanes 0-15 read from byte 0, lanes 16-31 read from byte 1
%result = amdgpu.scaled_ext_packed816 %source scale(%scales)
%result = amdgpu.scaled_ext_packed_matrix %source scale(%scales)
blockSize(32) firstScaleLane(0) firstScaleByte(0)
: vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xf32>

// Input: 16-element vector of F6E2M3FN, converting to F16
// Lanes 0-15 read from byte 2, lanes 16-31 read from byte 3
%result = amdgpu.scaled_ext_packed816 %source scale(%scales)
blockSize(32) firstScaleLane(1) firstScaleByte(2)
%result = amdgpu.scaled_ext_packed_matrix %source scale(%scales)
blockSize(32) firstScaleLane(16) firstScaleByte(2)
: vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf16>
```

Expand All @@ -211,19 +210,19 @@ def AMDGPU_ScaledExtPacked816Op
```mlir
// Input: 8-element vector of F8E5M2, converting to BF16
// Lanes 0-15 read from byte 0, lanes 16-31 read from byte 2 (0+2)
%result = amdgpu.scaled_ext_packed816 %source scale(%scales)
%result = amdgpu.scaled_ext_packed_matrix %source scale(%scales)
blockSize(16) firstScaleLane(0) firstScaleByte(0)
: vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xbf16>

// Input: 16-element vector of F6E3M2FN, converting to F32
// Lanes 0-15 read from byte 1, lanes 16-31 read from byte 3 (1+2)
%result = amdgpu.scaled_ext_packed816 %source scale(%scales)
blockSize(16) firstScaleLane(1) firstScaleByte(1)
%result = amdgpu.scaled_ext_packed_matrix %source scale(%scales)
blockSize(16) firstScaleLane(16) firstScaleByte(1)
: vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf32>
```

Note: the layout for the scales generally mirrors how the WMMA
instructions use for matix scales. These selection operands allows
instructions use for matrix scales. These selection operands allows
one to choose portions of the matrix to convert.

When `source` is either F8E4M3FN or F8E5M2 and `blockSize` is 32,
Expand All @@ -233,7 +232,7 @@ def AMDGPU_ScaledExtPacked816Op
When `source` is either F8E4M3FN or F8E5M2 and `blockSize` is 16,
following combinations are allowed:
* `firstScaleLane(0), firstScaleByte(0)`
* `firstScaleLane(1), firstScaleByte(2)`
* `firstScaleLane(16), firstScaleByte(2)`
all other combinations are reserved.

Available on gfx1250+.
Expand Down
103 changes: 53 additions & 50 deletions mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1506,16 +1506,17 @@ struct ExtPackedFp8OpLowering final
ConversionPatternRewriter &rewriter) const override;
};

struct ScaledExtPacked816OpLowering final
: public ConvertOpToLLVMPattern<ScaledExtPacked816Op> {
ScaledExtPacked816OpLowering(const LLVMTypeConverter &converter,
Chipset chipset)
: ConvertOpToLLVMPattern<amdgpu::ScaledExtPacked816Op>(converter),
struct ScaledExtPackedMatrixOpLowering final
: public ConvertOpToLLVMPattern<ScaledExtPackedMatrixOp> {
ScaledExtPackedMatrixOpLowering(const LLVMTypeConverter &converter,
Chipset chipset)
: ConvertOpToLLVMPattern<amdgpu::ScaledExtPackedMatrixOp>(converter),
chipset(chipset) {}
Chipset chipset;

LogicalResult
matchAndRewrite(ScaledExtPacked816Op op, ScaledExtPacked816OpAdaptor adaptor,
matchAndRewrite(ScaledExtPackedMatrixOp op,
ScaledExtPackedMatrixOpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override;
};

Expand Down Expand Up @@ -1627,34 +1628,35 @@ LogicalResult ExtPackedFp8OpLowering::matchAndRewrite(
return success();
}

int32_t getScaleSel(int32_t blockSize, unsigned bitWidth,
int32_t firstScaleLane, int32_t firstScaleByte) {
// When lowering amdgpu.scaled_ext_packed816 to rocdl.cvt.scale.pk*.f*.f*
// operations, the attributes blockSize, sourceType, firstScaleLane and
int32_t getScaleSel(int32_t blockSize, unsigned bitWidth, int32_t scaleWaveHalf,
int32_t firstScaleByte) {
// When lowering amdgpu.scaled_ext_packed_matrix to rocdl.cvt.scale.pk*.f*.f*
// operations, the attributes blockSize, sourceType, scaleWaveHalf, and
// firstScaleByte are merged into a single attribute scaleSel. This is how
// those values are merged together.
// those values are merged together. (Note: scaleWaveHalf isn't a high-level
// attribute but is derifed from firstScaleLane).
assert(llvm::is_contained({16, 32}, blockSize));
assert(llvm::is_contained(llvm::ArrayRef<unsigned>{4, 6, 8}, bitWidth));

const bool is_fp8 = bitWidth == 8;
const bool is_block_16 = blockSize == 16;
const bool isFp8 = bitWidth == 8;
const bool isBlock16 = blockSize == 16;

if (!is_fp8) {
int bit_0 = is_block_16;
if (!isFp8) {
int32_t bit0 = isBlock16;
assert(llvm::is_contained({0, 1, 2}, firstScaleByte));
int bit_1 = (firstScaleByte == 2) << 1;
assert(llvm::is_contained({0, 1}, firstScaleLane));
int bit_2 = firstScaleLane << 2;
return bit_2 | bit_1 | bit_0;
int32_t bit1 = (firstScaleByte == 2) << 1;
assert(llvm::is_contained({0, 1}, scaleWaveHalf));
int32_t bit2 = scaleWaveHalf << 2;
return bit2 | bit1 | bit0;
}

int bit_0 = is_block_16;
int32_t bit0 = isBlock16;
// firstScaleByte is guaranteed to be defined by two bits.
assert(llvm::is_contained({0, 1, 2, 3}, firstScaleByte));
int bit_2_and_1 = firstScaleByte << 1;
assert(llvm::is_contained({0, 1}, firstScaleLane));
int bit_3 = firstScaleLane << 3;
int bits = bit_3 | bit_2_and_1 | bit_0;
int32_t bits2and1 = firstScaleByte << 1;
assert(llvm::is_contained({0, 1}, scaleWaveHalf));
int32_t bit3 = scaleWaveHalf << 3;
int32_t bits = bit3 | bits2and1 | bit0;
// These are invalid cases.
assert(!llvm::is_contained(
{0b0011, 0b0101, 0b0111, 0b1000, 0b1001, 0b1011, 0b1111}, bits));
Expand Down Expand Up @@ -1717,8 +1719,8 @@ scaledExtPacked816ToIntrinsic(Type srcElemType, Type destElemType) {
"instructions");
}

LogicalResult ScaledExtPacked816OpLowering::matchAndRewrite(
ScaledExtPacked816Op op, ScaledExtPacked816OpAdaptor adaptor,
LogicalResult ScaledExtPackedMatrixOpLowering::matchAndRewrite(
ScaledExtPackedMatrixOp op, ScaledExtPackedMatrixOpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
using fp4 = Float4E2M1FNType;
using fp8 = Float8E4M3FNType;
Expand All @@ -1732,7 +1734,9 @@ LogicalResult ScaledExtPacked816OpLowering::matchAndRewrite(
"Scaled fp packed conversion instructions are not available on target "
"architecture and their emulation is not implemented");
}
int32_t firstScaleLane = op.getFirstScaleLane();
// Convert user-facing firstScaleLane (0 or 16) to the half of the wave that
// is being selected.
int32_t scaleWaveHalf = op.getFirstScaleLane() / 16;
int32_t firstScaleByte = op.getFirstScaleByte();
int32_t blockSize = op.getBlockSize();
auto sourceType = cast<VectorType>(op.getSource().getType());
Expand Down Expand Up @@ -1770,7 +1774,7 @@ LogicalResult ScaledExtPacked816OpLowering::matchAndRewrite(
"no intrinsic matching packed scaled conversion on the given chipset");

int32_t scaleSel =
getScaleSel(blockSize, bitWidth, firstScaleLane, firstScaleByte);
getScaleSel(blockSize, bitWidth, scaleWaveHalf, firstScaleByte);
Value castedScale =
LLVM::BitcastOp::create(rewriter, loc, i32, adaptor.getScale());
Value castedSource =
Expand Down Expand Up @@ -2388,27 +2392,26 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
RewritePatternSet &patterns,
Chipset chipset) {
populateAMDGPUMemorySpaceAttributeConversions(converter);
patterns
.add<FatRawBufferCastLowering,
RawBufferOpLowering<RawBufferLoadOp, ROCDL::RawPtrBufferLoadOp>,
RawBufferOpLowering<RawBufferStoreOp, ROCDL::RawPtrBufferStoreOp>,
RawBufferOpLowering<RawBufferAtomicFaddOp,
ROCDL::RawPtrBufferAtomicFaddOp>,
RawBufferOpLowering<RawBufferAtomicFmaxOp,
ROCDL::RawPtrBufferAtomicFmaxOp>,
RawBufferOpLowering<RawBufferAtomicSmaxOp,
ROCDL::RawPtrBufferAtomicSmaxOp>,
RawBufferOpLowering<RawBufferAtomicUminOp,
ROCDL::RawPtrBufferAtomicUminOp>,
RawBufferOpLowering<RawBufferAtomicCmpswapOp,
ROCDL::RawPtrBufferAtomicCmpSwap>,
AMDGPUDPPLowering, MemoryCounterWaitOpLowering, LDSBarrierOpLowering,
SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering,
WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPacked816OpLowering,
ScaledExtPackedOpLowering, PackedScaledTruncOpLowering,
PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering,
GatherToLDSOpLowering, TransposeLoadOpLowering,
AMDGPUPermlaneLowering, AMDGPUMakeDmaBaseLowering>(converter,
chipset);
patterns.add<
FatRawBufferCastLowering,
RawBufferOpLowering<RawBufferLoadOp, ROCDL::RawPtrBufferLoadOp>,
RawBufferOpLowering<RawBufferStoreOp, ROCDL::RawPtrBufferStoreOp>,
RawBufferOpLowering<RawBufferAtomicFaddOp,
ROCDL::RawPtrBufferAtomicFaddOp>,
RawBufferOpLowering<RawBufferAtomicFmaxOp,
ROCDL::RawPtrBufferAtomicFmaxOp>,
RawBufferOpLowering<RawBufferAtomicSmaxOp,
ROCDL::RawPtrBufferAtomicSmaxOp>,
RawBufferOpLowering<RawBufferAtomicUminOp,
ROCDL::RawPtrBufferAtomicUminOp>,
RawBufferOpLowering<RawBufferAtomicCmpswapOp,
ROCDL::RawPtrBufferAtomicCmpSwap>,
AMDGPUDPPLowering, MemoryCounterWaitOpLowering, LDSBarrierOpLowering,
SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering,
WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPackedMatrixOpLowering,
ScaledExtPackedOpLowering, PackedScaledTruncOpLowering,
PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering,
GatherToLDSOpLowering, TransposeLoadOpLowering, AMDGPUPermlaneLowering,
AMDGPUMakeDmaBaseLowering>(converter, chipset);
patterns.add<AMDGPUSwizzleBitModeLowering>(converter);
}
8 changes: 4 additions & 4 deletions mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -343,9 +343,9 @@ void RawBufferAtomicCmpswapOp::getCanonicalizationPatterns(
}

//===----------------------------------------------------------------------===//
// ScaledExtPacked816Op
// ScaledExtPackedMatrixOp
//===----------------------------------------------------------------------===//
LogicalResult ScaledExtPacked816Op::verify() {
LogicalResult ScaledExtPackedMatrixOp::verify() {
int blockSize = getBlockSize();
assert(llvm::is_contained({16, 32}, blockSize) && "invalid block size");

Expand Down Expand Up @@ -376,10 +376,10 @@ LogicalResult ScaledExtPacked816Op::verify() {
} else {
if (is_block_16) {
bool is_valid = ((firstScaleLane == 0) && (firstScaleByte == 0)) ||
((firstScaleLane == 1) && (firstScaleByte == 2));
((firstScaleLane == 16) && (firstScaleByte == 2));
if (!is_valid) {
return emitOpError("blockSize of 16 can only have (firstScaleLane, "
"firstScaleByte) be (0, 0) or (1, 2) for f8.");
"firstScaleByte) be (0, 0) or (16, 2) for f8.");
}
}
}
Expand Down
Loading
Loading