-
Notifications
You must be signed in to change notification settings - Fork 15.5k
[mlir][amdgpu] Continue lowering make_tdm_descriptor. #171498
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[mlir][amdgpu] Continue lowering make_tdm_descriptor. #171498
Conversation
Continues the lowering of make_tdm_descriptor to support load and stores operations which require 4 descriptors.
Changing the order of operations to make lit tests more readable.
|
@llvm/pr-subscribers-mlir-gpu @llvm/pr-subscribers-mlir-amdgpu Author: Erick Ochoa Lopez (amd-eochoalo) Changes
Patch is 59.59 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/171498.diff 5 Files Affected:
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 56160d3e8fe85..6fbc90ded5824 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -110,9 +110,14 @@ def AMDGPU_TDMDescriptorType : AMDGPU_Type<"TDMDescriptor", "tdm_descriptor"> {
This type is opaque and corresponds to the two or four descriptor groups
used in tensor_load_to_lds or tensor_store_from_lds.
}];
-
}
+class AMDGPU_ConcreteVector<Type elem, int length> :
+ FixedVectorOfLengthAndType<[length], [elem]>,
+ BuildableType<
+ "::mlir::VectorType::get({" # length # "} ,"
+ # elem.builderCall # ")">;
+
//===----------------------------------------------------------------------===//
// AMDGPU Op definitions
//===----------------------------------------------------------------------===//
@@ -1296,14 +1301,14 @@ def AMDGPU_MakeDmaDescriptorOp :
DenseI64ArrayAttr: $global_static_strides,
Variadic<Index>: $shared_dynamic_sizes,
DenseI64ArrayAttr: $shared_static_sizes,
- Optional<I16>: $workgroup_mask,
+ Optional<AMDGPU_ConcreteVector<I1, 16>>: $workgroup_mask,
Optional<I1>: $early_timeout,
- Optional<Index>: $pad_amount,
- Optional<Index>: $pad_interval,
+ Optional<I32>: $pad_amount,
+ Optional<I32>: $pad_interval,
Optional<AnyMemRef>: $atomic_barrier_address,
Variadic<Index>: $atomic_barrier_indices,
Optional<Index>: $global_increment,
- Optional<Index>: $lds_increment,
+ Optional<I32>: $lds_increment,
Optional<Index>: $iteration_count)>,
Results<(outs AMDGPU_TDMDescriptorType: $desc)> {
@@ -1335,7 +1340,7 @@ def AMDGPU_MakeDmaDescriptorOp :
2D and 3D tensors may be iterated over by setting $global_increment, $lds_increment, and $iteration_count.
$global_increment determines how much to increment the starting global memory address per iteration in units of the $base's element type.
$lds_increment determines how much to increment the starting LDS address per iteration in units of the $base's element type.
- $iterate_count determines how many times to iterate.
+ $iterate_count determines how many times to iterate, it must be a value in the inclusive interval [1, 256].
```mlir
// Example of moving a two-dimensional tensor to LDS.
@@ -1345,7 +1350,7 @@ def AMDGPU_MakeDmaDescriptorOp :
// Example of moving a two dimension tensor to LDS where padding is applied after every integer.
%base = amdgpu.make_dma_base %global[0, 0], %lds[0, 0] : memref<32x32xi32>, memref<64x64xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
- %descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padding(%pad_amount pad_every %pad_interval) : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padShared(%pad_amount every %pad_interval) : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
```
}];
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 7584b17075225..592731778a8e4 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2381,6 +2381,8 @@ struct AMDGPUMakeDmaDescriptorLowering
if (!mask)
return sgpr0;
+ Type i16 = rewriter.getI16Type();
+ mask = LLVM::BitcastOp::create(rewriter, loc, i16, mask);
Type i32 = rewriter.getI32Type();
Value extendedMask = LLVM::ZExtOp::create(rewriter, loc, i32, mask);
return setValueAtOffset(rewriter, loc, sgpr0, extendedMask, 0);
@@ -2389,21 +2391,19 @@ struct AMDGPUMakeDmaDescriptorLowering
Value setDataSize(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
- // Compute data_size.
unsigned elementTypeWidthInBits = op.getElementTypeWidth();
assert(
llvm::is_contained<unsigned>({8, 16, 32, 64}, elementTypeWidthInBits) &&
"expected type width to be 8, 16, 32, or 64.");
- int64_t dataSize = llvm::Log2_32(elementTypeWidthInBits / 8);
- Value size = createI32Constant(rewriter, loc, dataSize);
+ int64_t idx = llvm::Log2_32(elementTypeWidthInBits / 8);
+ Value size = consts[idx];
return setValueAtOffset(rewriter, loc, sgpr0, size, 16);
}
Value setAtomicBarrier(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
- bool atomic_barrier_enable = adaptor.getAtomicBarrierAddress() != nullptr;
- if (!atomic_barrier_enable)
+ if (!adaptor.getAtomicBarrierAddress())
return sgpr0;
return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 18);
@@ -2412,19 +2412,16 @@ struct AMDGPUMakeDmaDescriptorLowering
Value setIterateEnable(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
- bool iterate_enable = adaptor.getGlobalIncrement() != nullptr;
- if (!iterate_enable)
+ if (!adaptor.getGlobalIncrement())
return sgpr0;
- // TODO: In future PR, add other required fields for iteration.
return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 19);
}
Value setPadEnable(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
- bool pad_enable = op.getPadAmount() != nullptr;
- if (!pad_enable)
+ if (!op.getPadAmount())
return sgpr0;
return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 20);
@@ -2442,13 +2439,16 @@ struct AMDGPUMakeDmaDescriptorLowering
Value setPadInterval(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
- bool pad_enable = op.getPadAmount() != nullptr;
- if (!pad_enable)
+ if (!op.getPadAmount())
return sgpr0;
+ // pre-condition: padInterval can be a power of two between 2 and 256.
+ // TODO: Validation if the value breaks the pre-condition.
+ // If the pre-condition fails, there is a possibility of
+ // affecting the higher bits. In a following PR add a flag
+ // that instruments conditions that need to be checked at runtime.
IntegerType i32 = rewriter.getI32Type();
Value padInterval = adaptor.getPadInterval();
- // pre-condition: padInterval can be a power of two between 2 and 256.
padInterval = LLVM::CountTrailingZerosOp::create(rewriter, loc, i32,
padInterval, false);
padInterval = LLVM::SubOp::create(rewriter, loc, padInterval, consts[1]);
@@ -2459,12 +2459,15 @@ struct AMDGPUMakeDmaDescriptorLowering
Value setPadAmount(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
- bool pad_enable = op.getPadAmount() != nullptr;
- if (!pad_enable)
+ if (!op.getPadAmount())
return sgpr0;
- Value padAmount = adaptor.getPadAmount();
// pre-condition: padAmount is a value between 1-128.
+ // TODO: Validation if the value breaks the pre-condition.
+ // If the pre-condition fails, there is a possibility of
+ // affecting the higher bits. In a following PR add a flag
+ // that instruments conditions that need to be checked at runtime.
+ Value padAmount = adaptor.getPadAmount();
padAmount = LLVM::SubOp::create(rewriter, loc, padAmount, consts[1]);
// post-condition: padAmount is a value between 0-127.
return setValueAtOffset(rewriter, loc, sgpr0, padAmount, 25);
@@ -2474,8 +2477,7 @@ struct AMDGPUMakeDmaDescriptorLowering
ConversionPatternRewriter &rewriter,
Location loc, Value sgpr1,
ArrayRef<Value> consts) const {
- bool atomic_barrier_enable = adaptor.getAtomicBarrierAddress() != nullptr;
- if (!atomic_barrier_enable)
+ if (!adaptor.getAtomicBarrierAddress())
return sgpr1;
Value atomicBarrierAddress = adaptor.getAtomicBarrierAddress();
@@ -2488,6 +2490,9 @@ struct AMDGPUMakeDmaDescriptorLowering
IntegerType i32 = rewriter.getI32Type();
// pre-condition: atomicBarrierAddress is aligned to 8 bytes which implies
// that the 3 LSBs are zero.
+ // TODO: Validation if the value breaks the pre-condition.
+ // In a following PR add a flag that instruments conditions that need to be
+ // checked at runtime.
atomicBarrierAddress =
LLVM::PtrToIntOp::create(rewriter, loc, i32, atomicBarrierAddress);
atomicBarrierAddress =
@@ -2498,65 +2503,91 @@ struct AMDGPUMakeDmaDescriptorLowering
return setValueAtOffset(rewriter, loc, sgpr1, atomicBarrierAddress, 32);
}
- std::pair<Value, Value> setTensorDim0(MakeDmaDescriptorOp op,
+ std::pair<Value, Value> setTensorDimX(MakeDmaDescriptorOp op,
OpAdaptor adaptor,
ConversionPatternRewriter &rewriter,
Location loc, Value sgpr1, Value sgpr2,
- ArrayRef<Value> consts) const {
- SmallVector<OpFoldResult> mixedGlobalSizes = op.getMixedGlobalSizes();
- OpFoldResult tensorDim0OpFoldResult = mixedGlobalSizes.back();
- Value tensorDim0;
- if (auto attr = dyn_cast<Attribute>(tensorDim0OpFoldResult))
- tensorDim0 =
+ ArrayRef<Value> consts, uint64_t dimX,
+ uint32_t offset) const {
+ ArrayRef<int64_t> globalStaticSizes = adaptor.getGlobalStaticSizes();
+ ValueRange globalDynamicSizes = adaptor.getGlobalDynamicSizes();
+ SmallVector<OpFoldResult> mixedGlobalSizes =
+ getMixedValues(globalStaticSizes, globalDynamicSizes, rewriter);
+ if (mixedGlobalSizes.size() <= dimX)
+ return {sgpr1, sgpr2};
+
+ OpFoldResult tensorDimXOpFoldResult = *(mixedGlobalSizes.rbegin() + dimX);
+ // pre-condition: tensorDimX is less than 2^48-1
+ // TODO: Validation if the value breaks the pre-condition.
+ // If the pre-condition fails, there is a possibility of
+ // affecting the higher bits. In a following PR add a flag
+ // that instruments conditions that need to be checked at runtime.
+ // This could also be fixed by saying that mixedGlobalSizes is a
+ // DynamicI48List.
+ Value tensorDimX;
+ if (auto attr = dyn_cast<Attribute>(tensorDimXOpFoldResult))
+ tensorDimX =
createI32Constant(rewriter, loc, cast<IntegerAttr>(attr).getInt());
- else
- tensorDim0 = cast<Value>(tensorDim0OpFoldResult);
+ else {
+ IntegerType i32 = rewriter.getI32Type();
+ tensorDimX = cast<Value>(tensorDimXOpFoldResult);
+ tensorDimX = LLVM::TruncOp::create(rewriter, loc, i32, tensorDimX);
+ }
+
+ sgpr1 = setValueAtOffset(rewriter, loc, sgpr1, tensorDimX, offset);
Value c16 = createI32Constant(rewriter, loc, 16);
- Value tensorDim0High = LLVM::LShrOp::create(rewriter, loc, tensorDim0, c16);
- sgpr1 = setValueAtOffset(rewriter, loc, sgpr1, tensorDim0, 48);
- sgpr2 = setValueAtOffset(rewriter, loc, sgpr2, tensorDim0High, 48 + 16);
+ Value tensorDimXHigh = LLVM::LShrOp::create(rewriter, loc, tensorDimX, c16);
+ sgpr2 = setValueAtOffset(rewriter, loc, sgpr2, tensorDimXHigh, offset + 16);
return {sgpr1, sgpr2};
}
+ std::pair<Value, Value> setTensorDim0(MakeDmaDescriptorOp op,
+ OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter,
+ Location loc, Value sgpr1, Value sgpr2,
+ ArrayRef<Value> consts) const {
+ return setTensorDimX(op, adaptor, rewriter, loc, sgpr1, sgpr2, consts, 0,
+ 48);
+ }
+
std::pair<Value, Value> setTensorDim1(MakeDmaDescriptorOp op,
OpAdaptor adaptor,
ConversionPatternRewriter &rewriter,
Location loc, Value sgpr2, Value sgpr3,
ArrayRef<Value> consts) const {
- // TODO: Generalize to setTensorDimX.
- SmallVector<OpFoldResult> mixedGlobalSizes = op.getMixedGlobalSizes();
- OpFoldResult tensorDim1OpFoldResult = *(mixedGlobalSizes.rbegin() + 1);
- Value tensorDim1;
- if (auto attr = dyn_cast<Attribute>(tensorDim1OpFoldResult))
- tensorDim1 =
- createI32Constant(rewriter, loc, cast<IntegerAttr>(attr).getInt());
- else
- tensorDim1 = cast<Value>(tensorDim1OpFoldResult);
-
- Value c16 = createI32Constant(rewriter, loc, 16);
- Value tensorDim1High = LLVM::LShrOp::create(rewriter, loc, tensorDim1, c16);
- sgpr2 = setValueAtOffset(rewriter, loc, sgpr2, tensorDim1, 80);
- sgpr3 = setValueAtOffset(rewriter, loc, sgpr3, tensorDim1High, 80 + 16);
- return {sgpr2, sgpr3};
+ return setTensorDimX(op, adaptor, rewriter, loc, sgpr2, sgpr3, consts, 1,
+ 80);
}
Value setTileDimX(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr, ArrayRef<Value> consts, size_t dimX,
int64_t offset) const {
- SmallVector<OpFoldResult> mixedSharedSizes = op.getMixedSharedSizes();
-
+ ArrayRef<int64_t> sharedStaticSizes = adaptor.getSharedStaticSizes();
+ ValueRange sharedDynamicSizes = adaptor.getSharedDynamicSizes();
+ SmallVector<OpFoldResult> mixedSharedSizes =
+ getMixedValues(sharedStaticSizes, sharedDynamicSizes, rewriter);
if (mixedSharedSizes.size() <= dimX)
return sgpr;
OpFoldResult tileDimXOpFoldResult = *(mixedSharedSizes.rbegin() + dimX);
+ // pre-condition: tileDimX is less than 2^16-1
+ // TODO: Validation if the value breaks the pre-condition.
+ // If the pre-condition fails, there is a possibility of
+ // affecting the higher bits. In a following PR add a flag
+ // that instruments conditions that need to be checked at runtime.
+ // This could also be fixed by saying that mixedSharedSizes is a
+ // DynamicI16List.
Value tileDimX;
if (auto attr = dyn_cast<Attribute>(tileDimXOpFoldResult))
tileDimX =
createI32Constant(rewriter, loc, cast<IntegerAttr>(attr).getInt());
- else
+ else {
+ IntegerType i32 = rewriter.getI32Type();
tileDimX = cast<Value>(tileDimXOpFoldResult);
+ tileDimX = LLVM::TruncOp::create(rewriter, loc, i32, tileDimX);
+ }
return setValueAtOffset(rewriter, loc, sgpr, tileDimX, offset);
}
@@ -2584,13 +2615,20 @@ struct AMDGPUMakeDmaDescriptorLowering
ConversionPatternRewriter &rewriter, Location loc,
Value sgprY, Value sgprZ, ArrayRef<Value> consts,
size_t dimX, int64_t offset) const {
- SmallVector<OpFoldResult> mixedGlobalStrides = op.getMixedGlobalStrides();
+ ArrayRef<int64_t> globalStaticStrides = adaptor.getGlobalStaticStrides();
+ ValueRange globalDynamicStrides = adaptor.getGlobalDynamicStrides();
+ SmallVector<OpFoldResult> mixedGlobalStrides =
+ getMixedValues(globalStaticStrides, globalDynamicStrides, rewriter);
if (mixedGlobalStrides.size() <= dimX)
return {sgprY, sgprZ};
OpFoldResult tensorDimXStrideOpFoldResult =
*(mixedGlobalStrides.rbegin() + dimX);
+ // pre-condition: tensorDimXStride is less than 2^48-1
+ // TODO: Validation if the value breaks the pre-condition.
+ // In a following PR add a flag that instruments conditions that need to be
+ // checked at runtime.
Value tensorDimXStride;
if (auto attr = dyn_cast<Attribute>(tensorDimXStrideOpFoldResult))
tensorDimXStride =
@@ -2605,6 +2643,7 @@ struct AMDGPUMakeDmaDescriptorLowering
IntegerType i32 = rewriter.getI32Type();
Value tensorDimXStrideLow =
LLVM::TruncOp::create(rewriter, loc, i32, tensorDimXStride);
+ sgprY = setValueAtOffset(rewriter, loc, sgprY, tensorDimXStrideLow, offset);
int64_t shift = (offset % 32) == 0 ? 32 : offset % 32;
Value shiftVal = createI64Constant(rewriter, loc, shift);
@@ -2612,8 +2651,6 @@ struct AMDGPUMakeDmaDescriptorLowering
LLVM::LShrOp::create(rewriter, loc, tensorDimXStride, shiftVal);
tensorDimXStrideHigh =
LLVM::TruncOp::create(rewriter, loc, i32, tensorDimXStrideHigh);
-
- sgprY = setValueAtOffset(rewriter, loc, sgprY, tensorDimXStrideLow, offset);
sgprZ = setValueAtOffset(rewriter, loc, sgprZ, tensorDimXStrideHigh,
offset + shift);
return {sgprY, sgprZ};
@@ -2680,6 +2717,221 @@ struct AMDGPUMakeDmaDescriptorLowering
return dgroup1;
}
+ Value setTensorDimX(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr0, ArrayRef<Value> consts, int64_t dimX,
+ int64_t offset) const {
+ ArrayRef<int64_t> globalStaticSizes = adaptor.getGlobalStaticSizes();
+ ValueRange globalDynamicSizes = adaptor.getGlobalDynamicSizes();
+ SmallVector<OpFoldResult> mixedGlobalSizes =
+ getMixedValues(globalStaticSizes, globalDynamicSizes, rewriter);
+ if (mixedGlobalSizes.size() <= static_cast<unsigned long>(dimX))
+ return sgpr0;
+
+ OpFoldResult tensorDimXOpFoldResult = *(mixedGlobalSizes.rbegin() + dimX);
+ Value tensorDimX;
+ if (auto attr = dyn_cast<Attribute>(tensorDimXOpFoldResult))
+ tensorDimX =
+ createI32Constant(rewriter, loc, cast<IntegerAttr>(attr).getInt());
+ else {
+ IntegerType i32 = rewriter.getI32Type();
+ tensorDimX = cast<Value>(tensorDimXOpFoldResult);
+ tensorDimX = LLVM::TruncOp::create(rewriter, loc, i32, tensorDimX);
+ }
+
+ return setValueAtOffset(rewriter, loc, sgpr0, tensorDimX, offset);
+ }
+
+ Value setTensorDim2(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr0, ArrayRef<Value> consts) const {
+ return setTensorDimX(op, adaptor, rewriter, loc, sgpr0, consts, 2, 0);
+ }
+
+ Value truncateAndSetValueAtOffset(ConversionPatternRewriter &rewriter,
+ Location loc, Value accumulator,
+ Value value, int64_t shift) const {
+
+ IntegerType i32 = rewriter.getI32Type();
+ value = LLVM::TruncOp::create(rewriter, loc, i32, value);
+ return setValueAtOffset(rewriter, loc, accumulator, value, shift);
+ }
+
+ Value setLDSAddrIncrement(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr1, ArrayRef<Value> consts,
+ int64_t offset) const {
+ Value ldsAddrIncrement = adaptor.getLdsIncrement();
+ return setValueAtOffset(rewriter, loc, sgpr1, ldsAddrIncrement, offset);
+ }
+
+ std::pair<Value, Value>
+ setGlobalAddrIncrement(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr2, Value sgpr3, ArrayRef<Value> consts,
+ int64_t offset) const {
+ Value globalAddrIncrement = adaptor.getGlobalIncrement();
+ sgpr2 = truncateAndSetValueAtOffset(rewriter, loc, sgpr2,
+ globalAddrIncrement, offset);
+ Value shift = createI64Constant(rewriter, loc, 32);
+ globalAddrIncrement =
+ LLVM::LShrOp::create(rewriter, loc, glo...
[truncated]
|
krzysz00
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Minor initial notes
krzysz00
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Approved after one change
Uh oh!
There was an error while loading. Please reload this page.