-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[mlir][xegpu] Support offset arguments in LoadNd, StoreNd and PrefetchNd subgroup distribution. #160417
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[mlir][xegpu] Support offset arguments in LoadNd, StoreNd and PrefetchNd subgroup distribution. #160417
Conversation
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-gpu Author: Charitha Saumya (charithaintc) ChangesCurrently offsets are given as operands of This PR adds support for offsets given at the consumer (like LoadNd). With this change, it is required to specify the offsets at consumer op (LoadNd, StoreNd, PrefetchNd) of the tile or otherwise distribution will fail. This also removes the need for UpdateNdOffset op. PR removes the support for UpdateNdOffset . Patch is 67.10 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/160417.diff 2 Files Affected:
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 449b8eb030b07..336df73e52eda 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -268,7 +268,7 @@ struct MoveFuncBodyToWarpExecuteOnLane0
/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
/// (!xegpu.tensor_desc<4x8xf32, #layout0>) {
/// ...
-/// %td = xegpu.create_nd_tdesc %arg0[0, 0]
+/// %td = xegpu.create_nd_tdesc %arg0
/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0>
/// vector.yield %td
/// }
@@ -277,11 +277,11 @@ struct MoveFuncBodyToWarpExecuteOnLane0
/// ```
/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (...) {
/// ...
-/// %dead = xegpu.create_nd_tdesc %arg0[0, 0]
+/// %dead = xegpu.create_nd_tdesc %arg0
/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0>
/// vector.yield %arg0, %dead
/// }
-/// %td = xegpu.create_nd_tdesc %r#0[0, 0]: memref<4x8xf32>
+/// %td = xegpu.create_nd_tdesc %r#0: memref<4x8xf32>
/// -> !xegpu.tensor_desc<4x8xf32>
///
/// ```
@@ -301,6 +301,10 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
if (!layout)
return rewriter.notifyMatchFailure(
descOp, "the tensor descriptor lacks layout attribute");
+ // CreateNdOp must not have offsets.
+ if (descOp.getMixedOffsets().size())
+ return rewriter.notifyMatchFailure(
+ descOp, "xegpu::CreateNdDescOp must not have offsets");
SmallVector<size_t> newRetIndices;
rewriter.setInsertionPoint(warpOp);
@@ -339,22 +343,23 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
/// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
/// gpu.warp_execute_on_lane_0(%laneid) -> () {
/// ...
-/// xegpu.store_nd %arg0, %arg1: vector<4x8xf32>,
+/// xegpu.store_nd %arg0, %arg1 [%x, %y]: vector<4x8xf32>,
/// !xegpu.tensor_desc<4x8xf32, #layout0>
/// }
/// ```
/// To
/// ```
/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
-/// !xegpu.tensor_desc<4x8xf32, #layout0>) {
-/// gpu.yield %arg0, %arg1: vector<4x8xf32>, !xegpu.tensor_desc<4x8xf32,
-/// #layout0>
+/// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) {
+/// ...
+/// gpu.yield %arg0, %arg1, %x, %y: vector<4x8xf32>,
+/// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index
/// }
/// %0 = vector.shape_cast %r#0: vector<4x1xf32> to vector<4xf32>
/// %1 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
/// #layout0>
/// -> !xegpu.tensor_desc<4x8xf32>
-/// xegpu.store_nd %0, %1: vector<4xf32>,
+/// xegpu.store_nd %0, %1 [%r#2, %r#3]: vector<4xf32>,
/// !xegpu.tensor_desc<4x8xf32>
///
/// ```
@@ -368,10 +373,15 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
if (!storeOp)
return failure();
- int64_t offsetSize = static_cast<int64_t>(storeOp.getOffsets().size());
- if ((offsetSize != 0) || storeOp.getConstOffsetsAttr())
- return failure();
-
+ SmallVector<OpFoldResult> offsets = storeOp.getMixedOffsets();
+ // Expecting offsets to be present.
+ if (offsets.empty())
+ return rewriter.notifyMatchFailure(storeOp,
+ "the store op must have offsets");
+ SmallVector<Value> offsetsAsValues =
+ vector::getAsValues(rewriter, storeOp.getLoc(), offsets);
+ SmallVector<Type> offsetTypes = llvm::to_vector(
+ llvm::map_range(offsetsAsValues, [](Value v) { return v.getType(); }));
xegpu::TensorDescType tensorDescTy = storeOp.getTensorDescType();
xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
if (!layout)
@@ -387,13 +397,13 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
distributedTypeByWarpOpOrFailure.value();
SmallVector<size_t> newRetIndices;
+ SmallVector<Value> newYieldedValues = {storeOp.getValue(),
+ storeOp.getTensorDesc()};
+ SmallVector<Type> newYieldedTypes = {distributedTypeByWarpOp, tensorDescTy};
+ newYieldedValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
+ newYieldedTypes.append(offsetTypes.begin(), offsetTypes.end());
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp,
- /* new yielded values = */
- ValueRange{storeOp.getValue(), storeOp.getTensorDesc()},
- /* new yielded types = */
- TypeRange{distributedTypeByWarpOp, storeOp.getTensorDescType()},
- newRetIndices);
+ rewriter, warpOp, newYieldedValues, newYieldedTypes, newRetIndices);
// Create a new store op outside the warp op with the distributed vector
// type. Tensor descriptor is not distributed.
rewriter.setInsertionPointAfter(newWarpOp);
@@ -418,6 +428,9 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
newStoreOperands.push_back(
resolveDistributedTy(newWarpOp.getResult(newRetIndices[1]),
distributedTensorDescTy, rewriter));
+ // Collect offsets.
+ for (size_t i = 2; i < newRetIndices.size(); ++i)
+ newStoreOperands.push_back(newWarpOp.getResult(newRetIndices[i]));
auto newStoreOp =
xegpu::StoreNdOp::create(rewriter, newWarpOp.getLoc(), TypeRange{},
@@ -491,9 +504,18 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
loadOp,
"xegpu::LoadNdOp require chip information to determine transpose "
"requirement");
- int64_t offsetSize = static_cast<int64_t>(loadOp.getOffsets().size());
- if ((offsetSize != 0) || loadOp.getConstOffsetsAttr())
- return failure();
+ // int64_t offsetSize = static_cast<int64_t>(loadOp.getOffsets().size());
+ // if ((offsetSize != 0) || loadOp.getConstOffsetsAttr())
+ // return failure();
+ // Expecting offsets to be present.
+ SmallVector<OpFoldResult> offsets = loadOp.getMixedOffsets();
+ if (offsets.empty())
+ return rewriter.notifyMatchFailure(loadOp,
+ "the load op must have offsets");
+ SmallVector<Value> offsetsAsValues =
+ vector::getAsValues(rewriter, loadOp.getLoc(), offsets);
+ SmallVector<Type> offsetTypes = llvm::to_vector(
+ llvm::map_range(offsetsAsValues, [](Value v) { return v.getType(); }));
xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
@@ -506,10 +528,12 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
cast<VectorType>(warpOp.getResult(operandIdx).getType());
SmallVector<size_t> newRetIndices;
+ SmallVector<Value> newYieldedValues = {loadOp.getTensorDesc()};
+ SmallVector<Type> newYieldedTypes = {tensorDescTy};
+ newYieldedValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
+ newYieldedTypes.append(offsetTypes.begin(), offsetTypes.end());
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp,
- /* new yielded values = */ loadOp.getTensorDesc(),
- /* new yielded types = */ tensorDescTy, newRetIndices);
+ rewriter, warpOp, newYieldedValues, newYieldedTypes, newRetIndices);
// Create a new load op outside the warp op with the distributed vector
// type.
@@ -523,11 +547,15 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
loadOp.getTensorDescType().dropLayouts(); // Distributed tensor
// descriptor type does not
// contain layout info.
+ SmallVector<Value> newLoadOperands{
+ resolveDistributedTy(newWarpOp.getResult(newRetIndices[0]),
+ distributedTensorDescTy, rewriter)};
+ // Collect offsets.
+ for (size_t i = 1; i < newRetIndices.size(); ++i)
+ newLoadOperands.push_back(newWarpOp.getResult(newRetIndices[i]));
auto newLoadOp = xegpu::LoadNdOp::create(
rewriter, newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
- resolveDistributedTy(newWarpOp->getResult(newRetIndices[0]),
- distributedTensorDescTy, rewriter),
- loadOp->getAttrs());
+ newLoadOperands, loadOp->getAttrs());
xegpu::removeLayoutAttrs(newLoadOp);
// Set the packed attribute if the layout requires it.
newLoadOp.setPacked(requirePacked(layout));
@@ -677,85 +705,6 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern {
}
};
-/// Sink an update_nd_offset op feeding into yield op of an enclosing
-/// `gpu.warp_execute_on_lane_0` region. The warp op will still contain the
-/// original op that will not be used by the yield op (and should be cleaned
-/// up later). The yield op will bypass the updateOp's arguments. The tensor
-/// descriptor type is not distributed. Appropriate cast ops are inserted if
-/// the distributed types does not match expected xegpu SIMT types.
-/// Example:
-/// ```
-/// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
-/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
-/// (!xegpu.tensor_desc<4x8xf32, #layout0>) {
-/// ...
-/// %update = xegpu.update_nd_offset %arg0, [%c32, %c16]:
-/// !xegpu.tensor_desc<4x8xf32, #layout0>
-/// gpu.yield %update
-/// }
-/// ...
-/// ```
-/// To
-/// ```
-/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (
-/// !xegpu.tensor_desc<4x8xf32, #layout0>,
-/// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) {
-/// ...
-/// %dead = xegpu.update_nd_offset %arg0, [%c32, %c16]:
-/// !xegpu.tensor_desc<4x8xf32, #layout0> gpu.yield %dead, %arg0
-/// gpu.yield %dead, %arg0, %c32, %c16
-/// }
-/// %0 = xegpu.unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
-/// #layout0> -> !xegpu.tensor_desc<4x8xf32>
-/// %1 = xegpu.update_nd_offset %0, [%r#2, %r#3]:
-/// !xegpu.tensor_desc<4x8xf32>
-/// ...
-/// ```
-struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
- using gpu::WarpDistributionPattern::WarpDistributionPattern;
- LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
- PatternRewriter &rewriter) const override {
- OpOperand *operand =
- getWarpResult(warpOp, llvm::IsaPred<xegpu::UpdateNdOffsetOp>);
- if (!operand)
- return rewriter.notifyMatchFailure(
- warpOp, "warp result is not a xegpu::UpdateNdOffset op");
- auto updateOp = operand->get().getDefiningOp<xegpu::UpdateNdOffsetOp>();
- unsigned operandIdx = operand->getOperandNumber();
-
- SmallVector<size_t> newRetIndices;
- gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
- rewriter, warpOp, updateOp->getOperands(), updateOp.getOperandTypes(),
- newRetIndices);
- rewriter.setInsertionPointAfter(newWarpOp);
- // new update op does not have layout attribute.
- xegpu::TensorDescType distributedTensorDescTy =
- updateOp.getTensorDescType().dropLayouts();
- SmallVector<Value> newUpdateOperands =
- llvm::map_to_vector(newRetIndices, [&](size_t i) {
- // For the tensor descriptor operand, the layout attribute is
- // dropped after distribution. Types needs to be resolved in this
- // case.
- if (isa<xegpu::TensorDescType>(newWarpOp.getResult(i).getType())) {
- return resolveDistributedTy(newWarpOp.getResult(i),
- distributedTensorDescTy, rewriter);
- }
- return newWarpOp.getResult(i);
- });
- // Create a new update op outside the warp op.
- auto newUpdateOp = xegpu::UpdateNdOffsetOp::create(
- rewriter, newWarpOp.getLoc(), distributedTensorDescTy,
- newUpdateOperands, updateOp->getAttrs());
- xegpu::removeLayoutAttrs(newUpdateOp);
- Value distributedVal = newWarpOp.getResult(operandIdx);
- // Resolve the distributed type with the original type.
- Value typeResolved = resolveDistributedTy(
- newUpdateOp.getResult(), distributedVal.getType(), rewriter);
- rewriter.replaceAllUsesWith(distributedVal, typeResolved);
- return success();
- }
-};
-
/// Distribute a prefetch_nd op at the end of enclosing
/// `gpu.warp_execute_on_lane_0`. In case arguments for the prefetch are passed
/// through the warp op interface they would be propagated as returned values.
@@ -769,18 +718,19 @@ struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
/// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
/// gpu.warp_execute_on_lane_0(%laneid) -> () {
/// ...
-/// xegpu.prefetch_nd %arg0 : !xegpu.tensor_desc<4x8xf32, #layout0>
+/// xegpu.prefetch_nd %arg0 [%x, %y] : !xegpu.tensor_desc<4x8xf32, #layout0>
/// }
/// ```
/// To
/// ```
/// %r:1 = gpu.warp_execute_on_lane_0(%laneid) -> (
-/// !xegpu.tensor_desc<4x8xf32, #layout0>) {
-/// gpu.yield %arg0: !xegpu.tensor_desc<4x8xf32, #layout0>
+/// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) {
+/// gpu.yield %arg0, %x, %y: !xegpu.tensor_desc<4x8xf32, #layout0>, index,
+/// index
/// }
/// %1 = unrealized_conversion_cast %r#0: !xegpu.tensor_desc<4x8xf32,
/// #layout0> -> !xegpu.tensor_desc<4x8xf32>
-/// xegpu.prefetch_nd %1 : !xegpu.tensor_desc<4x8xf32>
+/// xegpu.prefetch_nd %1 [%r#1, %r#2] : !xegpu.tensor_desc<4x8xf32>
///
/// ```
struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
@@ -793,17 +743,25 @@ struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
if (!prefetchOp)
return failure();
- int64_t offsetSize = static_cast<int64_t>(prefetchOp.getOffsets().size());
- if ((offsetSize != 0) || prefetchOp.getConstOffsetsAttr())
- return failure();
+ SmallVector<OpFoldResult> offsets = prefetchOp.getMixedOffsets();
+ // PrefetchNdOp must have offsets.
+ if (offsets.empty())
+ return rewriter.notifyMatchFailure(prefetchOp,
+ "the prefetch op must have offsets");
+ SmallVector<Value> offsetsAsValues =
+ vector::getAsValues(rewriter, prefetchOp.getLoc(), offsets);
+ SmallVector<Type> offsetTypes = llvm::to_vector(
+ llvm::map_range(offsetsAsValues, [](Value v) { return v.getType(); }));
xegpu::LayoutAttr layout = prefetchOp.getTensorDescType().getLayoutAttr();
if (!layout)
return rewriter.notifyMatchFailure(
prefetchOp, "the source tensor descriptor lacks layout attribute");
- SmallVector<Value, 1> newYieldValues = {prefetchOp.getTensorDesc()};
- SmallVector<Type, 1> newYieldTypes = {prefetchOp.getTensorDescType()};
+ SmallVector<Value> newYieldValues = {prefetchOp.getTensorDesc()};
+ SmallVector<Type> newYieldTypes = {prefetchOp.getTensorDescType()};
+ newYieldValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
+ newYieldTypes.append(offsetTypes.begin(), offsetTypes.end());
SmallVector<size_t> newRetIndices;
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
@@ -814,6 +772,9 @@ struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
rewriter.setInsertionPointAfter(newWarpOp);
SmallVector<Value> newPrefetchOperands = {resolveDistributedTy(
newWarpOp.getResult(newRetIndices[0]), newTensorDescTy, rewriter)};
+ // Collect offsets.
+ for (size_t i = 1; i < newRetIndices.size(); ++i)
+ newPrefetchOperands.push_back(newWarpOp.getResult(newRetIndices[i]));
xegpu::PrefetchNdOp::create(rewriter, newWarpOp.getLoc(), TypeRange{},
newPrefetchOperands, prefetchOp->getAttrs());
xegpu::removeLayoutAttrs(prefetchOp);
@@ -1456,15 +1417,14 @@ struct XeGPUSubgroupDistributePass final
void xegpu::populateXeGPUSubgroupDistributePatterns(
RewritePatternSet &patterns) {
- patterns
- .add<CreateNdDescDistribution, StoreNdDistribution, LoadNdDistribution,
- DpasDistribution, PrefetchNdDistribution, UpdateNdOffsetDistribution,
- GpuBarrierDistribution, VectorMultiReductionDistribution,
- LoadDistribution, StoreDistribution, VectorTransposeDistribution,
- VectorBitcastDistribution,
- MemrefExtractAlignedPointerAsIndexDistribution>(
- patterns.getContext(),
- /*pattern benefit=*/regularPatternBenefit);
+ patterns.add<CreateNdDescDistribution, StoreNdDistribution,
+ LoadNdDistribution, DpasDistribution, PrefetchNdDistribution,
+ GpuBarrierDistribution, VectorMultiReductionDistribution,
+ LoadDistribution, StoreDistribution, VectorTransposeDistribution,
+ VectorBitcastDistribution,
+ MemrefExtractAlignedPointerAsIndexDistribution>(
+ patterns.getContext(),
+ /*pattern benefit=*/regularPatternBenefit);
patterns.add<VectorShapeCastDistribution>(
patterns.getContext(),
/*pattern benefit=*/highPatternBenefit);
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index 13b0ed176eb0c..59fac26d18cf4 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -8,15 +8,15 @@
// CHECK-LABEL: gpu.func @store_nd_1d
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>) {
// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1xf32>
-// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
-// CHECK: xegpu.store_nd %[[CST]], %[[T0]] : vector<1xf32>, !xegpu.tensor_desc<16xf32>
+// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
+// CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}] : vector<1xf32>, !xegpu.tensor_desc<16xf32>
// CHECK: gpu.return
gpu.module @xevm_module{
gpu.func @store_nd_1d(%arg0: memref<16xf32>) {
%c0 = arith.constant 0 : index
%cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1.000000e+00> : vector<16xf32>
- %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
- xegpu.store_nd %cst, %0 : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+ %0 = xegpu.create_nd_tdesc %arg0 : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+ xegpu.store_nd %cst, %0 [%c0] : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
gpu.return
}
}
@@ -25,14 +25,14 @@ gpu.module @xevm_module{
// CHECK-LABEL: gpu.func @store_nd_2d
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>) {
// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf16>
-// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: xegpu.store_nd %[[CST]], %[[T0]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
+// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
gpu.module @xevm_module{
gpu.func @store_nd_2d(%arg0: memref<16x16xf16>) {
%c0 = arith.constant 0 : index
%cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<1.000000e+00> : vector<16x16xf16>
- %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
- xegpu.store_nd %cst, %0 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updates to transform pass and test cases look fine to me.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. do we have any SIMT test at sg level downstream which would need change now that other version is deprecated?
will be fixed with pulldown. |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/138/builds/19593 Here is the relevant piece of the build log for the reference
|
…hNd subgroup distribution. (llvm#160417) Currently offsets are given as operands of `CreateNd` op. Sg distribution does not support offsets arguments at the consumer. This PR adds support for offsets given at the consumer (like LoadNd). With this change, it is required to specify the offsets at consumer op (LoadNd, StoreNd, PrefetchNd) of the tile or otherwise distribution will fail. This also removes the need for UpdateNdOffset op. PR removes the support for UpdateNdOffset .
Currently offsets are given as operands of
CreateNd
op. Sg distribution does not support offsets arguments at the consumer.This PR adds support for offsets given at the consumer (like LoadNd). With this change, it is required to specify the offsets at consumer op (LoadNd, StoreNd, PrefetchNd) of the tile or otherwise distribution will fail.
This also removes the need for UpdateNdOffset op. PR removes the support for UpdateNdOffset .