[MLIR][XeGPU] Preserve leading unit dimension during blocking#180884
Conversation
|
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-gpu Author: Jianhui Li (Jianhui-Li) ChangesThis PR preserve leading dimension during blocking. This ensures the blocking process avoid generating unnecessary insert/extract_strided_slice, which under certain condition becomes difficult to be canceled, and creates extra burden in lane layout propagation and subgroup distribution. Patch is 25.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/180884.diff 4 Files Affected:
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index 6faa25cf49df9..4eb6ad51ee9bf 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -152,26 +152,8 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
if (layout && layout.isForSubgroup()) {
if (!layout.getEffectiveInstDataAsInt().empty()) {
SmallVector<int64_t> instData = layout.getEffectiveInstDataAsInt();
- // Remove leading unit dimensions from inst_data for non-rank-sensitive
- // ops. For example, if the inst_data is [1, 1, 32] it will pass [32] as
- // the unroll/blocking size.
- // Skip it for rank-sensitive ops, whose semantics depend on the tensor
- // rank (and consequently its shape), and therefore must not alter the
- // input tile rank or shape, such as by dropping leading dimensions.
- bool skipLeadingUnitDimRemoval =
- ownerOp &&
- (isa<xegpu::CreateNdDescOp, xegpu::DpasOp, xegpu::ConvertLayoutOp,
- xegpu::LoadMatrixOp, xegpu::StoreMatrixOp, xegpu::AtomicRMWOp,
- xegpu::LoadNdOp, xegpu::StoreNdOp, xegpu::PrefetchNdOp,
- vector::TransposeOp, vector::ShapeCastOp,
- vector::MultiDimReductionOp, vector::BroadcastOp>(ownerOp));
- if (!skipLeadingUnitDimRemoval) {
- auto it = llvm::find_if(instData, [](auto val) { return val != 1; });
- instData.erase(instData.begin(), it);
- }
return instData;
}
-
if (auto type = dyn_cast<ShapedType>(value.getType()))
return llvm::to_vector(type.getShape());
}
@@ -350,13 +332,6 @@ void XeGPUBlockingPass::runOnOperation() {
xegpu::doSCFStructuralTypeConversionWithTensorType(op, converter);
- // Remove leading unit dimensions from vector ops and then
- // do the unrolling.
- {
- RewritePatternSet patterns(ctx);
- vector::populateCastAwayVectorLeadingOneDimPatterns(patterns);
- (void)applyPatternsGreedily(op, std::move(patterns));
- }
xegpu::UnrollOptions options;
options.setFilterConstraint(
[&](Operation *op) -> LogicalResult { return success(needsUnroll(op)); });
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index aa1dfaa9e0fda..ce5f4f887e910 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -778,6 +778,15 @@ struct GpuBarrierDistribution final : public gpu::WarpDistributionPattern {
/// To
/// xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> :
/// vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
+///
+/// Note that the store distribution pattern also handles leading unit
+/// dimensions in the payload, mask and offsets vectors. In this case the store
+/// distribution will only change the dimensions corresponding to the SG
+/// distribution and keep the leading unit dimensions unchanged.
+/// For example, a store with payload vector<1x16xf16> with lane layout [1, 16 ]
+/// will be distributed as vector<1x1xf16>. Shapecast ops are inserted for the
+/// offset/mask/payload when necessary so that the distributed store is workign
+/// on 1D shape vector to match the HW capability.
struct StoreDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
@@ -792,30 +801,27 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
storeScatterOp, "Store op must have a vector of offsets argument");
VectorType offsetsTy = cast<VectorType>(offsets.getType());
VectorType maskTy = cast<VectorType>(storeScatterOp.getMask().getType());
- if (offsetsTy.getRank() != 1 || maskTy.getRank() != 1)
- return rewriter.notifyMatchFailure(storeScatterOp,
- "Expected 1D offsets and mask vector");
VectorType storeVecTy = cast<VectorType>(storeScatterOp.getValueType());
- if (storeVecTy.getRank() > 2)
- return rewriter.notifyMatchFailure(
- storeScatterOp, "Expected at most 2D result at SG level");
-
- std::string layoutPayloadName =
- xegpu::getTemporaryLayoutName(storeScatterOp->getOpOperand(0));
- std::string layoutOffsetsName =
- xegpu::getTemporaryLayoutName(storeScatterOp->getOpOperand(2));
- std::string layoutMaskName =
- xegpu::getTemporaryLayoutName(storeScatterOp->getOpOperand(3));
-
- xegpu::DistributeLayoutAttr layoutPayload =
- storeScatterOp->getAttrOfType<xegpu::DistributeLayoutAttr>(
- layoutPayloadName);
- xegpu::DistributeLayoutAttr layoutOffsets =
- storeScatterOp->getAttrOfType<xegpu::DistributeLayoutAttr>(
- layoutOffsetsName);
- xegpu::DistributeLayoutAttr layoutMask =
- storeScatterOp->getAttrOfType<xegpu::DistributeLayoutAttr>(
- layoutMaskName);
+
+ // Add handling for leading unit dimensions support
+ int chunkSize = storeScatterOp.getChunkSize().value_or(1);
+ int effectiveVecRank = (chunkSize == 1) ? 1 : 2;
+
+ // Check that all leading dimensions are unit dimensions
+ for (int i = 0; i < storeVecTy.getRank() - effectiveVecRank; i++) {
+ if (storeVecTy.getShape()[i] != 1) {
+ return rewriter.notifyMatchFailure(
+ storeScatterOp, "Only unit dimensions allowed for the leading "
+ "dimensions of the store vector!");
+ }
+ }
+
+ auto layoutPayload =
+ xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(0));
+ auto layoutOffsets =
+ xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(2));
+ auto layoutMask =
+ xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(3));
FailureOr<VectorType> distStoreVecByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layoutPayload, storeVecTy);
@@ -830,29 +836,42 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
storeScatterOp,
"Some vector operands have no layouts, using defaults instead.");
}
- // Distributed store payload type according to the lane layout.
- VectorType distPayloadTyByWarpOp = distStoreVecByWarpOpOrFailure.value();
- // Expected distributed payload type is always 1D.
- VectorType expectedPayloadTy =
- VectorType::get({distPayloadTyByWarpOp.getNumElements()},
- distPayloadTyByWarpOp.getElementType());
+
+ VectorType distPayloadTy = distStoreVecByWarpOpOrFailure.value();
+ VectorType distOffsetsTy = distOffsetsByWarpOpOrFailure.value();
+ VectorType distMaskTy = distMaskByWarpOpOrFailure.value();
SmallVector<size_t> newRetIndices;
SmallVector<Value> operands = storeScatterOp->getOperands();
SmallVector<Type> operandTypesToYield = {
- distPayloadTyByWarpOp, operands[1].getType(),
- distOffsetsByWarpOpOrFailure.value(),
- distMaskByWarpOpOrFailure.value()};
+ distPayloadTy, operands[1].getType(), distOffsetsTy, distMaskTy};
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
- SmallVector<Value> newStoreScatterOpOperands = llvm::map_to_vector(
- newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
- // The payload operand may need type adjustment due to mismatch between warp
- // distributed type and expected SIMT type.
+
rewriter.setInsertionPointAfter(newWarpOp);
- newStoreScatterOpOperands[0] = resolveDistributedTy(
- newStoreScatterOpOperands[0], expectedPayloadTy, rewriter);
+
+ // Distributed store payload type is always 1D without leading unit dims
+ VectorType payloadTy1D = VectorType::get({distPayloadTy.getNumElements()},
+ distPayloadTy.getElementType());
+
+ VectorType distOffsetsTy1D = VectorType::get(
+ {distOffsetsTy.getNumElements()}, distOffsetsTy.getElementType());
+ VectorType distMaskTy1D = VectorType::get({distMaskTy.getNumElements()},
+ distMaskTy.getElementType());
+
+ // Resolve distributed types to 1D for SIMT execution
+ Value distPayloadVal = resolveDistributedTy(
+ newWarpOp.getResult(newRetIndices[0]), payloadTy1D, rewriter);
+ Value distOffsetVal = resolveDistributedTy(
+ newWarpOp.getResult(newRetIndices[2]), distOffsetsTy1D, rewriter);
+ Value distMaskVal = resolveDistributedTy(
+ newWarpOp.getResult(newRetIndices[3]), distMaskTy1D, rewriter);
+
+ SmallVector<Value> newStoreScatterOpOperands = {
+ distPayloadVal, newWarpOp.getResult(newRetIndices[1]), distOffsetVal,
+ distMaskVal};
+
xegpu::StoreScatterOp newOp = xegpu::StoreScatterOp::create(
rewriter, newWarpOp.getLoc(), TypeRange{}, newStoreScatterOpOperands,
storeScatterOp->getAttrs());
@@ -1058,6 +1077,15 @@ struct StoreMatrixDistribution final : public gpu::WarpDistributionPattern {
/// To
/// %0 = xegpu.load %payload, %src[%offset], %mask <{chunk_size=8}> :
/// memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
+///
+/// Note that the load distribution pattern also handles leading unit dimensions
+/// in the payload, mask, and offsets vector.The load distribution will only
+/// change the dimensions corresponding to the SG distribution and keep the
+/// leading unit dimensions unchanged. For example, a load with result type
+/// vector<1x16xf16> with lane layout [1, 16 ] will be distributed
+/// as result type vector<1x1xf16>. Shapecast ops are inserted for the
+/// offset/mask/payload when necessary so that the distributed load is workign
+/// on 1D shape vector to match the HW capability.
struct LoadDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
@@ -1082,19 +1110,22 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
"Load op must have a vector arguments for offsets and mask");
VectorType offsetsTy = cast<VectorType>(offsets.getType());
VectorType maskTy = cast<VectorType>(loadGatherOp.getMask().getType());
- if (offsetsTy.getRank() != 1 || maskTy.getRank() != 1)
- return rewriter.notifyMatchFailure(loadGatherOp,
- "Expected 1D offsets and mask vector");
- // Assume offset and mask producers will be distributed as well.
- std::string layoutOffsetsName =
- xegpu::getTemporaryLayoutName(loadGatherOp->getOpOperand(1));
- std::string layoutMaskName =
- xegpu::getTemporaryLayoutName(loadGatherOp->getOpOperand(2));
-
- xegpu::LayoutAttr layoutOffsets =
- loadGatherOp->getAttrOfType<xegpu::LayoutAttr>(layoutOffsetsName);
- xegpu::LayoutAttr layoutMask =
- loadGatherOp->getAttrOfType<xegpu::LayoutAttr>(layoutMaskName);
+ VectorType resultVecTy =
+ cast<VectorType>(loadGatherOp.getResult().getType());
+ // add handling leading unit dimensions support
+ int chunkSize = loadGatherOp.getChunkSize().value_or(1);
+ int effectiveVecRank = (chunkSize == 1) ? 1 : 2;
+ for (int i = 0; i < resultVecTy.getRank() - effectiveVecRank; i++) {
+ if (resultVecTy.getShape()[i] != 1) {
+ return rewriter.notifyMatchFailure(
+ loadGatherOp, "Only unit dimensions allowed for the leading "
+ "dimensions of the load vector!");
+ }
+ }
+
+ auto layoutOffsets =
+ xegpu::getTemporaryLayout(loadGatherOp->getOpOperand(1));
+ auto layoutMask = xegpu::getTemporaryLayout(loadGatherOp->getOpOperand(2));
FailureOr<VectorType> distOffsetsByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy);
@@ -1109,26 +1140,45 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
SmallVector<size_t> newRetIndices;
SmallVector<Value> operands = loadGatherOp->getOperands();
- SmallVector<Type> operandTypesToYield = {
- operands[0].getType(), distOffsetsByWarpOpOrFailure.value(),
- distMaskByWarpOpOrFailure.value()};
const unsigned operandIdx = producedByLastLoad->getOperandNumber();
VectorType distResultTy =
cast<VectorType>(warpOp.getResult(operandIdx).getType());
- // Distributed load op will always be 1D.
- VectorType loadVecTy = VectorType::get({distResultTy.getNumElements()},
- distResultTy.getElementType());
+ VectorType distOffsetsTy = distOffsetsByWarpOpOrFailure.value();
+ VectorType distMaskTy = distMaskByWarpOpOrFailure.value();
+
+ SmallVector<Type> operandTypesToYield = {operands[0].getType(),
+ distOffsetsTy, distMaskTy};
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
- SmallVector<Value> newLoadGatherOperands = llvm::map_to_vector(
- newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
+ // SmallVector<Value> newLoadGatherOperands = llvm::map_to_vector(
+ // newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
rewriter.setInsertionPointAfter(newWarpOp);
+
+ // Distributed load op will always be 1D.
+ VectorType loadVecTy1D = VectorType::get({distResultTy.getNumElements()},
+ distResultTy.getElementType());
+
+ VectorType distOffsetsTy1D =
+ VectorType::get({distOffsetsByWarpOpOrFailure.value().getNumElements()},
+ distOffsetsByWarpOpOrFailure.value().getElementType());
+ VectorType distMaskTy1D =
+ VectorType::get({distMaskByWarpOpOrFailure.value().getNumElements()},
+ distMaskByWarpOpOrFailure.value().getElementType());
+
+ Value distOffsetVal = resolveDistributedTy(
+ newWarpOp.getResult(newRetIndices[1]), distOffsetsTy1D, rewriter);
+ Value distmaskVal = resolveDistributedTy(
+ newWarpOp.getResult(newRetIndices[2]), distMaskTy1D, rewriter);
+
+ SmallVector<Value> newLoadGatherOperands = {
+ newWarpOp.getResult(newRetIndices[0]), distOffsetVal, distmaskVal};
+
xegpu::LoadGatherOp newOp = xegpu::LoadGatherOp::create(
- rewriter, newWarpOp.getLoc(), loadVecTy, newLoadGatherOperands,
+ rewriter, newWarpOp.getLoc(), loadVecTy1D, newLoadGatherOperands,
loadGatherOp->getAttrs());
xegpu::removeLayoutAttrs(newOp);
Value distributedVal = newWarpOp.getResult(operandIdx);
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index c47fd92fe46d7..fa5810ad7f828 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -390,6 +390,7 @@ xegpu::extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc,
for (SmallVector<int64_t> offsets :
StaticTileOffsetRange(srcShape, adjustedTargetShape)) {
SmallVector<int64_t> staticStrides(offsets.size(), 1);
+
Value slice = vector::ExtractStridedSliceOp::create(
builder, loc, value, offsets, adjustedTargetShape, staticStrides);
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index 68f6e8e1ec955..e80a9144b9674 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -740,17 +740,17 @@ gpu.module @test_kernel {
// -----
gpu.module @test_kernel {
- // CHECK-LABEL: remove_unit_dim_inst_data
+ // CHECK-LABEL: preserve_unit_dim_of_load_inst_data
// CHECK-SAME: [[arg0:%.+]]: ui64
// CHECK: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<1x1x32xf32>
- // CHECK: [[cst_0:%.+]] = arith.constant dense<true> : vector<16xi1>
- // CHECK: [[cst_1:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex>
- // CHECK: [[cst_2:%.+]] = arith.constant dense<[128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]> : vector<16xindex>
- // CHECK: [[ld_0:%.+]] = xegpu.load [[arg0]][[[cst_1]]], [[cst_0]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32>
- // CHECK: [[ld_1:%.+]] = xegpu.load [[arg0]][[[cst_2]]], [[cst_0]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32>
- // CHECK: [[ins_0:%.+]] = vector.insert_strided_slice [[ld_0]], [[cst]] {offsets = [0, 0, 0], strides = [1]} : vector<16xf32> into vector<1x1x32xf32>
- // CHECK: [[ins_1:%.+]] = vector.insert_strided_slice [[ld_1]], [[ins_0]] {offsets = [0, 0, 16], strides = [1]} : vector<16xf32> into vector<1x1x32xf32>
- gpu.func @remove_unit_dim_inst_data(%src: ui64) -> vector<1x1x32xf32> {
+ // CHECK: [[cst_0:%.+]] = arith.constant dense<true> : vector<1x1x16xi1>
+ // CHECK: [[cst_1:%.+]] = arith.constant dense<{{.*}}> : vector<1x1x16xindex>
+ // CHECK: [[cst_2:%.+]] = arith.constant dense<{{.*}}> : vector<1x1x16xindex>
+ // CHECK: [[ld_0:%.+]] = xegpu.load [[arg0]][[[cst_1]]], [[cst_0]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<1x1x16xindex>, vector<1x1x16xi1> -> vector<1x1x16xf32>
+ // CHECK: [[ld_1:%.+]] = xegpu.load [[arg0]][[[cst_2]]], [[cst_0]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<1x1x16xindex>, vector<1x1x16xi1> -> vector<1x1x16xf32>
+ // CHECK: [[ins_0:%.+]] = vector.insert_strided_slice [[ld_0]], [[cst]] {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<1x1x16xf32> into vector<1x1x32xf32>
+ // CHECK: [[ins_1:%.+]] = vector.insert_strided_slice [[ld_1]], [[ins_0]] {offsets = [0, 0, 16], strides = [1, 1, 1]} : vector<1x1x16xf32> into vector<1x1x32xf32>
+ gpu.func @preserve_unit_dim_of_load_inst_data(%src: ui64) -> vector<1x1x32xf32> {
%cst = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [1, 1, 16]>} dense<[[
[0, 8, 16, 24, 32, 40, 48, 56,
64, 72, 80, 88, 96, 104, 112, 120,
@@ -770,8 +770,6 @@ gpu.module @test_kernel {
gpu.module @test_kernel {
// CHECK-LABEL: load_store_nd_with_offsets
// CHECK-SAME: [[arg0:%.+]]: memref<1024x1024xf32>, [[arg1:%.+]]: memref<1024x1024xf32>, [[arg2:%.+]]: memref<1024x1024xf32>
- // CHECK-DAG: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<32xf32>
- // CHECK-DAG: [[cst_0:%.+]] = arith.constant dense<0.000000e+00> : vector<1x32xf32>
// CHECK-DAG: [[c16:%.+]] = arith.constant 16 : index
// CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index
// CHECK: [[tdesc_a:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x16xf32>
@@ -779,27 +777,12 @@ gpu.module @test_kernel {
// CHECK: [[tdesc_c:%.+]] = xegpu.create_nd_tdesc [[arg2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x16xf32>
// CHECK: [[ld_a0:%.+]] = xegpu.load_nd [[tdesc_a]][[[c0]], [[c0]]] : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32>
// CHECK: [[ld_a1:%.+]] = xegpu.load_nd [[tdesc_a]][[[c0]], [[c16]]] : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32>
- // CHECK: [[ins_a0:%.+]] = vector.insert_strided_slice [[ld_a0]], [[cst_0]] {offsets = [0, 0], strides = [1, 1]} : vector<1x16xf32> into vector<1x32xf32>
- // CHECK: [[ins_a1:%.+]] = vector.insert_strided_slice [[ld_a1]], [[ins_a0]] {offsets = [0, 16], strides = [1, 1]} : vector<1x16xf32> into vector<1x32xf32>
// CHECK: [[ld_b0:%.+]] = xegpu.load_nd [[tdesc_b]][[[c0]], [[c0]]] : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32>
// CHECK: [[ld_b1:%.+]] = xegpu.load_nd [[tdesc_b]][[[c0]], [[c16]]] : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32>
- // CHECK: [[ins_b0:%.+]] = vector.insert_strided_slice [[ld_b0]], [[cst_0]] {offsets = [0, 0], strides = [1, 1]} : vector<1x16xf32> into vector<1x32xf32>
- // CHECK: [[ins_b1:%.+]] = vector.insert_strided_slice [[ld_b1]], [[ins_b0]] {offsets = [0, 16], strides = [1, 1]} : vector<1x16xf32> into vector<1x32xf32>
- /...
[truncated]
|
🐧 Linux x64 Test Results
✅ The build succeeded and all tests passed. |
| int effectiveVecRank = (chunkSize == 1) ? 1 : 2; | ||
|
|
||
| // Check that all leading dimensions are unit dimensions | ||
| for (int i = 0; i < storeVecTy.getRank() - effectiveVecRank; i++) { |
There was a problem hiding this comment.
what about trailing unit dims? that support is in the future?
|
|
||
| gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( | ||
| rewriter, warpOp, operands, operandTypesToYield, newRetIndices); | ||
|
|
||
| SmallVector<Value> newLoadGatherOperands = llvm::map_to_vector( | ||
| newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); }); | ||
| // SmallVector<Value> newLoadGatherOperands = llvm::map_to_vector( |
|
|
||
| // ----- | ||
| #inst_data = #xegpu.layout<inst_data = [1, 1, 32]> | ||
| #inst_data = #xegpu.layout<inst_data = [1, 1, 16]> |
There was a problem hiding this comment.
Why this change to test case?
There was a problem hiding this comment.
The original #inst_data doesn't trigger any blocking functionality after we preserve the leading dims.
This PR preserve leading dimension during blocking. This ensures the blocking process avoid generating unnecessary insert/extract_strided_slice, which under certain condition becomes difficult to be canceled, and creates extra burden in lane layout propagation and subgroup distribution.
This PR also extended subgroup distribution so load and store can support payload/mask/offsets with leading unit dimension. The distributed load/store works on 1d only, but shapecast is inserted to remove and add the leading dimension for the input/output vectors. Comparing to the insert/extract inserted at subgroup level, the shapecast inserted at lane level handling leading unit dimension is essentially a nop and can be processed lightly.