From c5ec4d8971a2855bd51300331c2c3ec8967e40f3 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 9 Dec 2025 21:03:38 +0000 Subject: [PATCH 1/4] add test --- .../Transforms/XeGPUSubgroupDistribute.cpp | 18 ++++++++++++++++-- .../XeGPU/subgroup-distribute-unit.mlir | 13 +++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index ca81c3cd7be42..bbea93101c54e 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -99,6 +99,7 @@ getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout, for (auto [i, dim] : llvm::enumerate(originalType.getShape())) { if (i < distributionStart) continue; + // Check if the dimension can be distributed evenly. if (dim % effectiveLaneLayout[i - distributionStart] != 0) return failure(); @@ -1673,6 +1674,19 @@ struct VectorExtractStridedSliceDistribution extractOp.getSizes(), [](Attribute attr) { return attr; }); SmallVector updatedOffsets = llvm::map_to_vector( extractOp.getOffsets(), [](Attribute attr) { return attr; }); + SmallVector updatedStrides = llvm::map_to_vector( + extractOp.getStrides(), [](Attribute attr) { return attr; }); + // If the provided sizes, offsets, strides are less than the rank, pad them + // with full sizes, zero offsets, and unit strides. This makes it easier to + // adjust them later. + int64_t sourceRank = extractOp.getSourceVectorType().getRank(); + for (int64_t i = extractOp.getSizes().size(); i < sourceRank; ++i) { + updatedSizes.push_back(rewriter.getI64IntegerAttr( + extractOp.getSourceVectorType().getDimSize(i))); + updatedOffsets.push_back(rewriter.getI64IntegerAttr(0)); + updatedStrides.push_back( + rewriter.getI64IntegerAttr(1)); // stride is always 1. + } // If the result is distributed, it must be distributed in exactly one // dimension. In this case, we adjust the sourceDistType, distributedSizes // and distributedOffsets accordingly. @@ -1708,7 +1722,7 @@ struct VectorExtractStridedSliceDistribution // The offsets in the distributed dimention must be a multiple of subgroup // size. int64_t distrDimOffset = - cast(extractOp.getOffsets()[distributedDim]).getInt(); + cast(updatedOffsets[distributedDim]).getInt(); if (distrDimOffset % subgroupSize != 0) return rewriter.notifyMatchFailure( warpOp, "Offset along distributed dimension " @@ -1737,7 +1751,7 @@ struct VectorExtractStridedSliceDistribution rewriter, extractOp.getLoc(), distributedType, source, ArrayAttr::get(rewriter.getContext(), updatedOffsets), ArrayAttr::get(rewriter.getContext(), updatedSizes), - extractOp.getStrides()); + ArrayAttr::get(rewriter.getContext(), updatedStrides)); rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), newExtractOp); return success(); } diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir index 216f3d19cff94..5440ef1566723 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir @@ -753,6 +753,19 @@ gpu.func @vector_extract_strided_slice_unsopported_source(%laneid: index) { gpu.return } +gpu.func @vector_extract_strided_slice_partial_offsets(%laneid: index) { + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) { + %0 = "some_def"() : () -> (vector<24x16xf32>) + %1 = vector.extract_strided_slice %0 { offsets = [8], sizes = [8], strides = [1], + layout_operand_0 = #xegpu.layout, + layout_result_0 = #xegpu.layout + } + : vector<24x16xf32> to vector<8x16xf32> + gpu.yield %1 : vector<8x16xf32> + } + "some_use"(%r) : (vector<8x1xf32>) -> () + gpu.return +} // CHECK-LABEL: gpu.func @vector_insert_strided_slice_distributed_dim_fully_inserted // CHECK-NEXT: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<64x1xf32>, vector<16x1xf32>, vector<64x1xf32>) { From 481996527426f1d2e7daea92af88e509b28fef75 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 9 Dec 2025 21:21:24 +0000 Subject: [PATCH 2/4] add test --- mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir index 5440ef1566723..a95b52276bba1 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir @@ -753,6 +753,14 @@ gpu.func @vector_extract_strided_slice_unsopported_source(%laneid: index) { gpu.return } +// CHECK-LABEL: gpu.func @vector_extract_strided_slice_partial_offsets +// CHECK-NEXT: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<8x1xf32>, vector<24x1xf32>) { +// CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<24x16xf32> +// CHECK: gpu.yield %{{.*}}, %[[S]] : vector<8x16xf32>, vector<24x16xf32> +// CHECK-NEXT: } +// CHECK-NEXT: %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1 +// CHECK-SAME: {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32> +// CHECK-NEXT: "some_use"(%[[T1]]) : (vector<8x1xf32>) -> () gpu.func @vector_extract_strided_slice_partial_offsets(%laneid: index) { %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) { %0 = "some_def"() : () -> (vector<24x16xf32>) From d414762645c52f91503ca7a5003f89c6c34d8e44 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 9 Dec 2025 21:37:58 +0000 Subject: [PATCH 3/4] add test --- .../XeGPU/subgroup-distribute-unit.mlir | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir index a95b52276bba1..7819a438057c4 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir @@ -901,6 +901,31 @@ gpu.func @vector_insert_strided_slice_1d(%laneid: index) { gpu.return } +// CHECK-LABEL: gpu.func @vector_insert_strided_slice_different_ranks +// CHECK-NEXT: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<64x1xf32>, vector<1xf32>, vector<64x1xf32>) { +// CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<16xf32> +// CHECK-NEXT: %[[D:.*]] = "some_def"() : () -> vector<64x16xf32> +// CHECK: gpu.yield %{{.*}}, %[[S]], %[[D]] : vector<64x16xf32>, vector<16xf32>, vector<64x16xf32> +// CHECK-NEXT: } +// CHECK-NEXT: %[[T1:.*]] = vector.insert_strided_slice %[[W]]#1, %[[W]]#2 +// CHECK-SAME: {offsets = [13, 0], strides = [1]} : vector<1xf32> into vector<64x1xf32> +// CHECK-NEXT: "some_use"(%[[T1]]) : (vector<64x1xf32>) -> () +gpu.func @vector_insert_strided_slice_different_ranks(%laneid: index) { + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<64x1xf32>) { + %0 = "some_def"() : () -> (vector<16xf32>) + %1 = "some_def"() : () -> (vector<64x16xf32>) + %2 = vector.insert_strided_slice %0, %1 { offsets = [13, 0], strides = [1], + layout_operand_0 = #xegpu.layout, + layout_operand_1 = #xegpu.layout, + layout_result_0 = #xegpu.layout + } + : vector<16xf32> into vector<64x16xf32> + gpu.yield %2 : vector<64x16xf32> + } + "some_use"(%r) : (vector<64x1xf32>) -> () + gpu.return +} + // CHECK-LABEL: gpu.func @vector_insert_strided_slice_unsupported_source // CHECK: %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<3xf32>) { // CHECK: } From 856d7b9b9d1a7ff5d66321f6aeb2665c529e1acc Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 9 Dec 2025 21:51:31 +0000 Subject: [PATCH 4/4] fix line --- mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index bbea93101c54e..dbd5a50489173 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -99,7 +99,6 @@ getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout, for (auto [i, dim] : llvm::enumerate(originalType.getShape())) { if (i < distributionStart) continue; - // Check if the dimension can be distributed evenly. if (dim % effectiveLaneLayout[i - distributionStart] != 0) return failure();