diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index ca81c3cd7be42..dbd5a50489173 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -1673,6 +1673,19 @@ struct VectorExtractStridedSliceDistribution extractOp.getSizes(), [](Attribute attr) { return attr; }); SmallVector updatedOffsets = llvm::map_to_vector( extractOp.getOffsets(), [](Attribute attr) { return attr; }); + SmallVector updatedStrides = llvm::map_to_vector( + extractOp.getStrides(), [](Attribute attr) { return attr; }); + // If the provided sizes, offsets, strides are less than the rank, pad them + // with full sizes, zero offsets, and unit strides. This makes it easier to + // adjust them later. + int64_t sourceRank = extractOp.getSourceVectorType().getRank(); + for (int64_t i = extractOp.getSizes().size(); i < sourceRank; ++i) { + updatedSizes.push_back(rewriter.getI64IntegerAttr( + extractOp.getSourceVectorType().getDimSize(i))); + updatedOffsets.push_back(rewriter.getI64IntegerAttr(0)); + updatedStrides.push_back( + rewriter.getI64IntegerAttr(1)); // stride is always 1. + } // If the result is distributed, it must be distributed in exactly one // dimension. In this case, we adjust the sourceDistType, distributedSizes // and distributedOffsets accordingly. @@ -1708,7 +1721,7 @@ struct VectorExtractStridedSliceDistribution // The offsets in the distributed dimention must be a multiple of subgroup // size. int64_t distrDimOffset = - cast(extractOp.getOffsets()[distributedDim]).getInt(); + cast(updatedOffsets[distributedDim]).getInt(); if (distrDimOffset % subgroupSize != 0) return rewriter.notifyMatchFailure( warpOp, "Offset along distributed dimension " @@ -1737,7 +1750,7 @@ struct VectorExtractStridedSliceDistribution rewriter, extractOp.getLoc(), distributedType, source, ArrayAttr::get(rewriter.getContext(), updatedOffsets), ArrayAttr::get(rewriter.getContext(), updatedSizes), - extractOp.getStrides()); + ArrayAttr::get(rewriter.getContext(), updatedStrides)); rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), newExtractOp); return success(); } diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir index 216f3d19cff94..7819a438057c4 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir @@ -753,6 +753,27 @@ gpu.func @vector_extract_strided_slice_unsopported_source(%laneid: index) { gpu.return } +// CHECK-LABEL: gpu.func @vector_extract_strided_slice_partial_offsets +// CHECK-NEXT: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<8x1xf32>, vector<24x1xf32>) { +// CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<24x16xf32> +// CHECK: gpu.yield %{{.*}}, %[[S]] : vector<8x16xf32>, vector<24x16xf32> +// CHECK-NEXT: } +// CHECK-NEXT: %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1 +// CHECK-SAME: {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<24x1xf32> to vector<8x1xf32> +// CHECK-NEXT: "some_use"(%[[T1]]) : (vector<8x1xf32>) -> () +gpu.func @vector_extract_strided_slice_partial_offsets(%laneid: index) { + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) { + %0 = "some_def"() : () -> (vector<24x16xf32>) + %1 = vector.extract_strided_slice %0 { offsets = [8], sizes = [8], strides = [1], + layout_operand_0 = #xegpu.layout, + layout_result_0 = #xegpu.layout + } + : vector<24x16xf32> to vector<8x16xf32> + gpu.yield %1 : vector<8x16xf32> + } + "some_use"(%r) : (vector<8x1xf32>) -> () + gpu.return +} // CHECK-LABEL: gpu.func @vector_insert_strided_slice_distributed_dim_fully_inserted // CHECK-NEXT: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<64x1xf32>, vector<16x1xf32>, vector<64x1xf32>) { @@ -880,6 +901,31 @@ gpu.func @vector_insert_strided_slice_1d(%laneid: index) { gpu.return } +// CHECK-LABEL: gpu.func @vector_insert_strided_slice_different_ranks +// CHECK-NEXT: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<64x1xf32>, vector<1xf32>, vector<64x1xf32>) { +// CHECK-NEXT: %[[S:.*]] = "some_def"() : () -> vector<16xf32> +// CHECK-NEXT: %[[D:.*]] = "some_def"() : () -> vector<64x16xf32> +// CHECK: gpu.yield %{{.*}}, %[[S]], %[[D]] : vector<64x16xf32>, vector<16xf32>, vector<64x16xf32> +// CHECK-NEXT: } +// CHECK-NEXT: %[[T1:.*]] = vector.insert_strided_slice %[[W]]#1, %[[W]]#2 +// CHECK-SAME: {offsets = [13, 0], strides = [1]} : vector<1xf32> into vector<64x1xf32> +// CHECK-NEXT: "some_use"(%[[T1]]) : (vector<64x1xf32>) -> () +gpu.func @vector_insert_strided_slice_different_ranks(%laneid: index) { + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<64x1xf32>) { + %0 = "some_def"() : () -> (vector<16xf32>) + %1 = "some_def"() : () -> (vector<64x16xf32>) + %2 = vector.insert_strided_slice %0, %1 { offsets = [13, 0], strides = [1], + layout_operand_0 = #xegpu.layout, + layout_operand_1 = #xegpu.layout, + layout_result_0 = #xegpu.layout + } + : vector<16xf32> into vector<64x16xf32> + gpu.yield %2 : vector<64x16xf32> + } + "some_use"(%r) : (vector<64x1xf32>) -> () + gpu.return +} + // CHECK-LABEL: gpu.func @vector_insert_strided_slice_unsupported_source // CHECK: %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<3xf32>) { // CHECK: }