From dbffea4eca96c94207522caac2ba2a01e647bc48 Mon Sep 17 00:00:00 2001 From: dchigarev Date: Thu, 20 Nov 2025 17:19:29 +0000 Subject: [PATCH] [mlir][XeGPU][VectorToXeGPU] Use 'xegpu.load' to lower 1D 'vector.transfer_read' for PVC & BMG Signed-off-by: dchigarev --- .../VectorToXeGPU/VectorToXeGPU.cpp | 7 +- .../VectorToXeGPU/transfer-read-to-xegpu.mlir | 82 ++++++++++++++----- 2 files changed, 69 insertions(+), 20 deletions(-) diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp index 1b4d1a42614ea..4358ef07da91d 100644 --- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp +++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp @@ -519,8 +519,13 @@ struct TransferReadLowering : public OpRewritePattern { return lowerToScatteredLoadOp(readOp, rewriter); } - // Perform common data transfer checks. VectorType vecTy = readOp.getVectorType(); + + // Lower using load.gather in 1D case + if (vecTy.getRank() == 1 && !readOp.hasOutOfBoundsDim()) + return lowerToScatteredLoadOp(readOp, rewriter); + + // Perform common data transfer checks. if (failed(storeLoadPreconditions(rewriter, readOp, vecTy))) return failure(); diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir index c87a5304babfe..8bb272b1fe5fc 100644 --- a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir +++ b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir @@ -11,14 +11,15 @@ gpu.func @load_1D_vector(%source: memref<8x16x32xf32>, %offset: index) -> vector // LOAD-ND-LABEL: @load_1D_vector( // LOAD-ND-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, -// LOAD-ND-SAME: %[[OFFSET:.+]]: index -// LOAD-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0] -// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc -// LOAD-ND-SAME: %[[COLLAPSED]] -// LOAD-ND-SAME: memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32, -// LOAD-ND-SAME: boundary_check = false -// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]]]{{.*}}-> vector<8xf32> -// LOAD-ND: return %[[VEC]] +// LOAD-ND: %[[CST:.+]] = arith.constant dense : vector<8xi1> +// LOAD-ND: %[[STEP:.+]] = vector.step : vector<8xindex> +// LOAD-ND-COUNT2: arith.muli {{.*}} : index +// LOAD-ND-COUNT2: arith.addi {{.*}} : index +// LOAD-ND: %[[SPLAT:.+]] = vector.broadcast {{.*}}: index to vector<8xindex> +// LOAD-ND: %[[IDX:.+]] = arith.addi %[[SPLAT]], %[[STEP]] : vector<8xindex> +// LOAD-ND: %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SRC]] : memref<8x16x32xf32> -> index +// LOAD-ND: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64 +// LOAD-ND: %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8xindex>, vector<8xi1> -> vector<8xf32> // LOAD-GATHER-LABEL: @load_1D_vector( // LOAD-GATHER-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, @@ -404,7 +405,7 @@ gpu.func @no_load_unsupported_map(%source: memref<16x32x64xf32>, // ----- gpu.module @xevm_module { -gpu.func @load_from_subview(%source: memref<4096x4096xf16>, %off1: index, %off2: index) -> vector<8xf16> { +gpu.func @load_from_subview_1D(%source: memref<4096x4096xf16>, %off1: index, %off2: index) -> vector<8xf16> { %c0 = arith.constant 0.0 : f16 %subview = memref.subview %source[%off1, %off2] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> %0 = vector.transfer_read %subview[%off2, %off2], %c0 @@ -412,19 +413,23 @@ gpu.func @load_from_subview(%source: memref<4096x4096xf16>, %off1: index, %off2: gpu.return %0 : vector<8xf16> } -// LOAD-ND-LABEL: @load_from_subview( +// LOAD-ND-LABEL: @load_from_subview_1D( // LOAD-ND-SAME: %[[SRC:.+]]: memref<4096x4096xf16>, // LOAD-ND-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index +// LOAD-ND: %[[CST:.+]] = arith.constant dense : vector<8xi1> // LOAD-ND: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> -// LOAD-ND: %[[COLLAPSED:.+]] = memref.subview %[[SUBVIEW]][%[[OFF2]], 0] -// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc -// LOAD-ND-SAME: %[[COLLAPSED]] -// LOAD-ND-SAME: memref<256xf16, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf16, -// LOAD-ND-SAME: boundary_check = false -// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF2]]]{{.*}}-> vector<8xf16> -// LOAD-ND: return %[[VEC]] - -// LOAD-GATHER-LABEL: @load_from_subview( +// LOAD-ND: %[[BB:.+]], %[[OFFSET:.+]],{{.*}},{{.*}} = memref.extract_strided_metadata %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> memref, index, index, index, index, index +// LOAD-ND: %[[STEP:.+]] = vector.step : vector<8xindex> +// LOAD-ND: arith.muli {{.*}} : index +// LOAD-ND: arith.addi %[[OFFSET]]{{.*}} : index +// LOAD-ND: arith.addi {{.*}} : index +// LOAD-ND: %[[SPLAT:.+]] = vector.broadcast {{.*}}: index to vector<8xindex> +// LOAD-ND: %[[IDX:.+]] = arith.addi %[[SPLAT]], %[[STEP]] : vector<8xindex> +// LOAD-ND: %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> index +// LOAD-ND: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64 +// LOAD-ND: %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8xindex>, vector<8xi1> -> vector<8xf16> + +// LOAD-GATHER-LABEL: @load_from_subview_1D( // LOAD-GATHER-SAME: %[[SRC:.+]]: memref<4096x4096xf16>, // LOAD-GATHER-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index // LOAD-GATHER: %[[CST:.+]] = arith.constant dense : vector<8xi1> @@ -440,3 +445,42 @@ gpu.func @load_from_subview(%source: memref<4096x4096xf16>, %off1: index, %off2: // LOAD-GATHER: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64 // LOAD-GATHER: %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8xindex>, vector<8xi1> -> vector<8xf16> } + +// ----- +gpu.module @xevm_module { +gpu.func @load_from_subview_2D(%source: memref<4096x4096xf16>, %off1: index, %off2: index) -> vector<8x16xf16> { + %c0 = arith.constant 0.0 : f16 + %subview = memref.subview %source[%off1, %off2] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> + %0 = vector.transfer_read %subview[%off2, %off2], %c0 + {in_bounds = [true, true]} : memref<256x256xf16, strided<[4096, 1], offset: ?>>, vector<8x16xf16> + gpu.return %0 : vector<8x16xf16> +} + +// LOAD-ND-LABEL: @load_from_subview_2D( +// LOAD-ND-SAME: %[[SRC:.+]]: memref<4096x4096xf16>, +// LOAD-ND-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index +// LOAD-ND: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> +// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc +// LOAD-ND-SAME: %[[SUBVIEW]] +// LOAD-ND-SAME: memref<256x256xf16, strided<[4096, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf16, +// LOAD-ND-SAME: boundary_check = false +// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF2]], %[[OFF2]]]{{.*}}-> vector<8x16xf16> +// LOAD-ND: return %[[VEC]] + +// LOAD-GATHER-LABEL: @load_from_subview_2D( +// LOAD-GATHER-SAME: %[[SRC:.+]]: memref<4096x4096xf16>, +// LOAD-GATHER-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index +// LOAD-GATHER: %[[CST:.+]] = arith.constant dense : vector<8x16xi1> +// LOAD-GATHER: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> +// LOAD-GATHER: %[[BB:.+]], %[[OFFSET:.+]],{{.*}},{{.*}} = memref.extract_strided_metadata %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> memref, index, index, index, index, index +// LOAD-GATHER-COUNT2: vector.step +// LOAD-GATHER-COUNT2: vector.shape_cast +// LOAD-GATHER-COUNT2: vector.broadcast +// LOAD-GATHER-COUNT2: arith.muli {{.*}} : index +// LOAD-GATHER-COUNT2: arith.addi {{.*}} : index +// LOAD-GATHER: %[[SPLAT:.+]] = vector.broadcast {{.*}}: index to vector<8x16xindex> +// LOAD-GATHER: %[[IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} : vector<8x16xindex> +// LOAD-GATHER: %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> index +// LOAD-GATHER: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64 +// LOAD-GATHER: %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf16> +}