diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td index 12270af870b3b..0ca58426ecfcb 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td @@ -37,6 +37,19 @@ def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> { propagate the layouts required for their operands to the producers. With this propagated layout information, pass will then update op result type with the layout information. + + `layout-kind` option values: + - `inst` + Propagate the `inst_data` field of the layout attribute. The default is chosen to + maximize instruction-level granularity so that the user shape can be processed + with the fewest instructions. For N-D operations, this granularity depends on + W (width) and H (height) of the instruction shape. + The B (block) dimension (or array length) is not included in the default + configuration and must be enabled via a separate optimization pass. + + - `lane` + Propagate the `lane_layout` and `lane_data` fields of the layout attribute. + Default values are selected to align with hardware. }]; let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect"]; diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index b3a780abd3f12..6b3ba5a5981ce 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -495,8 +495,7 @@ void LayoutInfoPropagation::visitPrefetchNdOp( auto [bWidth, bHeight, bCount] = blockWHC.value(); SmallVector instData; int instWidth = xegpu::getLargestDivisor( - static_cast(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth, - bCount); + static_cast(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth); if (instWidth == -1) prefetch.emitWarning( "No suitable instruction multiple found for the given shape."); @@ -702,8 +701,7 @@ void LayoutInfoPropagation::visitStoreNdOp( auto [bWidth, bHeight, bCount] = blockWHC.value(); SmallVector instData; int instWidth = xegpu::getLargestDivisor( - static_cast(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth, - bCount); + static_cast(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth); if (instWidth == -1) store.emitWarning( "No suitable instruction multiple found for the given shape."); diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir index c31ef323a94d2..0c837e17a0afa 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir @@ -1,5 +1,29 @@ // RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout="layout-kind=inst" -split-input-file %s | FileCheck %s + +// CHECK-LABEL: func.func @load_store_no_array_len( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x32xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<8x32xf32>) { +// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32> +// CHECK: %[[TDESC_SRC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32, #xegpu.layout> +// CHECK: %[[TDESC_DST:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32, #xegpu.layout> +// CHECK: %[[LOADED:.*]] = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: !xegpu.tensor_desc<8x32xf32, #xegpu.layout> -> vector<8x32xf32> +// CHECK: xegpu.store_nd %[[LOADED]], %[[TDESC_DST]] : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #xegpu.layout> +gpu.module @test { +// Although the uArch allows 8x32 inst data using block count (or array_len), +// it is up to optimization passes to decide on the block count usage. +func.func @load_store_no_array_len(%arg0: memref<8x32xf32>, %arg1: memref<8x32xf32>) { + %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> + %0 = xegpu.create_nd_tdesc %arg0 : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32> + %1 = xegpu.create_nd_tdesc %arg1 : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32> + %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x32xf32> -> vector<8x32xf32> + xegpu.store_nd %2, %1 : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32> + return +} +} + +// ----- + // CHECK-LABEL: func.func @dpas_f16( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32>