diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td index 0ca58426ecfcb..3ff7805263f0e 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td @@ -50,6 +50,10 @@ def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> { - `lane` Propagate the `lane_layout` and `lane_data` fields of the layout attribute. Default values are selected to align with hardware. + + - `subgroup` + Propagate the `sg_layout` and `sg_data` fields of the layout attribute. + Default values are selected to align with hardware. }]; let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect"]; @@ -60,7 +64,7 @@ def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> { Option< "layoutKind", "layout-kind", "std::string", /*default=*/"\"lane\"", - "Propagate `inst` / `lane` level of xegpu layouts."> + "Propagate `subgroup` / `inst` / `lane` level of xegpu layouts."> ]; } diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index dc9eb96c169b4..cbd91154ce0aa 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -53,7 +53,7 @@ using namespace mlir::dataflow; namespace { -enum class LayoutKind { Lane, InstData }; +enum class LayoutKind { Lane, InstData, Subgroup }; //===----------------------------------------------------------------------===// // LayoutInfo @@ -109,6 +109,12 @@ struct LayoutInfo { SmallVector getInstData() const; + SmallVector getSgLayout() const; + + SmallVector getSgData() const; + + SmallVector getOrder() const; + bool isSliceLayout() const { if (!isAssigned()) return false; @@ -127,8 +133,6 @@ struct LayoutInfo { SmallVector LayoutInfo::getLaneLayout() const { if (!isAssigned()) return {}; - assert(storage.getEffectiveLaneLayoutAsInt().size() && - "Expected lane layout to be assigned"); return llvm::map_to_vector(storage.getEffectiveLaneLayoutAsInt(), [](int64_t val) { return static_cast(val); }); } @@ -136,8 +140,6 @@ SmallVector LayoutInfo::getLaneLayout() const { SmallVector LayoutInfo::getLaneData() const { if (!isAssigned()) return {}; - assert(storage.getEffectiveLaneDataAsInt().size() && - "Expected lane data to be assigned"); return llvm::map_to_vector(storage.getEffectiveLaneDataAsInt(), [](int64_t val) { return static_cast(val); }); } @@ -149,6 +151,27 @@ SmallVector LayoutInfo::getInstData() const { [](int64_t val) { return static_cast(val); }); } +SmallVector LayoutInfo::getSgLayout() const { + if (!isAssigned()) + return {}; + return llvm::map_to_vector(storage.getEffectiveSgLayoutAsInt(), + [](int64_t val) { return static_cast(val); }); +} + +SmallVector LayoutInfo::getSgData() const { + if (!isAssigned()) + return {}; + return llvm::map_to_vector(storage.getEffectiveSgDataAsInt(), + [](int64_t val) { return static_cast(val); }); +} + +SmallVector LayoutInfo::getOrder() const { + if (!isAssigned() || !storage.getOrder()) + return {}; + return llvm::map_to_vector(storage.getOrder().asArrayRef(), + [](int64_t val) { return static_cast(val); }); +} + void LayoutInfo::print(raw_ostream &os) const { if (isAssigned()) { os << storage; @@ -188,6 +211,10 @@ LayoutInfo LayoutInfo::transpose(ArrayRef permutation) const { SmallVector laneLayout; SmallVector laneData; SmallVector instData; + SmallVector sgLayout; + SmallVector sgData; + SmallVector order; + for (int64_t idx : permutation) { if (getLaneLayout().size()) { laneLayout.push_back(static_cast(getLaneLayout()[idx])); @@ -195,13 +222,30 @@ LayoutInfo LayoutInfo::transpose(ArrayRef permutation) const { } if (getInstData().size()) instData.push_back(static_cast(getInstData()[idx])); + if (getSgData().size()) { + sgLayout.push_back(static_cast(getSgLayout()[idx])); + sgData.push_back(static_cast(getSgData()[idx])); + } + if (getOrder().size()) { + order.push_back(static_cast(getOrder()[idx])); + } } + auto orderAttr = order.size() + ? DenseI32ArrayAttr::get(storage.getContext(), order) + : nullptr; xegpu::LayoutAttr layoutAttr; if (getLaneLayout().size()) layoutAttr = xegpu::LayoutAttr::get(storage.getContext(), laneLayout, laneData); if (getInstData().size()) layoutAttr = xegpu::LayoutAttr::get(storage.getContext(), instData); + if (getSgData().size()) + layoutAttr = xegpu::LayoutAttr::get( + storage.getContext(), + DenseI32ArrayAttr::get(storage.getContext(), sgLayout), + DenseI32ArrayAttr::get(storage.getContext(), sgData), + /*inst_data =*/nullptr, /*lane_layout =*/nullptr, + /*lane_data =*/nullptr, orderAttr); return LayoutInfo(layoutAttr); } @@ -487,6 +531,9 @@ bool LayoutInfoPropagation::hasParamsOfLayoutKind( } else if (layoutKind == LayoutKind::Lane) { return !(anchorLayout.getEffectiveLaneLayoutAsInt().empty() || anchorLayout.getEffectiveLaneDataAsInt().empty()); + } else if (layoutKind == LayoutKind::Subgroup) { + return !(anchorLayout.getEffectiveSgLayoutAsInt().empty() || + anchorLayout.getEffectiveSgDataAsInt().empty()); } return false; } @@ -1311,6 +1358,8 @@ void XeGPUPropagateLayoutPass::runOnOperation() { layoutKind = LayoutKind::Lane; } else if (this->layoutKind == "inst") { layoutKind = LayoutKind::InstData; + } else if (this->layoutKind == "subgroup") { + layoutKind = LayoutKind::Subgroup; } else { getOperation()->emitError("Unsupported layout kind option: " + this->layoutKind); diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir new file mode 100644 index 0000000000000..c7dfc9fb7b1f1 --- /dev/null +++ b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir @@ -0,0 +1,53 @@ +// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout="layout-kind=subgroup" -split-input-file %s | FileCheck %s + +gpu.module @test { + // CHECK-LABEL: store_nd + // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> + func.func @store_nd(%src: memref<256x128xf32>) { + // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf32> + // CHECK-SAME: -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + // CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]] <{layout = #xegpu.layout}> + // CHECK-SAME: {layout_result_0 = #xegpu.layout} + // CHECK-SAME: : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + // CHECK-SAME: -> vector<256x128xf32> + // CHECK: xegpu.store_nd %[[LOAD]], %[[TDESC]] <{layout = #xegpu.layout}> + // CHECK-SAME: : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32> + %load = xegpu.load_nd %tdesc : !xegpu.tensor_desc<256x128xf32> -> vector<256x128xf32> + xegpu.store_nd %load, %tdesc <{layout = #xegpu.layout}> + : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32> + return + } +} + +// ----- + +gpu.module @test { + // CHECK-LABEL: vector_transpose + // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> + // CHECK-SAME: %[[ARG_1:.*]]: memref<128x256xf32> + func.func @vector_transpose(%src: memref<256x128xf32>, %src1: memref<128x256xf32>) { + // CHECK: %[[TDESC_LD:.*]] = xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf32> -> + // CHECK-SAME: !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + // CHECK: %[[TDESC_ST:.*]] = xegpu.create_nd_tdesc %[[ARG_1]] : memref<128x256xf32> -> + // CHECK-SAME: !xegpu.tensor_desc<128x256xf32, #xegpu.layout> + + // CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC_LD]][0, 0] <{layout = #xegpu.layout}> + // CHECK-SAME: {layout_result_0 = #xegpu.layout} : + // CHECK-SAME: !xegpu.tensor_desc<256x128xf32, #xegpu.layout> -> vector<256x128xf32> + + // CHECK: %[[TRANSPOSED:.*]] = vector.transpose %2, [1, 0] + // CHECK-SAME {layout_result_0 = #xegpu.layout} : vector<256x128xf32> to vector<128x256xf32> + + // CHECK: xegpu.store_nd %[[TRANSPOSED]], %[[TDESC_ST]][0, 0] + // CHECK-SAME: <{layout = #xegpu.layout}> : vector<128x256xf32>, + // CHECK-SAME: !xegpu.tensor_desc<128x256xf32, #xegpu.layout> + %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32> + %tdesc1 = xegpu.create_nd_tdesc %src1 : memref<128x256xf32> -> !xegpu.tensor_desc<128x256xf32> + %load = xegpu.load_nd %tdesc[0, 0] : !xegpu.tensor_desc<256x128xf32> -> vector<256x128xf32> + %trans = vector.transpose %load, [1, 0] : vector<256x128xf32> to vector<128x256xf32> + xegpu.store_nd %trans, %tdesc1[0, 0] <{layout = #xegpu.layout}> + : vector<128x256xf32>, !xegpu.tensor_desc<128x256xf32> + return + } +}