From 9d7d1af48238eb440dbca40179517db716e910cf Mon Sep 17 00:00:00 2001 From: Artem Kroviakov Date: Thu, 16 Oct 2025 14:01:00 +0000 Subject: [PATCH 1/3] [MLIR][XeGPU] Introduce `xegpu::uArch` usage in target-sensitive passes --- .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 34 ++- .../mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h | 30 --- .../mlir/Dialect/XeGPU/Transforms/Passes.td | 7 +- .../mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h | 78 +++++- .../mlir/Dialect/XeGPU/uArch/uArchBase.h | 17 +- mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 16 +- .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 226 +++++++++++++----- .../Transforms/XeGPUSubgroupDistribute.cpp | 26 +- .../XeGPU/move-gpu-func-to-warp-op.mlir | 2 +- .../XeGPU/propagate-layout-inst-data.mlir | 51 ++++ mlir/test/Dialect/XeGPU/propagate-layout.mlir | 82 +++++-- 11 files changed, 418 insertions(+), 151 deletions(-) delete mode 100644 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h create mode 100644 mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index 5695d5d515d7f..ec236d702de0d 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -379,29 +379,41 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> { ); let builders = [ - AttrBuilder<(ins "llvm::ArrayRef": $lane_layout, + AttrBuilder<(ins "llvm::ArrayRef": $inst_data, + "llvm::ArrayRef": $lane_layout, "llvm::ArrayRef": $lane_data), [{ auto sg_layout = DenseI32ArrayAttr(); auto sg_data = DenseI32ArrayAttr(); - auto inst_data = DenseI32ArrayAttr(); auto order = DenseI32ArrayAttr(); - return $_get($_ctxt, sg_layout, sg_data, inst_data, + return $_get($_ctxt, sg_layout, sg_data, + DenseI32ArrayAttr::get($_ctxt, inst_data), DenseI32ArrayAttr::get($_ctxt, lane_layout), DenseI32ArrayAttr::get($_ctxt, lane_data), order); }]>, AttrBuilder<(ins "llvm::ArrayRef": $lane_layout, - "llvm::ArrayRef": $lane_data, - "llvm::ArrayRef": $order), + "llvm::ArrayRef": $lane_data), [{ - return $_get($_ctxt, - /*sg_layout =*/ nullptr, - /*sg_data =*/ nullptr, - /*inst_data =*/ nullptr, + auto sg_layout = DenseI32ArrayAttr(); + auto sg_data = DenseI32ArrayAttr(); + auto inst_data = DenseI32ArrayAttr(); + auto order = DenseI32ArrayAttr(); + return $_get($_ctxt, sg_layout, sg_data, inst_data, DenseI32ArrayAttr::get($_ctxt, lane_layout), - DenseI32ArrayAttr::get($_ctxt, lane_data), - DenseI32ArrayAttr::get($_ctxt, order)); + DenseI32ArrayAttr::get($_ctxt, lane_data), order); }]>, + // AttrBuilder<(ins "llvm::ArrayRef": $lane_layout, + // "llvm::ArrayRef": $lane_data, + // "llvm::ArrayRef": $order), + // [{ + // return $_get($_ctxt, + // /*sg_layout =*/ nullptr, + // /*sg_data =*/ nullptr, + // /*inst_data =*/ nullptr, + // DenseI32ArrayAttr::get($_ctxt, lane_layout), + // DenseI32ArrayAttr::get($_ctxt, lane_data), + // DenseI32ArrayAttr::get($_ctxt, order)); + // }]>, AttrBuilder<(ins "DenseI32ArrayAttr": $lane_layout, "DenseI32ArrayAttr": $lane_data, "DenseI32ArrayAttr": $order), diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h deleted file mode 100644 index 8aa9536cb67c1..0000000000000 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h +++ /dev/null @@ -1,30 +0,0 @@ -//===- XeGPUTargetInfo.h - Target constants ---------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef MLIR_DIALECT_XEGPU_IR_XEGPUTARGETINFO_H_ -#define MLIR_DIALECT_XEGPU_IR_XEGPUTARGETINFO_H_ - -namespace mlir { -namespace xegpu { -/// HW dependent constants. -/// TODO: These constants should be queried from the target information. -namespace targetinfo { -constexpr unsigned subgroupSize = 16; // How many lanes in a subgroup. -/// If DPAS A or B operands have low precision element types they must be packed -/// according to the following sizes. -constexpr unsigned packedSizeInBitsForDefault = - 16; // Minimum packing size per register for DPAS A. -constexpr unsigned packedSizeInBitsForDpasB = - 32; // Minimum packing size per register for DPAS B. -constexpr unsigned packedSizeInBitsForGatherScatter = - 32; // Minimum packing size per register for Gather and Scatter ops. -} // namespace targetinfo -} // namespace xegpu -} // namespace mlir - -#endif // MLIR_DIALECT_XEGPU_IR_XEGPUTARGETINFO_H_ diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td index 564d9c4d5422b..5ef1d499d618f 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td @@ -43,7 +43,12 @@ def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> { let options = [Option< "printOnly", "print-analysis-only", "bool", /*default=*/"false", - "Print the result of layout propagation analysis and exit.">]; + "Print the result of layout propagation analysis and exit.">, + Option< + "assumeUnrolled", "assume-unrolled", "bool", + /*default=*/"false", + "If the input IR has SG-sized tiles matching instruction sizes, omit `inst_data`."> + ]; } def XeGPUWgToSgDistribute : Pass<"xegpu-wg-to-sg-distribute"> { diff --git a/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h b/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h index 0519f7b2e277d..5cb6d61336391 100644 --- a/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h +++ b/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h @@ -42,12 +42,59 @@ struct Xe2Plus : public uArch { &instrs = {}) : uArch(archName, archDescription, regInfo, cacheInfo, instrs), xeCore(xeCore) {} + int getSubgroupSize() const override { return 16; } + int getPackedFormatBitSizeGatherScatter() const override { return 32; } + int getPackedFormatBitSize() const override { return 16; } + std::optional getPackedFormatBitSizeDpasB() const override { return 32; } +}; + +//===----------------------------------------------------------------------===// +// uArch instructions +//===----------------------------------------------------------------------===// +struct StoreNdInstruction : public Instruction { + StoreNdInstruction() + : Instruction(InstructionKind::STORE_ND, InstructionScope::Subgroup) {} + + // Source : + // https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroups.html#_add_a_new_section_6_13_x_sub_group_read_and_write_functions + // Reads 1, 2, 4, or 8 uints of data for each work item in the sub-group from + // the specified pointer + llvm::SmallVector getSortedLaneVectorLengths() { return {1, 2, 4, 8}; } +}; + +struct LoadNdInstruction : public Instruction { + LoadNdInstruction() + : Instruction(InstructionKind::LOAD_ND, InstructionScope::Subgroup) {} + + // Source : + // https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroups.html#_add_a_new_section_6_13_x_sub_group_read_and_write_functions + // Writes 1, 2, 4, or 8 uints of data for each work item in the sub-group to + // the specified pointer. + llvm::SmallVector getSortedLaneVectorLengths() { return {1, 2, 4, 8}; } +}; + +struct PrefetchNdInstruction : public Instruction { + PrefetchNdInstruction() + : Instruction(InstructionKind::PREFETCH_ND, InstructionScope::Subgroup) {} + + // Source : + // https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_buffer_prefetch.html#_add_a_new_section_6_15_x_sub_group_prefetch_functions + llvm::SmallVector getSortedLaneVectorLengths(int elementBitwidth) { + if (elementBitwidth == 8 || elementBitwidth == 16) + return {1, 2, 4, 8, 16}; + else if (elementBitwidth == 32 || elementBitwidth == 64) + return {1, 2, 4, 8}; + else + llvm_unreachable( + "Unsupported element bitwidth for PrefetchNdInstruction"); + } }; -// struct to represent DPAS instruction struct DPASInstruction : public Instruction, public MMAInstructionInterface { DPASInstruction() : Instruction(InstructionKind::DPAS, InstructionScope::Subgroup) {} + // Source: + // https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_matrix_multiply_accumulate.html // Override all virtuals from MatrixOpInterface virtual llvm::SmallVector, 16> @@ -72,6 +119,9 @@ struct DPASInstruction : public Instruction, public MMAInstructionInterface { virtual llvm::SmallVector getSupportedN(Type type) override; }; +//===----------------------------------------------------------------------===// +// uArch instructions +//===----------------------------------------------------------------------===// struct PVCuArch : public Xe2Plus { // Maintaines ownership of the instructions owned by PVUarch llvm::SmallVector, 8> owned_instructions; @@ -101,9 +151,15 @@ struct PVCuArch : public Xe2Plus { CacheInfo(512 * 1024, 64, CacheHierarchyLevel::L2)); // Add the instructions- - auto dpas = std::make_shared(); - instructions.emplace(dpas->getInstructionKind(), dpas); - owned_instructions.push_back(dpas); + llvm::SmallVector> instructionsToAdd{ + std::make_shared(), + std::make_shared(), + std::make_shared(), + std::make_shared()}; + for (auto &inst : instructionsToAdd) { + instructions.emplace(inst->getInstructionKind(), inst); + owned_instructions.push_back(inst); + } } }; @@ -139,10 +195,24 @@ struct BMGuArch : public Xe2Plus { owned_instructions.push_back(dpas); } }; + +inline std::shared_ptr getUArch(const std::string &archName) { + if (archName == "pvc") + return std::make_shared(); + else if (archName == "bmg") + return std::make_shared(); + else + return nullptr; +} + } // namespace uArch } // namespace xegpu } // namespace mlir +//===----------------------------------------------------------------------===// +// Instruction implementations +//===----------------------------------------------------------------------===// + inline llvm::SmallVector, 16> DPASInstruction::getSupportedShapes(Type dataType, MMAOpndKind matrixType) { auto combineVectors = [](const llvm::SmallVector &a, diff --git a/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h b/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h index 955994ea5ecf5..0f5b1282f0e24 100644 --- a/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h +++ b/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h @@ -32,8 +32,11 @@ namespace uArch { // An enum class to represent the scope of an instruction enum class InstructionScope { Lane, Subgroup, Workgroup, Cluster }; enum class InstructionKind { - DPAS, // Dot Product Accumulate Systolic (DPAS) is a matrix - // multiply-add operation + DPAS, // Dot Product Accumulate Systolic (DPAS) is a matrix + // multiply-add operation + STORE_ND, // Subgroup-level 2D block write instruction + LOAD_ND, // Subgroup-level 2D block load instruction + PREFETCH_ND // Subgroup-level 2D block prefetch instruction // @TODO: Add more instructions as needed }; @@ -148,6 +151,16 @@ struct uArch { const std::string &getDescription() const { return description; } + virtual int getSubgroupSize() const = 0; + virtual int getPackedFormatBitSizeGatherScatter() const = 0; + virtual int getPackedFormatBitSize() const = 0; + virtual std::optional getPackedFormatBitSizeDpasB() const = 0; + + std::shared_ptr getInstruction(InstructionKind instKind) const { + assert(instructions.find(instKind) != instructions.end()); + return instructions.at(instKind); + } + const std::map & getRegisterFileInfo() const { return registerFileInfo; diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index 9beb22d517473..afda04fa71105 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -11,7 +11,7 @@ #include "mlir/Dialect/Index/IR/IndexOps.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" -#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h" +#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" #include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h" #include "mlir/IR/Builders.h" #include "mlir/IR/DialectImplementation.h" @@ -226,8 +226,10 @@ LayoutAttr::verify(llvm::function_ref emitError, } if (inst_data && lane_layout && inst_data.size() != lane_layout.size()) { - return emitError() - << "expected inst_data and lane_layout to have the same rank"; + return emitError() << "expected inst_data and lane_layout to have the same " + "rank, got inst_data " + << inst_data.size() << ", lane_layout " + << lane_layout.size(); } // sg_data is optional for Workgroup layout, but its presence requires @@ -565,10 +567,10 @@ TensorDescType::verify(llvm::function_ref emitError, // for gather and scatter ops, Low-precision types are packed in 32-bit units. unsigned bitWidth = elementType.getIntOrFloatBitWidth(); - int chunkAlignmentFactor = - bitWidth < targetinfo::packedSizeInBitsForGatherScatter - ? targetinfo::packedSizeInBitsForGatherScatter / bitWidth - : 1; + constexpr int packingBitSizeGatherScatter{32}; + int chunkAlignmentFactor = bitWidth < packingBitSizeGatherScatter + ? packingBitSizeGatherScatter / bitWidth + : 1; auto scatterAttr = mlir::dyn_cast_if_present(encoding); if (scatterAttr) { int64_t chunkSize = scatterAttr.getChunkSizeAsInt(); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 8fab255d6347f..9c09908f3547d 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -14,7 +14,6 @@ #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" -#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h" #include "mlir/Dialect/XeGPU/Transforms/Passes.h" #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" #include "mlir/IR/Attributes.h" @@ -37,6 +36,8 @@ #include "llvm/Support/LogicalResult.h" #include "llvm/Support/raw_ostream.h" +#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h" + namespace mlir { namespace xegpu { #define GEN_PASS_DEF_XEGPUPROPAGATELAYOUT @@ -104,6 +105,8 @@ struct LayoutInfo { SmallVector getLaneData() const; + SmallVector getInstData() const; + bool isSliceLayout() const { if (!isAssigned()) return false; @@ -137,6 +140,13 @@ SmallVector LayoutInfo::getLaneData() const { [](int64_t val) { return static_cast(val); }); } +SmallVector LayoutInfo::getInstData() const { + if (!isAssigned()) + return {}; + return llvm::map_to_vector(storage.getEffectiveInstDataAsInt(), + [](int64_t val) { return static_cast(val); }); +} + void LayoutInfo::print(raw_ostream &os) const { if (isAssigned()) { os << storage; @@ -174,12 +184,14 @@ LayoutInfo LayoutInfo::transpose(ArrayRef permutation) const { SmallVector laneLayout; SmallVector laneData; + SmallVector instData; for (int64_t idx : permutation) { laneLayout.push_back(static_cast(getLaneLayout()[idx])); laneData.push_back(static_cast(getLaneData()[idx])); + instData.push_back(static_cast(getInstData()[idx])); } - return LayoutInfo( - xegpu::LayoutAttr::get(storage.getContext(), laneLayout, laneData)); + return LayoutInfo(xegpu::LayoutAttr::get(storage.getContext(), instData, + laneLayout, laneData)); } //===----------------------------------------------------------------------===// @@ -199,20 +211,33 @@ struct LayoutInfoLattice : public Lattice { /// Helper Function to get the default layout for uniform values like constants. /// For 1D vector, lane_layout is [subgroupSize] and lane_data is [1]. /// For 2D vector, lane_layout is [1, subgroupSize] and lane_data is [1, 1]. -static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx, - unsigned rank) { +static LayoutInfo +getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx, unsigned rank, + std::shared_ptr &uArch, + ArrayRef instData) { assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector."); if (rank == 1) { return LayoutInfo( - xegpu::LayoutAttr::get(ctx, {xegpu::targetinfo::subgroupSize}, {1})); + xegpu::LayoutAttr::get(ctx, instData, {uArch->getSubgroupSize()}, {1})); } return LayoutInfo(xegpu::LayoutAttr::get( - ctx, {1, xegpu::targetinfo::subgroupSize}, {1, 1})); + ctx, instData, {1, uArch->getSubgroupSize()}, {1, 1})); +} + +static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx, + unsigned rank, int subgroupSize) { + assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector."); + if (rank == 1) { + return LayoutInfo(xegpu::LayoutAttr::get(ctx, {subgroupSize}, {1})); + } + return LayoutInfo(xegpu::LayoutAttr::get(ctx, {1, subgroupSize}, {1, 1})); } /// Helper to get the default layout for a vector type. -static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy, - bool isScattered = false) { +static LayoutInfo +getDefaultSIMTLayoutInfo(VectorType vectorTy, + std::shared_ptr &uArch, + ArrayRef instData, bool isScattered = false) { // Expecting a 1D or 2D vector. assert((vectorTy.getRank() == 1 || vectorTy.getRank() == 2) && "Expected 1D or 2D vector."); @@ -221,29 +246,31 @@ static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy, "Expected int or float element type."); // If the rank is 1, then return default layout for 1D vector. if (vectorTy.getRank() == 1) - return getDefaultSIMTLayoutInfo(vectorTy.getContext(), 1); + return getDefaultSIMTLayoutInfo(vectorTy.getContext(), 1, uArch, instData); // Packing factor is determined by the element type bitwidth. int packingFactor = 1; unsigned bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth(); if (isScattered) { packingFactor = - bitwidth < xegpu::targetinfo::packedSizeInBitsForGatherScatter - ? xegpu::targetinfo::packedSizeInBitsForGatherScatter / bitwidth + bitwidth < uArch->getPackedFormatBitSizeGatherScatter() + ? uArch->getPackedFormatBitSizeGatherScatter() / bitwidth : 1; - return LayoutInfo(xegpu::LayoutAttr::get( - vectorTy.getContext(), {xegpu::targetinfo::subgroupSize, 1}, - {1, packingFactor})); + return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), instData, + {uArch->getSubgroupSize(), 1}, + {1, packingFactor})); } - if (bitwidth < xegpu::targetinfo::packedSizeInBitsForDefault) - packingFactor = xegpu::targetinfo::packedSizeInBitsForDefault / bitwidth; - return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), - {1, xegpu::targetinfo::subgroupSize}, + if (bitwidth < uArch->getPackedFormatBitSize()) + packingFactor = uArch->getPackedFormatBitSize() / bitwidth; + return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), instData, + {1, uArch->getSubgroupSize()}, {1, packingFactor})); } /// Helper to get the default layout for a vector type. -static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy, - bool isScattered = false) { +static LayoutInfo +getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy, + std::shared_ptr &uArch, + ArrayRef instData, bool isScattered = false) { // Expecting a 1D or 2D vector. assert((tdescTy.getRank() == 1 || tdescTy.getRank() == 2) && "Expected 1D or 2D TensorDesc."); @@ -252,27 +279,24 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy, "Expected int or float element type."); // If the rank is 1, then return default layout for 1D vector. if (tdescTy.getRank() == 1) - return getDefaultSIMTLayoutInfo(tdescTy.getContext(), 1); + return getDefaultSIMTLayoutInfo(tdescTy.getContext(), 1, uArch, instData); // Packing factor is determined by the element type bitwidth. unsigned bitwidth = tdescTy.getElementType().getIntOrFloatBitWidth(); - + int subgroupSize = uArch->getSubgroupSize(); if (isScattered) { int packingFactor = - bitwidth < xegpu::targetinfo::packedSizeInBitsForGatherScatter - ? xegpu::targetinfo::packedSizeInBitsForGatherScatter / bitwidth + bitwidth < uArch->getPackedFormatBitSizeGatherScatter() + ? uArch->getPackedFormatBitSizeGatherScatter() / bitwidth : 1; return LayoutInfo(xegpu::LayoutAttr::get( - tdescTy.getContext(), {xegpu::targetinfo::subgroupSize, 1}, - {1, packingFactor})); + tdescTy.getContext(), instData, {subgroupSize, 1}, {1, packingFactor})); } - int packingFactor = - (bitwidth < xegpu::targetinfo::packedSizeInBitsForDefault) - ? xegpu::targetinfo::packedSizeInBitsForDefault / bitwidth - : 1; - return LayoutInfo(xegpu::LayoutAttr::get(tdescTy.getContext(), - {1, xegpu::targetinfo::subgroupSize}, - {1, packingFactor})); + int packingFactor = (bitwidth < uArch->getPackedFormatBitSize()) + ? uArch->getPackedFormatBitSize() / bitwidth + : 1; + return LayoutInfo(xegpu::LayoutAttr::get( + tdescTy.getContext(), instData, {1, subgroupSize}, {1, packingFactor})); } /// Helper Function to get the expected layouts for DPAS operands. `lane_data` @@ -281,25 +305,27 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy, /// `packedSizeInBitsForDefault` /// * For B operand, the data must be packed in minimum /// `packedSizeInBitsForDpasB` -static LayoutInfo getSIMTLayoutInfoForDPASOperand(VectorType vectorTy, - unsigned operandNum) { +static LayoutInfo +getSIMTLayoutInfoForDPASOperand(VectorType vectorTy, unsigned operandNum, + std::shared_ptr &uArch, + ArrayRef instData) { Type elementTy = vectorTy.getElementType(); assert(elementTy.isIntOrFloat() && "Expected int or float type in DPAS operands"); - SmallVector layout({1, xegpu::targetinfo::subgroupSize}); + SmallVector layout({1, uArch->getSubgroupSize()}); // For B operand, data must be packed in minimum `packedDpasBSizeInBits` and // must have the VNNI format. - if (operandNum == 1 && elementTy.getIntOrFloatBitWidth() < - xegpu::targetinfo::packedSizeInBitsForDpasB) { + auto packSizeB = uArch->getPackedFormatBitSizeDpasB(); + assert(packSizeB.has_value() && "uArch must support dpas instructions"); + if (operandNum == 1 && elementTy.getIntOrFloatBitWidth() < *packSizeB) { SmallVector data( - {static_cast(xegpu::targetinfo::packedSizeInBitsForDpasB / - elementTy.getIntOrFloatBitWidth()), + {static_cast(*packSizeB / elementTy.getIntOrFloatBitWidth()), 1}); return LayoutInfo( - xegpu::LayoutAttr::get(vectorTy.getContext(), layout, data)); + xegpu::LayoutAttr::get(vectorTy.getContext(), instData, layout, data)); } // Otherwise, return the default layout for the vector type. - return getDefaultSIMTLayoutInfo(vectorTy); + return getDefaultSIMTLayoutInfo(vectorTy, uArch, instData); } //===----------------------------------------------------------------------===// @@ -456,7 +482,22 @@ void LayoutInfoPropagation::visitPrefetchNdOp( // Here we assign the default layout to the tensor descriptor operand of // prefetch. auto tdescTy = prefetch.getTensorDescType(); - auto prefetchLayout = getDefaultSIMTLayoutInfo(tdescTy); + + auto uArch = getUArch(getChipStr(prefetch).value_or("")); + int subgroupSize = uArch->getSubgroupSize(); + auto uArchInstruction = + std::static_pointer_cast( + uArch->getInstruction(xegpu::uArch::InstructionKind::STORE_ND)); + int maxVecLength = + uArchInstruction + ->getSortedLaneVectorLengths(tdescTy.getElementTypeBitWidth()) + .back(); + SmallVector instData; + if (tdescTy.getRank() == 1) + instData = {subgroupSize}; + else + instData = {maxVecLength, subgroupSize}; + auto prefetchLayout = getDefaultSIMTLayoutInfo(tdescTy, uArch, instData); // Propagate the layout to the source tensor descriptor. propagateIfChanged(operands[0], operands[0]->meet(prefetchLayout)); } @@ -475,10 +516,11 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp( reduction.emitWarning("Expecting output type to be 1D vector."); return; } + auto uArch = getUArch(xegpu::getChipStr(reduction).value_or("")); // Given that the result is 1D, the layout of the operand should be 2D with // default layout. - LayoutInfo operandLayout = - getDefaultSIMTLayoutInfo(reduction->getContext(), 2); + LayoutInfo operandLayout = getDefaultSIMTLayoutInfo( + reduction->getContext(), 2, uArch->getSubgroupSize()); propagateIfChanged(operands[0], operands[0]->meet(operandLayout)); // Accumulator should have the same layout as the result. propagateIfChanged(operands[1], operands[1]->meet(resultLayout)); @@ -557,15 +599,33 @@ void LayoutInfoPropagation::visitDpasOp( ArrayRef results) { VectorType aTy = dpas.getLhsType(); VectorType bTy = dpas.getRhsType(); - propagateIfChanged( - operands[0], operands[0]->meet(getSIMTLayoutInfoForDPASOperand(aTy, 0))); - propagateIfChanged( - operands[1], operands[1]->meet(getSIMTLayoutInfoForDPASOperand(bTy, 1))); + + auto uArch = getUArch(getChipStr(dpas).value_or("")); + const int subgroupSize = uArch->getSubgroupSize(); + auto uArchInstruction = + std::static_pointer_cast( + uArch->getInstruction(xegpu::uArch::InstructionKind::DPAS)); + const int maxALen = + uArchInstruction->getSupportedM(aTy.getElementType()).back(); + const int maxBLen = + uArchInstruction->getSupportedK(bTy.getElementType()).back(); + SmallVector instDataA = {maxALen, subgroupSize}; + SmallVector instDataB = {subgroupSize, maxBLen}; + + propagateIfChanged(operands[0], + operands[0]->meet(getSIMTLayoutInfoForDPASOperand( + aTy, 0, uArch, instDataA))); + propagateIfChanged(operands[1], + operands[1]->meet(getSIMTLayoutInfoForDPASOperand( + bTy, 1, uArch, instDataB))); if (operands.size() > 2) { VectorType cTy = dpas.getAccType(); - propagateIfChanged( - operands[2], - operands[2]->meet(getSIMTLayoutInfoForDPASOperand(cTy, 2))); + const int maxCLen = + uArchInstruction->getSupportedN(bTy.getElementType()).back(); + SmallVector instDataC = {maxALen, maxCLen}; + propagateIfChanged(operands[2], + operands[2]->meet(getSIMTLayoutInfoForDPASOperand( + cTy, 2, uArch, instDataC))); } } @@ -573,7 +633,20 @@ void LayoutInfoPropagation::visitDpasOp( void LayoutInfoPropagation::visitStoreNdOp( xegpu::StoreNdOp store, ArrayRef operands, ArrayRef results) { - LayoutInfo storeLayout = getDefaultSIMTLayoutInfo(store.getValueType()); + + auto uArch = getUArch(getChipStr(store).value_or("")); + int subgroupSize = uArch->getSubgroupSize(); + auto uArchInstruction = + std::static_pointer_cast( + uArch->getInstruction(xegpu::uArch::InstructionKind::STORE_ND)); + int maxVecLength = uArchInstruction->getSortedLaneVectorLengths().back(); + SmallVector instData; + if (store.getValueType().getRank() == 1) + instData = {subgroupSize}; + else + instData = {maxVecLength, subgroupSize}; + LayoutInfo storeLayout = + getDefaultSIMTLayoutInfo(store.getValueType(), uArch, instData); // Both operands should have the same layout for (LayoutInfoLattice *operand : operands) propagateIfChanged(operand, operand->meet(storeLayout)); @@ -694,10 +767,22 @@ void LayoutInfoPropagation::visitLoadGatherOp( load.emitWarning("Not propagating, non-vector payload supplied."); return; } - LayoutInfo layout = getDefaultSIMTLayoutInfo(payloadTy, /*scattered*/ true); + auto uArch = getUArch(getChipStr(load).value_or("")); + const int subgroupSize = uArch->getSubgroupSize(); + SmallVector instData{subgroupSize}; + if (auto chunkSize = load.getChunkSize().value_or(0); chunkSize > 1) + instData.push_back(chunkSize); + else if (auto srcTdescTy = + dyn_cast(load.getSourceType())) { + if (srcTdescTy.getChunkSizeAsInt() > 1) + instData.push_back(chunkSize); + } + LayoutInfo layout = + getDefaultSIMTLayoutInfo(payloadTy, uArch, instData, /*scattered*/ true); // Mask operand should have 1D default layout. - LayoutInfo maskLayout = getDefaultSIMTLayoutInfo(load->getContext(), 1); + LayoutInfo maskLayout = + getDefaultSIMTLayoutInfo(load->getContext(), 1, subgroupSize); // Propagate the new layout to the tensor descriptor operand. if (isa(load.getSourceType())) @@ -717,8 +802,10 @@ void LayoutInfoPropagation::visitCreateDescOp( // Need the layout of the descriptor to propagate to the operands. if (!descLayout.isAssigned()) return; + auto uArch = getUArch(getChipStr(createDesc).value_or("")); // For offset operand propagate 1D default layout. - LayoutInfo layout = getDefaultSIMTLayoutInfo(createDesc->getContext(), 1); + LayoutInfo layout = getDefaultSIMTLayoutInfo(createDesc->getContext(), 1, + uArch->getSubgroupSize()); propagateIfChanged(operands[1], operands[1]->meet(layout)); } @@ -735,18 +822,29 @@ void LayoutInfoPropagation::visitStoreScatterOp( storeScatter.emitWarning("Not propagating, non-vector payload supplied."); return; } + auto uArch = getUArch(getChipStr(storeScatter).value_or("")); + const int subgroupSize = uArch->getSubgroupSize(); + auto payloadShape = payloadTy.getShape(); if (payloadShape.size() > 1) assert( - payloadShape[0] == xegpu::targetinfo::subgroupSize && + payloadShape[0] == subgroupSize && "Expected the first dimension of 2D tensor descriptor to be equal to " "subgroup size."); + SmallVector instData{subgroupSize}; + if (auto chunkSize = storeScatter.getChunkSize().value_or(0); chunkSize > 1) + instData.push_back(chunkSize); + else if (auto dstTdescTy = + dyn_cast(storeScatter.getDestType())) { + if (dstTdescTy.getChunkSizeAsInt() > 1) + instData.push_back(chunkSize); + } LayoutInfo payloadLayout = - getDefaultSIMTLayoutInfo(payloadTy, /*scattered=*/true); + getDefaultSIMTLayoutInfo(payloadTy, uArch, instData, /*scattered=*/true); LayoutInfo maskLayout = - getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1); + getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1, subgroupSize); // Propagate the payload operand layout propagateIfChanged(operands[0], operands[0]->meet(payloadLayout)); // Propagate the destination (if tdesc) operand layout @@ -1023,9 +1121,13 @@ void XeGPUPropagateLayoutPass::runOnOperation() { LayoutInfo layout = analysis.getLayoutInfo(val); if (!layout.isAssigned()) return {}; + xegpu::DistributeLayoutAttr layoutAttr = + cast(layout.get()); + if (this->assumeUnrolled) + layoutAttr = layoutAttr.dropInstData(); if (layout.isSliceLayout()) - return cast(layout.get()); - return cast(layout.get()); + return cast(layoutAttr); + return cast(layoutAttr); }; mlir::OpBuilder builder(&getContext()); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 26770b3c003ea..75466f0efcd36 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -11,10 +11,10 @@ #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" -#include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h" #include "mlir/Dialect/XeGPU/Transforms/Passes.h" #include "mlir/Dialect/XeGPU/Transforms/Transforms.h" #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" +#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h" #include "mlir/IR/AffineMap.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/Builders.h" @@ -159,17 +159,17 @@ static bool requirePacked(const xegpu::LayoutAttr layout) { /// Helper function to check if the layout requires a transpose effect. static bool requireTranspose(const xegpu::LayoutAttr layout, - const std::string &chipStr) { + std::shared_ptr uArch) { // Return false for unsupported targets. // TODO: Add more support or move to target info. - if (chipStr != "pvc" && chipStr != "bmg") + if (uArch->getName() != "pvc" && uArch->getName() != "bmg") return false; if (!layout) return false; auto laneLayout = layout.getEffectiveLaneLayoutAsInt(); if (laneLayout.size() != 2) return false; - return laneLayout[0] == xegpu::targetinfo::subgroupSize && laneLayout[1] == 1; + return laneLayout[0] == uArch->getSubgroupSize() && laneLayout[1] == 1; } /// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body @@ -228,9 +228,14 @@ struct MoveFuncBodyToWarpOp : public OpRewritePattern { rewriter, newGpuFunc.getLoc(), rewriter.getIndexType(), /** upperBound = **/ mlir::IntegerAttr()); ArrayRef gpuFuncResultType = gpuFuncOp.getFunctionType().getResults(); + auto uArch = getUArch(xegpu::getChipStr(gpuFuncOp).value_or("")); + if (!uArch) + return rewriter.notifyMatchFailure( + gpuFuncOp, "Subgroup distribution requires target attribute attached " + "to set the warp size"); auto warpOp = gpu::WarpExecuteOnLane0Op::create( rewriter, laneId.getLoc(), gpuFuncResultType, laneId, - xegpu::targetinfo::subgroupSize, newGpuFunc.getArguments(), + uArch->getSubgroupSize(), newGpuFunc.getArguments(), newGpuFunc.getArgumentTypes()); Block &warpBodyBlock = warpOp.getBodyRegion().front(); // Replace the ReturnOp of the original gpu function with a YieldOp. @@ -498,11 +503,12 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern { // Chip information is required to decide if the layout requires transpose // effect. auto chipStr = xegpu::getChipStr(loadOp); - if (!chipStr) + auto uArch = getUArch(chipStr.value_or("")); + if (!uArch) return rewriter.notifyMatchFailure( - loadOp, - "xegpu::LoadNdOp require chip information to determine transpose " - "requirement"); + loadOp, "xegpu::LoadNdOp require target attribute attached to " + "determine transpose " + "requirement"); // Expecting offsets to be present. SmallVector offsets = loadOp.getMixedOffsets(); if (offsets.empty()) @@ -556,7 +562,7 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern { // Set the packed attribute if the layout requires it. newLoadOp.setPacked(requirePacked(layout)); // Set the transpose attribute if the layout requires it. - if (requireTranspose(layout, chipStr.value())) + if (requireTranspose(layout, uArch)) newLoadOp.setTranspose( DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0})); Value distributedVal = newWarpOp.getResult(operandIdx); diff --git a/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir b/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir index d289d73e863c7..2780212d2917f 100644 --- a/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir +++ b/mlir/test/Dialect/XeGPU/move-gpu-func-to-warp-op.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -test-xegpu-move-func-to-warp-op -split-input-file --allow-unregistered-dialect %s | FileCheck %s +// RUN: mlir-opt -xevm-attach-target='chip=pvc' -test-xegpu-move-func-to-warp-op -split-input-file --allow-unregistered-dialect %s | FileCheck %s gpu.module @test { gpu.func @empty() { diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir new file mode 100644 index 0000000000000..5020bb1450890 --- /dev/null +++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir @@ -0,0 +1,51 @@ +// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout -split-input-file %s | FileCheck %s + +// CHECK-LABEL: func.func @dpas_f16( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { +// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> +// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> +// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> +// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> +// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +gpu.module @test { + +func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { + %c0 = arith.constant 0 : index + %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> + %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %4 = xegpu.dpas %2, %3, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + return +} +} + + +// ----- +gpu.module @test { +// CHECK-LABEL: func.func @scatter_ops_chunksize( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { +// CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> +// CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> +// CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}> +// CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> +// CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +func.func @scatter_ops_chunksize(%src: memref<256xf16>) { + %1 = arith.constant dense<1>: vector<16xi1> + %offset = arith.constant dense<12> : vector<16xindex> + %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> + : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> + xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> + : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> + return +} +} diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir index 30f785ded975a..512879bcd1954 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir @@ -1,5 +1,6 @@ -// RUN: mlir-opt -xegpu-propagate-layout -split-input-file %s | FileCheck %s +// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout="assume-unrolled" -split-input-file %s | FileCheck %s +gpu.module @test { // CHECK-LABEL: func.func @dpas_f16( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32> @@ -25,8 +26,10 @@ func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: me xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @dpas_i8( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<8x32xi8>, %[[ARG1:[0-9a-zA-Z]+]]: vector<32x16xi8>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xi32>) { // CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {layout_result_0 = #xegpu.layout, %arg1: vector<32x16xi8>, %arg2: memre xegpu.store_nd %0, %1 : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32> return } +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @load_with_transpose_effect( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{transpose = array}> {layout_result_0 = #xegpu.layout} : @@ -55,8 +60,10 @@ func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_transpose( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %{{.*}} = vector.transpose %{{.*}}, [1, 0] {layout_result_0 = #xegpu.layout} : vector<16x16xf16> to vector<16x16xf16> @@ -73,8 +80,10 @@ func.func @vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, % xegpu.store_nd %5, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @extf_truncf( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, %[[ARG1:[0-9a-zA-Z]+]]: // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) -> vector<8x16xf32> { @@ -88,8 +97,10 @@ func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor %4 = xegpu.dpas %0, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> return %4 : vector<8x16xf32> } +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @load_gather_with_chunksize( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} @@ -113,8 +124,10 @@ func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256 xegpu.store_nd %5, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @load_gather_1d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout>) { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} @@ -132,8 +145,9 @@ func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf xegpu.store_nd %1, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @store_scatter_with_chunksize( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<128xf32>) { // CHECK: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %{{.*}} : memref<128xf32>, vector<16xindex> -> @@ -148,8 +162,9 @@ func.func @store_scatter_with_chunksize(%arg0: memref<128xf32>) { xegpu.store %cst, %0, %cst_0 : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @store_scatter_1d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) { // CHECK: xegpu.store %[[ARG0]], %{{.*}}, %{{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, @@ -161,8 +176,9 @@ func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) { xegpu.store %arg0, %0, %cst_0 : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @scatter_ops_chunksize( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> @@ -179,8 +195,9 @@ func.func @scatter_ops_chunksize(%src: memref<256xf16>) { : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @scatter_ops( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> @@ -195,8 +212,9 @@ func.func @scatter_ops(%src: memref<256xf16>) { xegpu.store %3, %src[%offset], %1 : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i16_to_f16( // CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout} // CHECK-SAME: !xegpu.tensor_desc<8x16xi16, #xegpu.layout> -> vector<8x16xi16> @@ -219,8 +237,9 @@ func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x1 xegpu.store_nd %6, %7 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i32_to_f16( // CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout} // CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> @@ -239,8 +258,9 @@ func.func @vector_bitcast_i32_to_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x8 xegpu.store_nd %6, %7 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i16_to_i32( // CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout} // CHECK-SAME: !xegpu.tensor_desc<8x32xi16, #xegpu.layout> -> vector<8x32xi16> @@ -255,8 +275,9 @@ func.func @vector_bitcast_i16_to_i32(%arg0: memref<8x32xi16>, %arg1: memref<8x16 xegpu.store_nd %3, %1 : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_require_cross_lane_shuffle( // CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xi32> -> vector<8x16xi32> // CHECK: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout} @@ -270,9 +291,10 @@ func.func @vector_bitcast_require_cross_lane_shuffle(%arg0: memref<8x16xi32>, %a xegpu.store_nd %3, %1 : vector<8x32xi16>, !xegpu.tensor_desc<8x32xi16> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @binary_op_one_use( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, @@ -291,8 +313,9 @@ func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu. xegpu.store_nd %4, %arg2 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @binary_op_multiple_uses( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, @@ -312,8 +335,9 @@ func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: ! xegpu.store_nd %2, %arg3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @for_op( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x128xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<128x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> @@ -353,8 +377,9 @@ func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: me xegpu.store_nd %2#2, %3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @if_single_use( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, @@ -381,8 +406,9 @@ func.func @if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tens xegpu.store_nd %2, %arg3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @if_multiple_uses( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, @@ -411,8 +437,9 @@ func.func @if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.t xegpu.store_nd %1, %arg4 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_outer_reduction( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16x16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout>) { // CHECK: %{{.*}} = vector.multi_reduction , %[[ARG0]], %{{.*}} {layout_result_0 = #xegpu.layout} [0] : vector<16x16xf32> to vector<16xf32> @@ -422,8 +449,9 @@ func.func @vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor xegpu.store_nd %0, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_inner_reduction( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16x16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout>) { // CHECK: %{{.*}} = vector.multi_reduction , %[[ARG0]], %{{.*}} {layout_result_0 = #xegpu.layout} [1] : vector<16x16xf32> to vector<16xf32> @@ -433,8 +461,9 @@ func.func @vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor xegpu.store_nd %0, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @update_nd_offset_1d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> @@ -448,8 +477,9 @@ func.func @update_nd_offset_1d(%arg0: memref<256xf32>){ xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @update_nd_offset_2d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf32>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout> @@ -463,8 +493,9 @@ func.func @update_nd_offset_2d(%arg0: memref<256x256xf32>){ xegpu.store_nd %1, %2 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @prefetch_2d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> @@ -475,8 +506,9 @@ func.func @prefetch_2d(%arg0: memref<256x256xf16>){ xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<16x16xf16> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @prefetch_1d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> @@ -487,8 +519,9 @@ func.func @prefetch_1d(%arg0: memref<256xf16>){ xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<16xf16> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @scf_while_and_condition( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) { // CHECK: %{{.*}}:3 = scf.while ({{.*}}) : (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout>) @@ -520,8 +553,9 @@ func.func @scf_while_and_condition(%arg0: memref<256xf32>, %arg1: memref<256xf32 } return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim1_distributed( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { @@ -541,8 +575,9 @@ func.func @vector_shape_cast_1d_to_2d_dim1_distributed(%arg0: !xegpu.tensor_desc xegpu.store_nd %5, %arg1 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> return } - +} // ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { @@ -563,3 +598,4 @@ func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(%arg0: !xegpu.tensor_desc xegpu.store_nd %5, %arg1 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> return } +} From a95b7cff2039650f9830d3c933631e94c7a9fba0 Mon Sep 17 00:00:00 2001 From: Artem Kroviakov Date: Thu, 16 Oct 2025 15:18:03 +0000 Subject: [PATCH 2/3] Default virtual dtor --- mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h b/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h index 0f5b1282f0e24..168f0abb17af5 100644 --- a/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h +++ b/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h @@ -145,7 +145,7 @@ struct uArch { : name(name), description(description), registerFileInfo(registerFileInfo), cacheInfo(cacheInfo), instructions(instructions) {} - + virtual ~uArch() = default; // Get methods const std::string &getName() const { return name; } From 4b99cddf50acaeab5e23fa5194106ded78e8660a Mon Sep 17 00:00:00 2001 From: Artem Kroviakov Date: Thu, 16 Oct 2025 16:57:47 +0000 Subject: [PATCH 3/3] Fix warnings --- mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h | 10 +++++----- mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h | 12 +++++++++--- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h b/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h index 5cb6d61336391..d0d9e82dfbc38 100644 --- a/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h +++ b/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h @@ -23,8 +23,6 @@ #include #include -#define DEBUG_TYPE "xegpu-uarch" - using namespace mlir; using namespace mlir::xegpu::uArch; @@ -43,9 +41,11 @@ struct Xe2Plus : public uArch { : uArch(archName, archDescription, regInfo, cacheInfo, instrs), xeCore(xeCore) {} int getSubgroupSize() const override { return 16; } - int getPackedFormatBitSizeGatherScatter() const override { return 32; } - int getPackedFormatBitSize() const override { return 16; } - std::optional getPackedFormatBitSizeDpasB() const override { return 32; } + unsigned getPackedFormatBitSizeGatherScatter() const override { return 32; } + unsigned getPackedFormatBitSize() const override { return 16; } + std::optional getPackedFormatBitSizeDpasB() const override { + return 32; + } }; //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h b/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h index 168f0abb17af5..09137a5547aab 100644 --- a/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h +++ b/mlir/include/mlir/Dialect/XeGPU/uArch/uArchBase.h @@ -57,6 +57,12 @@ struct Instruction { switch (instKind) { case InstructionKind::DPAS: return "dpas"; + case InstructionKind::STORE_ND: + return "store_nd"; + case InstructionKind::LOAD_ND: + return "load_nd"; + case InstructionKind::PREFETCH_ND: + return "prefetch_nd"; } llvm_unreachable("Unknown InstructionKind"); } @@ -152,9 +158,9 @@ struct uArch { const std::string &getDescription() const { return description; } virtual int getSubgroupSize() const = 0; - virtual int getPackedFormatBitSizeGatherScatter() const = 0; - virtual int getPackedFormatBitSize() const = 0; - virtual std::optional getPackedFormatBitSizeDpasB() const = 0; + virtual unsigned getPackedFormatBitSizeGatherScatter() const = 0; + virtual unsigned getPackedFormatBitSize() const = 0; + virtual std::optional getPackedFormatBitSizeDpasB() const = 0; std::shared_ptr getInstruction(InstructionKind instKind) const { assert(instructions.find(instKind) != instructions.end());