From 28c5c4c5f29a23dee72e9397e0f93063dc167e75 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 21 Aug 2025 16:11:50 +0000 Subject: [PATCH 01/36] pull changes --- .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 159 ++++++++++++++- .../Transforms/XeGPUSubgroupDistribute.cpp | 188 +++++++++++++++++- mlir/test/Dialect/XeGPU/propagate-layout.mlir | 17 ++ 3 files changed, 353 insertions(+), 11 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index bef88042fc663..10c2759493477 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -62,10 +62,17 @@ struct Layout { SmallVector layout; Layout() = default; Layout(std::initializer_list list) : layout(list) {} + Layout(SmallVector &list) : layout(list) {} void print(llvm::raw_ostream &os) const; size_t size() const { return layout.size(); } + int64_t operator[](size_t idx) const; }; +int64_t Layout::operator[](size_t idx) const { + assert(idx < layout.size() && "Index out of bounds"); + return layout[idx]; +} + void Layout::print(llvm::raw_ostream &os) const { os << llvm::interleaved_array(layout); } @@ -324,6 +331,13 @@ class LayoutInfoPropagation ArrayRef operands, ArrayRef results); + void visitVectorBroadCastOp(vector::BroadcastOp broadcast, + ArrayRef operands, + ArrayRef results); + void visitShapeCastOp(vector::ShapeCastOp shapeCast, + ArrayRef operands, + ArrayRef results); + public: LayoutInfoPropagation(DataFlowSolver &solver, SymbolTableCollection &symbolTable) @@ -383,6 +397,12 @@ LogicalResult LayoutInfoPropagation::visitOperation( .Case([&](auto reductionOp) { visitVectorMultiReductionOp(reductionOp, operands, results); }) + .Case([&](auto broadcastOp) { + visitVectorBroadCastOp(broadcastOp, operands, results); + }) + .Case([&](auto shapeCastOp) { + visitShapeCastOp(shapeCastOp, operands, results); + }) // All other ops. .Default([&](Operation *op) { for (const LayoutInfoLattice *resultInfo : results) { @@ -437,6 +457,83 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp( propagateIfChanged(operands[1], operands[1]->meet(resultLayout)); } +void LayoutInfoPropagation::visitVectorBroadCastOp( + vector::BroadcastOp broadcast, ArrayRef operands, + ArrayRef results) { + // The layout of the result must be present. + LayoutInfo resultLayout = results[0]->getValue(); + if (!resultLayout.isAssigned()) + return; + // Only consider 1D -> 2D broadcasts or 2D -> 2D broadcasts. + VectorType resultTy = broadcast.getResultVectorType(); + VectorType sourceTy = dyn_cast(broadcast.getSourceType()); + if (!sourceTy) { + broadcast.emitWarning("Expecting source type to be a vector type."); + return; + } + + // Only conside 2D -> 2D broadcast. + if (sourceTy.getRank() != 2 || resultTy.getRank() != 2) { + broadcast.emitWarning("Expecting source type to be 2D vector and " + "result type to be 2D vector."); + return; + } + SetVector broadcastUnitDims = broadcast.computeBroadcastedUnitDims(); + if (broadcastUnitDims.size() != 1) { + broadcast.emitWarning("Expecting source type to be 2D vector only with " + "one broadcasted dimension."); + return; + } + // Propagate the result layout to the source operand. + propagateIfChanged(operands[0], operands[0]->meet(resultLayout)); +} + +void LayoutInfoPropagation::visitShapeCastOp( + vector::ShapeCastOp shapeCast, ArrayRef operands, + ArrayRef results) { + // The layout of the result must be present. + LayoutInfo resultLayout = results[0]->getValue(); + if (!resultLayout.isAssigned()) + return; + VectorType sourceTy = shapeCast.getSourceVectorType(); + VectorType resultTy = shapeCast.getResultVectorType(); + // Expecting source rank to be 1D or 2D. + if (sourceTy.getRank() != 1 && sourceTy.getRank() != 2) { + shapeCast.emitWarning("Expecting source type to be 1D or 2D vector."); + return; + } + // Expecting result rank to be 1D or 2D. + if (resultTy.getRank() != 1 && resultTy.getRank() != 2) { + shapeCast.emitWarning("Expecting result type to be 1D or 2D vector."); + return; + } + // For 2D -> 2D shape cast, propagate the result layout to the source. + if (sourceTy.getRank() == 2 && resultTy.getRank() == 2) { + // Propagate the result layout to the source operand. + propagateIfChanged(operands[0], operands[0]->meet(resultLayout)); + return; + } + auto resultLayoutArray = resultLayout.getLayoutAsArrayRef(); + if (resultLayoutArray[0] != 1 && resultLayoutArray[1] != 1) { + shapeCast.emitWarning( + "Expecting result layout to be of form [1, subgroupSize] " + "or [subgroupSize, 1]."); + return; + } + int64_t distributedDim = resultLayoutArray[0] == 1 ? 1 : 0; + // If the result shape can be evenly distributed in the distributed dimension, + // then the source layout should be [subgroupSize][1]. Otherwise, data is + // shared accross lanes (broadcasted). In that case, just assign [1][1] for + // now (TODO: Use slice for this case) + LayoutInfo sourceLayout = + resultTy.getShape()[distributedDim] % xegpu::targetinfo::subgroupSize == 0 + ? LayoutInfo(LaneLayout({xegpu::targetinfo::subgroupSize}), + LaneData({1})) + : LayoutInfo(LaneLayout({1}), LaneData({1})); + // Propagate the source layout to the source operand. + propagateIfChanged(operands[0], operands[0]->meet(sourceLayout)); +} + /// Propagate the layout of the result tensor to the source tensor descriptor in /// UpdateNdOffsetOp. void LayoutInfoPropagation::visitUpdateNdOffsetOp( @@ -529,16 +626,64 @@ void LayoutInfoPropagation::visitVectorBitcastOp( bitcast.getSourceVectorType().getElementType().getIntOrFloatBitWidth(); int outElemTyBitWidth = bitcast.getResultVectorType().getElementType().getIntOrFloatBitWidth(); - - // NOTE: We do not expect widening or narrowing bitcasts at this stage. Emit - // a warning and return. - if (inElemTyBitWidth != outElemTyBitWidth) { - bitcast.emitWarning("Widening or narrowing bitcasts are not expected at " - "layout propagation stage."); + // If the element bit widths are the same, then the layout does not change. + if (inElemTyBitWidth == outElemTyBitWidth) { + propagateIfChanged(operands[0], operands[0]->meet(resultLayout)); return; } + int64_t rank = bitcast.getSourceVectorType().getRank(); + // Bitcast is a `narrowing` if the input element type bit width larger than + // the output element type bit width. eg. f32 -> f16 is a narrowing bitcast. + bool isNarrowing = inElemTyBitWidth > outElemTyBitWidth; + int bitCastRatio = isNarrowing ? inElemTyBitWidth / outElemTyBitWidth + : outElemTyBitWidth / inElemTyBitWidth; + const LaneLayout &sourceLaneLayout = + resultLayout.getLayout(); // source lane layout is unchanged. + ArrayRef currData = resultLayout.getDataAsArrayRef(); + + // TODO: Currently we assume that bitcasts does not require cross lane + // communication. So each lane must own the required number of elements to + // perform the bitcast locally without cross-lane communication. + // For 1D vectors, decide how many elements each lane owns based on whether + // the bitcast is narrowing or widening. + if (rank == 1) { + if ((currData[0] * outElemTyBitWidth) % inElemTyBitWidth != 0) { + bitcast.emitWarning( + "Narrowing bitcast with cross lane communication is not supported."); + return; + } + LaneData sourceLaneData = isNarrowing + ? LaneData({currData[0] / bitCastRatio}) + : LaneData({currData[0] * bitCastRatio}); - propagateIfChanged(operands[0], operands[0]->meet(resultLayout)); + propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo( + sourceLaneLayout, sourceLaneData))); + } + // For nD vectors, Each lane is not allowed to own multiple elements in any + // dimension other than the innermost dimension. + // TODO: Add support for other case depending on the use case. + SmallVector sourceLaneDataStorage(currData.begin(), + currData.end() - 1); + if (llvm::any_of(sourceLaneDataStorage, [](int64_t d) { return d != 1; })) { + bitcast.emitWarning( + "Each lane must not own multiple elements in any dimension other than " + "the innermost dimension."); + return; + } + // Check if the bitcast requires cross lane communication. + if ((currData[rank - 1] * outElemTyBitWidth) % inElemTyBitWidth != 0) { + bitcast.emitWarning( + "Narrowing bitcast with cross lane communication is not supported."); + return; + } + // Decide lane data based on whether the bitcast is narrowing or widening. + int64_t innerMostLaneData = isNarrowing ? currData[rank - 1] / bitCastRatio + : currData[rank - 1] * bitCastRatio; + sourceLaneDataStorage.push_back(innerMostLaneData); + LaneData sourceLaneData(sourceLaneDataStorage); + + propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo( + sourceLaneLayout, sourceLaneData))); } /// Propagate the layout of the result to the tensor descriptor and mask diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 2088c3c7fc5ec..61eece55a9bac 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -27,6 +27,7 @@ #include "mlir/IR/Value.h" #include "mlir/IR/Visitors.h" #include "mlir/Interfaces/FunctionInterfaces.h" +#include "mlir/Interfaces/SideEffectInterfaces.h" #include "mlir/Support/LLVM.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" @@ -34,6 +35,9 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SmallVectorExtras.h" +#include "llvm/Support/LogicalResult.h" +#include namespace mlir { namespace xegpu { @@ -146,6 +150,15 @@ static bool hasPackedLayout(xegpu::LayoutAttr layout) { return laneData.asArrayRef()[0] != 1; } +static bool hasTransposedLayout(xegpu::LayoutAttr layout) { + if (layout == xegpu::LayoutAttr()) + return false; + DenseI32ArrayAttr laneLayout = layout.getLaneLayout(); + if (!laneLayout || laneLayout.size() != 2) + return false; + return laneLayout.asArrayRef()[0] > 1 && laneLayout.asArrayRef()[1] == 1; +} + /// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body /// of the original GPUFuncOp to the new GPUFuncOp such that entire body is /// contained within a WarpExecuteOnLane0Op. @@ -500,6 +513,9 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern { xegpu::removeLayoutAttrs(newLoadOp); // Set the packed attribute if the layout requires it. newLoadOp.setPacked(hasPackedLayout(layout)); + if (hasTransposedLayout(layout)) + newLoadOp.setTranspose( + DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0})); Value distributedVal = newWarpOp.getResult(operandIdx); // There can be a conflict between the vector type distributed by the // warp op and (xegpu-specific) distributed type supported by the load @@ -811,6 +827,135 @@ struct GpuBarrierDistribution final : public gpu::WarpDistributionPattern { } }; +struct MemrefExtractAlignedPointerAsIndexDistribution final + : public gpu::WarpDistributionPattern { + using gpu::WarpDistributionPattern::WarpDistributionPattern; + LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, + PatternRewriter &rewriter) const override { + OpOperand *operand = getWarpResult( + warpOp, llvm::IsaPred); + if (!operand) + return rewriter.notifyMatchFailure( + warpOp, + "warp result is not a xegpu::MemrefExtractAlignedPointerAsIndex op"); + auto extractOp = + operand->get().getDefiningOp(); + unsigned operandIdx = operand->getOperandNumber(); + SmallVector newRetIndices; + gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( + rewriter, warpOp, extractOp.getSource(), + TypeRange{extractOp.getSource().getType()}, newRetIndices); + rewriter.setInsertionPointAfter(newWarpOp); + auto newExtractOp = memref::ExtractAlignedPointerAsIndexOp::create( + rewriter, newWarpOp.getLoc(), extractOp.getType(), + newWarpOp.getResult(newRetIndices[0])); + Value distributedVal = newWarpOp.getResult(operandIdx); + rewriter.replaceAllUsesWith(distributedVal, newExtractOp.getResult()); + return success(); + } +}; + +struct VectorBitcastDistribution final : public gpu::WarpDistributionPattern { + using gpu::WarpDistributionPattern::WarpDistributionPattern; + LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, + PatternRewriter &rewriter) const override { + OpOperand *operand = + getWarpResult(warpOp, llvm::IsaPred); + if (!operand) + return rewriter.notifyMatchFailure( + warpOp, "warp result is not a vector::BitCast op"); + auto bitcastOp = operand->get().getDefiningOp(); + unsigned operandIdx = operand->getOperandNumber(); + VectorType distributedSourceType = + getDistVecTypeBasedOnLaneLayout( + xegpu::getLayoutAttr(bitcastOp.getSource()), + bitcastOp.getSourceVectorType()) + .value_or(VectorType()); + if (!distributedSourceType) + return rewriter.notifyMatchFailure( + bitcastOp, "Failed to distribute the source vector type in " + "vector::BitCast op"); + VectorType distributedResultType = + cast(warpOp.getResult(operandIdx).getType()); + if (distributedSourceType.getRank() != 2 || + distributedResultType.getRank() != 2) + return rewriter.notifyMatchFailure( + bitcastOp, "the source or result vector of the bitcast op " + "are not 2D vectors"); + SmallVector newRetIndices; + gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( + rewriter, warpOp, bitcastOp.getSource(), + TypeRange{distributedSourceType}, newRetIndices); + rewriter.setInsertionPointAfter(newWarpOp); + auto newBitcastOp = vector::BitCastOp::create( + rewriter, newWarpOp.getLoc(), distributedResultType, + newWarpOp.getResult(newRetIndices[0])); + Value distributedVal = newWarpOp.getResult(operandIdx); + rewriter.replaceAllUsesWith(distributedVal, newBitcastOp.getResult()); + return success(); + } +}; + +struct VectorTransposeDistribution final : public gpu::WarpDistributionPattern { + using gpu::WarpDistributionPattern::WarpDistributionPattern; + LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, + PatternRewriter &rewriter) const override { + OpOperand *operand = + getWarpResult(warpOp, llvm::IsaPred); + if (!operand) + return rewriter.notifyMatchFailure( + warpOp, "warp result is not a vector::Transpose op"); + auto transposeOp = operand->get().getDefiningOp(); + unsigned operandIdx = operand->getOperandNumber(); + xegpu::LayoutAttr sourceLayout = + xegpu::getLayoutAttr(transposeOp.getVector()); + xegpu::LayoutAttr resultLayout = + xegpu::getLayoutAttr(transposeOp.getResult()); + if (!sourceLayout || !resultLayout) + return rewriter.notifyMatchFailure( + transposeOp, + "the source or result vector of the transpose op lacks layout " + "attribute"); + ArrayRef sourceLaneLayout = sourceLayout.getLaneLayout().asArrayRef(); + ArrayRef resultLaneLayout = resultLayout.getLaneLayout().asArrayRef(); + ArrayRef sourceLaneData = sourceLayout.getLaneData().asArrayRef(); + ArrayRef resultLaneData = resultLayout.getLaneData().asArrayRef(); + if (sourceLaneLayout.size() != 2 || resultLaneLayout.size() != 2) + return rewriter.notifyMatchFailure( + transposeOp, "the source or result vector of the transpose op " + "does not have 2D layout"); + auto is2DTranspose = [](ArrayRef input, ArrayRef output) { + return input.size() == 2 && output.size() == 2 && input[0] == output[1] && + input[1] == output[0]; + }; + + if (!is2DTranspose(sourceLaneLayout, resultLaneLayout) || + !is2DTranspose(sourceLaneData, resultLaneData)) + return rewriter.notifyMatchFailure( + transposeOp, + "the source or result vector layouts must be transposes of each " + "other"); + FailureOr distributedSourceTypeOrFailure = + getDistVecTypeBasedOnLaneLayout(sourceLayout, + transposeOp.getSourceVectorType()); + if (failed(distributedSourceTypeOrFailure)) + return rewriter.notifyMatchFailure( + transposeOp, "Failed to distribute the source vector type in " + "vector::Transpose op"); + SmallVector newRetIndices; + gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( + rewriter, warpOp, transposeOp.getVector(), + TypeRange{distributedSourceTypeOrFailure.value()}, newRetIndices); + rewriter.setInsertionPointAfter(newWarpOp); + auto newTransposeOp = vector::TransposeOp::create( + rewriter, newWarpOp.getLoc(), newWarpOp.getResult(newRetIndices[0]), + transposeOp.getPermutation()); + Value distributedVal = newWarpOp.getResult(operandIdx); + rewriter.replaceAllUsesWith(distributedVal, newTransposeOp.getResult()); + return success(); + } +}; + } // namespace namespace { @@ -825,7 +970,9 @@ void xegpu::populateXeGPUSubgroupDistributePatterns( RewritePatternSet &patterns) { patterns.add( + UpdateNdOffsetDistribution, GpuBarrierDistribution, + VectorTransposeDistribution, VectorBitcastDistribution, + MemrefExtractAlignedPointerAsIndexDistribution>( patterns.getContext()); } @@ -903,14 +1050,47 @@ void XeGPUSubgroupDistributePass::runOnOperation() { int64_t warpSz) { return Value(); }; vector::populatePropagateWarpVectorDistributionPatterns( patterns, distributionFn, shuffleFn); + + auto warpReduction = [](Location loc, OpBuilder &builder, Value input, + vector::CombiningKind kind, uint32_t size) { + // First reduce on a single thread to get per lane reduction value. + Value laneVal = builder.create(loc, kind, input); + // Parallel reduction using butterfly shuffles. + for (uint64_t i = 1; i < size; i <<= 1) { + Value shuffled = + builder + .create(loc, laneVal, i, + /*width=*/size, + /*mode=*/gpu::ShuffleMode::XOR) + .getShuffleResult(); + laneVal = makeArithReduction(builder, loc, kind, laneVal, shuffled); + } + return laneVal; + }; + + vector::populateDistributeReduction(patterns, warpReduction); if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { signalPassFailure(); return; } - // Step 4: Finllay, clean up UnrealizedConversionCastOps that were inserted + // Step 4: Finally, clean up UnrealizedConversionCastOps that were inserted // due to tensor desc type mismatches created by using upstream distribution - // patterns (scf.for) + // patterns (scf.for). This cleanup should only be done if all the ops are + // distributed successfully, if some ops are still not distributed and remains + // inside any WarpExecuteOnLane0Op we avoid this simplication step to avoid + // breaking the IR. + bool foundWarpOp = false; + getOperation()->walk([&](gpu::WarpExecuteOnLane0Op warpOp) { + // Look for WarpOps that are not trivially dead. + if (isOpTriviallyDead(warpOp)) + return WalkResult::advance(); + foundWarpOp = true; + return WalkResult::interrupt(); + }); + if (foundWarpOp) + return; + getOperation()->walk([&](mlir::UnrealizedConversionCastOp op) { // We are only interested in UnrealizedConversionCastOps there were added // for resolving SIMT type mismatches. @@ -929,7 +1109,7 @@ void XeGPUSubgroupDistributePass::runOnOperation() { "Unrealized conversion cast must have tensor descriptor types"); // tensor_desc -> tensor_desc Type of conversions. - // This occurs iside scf.for body to resolve the block argument type to + // This occurs inside scf.for body to resolve the block argument type to // SIMT type. if (inputDescType.getLayout()) { auto argument = mlir::dyn_cast(input); diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir index 0214d84f2c16f..4cbe4db271ad6 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir @@ -181,6 +181,23 @@ func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x1 return } +// ----- +// CHECK-LABEL: func.func @vector_bitcast_i32_to_f16( +// CHECK: %{{.*}} = vector.bitcast %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x8xi32> to vector<16x16xf16> +func.func @vector_bitcast_i32_to_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x8xi32>, %arg2: memref<8x16xf32>) { + %c0 = arith.constant 0 : index + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32> + %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<16x8xi32> -> vector<16x8xi32> + %4 = vector.bitcast %3 : vector<16x8xi32> to vector<16x16xf16> + %5 = vector.transpose %4, [1, 0] : vector<16x16xf16> to vector<16x16xf16> + %6 = xegpu.dpas %2, %5 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> + %7 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %6, %7 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + return +} + // ----- // CHECK-LABEL: func.func @binary_op_one_use( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, From ad5d0a88a4f065dc3720d977c8e3d125c5b768b8 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Thu, 21 Aug 2025 17:58:25 +0000 Subject: [PATCH 02/36] rename getLayoutAttr util --- .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 66 +++++++++++++++++++ .../mlir/Dialect/XeGPU/IR/XeGPUDialect.td | 2 +- .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 27 ++++---- mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 25 ++++--- .../XeGPU/Transforms/XeGPUBlocking.cpp | 16 ++--- .../Transforms/XeGPUSubgroupDistribute.cpp | 5 +- .../Transforms/XeGPUWgToSgDistribute.cpp | 26 ++++---- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 30 ++++----- 8 files changed, 132 insertions(+), 65 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index b4d696444cc44..5b4b376157c00 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -185,6 +185,9 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> { InterfaceMethod<"Check the availability of workgroup level layouts", "bool", "isForWorkgroup">, + InterfaceMethod<"Check the availability of subgroup level layouts", + "bool", + "isForSubgroup">, InterfaceMethod<"Get the rank of attribute", "int64_t", "getRank">, @@ -202,6 +205,15 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> { InterfaceMethod<"Get the SgData field of the attribute as integer array", "std::optional>", "getSgDataAsInt">, + InterfaceMethod<"Get the InstData field of the attribute as integer array", + "std::optional>", + "getInstDataAsInt">, + InterfaceMethod<"Get the LaneLayout field of the attribute as integer array", + "std::optional>", + "getLaneLayoutAsInt">, + InterfaceMethod<"Get the LaneData field of the attribute as integer array", + "std::optional>", + "getLaneDataAsInt">, InterfaceMethod<"Derive a new layout by dropping sgLayout and sgData", "xegpu::DistributeLayoutAttr", "dropSgLayoutAndData">, @@ -388,6 +400,24 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> { return std::nullopt; } + std::optional> getInstDataAsInt() const { + if (DenseI32ArrayAttr inst = getInstData()) + return llvm::to_vector_of(inst.asArrayRef()); + return std::nullopt; + } + + std::optional> getLaneLayoutAsInt() const { + if (DenseI32ArrayAttr layout = getLaneLayout()) + return llvm::to_vector_of(layout.asArrayRef()); + return std::nullopt; + } + + std::optional> getLaneDataAsInt() const { + if (DenseI32ArrayAttr data = getLaneData()) + return llvm::to_vector_of(data.asArrayRef()); + return std::nullopt; + } + /// Delinearizes a linear subgroup ID into its multidimensional indices /// based on the effective subgroup layout. FailureOr> @@ -488,6 +518,42 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> { return std::nullopt; } + /// Returns the InstData of the attribute, computed by applying + /// the slice dimensions to the underlying LayoutAttr. + std::optional> getInstDataAsInt() const { + SliceAttr attr = flatten(); + auto parent = dyn_cast(attr.getParent()); + if (auto inst = parent.getInstDataAsInt()) { + ArrayRef dims = attr.getDims().asArrayRef(); + return XeGPUDialect::slice(llvm::ArrayRef(*inst), dims); + } + return std::nullopt; + } + + /// Returns the LaneLayout of the attribute, computed by applying + /// the slice dimensions to the underlying LayoutAttr. + std::optional> getLaneLayoutAsInt() const { + SliceAttr attr = flatten(); + auto parent = dyn_cast(attr.getParent()); + if (auto layout = parent.getLaneLayoutAsInt()) { + ArrayRef dims = attr.getDims().asArrayRef(); + return XeGPUDialect::slice(llvm::ArrayRef(*layout), dims); + } + return std::nullopt; + } + + /// Returns the LaneData of the attribute, computed by applying + /// the slice dimensions to the underlying LayoutAttr. + std::optional> getLaneDataAsInt() const { + SliceAttr attr = flatten(); + auto parent = dyn_cast(attr.getParent()); + if (auto data = parent.getLaneDataAsInt()) { + ArrayRef dims = attr.getDims().asArrayRef(); + return XeGPUDialect::slice(llvm::ArrayRef(*data), dims); + } + return std::nullopt; + } + SliceAttr dropSgLayoutAndData() { SliceAttr attr = flatten(); auto parent = dyn_cast(attr.getParent()); diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td index 76d58e5ea2424..c173b93face98 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td @@ -40,7 +40,7 @@ def XeGPU_Dialect : Dialect { let extraClassDeclaration = [{ /// Checks if the given shape can be evenly distributed based on the layout /// and data factors provided by the LayoutAttr. - static bool isEvenlyDistributable(llvm::ArrayRef shape, xegpu::LayoutAttr attr); + static bool isEvenlyDistributable(llvm::ArrayRef shape, xegpu::DistributeLayoutAttr attr); /// drops/slices the shape in the specified dims, and return the rest. e.g., /// for shape = [32, 64, 8], dims = [0, 2], it will return [64] diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index b2b2d3ab85231..010199083add9 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -21,6 +21,7 @@ class ValueRange; class TypeConverter; namespace xegpu { +class DistributeLayoutAttr; class LayoutAttr; class TensorDescType; } // namespace xegpu @@ -60,22 +61,22 @@ FailureOr getDistributedVectorType(xegpu::TensorDescType tdescTy); FailureOr getDistributedVectorType(VectorType originalType, LayoutAttr layout); -/// Return the attribute name for the OpOperand to attach LayoutAttr +/// Return the attribute name for the OpOperand to attach DistributeLayoutAttr std::string getLayoutName(const OpOperand &operand); -/// Return the attribute name for the OpResult to attach LayoutAttr +/// Return the attribute name for the OpResult to attach DistributeLayoutAttr std::string getLayoutName(const OpResult result); -/// Retrieves the LayoutAttr associated with a given Value. For TensorDescType -/// values, the LayoutAttr is extracted from the TensorDescType itself. For +/// Retrieves the DistributeLayoutAttr associated with a given Value. For TensorDescType +/// values, the DistributeLayoutAttr is extracted from the TensorDescType itself. For /// other values, it is obtained from the attributes of the defining operation. -/// Returns nullptr if no LayoutAttr is found. -LayoutAttr getLayoutAttr(const Value value); +/// Returns nullptr if no DistributeLayoutAttr is found. +DistributeLayoutAttr getDistributeLayoutAttr(const Value value); -/// Retrieves the LayoutAttr associated with a given OpOperand. It will +/// Retrieves the DistributeLayoutAttr associated with a given OpOperand. It will /// first check the operand_layout_{id} of the owner operation. If not found, /// it will check the operand itself and its defining op. -LayoutAttr getLayoutAttr(const OpOperand &opr); +DistributeLayoutAttr getDistributeLayoutAttr(const OpOperand &opr); /// Removes the LayoutAttr for a given OpOperand or OpResult if it exists. template >> void removeLayoutAttr(const T &operandOrResult); -/// Removes the LayoutAttr for each OpOperand and OpResult of the given +/// Removes the DistributeLayoutAttr for each OpOperand and OpResult of the given /// operation if they exist. If the operation contains regions, it is also /// applied recursively to the contained operations void removeLayoutAttrs(Operation *op); -/// Sets the LayoutAttr for a given OpOperand or OpResult by attaching +/// Sets the DistributeLayoutAttr for a given OpOperand or OpResult by attaching /// it to the owner's dictionary attributes template || std::is_same_v>> -void setLayoutAttr(const T &operandOrResult, const LayoutAttr layout); +void setLayoutAttr(const T &operandOrResult, const DistributeLayoutAttr layout); -/// Set the LayoutAttr for each OpOperand and OpResult of the given operation. +/// Set the DistributeLayoutAttr for each OpOperand and OpResult of the given operation. /// If the operation contains regions, it is also applied recursively to the /// contained operations void setLayoutAttrs(Operation *op, - function_ref getLayoutImpl); + function_ref getLayoutImpl); /// Extract a set of small vectors from a value with a given shape using /// vector.extract_stride_slice diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index a2d708be0e937..2079848c878a3 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -91,7 +91,7 @@ genOffsetsComputingInsts(OpBuilder &builder, Location loc, // Checks if the given shape can be evenly distributed based on the layout // and data factors provided by the LayoutAttr. bool XeGPUDialect::isEvenlyDistributable(llvm::ArrayRef shape, - xegpu::LayoutAttr attr) { + xegpu::DistributeLayoutAttr attr) { assert(attr && "Layout attribute is missing."); // Checks whether the given shape can be evenly distributed using the @@ -104,52 +104,51 @@ bool XeGPUDialect::isEvenlyDistributable(llvm::ArrayRef shape, // smaller than `layout[i] * data[i]`, allowing multiple compute units to // share the data. auto tryDistribute = [&](llvm::ArrayRef shape, - DenseI32ArrayAttr layout, DenseI32ArrayAttr data, + std::optional> layout, + std::optional> data, bool rr = true) -> optional> { llvm::SmallVector newShape(shape); if (layout) { - auto vec = llvm::to_vector_of(layout.asArrayRef()); - if (vec.size() != shape.size()) + if ((*layout).size() != shape.size()) return std::nullopt; - auto ratio = computeShapeRatio(shape, vec); + auto ratio = computeShapeRatio(shape, *layout); if (!ratio.has_value()) return std::nullopt; newShape = ratio.value(); } if (data) { - auto vec = llvm::to_vector_of(data.asArrayRef()); - if (vec.size() != shape.size()) + if ((*data).size() != shape.size()) return std::nullopt; - auto ratio = computeShapeRatio(newShape, vec); + auto ratio = computeShapeRatio(newShape, *data); if (!ratio.has_value() && rr) - ratio = computeShapeRatio(vec, newShape); + ratio = computeShapeRatio(*data, newShape); if (!ratio.has_value()) return std::nullopt; // if data is not null, we always return it for next phase. - newShape = vec; + newShape = *data; } return newShape; }; // check the sgLayout and sgData auto maybeSgShape = - tryDistribute(shape, attr.getSgLayout(), attr.getSgData()); + tryDistribute(shape, attr.getSgLayoutAsInt(), attr.getSgDataAsInt()); if (!maybeSgShape) return false; auto sgShape = maybeSgShape.value(); // check InstData, it neither have layout nor need round-robin auto maybeInstShape = - tryDistribute(sgShape, nullptr, attr.getInstData(), false); + tryDistribute(sgShape, std::nullopt, attr.getInstDataAsInt(), false); if (!maybeInstShape) return false; auto instShape = maybeInstShape.value(); // check LaneLayout and LaneData auto maybeLaneShape = - tryDistribute(instShape, attr.getLaneLayout(), attr.getLaneData(), false); + tryDistribute(instShape, attr.getLaneLayoutAsInt(), attr.getLaneDataAsInt(), false); return maybeLaneShape.has_value(); } diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index b3144e4c1e55d..c62597df1f895 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -140,10 +140,10 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const { else value = (Value)operandOrResult; - xegpu::LayoutAttr layout = xegpu::getLayoutAttr(operandOrResult); + xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(operandOrResult); if (layout && layout.isForSubgroup()) { - if (auto inst_data = layout.getInstData()) - return llvm::to_vector_of(inst_data.asArrayRef()); + if (auto inst_data = layout.getInstDataAsInt()) + return inst_data.value(); if (auto type = dyn_cast(value.getType())) return llvm::to_vector(type.getShape()); @@ -204,12 +204,12 @@ bool XeGPUBlockingPass::needsUnroll(Operation *op) const { // skip the op if any of its operands or results has workgroup level layouts bool hasWgLayoutOperands = llvm::any_of(op->getOpOperands(), [](OpOperand &opr) { - xegpu::LayoutAttr layout = xegpu::getLayoutAttr(opr); + xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(opr); return layout && layout.isForWorkgroup(); }); bool hasWgLayoutResults = llvm::any_of(op->getOpResults(), [](OpResult result) { - xegpu::LayoutAttr layout = xegpu::getLayoutAttr(result); + xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(result); return layout && layout.isForWorkgroup(); }); if (hasWgLayoutOperands || hasWgLayoutResults) { @@ -220,8 +220,8 @@ bool XeGPUBlockingPass::needsUnroll(Operation *op) const { auto isUnrollable = [](Value value, ArrayRef tileShape) { Type valTy = value.getType(); if (auto tdescTy = dyn_cast(valTy)) { - xegpu::LayoutAttr layout = tdescTy.getLayoutAttr(); - return layout && layout.getInstData(); + xegpu::DistributeLayoutAttr layout = tdescTy.getLayoutAttr(); + return layout && layout.getInstDataAsInt(); } auto shapedType = dyn_cast(valTy); return shapedType && !llvm::equal(tileShape, shapedType.getShape()); @@ -247,7 +247,7 @@ void XeGPUBlockingPass::runOnOperation() { // Preserve the LayoutAttr for each operand to the owner's DictionaryAttr. // This ensures that the LayoutAttr remains accessible even if the defining // operation is replaced. - xegpu::setLayoutAttrs(op, [](Value v) { return xegpu::getLayoutAttr(v); }); + xegpu::setLayoutAttrs(op, [](Value v) { return xegpu::getDistributeLayoutAttr(v); }); auto getTileShapeAndCount = [](llvm::ArrayRef shape, xegpu::LayoutAttr layout) { diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 2088c3c7fc5ec..de9378bd7a6f6 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -841,7 +841,7 @@ void XeGPUSubgroupDistributePass::runOnOperation() { if (!isa(operand.get().getType())) continue; - xegpu::LayoutAttr layout = xegpu::getLayoutAttr(operand); + auto layout = dyn_cast(xegpu::getDistributeLayoutAttr(operand)); if (!layout) { op->emitError("Could not find layout attribute for operand ") << operand.getOperandNumber() << " of operation " << op->getName(); @@ -882,7 +882,8 @@ void XeGPUSubgroupDistributePass::runOnOperation() { if (vecRank == 0) return AffineMap::get(val.getContext()); // Get the layout of the vector type. - xegpu::LayoutAttr layout = xegpu::getLayoutAttr(val); + // TODO: support more layout types + auto layout = dyn_cast(xegpu::getDistributeLayoutAttr(val)); // If no layout is specified, assume the inner most dimension is distributed // for now. if (!layout) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index 93b4efcd125ec..c60f9e361bf8e 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -406,7 +406,7 @@ struct WgToSgDpasOp : public OpConversionPattern { if (resultTy.getRank() != 2) return failure(); - auto originalLayout = xegpu::getLayoutAttr(op.getResult()); + auto originalLayout = xegpu::getDistributeLayoutAttr(op.getResult()); if (!originalLayout) return failure(); @@ -470,8 +470,8 @@ struct WgToSgVectorBroadcastOp VectorType resultType = op.getResult().getType(); ArrayRef wgShape = resultType.getShape(); - xegpu::LayoutAttr layout = xegpu::getLayoutAttr(op.getResult()); - if (!layout || !layout.getSgLayout()) + xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(op.getResult()); + if (!layout || !layout.isForWorkgroup()) return failure(); // TODO: Currently only supports cases where the source and result ranks @@ -487,8 +487,8 @@ struct WgToSgVectorBroadcastOp // Check if the output layout is distributable SmallVector sgLayout; - if (auto sgLayoutAttr = layout.getSgLayout()) - sgLayout = llvm::to_vector_of(sgLayoutAttr.asArrayRef()); + if (auto maybeSgLayout = layout.getSgLayoutAsInt()) + sgLayout = *maybeSgLayout; else return failure(); @@ -535,8 +535,8 @@ struct WgToSgElementwiseOp : public ConversionPattern { ArrayRef wgShape = resultType.getShape(); - xegpu::LayoutAttr layout = xegpu::getLayoutAttr(op->getResult(0)); - if (!layout || !layout.getSgLayout()) + xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(op->getResult(0)); + if (!layout || !layout.isForWorkgroup()) return failure(); SmallVector sgShape = getSgShapeAndCount(wgShape, layout).first; @@ -737,8 +737,8 @@ struct WgToSgArithConstantOp : public OpConversionPattern { if (!vecAttr || !vecAttr.isSplat() || !vecType) return failure(); - xegpu::LayoutAttr layout = xegpu::getLayoutAttr(op.getResult()); - if (!layout || !layout.getSgLayout()) + xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(op.getResult()); + if (!layout || !layout.isForWorkgroup()) return failure(); ArrayRef wgShape = vecType.getShape(); @@ -928,7 +928,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() { }); target.addDynamicallyLegalOp([=](xegpu::DpasOp op) -> bool { - auto layout = xegpu::getLayoutAttr(op.getResult()); + auto layout = xegpu::getDistributeLayoutAttr(op.getResult()); return isLegal(layout); }); @@ -947,12 +947,12 @@ void XeGPUWgToSgDistributePass::runOnOperation() { auto vecType = dyn_cast(op.getType()); if (!vecType) return true; - return isLegal(xegpu::getLayoutAttr(op.getResult())); + return isLegal(xegpu::getDistributeLayoutAttr(op.getResult())); }); target.addDynamicallyLegalOp( [=](vector::BroadcastOp op) -> bool { - return isLegal(xegpu::getLayoutAttr(op.getResult())); + return isLegal(xegpu::getDistributeLayoutAttr(op.getResult())); }); target.addDynamicallyLegalOp( @@ -980,7 +980,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() { } } - xegpu::LayoutAttr layout = xegpu::getLayoutAttr(op->getResult(0)); + xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(op->getResult(0)); return isLegal(layout); }); diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 6835f64ad8ef7..5ae025ef34739 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -114,7 +114,7 @@ std::string xegpu::getLayoutName(const OpResult result) { return llvm::formatv("{0}{1}", prefix, result.getResultNumber()).str(); } -xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) { +xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { if (!value) return nullptr; @@ -132,11 +132,11 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) { // for LoadNdOp, the layout is stored in the tensor descriptor if (auto loadNd = dyn_cast(defOp)) - return getLayoutAttr(loadNd.getTensorDesc()); + return getDistributeLayoutAttr(loadNd.getTensorDesc()); std::string layoutName = getLayoutName(result); if (defOp->hasAttr(layoutName)) - return defOp->getAttrOfType(layoutName); + return defOp->getAttrOfType(layoutName); } if (auto arg = dyn_cast(value)) { @@ -144,41 +144,41 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) { if (auto loop = dyn_cast(parentOp)) { OpOperand *tiedInit = loop.getTiedLoopInit(arg); if (tiedInit) - return getLayoutAttr(tiedInit->get()); + return getDistributeLayoutAttr(tiedInit->get()); } } return nullptr; } -xegpu::LayoutAttr xegpu::getLayoutAttr(const OpOperand &opr) { +xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const OpOperand &opr) { Operation *op = opr.getOwner(); std::string layoutName = xegpu::getLayoutName(opr); if (op->hasAttr(layoutName)) - return op->getAttrOfType(layoutName); - return getLayoutAttr(opr.get()); + return op->getAttrOfType(layoutName); + return getDistributeLayoutAttr(opr.get()); } template -void xegpu::setLayoutAttr(const T &operandOrResult, const LayoutAttr layout) { +void xegpu::setLayoutAttr(const T &operandOrResult, const DistributeLayoutAttr layout) { Operation *owner = operandOrResult.getOwner(); std::string name = xegpu::getLayoutName(operandOrResult); - if (layout && !owner->hasAttrOfType(name)) + if (layout && !owner->hasAttrOfType(name)) owner->setAttr(name, layout); } // Explicit instantiation for OpResult template void xegpu::setLayoutAttr(const mlir::OpResult &result, - const mlir::xegpu::LayoutAttr layout); + const mlir::xegpu::DistributeLayoutAttr layout); // Explicit instantiation for OpOperand template void xegpu::setLayoutAttr(const mlir::OpOperand &operand, - const mlir::xegpu::LayoutAttr layout); + const mlir::xegpu::DistributeLayoutAttr layout); void xegpu::setLayoutAttrs(Operation *op, - function_ref getLayoutImpl) { + function_ref getLayoutImpl) { op->walk([&](Operation *nestOp) { for (OpOperand &opr : nestOp->getOpOperands()) { auto layout = getLayoutImpl(opr.get()); @@ -195,7 +195,7 @@ template void xegpu::removeLayoutAttr(const T &operandOrResult) { Operation *owner = operandOrResult.getOwner(); std::string name = xegpu::getLayoutName(operandOrResult); - if (owner->hasAttrOfType(name)) + if (owner->hasAttrOfType(name)) owner->removeAttr(name); } @@ -306,7 +306,7 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType( if (!inputTy || !resultTy) return WalkResult::skip(); - xegpu::LayoutAttr layout = xegpu::getLayoutAttr(input); + xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(input); if (!layout) return WalkResult::skip(); @@ -344,7 +344,7 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType( } { // perform the conversion from RankedTensorType to VectorType based on the - // LayoutAttr + // DistributeLayoutAttr // Handle the UnrealizedConversionCastOp introduced by the first step. // For vector->RankedTensorType, it will simply forward the inputs. From 0e34f36690a34f071afd181649b8f86c90dde9b4 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Thu, 21 Aug 2025 18:10:49 +0000 Subject: [PATCH 03/36] refine --- .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 17 +++++++++++--- .../XeGPU/Transforms/XeGPUBlocking.cpp | 5 ++-- .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 4 ++-- .../Transforms/XeGPUSubgroupDistribute.cpp | 7 +++--- .../Transforms/XeGPUWgToSgDistribute.cpp | 10 ++++---- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 23 ++++++++++--------- 6 files changed, 40 insertions(+), 26 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index 010199083add9..7089559d0c51b 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -73,11 +73,21 @@ std::string getLayoutName(const OpResult result); /// Returns nullptr if no DistributeLayoutAttr is found. DistributeLayoutAttr getDistributeLayoutAttr(const Value value); +template +AttrTy getDistributeLayoutAttrOfType(const Value value) { + return dyn_cast_if_present(getDistributeLayoutAttr(value)); +} + /// Retrieves the DistributeLayoutAttr associated with a given OpOperand. It will /// first check the operand_layout_{id} of the owner operation. If not found, /// it will check the operand itself and its defining op. DistributeLayoutAttr getDistributeLayoutAttr(const OpOperand &opr); +template +AttrTy getDistributeLayoutAttrOfType(const OpOperand &opr) { + return dyn_cast_if_present(getDistributeLayoutAttr(opr)); +} + /// Removes the LayoutAttr for a given OpOperand or OpResult if it exists. template || @@ -94,13 +104,14 @@ void removeLayoutAttrs(Operation *op); template || std::is_same_v>> -void setLayoutAttr(const T &operandOrResult, const DistributeLayoutAttr layout); +void setDistributeLayoutAttr(const T &operandOrResult, + const DistributeLayoutAttr layout); /// Set the DistributeLayoutAttr for each OpOperand and OpResult of the given operation. /// If the operation contains regions, it is also applied recursively to the /// contained operations -void setLayoutAttrs(Operation *op, - function_ref getLayoutImpl); +void setDistributeLayoutAttrs( + Operation *op, function_ref getLayoutImpl); /// Extract a set of small vectors from a value with a given shape using /// vector.extract_stride_slice diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index c62597df1f895..2e3e40ed2d457 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -247,7 +247,8 @@ void XeGPUBlockingPass::runOnOperation() { // Preserve the LayoutAttr for each operand to the owner's DictionaryAttr. // This ensures that the LayoutAttr remains accessible even if the defining // operation is replaced. - xegpu::setLayoutAttrs(op, [](Value v) { return xegpu::getDistributeLayoutAttr(v); }); + xegpu::setDistributeLayoutAttrs( + op, [](Value v) { return xegpu::getDistributeLayoutAttr(v); }); auto getTileShapeAndCount = [](llvm::ArrayRef shape, xegpu::LayoutAttr layout) { @@ -377,7 +378,7 @@ void XeGPUBlockingPass::runOnOperation() { if (auto layout = op->getAttrOfType(name)) { op->removeAttr(name); if (!isa(op)) - xegpu::setLayoutAttr(result, layout.dropInstData()); + xegpu::setDistributeLayoutAttr(result, layout.dropInstData()); } } diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index bef88042fc663..5cb47b2accd68 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -718,7 +718,7 @@ static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op, } // If the result is a vector type, add a temporary layout attribute to the // op. - xegpu::setLayoutAttr(result, layout); + xegpu::setDistributeLayoutAttr(result, layout); } return success(); } @@ -800,7 +800,7 @@ updateControlFlowOps(mlir::OpBuilder &builder, // If the type is a vector type and this region argument is an OpResult, // set the layout attribute on the OpResult. if (auto result = dyn_cast(successorInput)) - xegpu::setLayoutAttr(result, successorOperandLayout); + xegpu::setDistributeLayoutAttr(result, successorOperandLayout); } } return success(); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index de9378bd7a6f6..e48e2180197ec 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -841,14 +841,15 @@ void XeGPUSubgroupDistributePass::runOnOperation() { if (!isa(operand.get().getType())) continue; - auto layout = dyn_cast(xegpu::getDistributeLayoutAttr(operand)); + auto layout = + xegpu::getDistributeLayoutAttrOfType(operand); if (!layout) { op->emitError("Could not find layout attribute for operand ") << operand.getOperandNumber() << " of operation " << op->getName(); signalPassFailure(); return; } - xegpu::setLayoutAttr(operand, layout); + xegpu::setDistributeLayoutAttr(operand, layout); } }); // Step 2: Move all operations of a GPU function inside @@ -883,7 +884,7 @@ void XeGPUSubgroupDistributePass::runOnOperation() { return AffineMap::get(val.getContext()); // Get the layout of the vector type. // TODO: support more layout types - auto layout = dyn_cast(xegpu::getDistributeLayoutAttr(val)); + auto layout = xegpu::getDistributeLayoutAttrOfType(val); // If no layout is specified, assume the inner most dimension is distributed // for now. if (!layout) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index c60f9e361bf8e..a8700ca73efc4 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -429,8 +429,8 @@ struct WgToSgDpasOp : public OpConversionPattern { VectorType resTy = VectorType::get({aVecShape[0], bVecShape[1]}, resultTy.getElementType()); tmpC = xegpu::DpasOp::create(rewriter, loc, resTy, operands); - xegpu::setLayoutAttr(cast(tmpC), - originalLayout.dropSgLayoutAndData()); + xegpu::setDistributeLayoutAttr(cast(tmpC), + originalLayout.dropSgLayoutAndData()); newDpasOps.push_back(tmpC); } @@ -508,8 +508,8 @@ struct WgToSgVectorBroadcastOp for (auto operand : adaptor.getOperands().front()) { auto newBroadcast = vector::BroadcastOp::create(rewriter, op.getLoc(), newResultType, operand); - xegpu::setLayoutAttr(newBroadcast->getResult(0), - layout.dropSgLayoutAndData()); + xegpu::setDistributeLayoutAttr(newBroadcast->getResult(0), + layout.dropSgLayoutAndData()); newBroadcastOps.push_back(newBroadcast.getResult()); } @@ -755,7 +755,7 @@ struct WgToSgArithConstantOp : public OpConversionPattern { auto cstOp = arith::ConstantOp::create(rewriter, op.getLoc(), newType, sgAttr); if (auto newLayout = layout.dropSgLayoutAndData()) - xegpu::setLayoutAttr(cstOp->getResult(0), newLayout); + xegpu::setDistributeLayoutAttr(cstOp->getResult(0), newLayout); SmallVector newConsts(count, cstOp); rewriter.replaceOpWithMultiple(op, {newConsts}); diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 5ae025ef34739..1d4de68754c20 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -160,7 +160,8 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const OpOperand &opr) } template -void xegpu::setLayoutAttr(const T &operandOrResult, const DistributeLayoutAttr layout) { +void xegpu::setDistributeLayoutAttr(const T &operandOrResult, + const DistributeLayoutAttr layout) { Operation *owner = operandOrResult.getOwner(); std::string name = xegpu::getLayoutName(operandOrResult); if (layout && !owner->hasAttrOfType(name)) @@ -168,25 +169,25 @@ void xegpu::setLayoutAttr(const T &operandOrResult, const DistributeLayoutAttr l } // Explicit instantiation for OpResult -template void -xegpu::setLayoutAttr(const mlir::OpResult &result, - const mlir::xegpu::DistributeLayoutAttr layout); +template void xegpu::setDistributeLayoutAttr( + const mlir::OpResult &result, + const mlir::xegpu::DistributeLayoutAttr layout); // Explicit instantiation for OpOperand -template void -xegpu::setLayoutAttr(const mlir::OpOperand &operand, - const mlir::xegpu::DistributeLayoutAttr layout); +template void xegpu::setDistributeLayoutAttr( + const mlir::OpOperand &operand, + const mlir::xegpu::DistributeLayoutAttr layout); -void xegpu::setLayoutAttrs(Operation *op, - function_ref getLayoutImpl) { +void xegpu::setDistributeLayoutAttrs( + Operation *op, function_ref getLayoutImpl) { op->walk([&](Operation *nestOp) { for (OpOperand &opr : nestOp->getOpOperands()) { auto layout = getLayoutImpl(opr.get()); - setLayoutAttr(opr, layout); + setDistributeLayoutAttr(opr, layout); } for (OpResult result : nestOp->getOpResults()) { auto layout = getLayoutImpl(result); - setLayoutAttr(result, layout); + setDistributeLayoutAttr(result, layout); } }); } From a84014ff42002dc5b036558c62e5387536e74019 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Thu, 21 Aug 2025 18:12:17 +0000 Subject: [PATCH 04/36] format --- .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 25 ++++++++++--------- mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 4 +-- .../XeGPU/Transforms/XeGPUBlocking.cpp | 9 ++++--- .../Transforms/XeGPUWgToSgDistribute.cpp | 12 ++++++--- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 6 +++-- 5 files changed, 33 insertions(+), 23 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index 7089559d0c51b..82fd70571c022 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -67,10 +67,11 @@ std::string getLayoutName(const OpOperand &operand); /// Return the attribute name for the OpResult to attach DistributeLayoutAttr std::string getLayoutName(const OpResult result); -/// Retrieves the DistributeLayoutAttr associated with a given Value. For TensorDescType -/// values, the DistributeLayoutAttr is extracted from the TensorDescType itself. For -/// other values, it is obtained from the attributes of the defining operation. -/// Returns nullptr if no DistributeLayoutAttr is found. +/// Retrieves the DistributeLayoutAttr associated with a given Value. For +/// TensorDescType values, the DistributeLayoutAttr is extracted from the +/// TensorDescType itself. For other values, it is obtained from the attributes +/// of the defining operation. Returns nullptr if no DistributeLayoutAttr is +/// found. DistributeLayoutAttr getDistributeLayoutAttr(const Value value); template @@ -78,9 +79,9 @@ AttrTy getDistributeLayoutAttrOfType(const Value value) { return dyn_cast_if_present(getDistributeLayoutAttr(value)); } -/// Retrieves the DistributeLayoutAttr associated with a given OpOperand. It will -/// first check the operand_layout_{id} of the owner operation. If not found, -/// it will check the operand itself and its defining op. +/// Retrieves the DistributeLayoutAttr associated with a given OpOperand. It +/// will first check the operand_layout_{id} of the owner operation. If not +/// found, it will check the operand itself and its defining op. DistributeLayoutAttr getDistributeLayoutAttr(const OpOperand &opr); template @@ -94,8 +95,8 @@ template >> void removeLayoutAttr(const T &operandOrResult); -/// Removes the DistributeLayoutAttr for each OpOperand and OpResult of the given -/// operation if they exist. If the operation contains regions, it is also +/// Removes the DistributeLayoutAttr for each OpOperand and OpResult of the +/// given operation if they exist. If the operation contains regions, it is also /// applied recursively to the contained operations void removeLayoutAttrs(Operation *op); @@ -107,9 +108,9 @@ template getLayoutImpl); diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index 2079848c878a3..6de6049facfc6 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -147,8 +147,8 @@ bool XeGPUDialect::isEvenlyDistributable(llvm::ArrayRef shape, auto instShape = maybeInstShape.value(); // check LaneLayout and LaneData - auto maybeLaneShape = - tryDistribute(instShape, attr.getLaneLayoutAsInt(), attr.getLaneDataAsInt(), false); + auto maybeLaneShape = tryDistribute(instShape, attr.getLaneLayoutAsInt(), + attr.getLaneDataAsInt(), false); return maybeLaneShape.has_value(); } diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 2e3e40ed2d457..45fed8e548a89 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -140,7 +140,8 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const { else value = (Value)operandOrResult; - xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(operandOrResult); + xegpu::DistributeLayoutAttr layout = + xegpu::getDistributeLayoutAttr(operandOrResult); if (layout && layout.isForSubgroup()) { if (auto inst_data = layout.getInstDataAsInt()) return inst_data.value(); @@ -204,12 +205,14 @@ bool XeGPUBlockingPass::needsUnroll(Operation *op) const { // skip the op if any of its operands or results has workgroup level layouts bool hasWgLayoutOperands = llvm::any_of(op->getOpOperands(), [](OpOperand &opr) { - xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(opr); + xegpu::DistributeLayoutAttr layout = + xegpu::getDistributeLayoutAttr(opr); return layout && layout.isForWorkgroup(); }); bool hasWgLayoutResults = llvm::any_of(op->getOpResults(), [](OpResult result) { - xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(result); + xegpu::DistributeLayoutAttr layout = + xegpu::getDistributeLayoutAttr(result); return layout && layout.isForWorkgroup(); }); if (hasWgLayoutOperands || hasWgLayoutResults) { diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index a8700ca73efc4..518c7817a516e 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -470,7 +470,8 @@ struct WgToSgVectorBroadcastOp VectorType resultType = op.getResult().getType(); ArrayRef wgShape = resultType.getShape(); - xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(op.getResult()); + xegpu::DistributeLayoutAttr layout = + xegpu::getDistributeLayoutAttr(op.getResult()); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -535,7 +536,8 @@ struct WgToSgElementwiseOp : public ConversionPattern { ArrayRef wgShape = resultType.getShape(); - xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(op->getResult(0)); + xegpu::DistributeLayoutAttr layout = + xegpu::getDistributeLayoutAttr(op->getResult(0)); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -737,7 +739,8 @@ struct WgToSgArithConstantOp : public OpConversionPattern { if (!vecAttr || !vecAttr.isSplat() || !vecType) return failure(); - xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(op.getResult()); + xegpu::DistributeLayoutAttr layout = + xegpu::getDistributeLayoutAttr(op.getResult()); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -980,7 +983,8 @@ void XeGPUWgToSgDistributePass::runOnOperation() { } } - xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(op->getResult(0)); + xegpu::DistributeLayoutAttr layout = + xegpu::getDistributeLayoutAttr(op->getResult(0)); return isLegal(layout); }); diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 1d4de68754c20..cac1ffe4d3bc3 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -151,7 +151,8 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { return nullptr; } -xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const OpOperand &opr) { +xegpu::DistributeLayoutAttr +xegpu::getDistributeLayoutAttr(const OpOperand &opr) { Operation *op = opr.getOwner(); std::string layoutName = xegpu::getLayoutName(opr); if (op->hasAttr(layoutName)) @@ -307,7 +308,8 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType( if (!inputTy || !resultTy) return WalkResult::skip(); - xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(input); + xegpu::DistributeLayoutAttr layout = + xegpu::getDistributeLayoutAttr(input); if (!layout) return WalkResult::skip(); From f3af2c307597bf13a04579b3235b45af7ea10392 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Thu, 21 Aug 2025 18:59:45 +0000 Subject: [PATCH 05/36] update convert_layout --- mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 3 +++ mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 4 ++-- mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp | 6 +++--- mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp | 5 +++-- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index 5b4b376157c00..77e3c257f234e 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -217,6 +217,9 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> { InterfaceMethod<"Derive a new layout by dropping sgLayout and sgData", "xegpu::DistributeLayoutAttr", "dropSgLayoutAndData">, + InterfaceMethod<"Derive a new layout by dropping InstData", + "xegpu::DistributeLayoutAttr", + "dropInstData">, InterfaceMethod<[{Delinearizes a linear subgroup ID into its multidimensional indices based on the effective subgroup layout.}], "FailureOr>", diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index ab471a1f33ef9..2f6671c5e37cc 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -1162,8 +1162,8 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou the IR is lowered to WI level because that is the end result of all distributions. }]; let arguments = (ins XeGPU_VectorType: $source, - XeGPU_LayoutAttr: $input_layout, - XeGPU_LayoutAttr: $target_layout); + DistributeLayoutAttr: $input_layout, + DistributeLayoutAttr: $target_layout); let results = (outs XeGPU_VectorType: $result); let assemblyFormat = [{ $source prop-dict attr-dict `:` type($source) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 45fed8e548a89..80e9d4d25b06c 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -84,9 +84,9 @@ struct ConvertLayoutOpPattern using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op, PatternRewriter &rewriter) const override { - xegpu::LayoutAttr input_layout = op.getInputLayoutAttr(); - xegpu::LayoutAttr target_layout = op.getTargetLayoutAttr(); - if (!input_layout.getInstData() || !target_layout.getInstData()) + xegpu::DistributeLayoutAttr input_layout = op.getInputLayoutAttr(); + xegpu::DistributeLayoutAttr target_layout = op.getTargetLayoutAttr(); + if (!input_layout.getInstDataAsInt() || !target_layout.getInstDataAsInt()) return rewriter.notifyMatchFailure(op, "Not a target ConvertLayoutOp."); input_layout = input_layout.dropInstData(); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index 518c7817a516e..4fb962908793f 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -613,8 +613,9 @@ struct WgToSgConvertLayoutOp LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op, OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - xegpu::LayoutAttr input = op.getInputLayout(); - xegpu::LayoutAttr target = op.getTargetLayout(); + // TODO: currently, we only support LayoutAttr + auto input = dyn_cast(op.getInputLayout()); + auto target = dyn_cast(op.getTargetLayout()); if (!input || !target || !input.isForWorkgroup() || !target.isForWorkgroup()) From ee5baca1ccae6549aca46693814f9c8ea8b995e7 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 21 Aug 2025 22:54:47 +0000 Subject: [PATCH 06/36] save work --- .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 42 ++++++---------- mlir/test/Dialect/XeGPU/propagate-layout.mlir | 48 +++++++++++++++++-- 2 files changed, 58 insertions(+), 32 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 10c2759493477..8dce63b80f373 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -639,46 +639,32 @@ void LayoutInfoPropagation::visitVectorBitcastOp( : outElemTyBitWidth / inElemTyBitWidth; const LaneLayout &sourceLaneLayout = resultLayout.getLayout(); // source lane layout is unchanged. - ArrayRef currData = resultLayout.getDataAsArrayRef(); + ArrayRef outData = resultLayout.getDataAsArrayRef(); // TODO: Currently we assume that bitcasts does not require cross lane // communication. So each lane must own the required number of elements to // perform the bitcast locally without cross-lane communication. - // For 1D vectors, decide how many elements each lane owns based on whether - // the bitcast is narrowing or widening. - if (rank == 1) { - if ((currData[0] * outElemTyBitWidth) % inElemTyBitWidth != 0) { - bitcast.emitWarning( - "Narrowing bitcast with cross lane communication is not supported."); - return; - } - LaneData sourceLaneData = isNarrowing - ? LaneData({currData[0] / bitCastRatio}) - : LaneData({currData[0] * bitCastRatio}); - - propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo( - sourceLaneLayout, sourceLaneData))); + int outInnerBitsPerLane = outData[rank - 1] * outElemTyBitWidth; + if (outInnerBitsPerLane < inElemTyBitWidth) { + bitcast.emitWarning( + "Narrowing bitcast with cross lane communication is not supported."); + return; } - // For nD vectors, Each lane is not allowed to own multiple elements in any - // dimension other than the innermost dimension. - // TODO: Add support for other case depending on the use case. - SmallVector sourceLaneDataStorage(currData.begin(), - currData.end() - 1); + // Check if each lane owns a single element in all dimensions except the + // innermost dimension. For example, if the result layout is [1, 16][2, 1], we + // are not allowed to bitcast such vectors. + // TODO: Relax this based on use cases. + SmallVector sourceLaneDataStorage(outData.begin(), + outData.end() - 1); if (llvm::any_of(sourceLaneDataStorage, [](int64_t d) { return d != 1; })) { bitcast.emitWarning( "Each lane must not own multiple elements in any dimension other than " "the innermost dimension."); return; } - // Check if the bitcast requires cross lane communication. - if ((currData[rank - 1] * outElemTyBitWidth) % inElemTyBitWidth != 0) { - bitcast.emitWarning( - "Narrowing bitcast with cross lane communication is not supported."); - return; - } // Decide lane data based on whether the bitcast is narrowing or widening. - int64_t innerMostLaneData = isNarrowing ? currData[rank - 1] / bitCastRatio - : currData[rank - 1] * bitCastRatio; + int64_t innerMostLaneData = isNarrowing ? outData[rank - 1] / bitCastRatio + : outData[rank - 1] * bitCastRatio; sourceLaneDataStorage.push_back(innerMostLaneData); LaneData sourceLaneData(sourceLaneDataStorage); diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir index 4cbe4db271ad6..994fa44cab0b6 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir @@ -164,9 +164,14 @@ func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) { // ----- // CHECK-LABEL: func.func @vector_bitcast_i16_to_f16( -// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xi16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xi16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %{{.*}} = vector.bitcast %{{.*}} {layout_result_0 = #xegpu.layout} : vector<8x16xi16> to vector<8x16xf16> -// CHECK: %{{.*}} = vector.bitcast %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x16xi16> to vector<16x16xf16> +// CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout} +// CHECK-SAME: !xegpu.tensor_desc<8x16xi16, #xegpu.layout> -> vector<8x16xi16> +// CHECK: %[[LOAD1:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout} +// CHECK-SAME: !xegpu.tensor_desc<16x16xi16, #xegpu.layout> -> vector<16x16xi16> +// CHECK: %{{.*}} = vector.bitcast %[[LOAD0]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: vector<8x16xi16> to vector<8x16xf16> +// CHECK: %{{.*}} = vector.bitcast %[[LOAD1]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: vector<16x16xi16> to vector<16x16xf16> func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x16xi16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16> @@ -183,7 +188,10 @@ func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x1 // ----- // CHECK-LABEL: func.func @vector_bitcast_i32_to_f16( -// CHECK: %{{.*}} = vector.bitcast %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x8xi32> to vector<16x16xf16> +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout} +// CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> +// CHECK-NEXT: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: vector<16x8xi32> to vector<16x16xf16> func.func @vector_bitcast_i32_to_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x8xi32>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> @@ -198,6 +206,38 @@ func.func @vector_bitcast_i32_to_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x8 return } +// ----- +// CHECK-LABEL: func.func @vector_bitcast_i16_to_i32( +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout} +// CHECK-SAME: !xegpu.tensor_desc<8x32xi16, #xegpu.layout> -> vector<8x32xi16> +// CHECK-NEXT: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: vector<8x32xi16> to vector<8x16xi32> +func.func @vector_bitcast_i16_to_i32(%arg0: memref<8x32xi16>, %arg1: memref<8x16xi32>) { + %c0 = arith.constant 0 : index + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x32xi16> -> !xegpu.tensor_desc<8x32xi16> + %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32> + %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x32xi16> -> vector<8x32xi16> + %3 = vector.bitcast %2 : vector<8x32xi16> to vector<8x16xi32> + xegpu.store_nd %3, %1 : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32> + return +} + +// ----- +// CHECK-LABEL: func.func @vector_bitcast_require_cross_lane_shuffle( +// CHECK-NOT: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = {{.*}}} : !xegpu.tensor_desc<8x16xi32> -> vector<8x16xi32> +// CHECK: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: vector<8x16xi32> to vector<8x32xi16> +func.func @vector_bitcast_require_cross_lane_shuffle(%arg0: memref<8x16xi32>, %arg1: memref<8x32xi16>) { + %c0 = arith.constant 0 : index + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32> + %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<8x32xi16> -> !xegpu.tensor_desc<8x32xi16> + %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xi32> -> vector<8x16xi32> + %3 = vector.bitcast %2 : vector<8x16xi32> to vector<8x32xi16> + xegpu.store_nd %3, %1 : vector<8x32xi16>, !xegpu.tensor_desc<8x32xi16> + return +} + + // ----- // CHECK-LABEL: func.func @binary_op_one_use( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, From 621122c50d7df5adb6ed33d94b8055fdc480ecdd Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 21 Aug 2025 23:14:40 +0000 Subject: [PATCH 07/36] save work --- mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 8dce63b80f373..d8c447dd46338 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -107,7 +107,6 @@ struct LayoutInfo { private: LaneLayout laneLayout; LaneData laneData; - xegpu::LayoutAttr layoutAttr; public: LayoutInfo() = default; @@ -464,7 +463,7 @@ void LayoutInfoPropagation::visitVectorBroadCastOp( LayoutInfo resultLayout = results[0]->getValue(); if (!resultLayout.isAssigned()) return; - // Only consider 1D -> 2D broadcasts or 2D -> 2D broadcasts. + // Only consider vector to vector broadcasts for now. VectorType resultTy = broadcast.getResultVectorType(); VectorType sourceTy = dyn_cast(broadcast.getSourceType()); if (!sourceTy) { @@ -472,7 +471,7 @@ void LayoutInfoPropagation::visitVectorBroadCastOp( return; } - // Only conside 2D -> 2D broadcast. + // Only consider 2D -> 2D broadcast. if (sourceTy.getRank() != 2 || resultTy.getRank() != 2) { broadcast.emitWarning("Expecting source type to be 2D vector and " "result type to be 2D vector."); From 35c64895111db5d7019a64078fbe719dce317b95 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Fri, 22 Aug 2025 14:45:35 +0000 Subject: [PATCH 08/36] fix compilation error in clang --- mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 1 + 1 file changed, 1 insertion(+) diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index 82fd70571c022..bad734dbfd9f0 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -9,6 +9,7 @@ #ifndef MLIR_DIALECT_XEGPU_UTILS_XEGPUUTILS_H_ #define MLIR_DIALECT_XEGPU_UTILS_XEGPUUTILS_H_ +#include "mlir/Dialect/XeGPU/IR/XeGPU.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/OpDefinition.h" namespace mlir { From b912c21cf84eee0b574f4acc8db036270d9efb36 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 25 Aug 2025 22:08:11 +0000 Subject: [PATCH 09/36] save work --- .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 289 +++++++++++------- 1 file changed, 173 insertions(+), 116 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index d8c447dd46338..5bba85dd4d3bc 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -21,6 +21,7 @@ #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/MLIRContext.h" #include "mlir/IR/Operation.h" #include "mlir/IR/Value.h" #include "mlir/IR/Visitors.h" @@ -29,6 +30,7 @@ #include "mlir/Support/LLVM.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/Casting.h" @@ -36,6 +38,7 @@ #include "llvm/Support/InterleavedRange.h" #include "llvm/Support/LogicalResult.h" #include "llvm/Support/raw_ostream.h" +#include namespace mlir { namespace xegpu { @@ -58,30 +61,32 @@ namespace { /// Helper class to store the ND layout of lanes within a subgroup and data /// owned by each lane. -struct Layout { - SmallVector layout; - Layout() = default; - Layout(std::initializer_list list) : layout(list) {} - Layout(SmallVector &list) : layout(list) {} - void print(llvm::raw_ostream &os) const; - size_t size() const { return layout.size(); } - int64_t operator[](size_t idx) const; -}; - -int64_t Layout::operator[](size_t idx) const { - assert(idx < layout.size() && "Index out of bounds"); - return layout[idx]; -} - -void Layout::print(llvm::raw_ostream &os) const { - os << llvm::interleaved_array(layout); -} - -/// LaneLayout represents the logical layout of lanes within a subgroup when it -/// accesses some value. LaneData represents the logical layout of data owned by -/// each work item. -using LaneLayout = Layout; -using LaneData = Layout; +// struct Layout { +// SmallVector layout; +// Layout() = default; +// Layout(std::initializer_list list) : layout(list) {} +// Layout(SmallVector &list) : layout(list) {} +// void print(llvm::raw_ostream &os) const; +// size_t size() const { return layout.size(); } +// int64_t operator[](size_t idx) const; +// }; + +// int64_t Layout::operator[](size_t idx) const { +// assert(idx < layout.size() && "Index out of bounds"); +// return layout[idx]; +// } + +// void Layout::print(llvm::raw_ostream &os) const { +// os << llvm::interleaved_array(layout); +// } + +// /// LaneLayout represents the logical layout of lanes within a subgroup when +// it +// /// accesses some value. LaneData represents the logical layout of data owned +// by +// /// each work item. +// using LaneLayout = Layout; +// using LaneData = Layout; //===----------------------------------------------------------------------===// // LayoutInfo @@ -105,13 +110,14 @@ using LaneData = Layout; struct LayoutInfo { private: - LaneLayout laneLayout; - LaneData laneData; + mlir::Attribute storage = nullptr; public: LayoutInfo() = default; - LayoutInfo(const LaneLayout &layout, const LaneData &data) - : laneLayout(layout), laneData(data) {} + LayoutInfo(const xegpu::LayoutAttr &layout) : storage(layout) {} + LayoutInfo(const xegpu::SliceAttr &slice) : storage(slice) { + storage = slice.flatten(); + } // Two lattice values are equal if they have `some` layout. The actual // content of the layout does not matter. @@ -125,24 +131,44 @@ struct LayoutInfo { void print(raw_ostream &os) const; - bool isAssigned() const { - return laneLayout.size() > 0 && laneData.size() > 0; - } + bool isAssigned() const { return storage != nullptr; } + + LayoutInfo transpose(ArrayRef permutation) const; - LayoutInfo getTransposedLayout(ArrayRef permutation) const; + ArrayRef getLaneLayout() const { + if (!isAssigned()) + return {}; + if (isa(storage)) + return cast(storage).getLaneLayout().asArrayRef(); + xegpu::SliceAttr slice = cast(storage); + assert(isa(slice.getParent()) && + "Slice parent must be a LayoutAttr"); + auto parent = cast(slice.getParent()); + return parent.getLaneLayout().asArrayRef(); + } + ArrayRef getLaneData() const { + if (!isAssigned()) + return {}; + if (isa(storage)) + return cast(storage).getLaneData().asArrayRef(); + xegpu::SliceAttr slice = cast(storage); + assert(isa(slice.getParent()) && + "Slice parent must be a LayoutAttr"); + auto parent = cast(slice.getParent()); + return parent.getLaneData().asArrayRef(); + } + bool isSliceLayout() const { + if (!isAssigned()) + return false; + return isa(storage); + } - const LaneLayout &getLayout() const { return laneLayout; } - const LaneData &getData() const { return laneData; } - ArrayRef getLayoutAsArrayRef() const { return laneLayout.layout; } - ArrayRef getDataAsArrayRef() const { return laneData.layout; } + Attribute get() { return storage; } }; void LayoutInfo::print(raw_ostream &os) const { if (isAssigned()) { - os << "lane_layout: "; - laneLayout.print(os); - os << ", lane_data: "; - laneData.print(os); + os << storage; } else { os << "Not assigned."; } @@ -159,18 +185,30 @@ LayoutInfo LayoutInfo::join(const LayoutInfo &lhs, const LayoutInfo &rhs) { llvm_unreachable("Join should not be triggered by layout propagation."); } -/// Get the transposed layout according to the given permutation. -LayoutInfo -LayoutInfo::getTransposedLayout(ArrayRef permutation) const { +/// Construct a new layout with the transposed lane layout and lane data. +LayoutInfo LayoutInfo::transpose(ArrayRef permutation) const { if (!isAssigned()) return {}; - LaneLayout newLayout; - LaneData newData; + // Check if the permutation is valid. + llvm::SmallSet seen(permutation.begin(), permutation.end()); + bool hasDuplicates = seen.size() != permutation.size(); + bool withinRange = llvm::all_of(permutation, [&](size_t idx) { + return idx >= 0 && idx < permutation.size(); + }); + + if (!withinRange || hasDuplicates) { + assert(false && "Invalid permutation for transpose."); + return {}; + } + + SmallVector laneLayout; + SmallVector laneData; for (int64_t idx : permutation) { - newLayout.layout.push_back(laneLayout.layout[idx]); - newData.layout.push_back(laneData.layout[idx]); + laneLayout.push_back(static_cast(getLaneLayout()[idx])); + laneData.push_back(static_cast(getLaneData()[idx])); } - return LayoutInfo(newLayout, newData); + return LayoutInfo( + xegpu::LayoutAttr::get(storage.getContext(), laneLayout, laneData)); } //===----------------------------------------------------------------------===// @@ -190,13 +228,15 @@ struct LayoutInfoLattice : public Lattice { /// Helper Function to get the default layout for uniform values like constants. /// For 1D vector, lane_layout is [subgroupSize] and lane_data is [1]. /// For 2D vector, lane_layout is [1, subgroupSize] and lane_data is [1, 1]. -static LayoutInfo getDefaultSIMTLayoutInfo(unsigned rank) { +static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx, + unsigned rank) { assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector."); - if (rank == 1) - return LayoutInfo(LaneLayout({xegpu::targetinfo::subgroupSize}), - LaneData({1})); - return LayoutInfo(LaneLayout({1, xegpu::targetinfo::subgroupSize}), - LaneData({1, 1})); + if (rank == 1) { + return LayoutInfo( + xegpu::LayoutAttr::get(ctx, {xegpu::targetinfo::subgroupSize}, {1})); + } + return LayoutInfo(xegpu::LayoutAttr::get( + ctx, {1, xegpu::targetinfo::subgroupSize}, {1, 1})); } /// Helper to get the default layout for a vector type. @@ -209,14 +249,15 @@ static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy) { "Expected int or float element type."); // If the rank is 1, then return default layout for 1D vector. if (vectorTy.getRank() == 1) - return getDefaultSIMTLayoutInfo(1); + return getDefaultSIMTLayoutInfo(vectorTy.getContext(), 1); // Packing factor is determined by the element type bitwidth. int packingFactor = 1; unsigned bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth(); if (bitwidth < xegpu::targetinfo::packedSizeInBitsForDefault) packingFactor = xegpu::targetinfo::packedSizeInBitsForDefault / bitwidth; - return LayoutInfo(LaneLayout({1, xegpu::targetinfo::subgroupSize}), - LaneData({1, packingFactor})); + return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), + {1, xegpu::targetinfo::subgroupSize}, + {1, packingFactor})); } /// Helper to get the default layout for a vector type. @@ -229,7 +270,7 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy) { "Expected int or float element type."); // If the rank is 1, then return default layout for 1D vector. if (tdescTy.getRank() == 1) - return getDefaultSIMTLayoutInfo(1); + return getDefaultSIMTLayoutInfo(tdescTy.getContext(), 1); // Packing factor is determined by the element type bitwidth. unsigned bitwidth = tdescTy.getElementType().getIntOrFloatBitWidth(); @@ -238,16 +279,18 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy) { bitwidth < xegpu::targetinfo::packedSizeInBitsForGatherScatter ? xegpu::targetinfo::packedSizeInBitsForGatherScatter / bitwidth : 1; - return LayoutInfo(LaneLayout({xegpu::targetinfo::subgroupSize, 1}), - LaneData({1, packingFactor})); + return LayoutInfo(xegpu::LayoutAttr::get( + tdescTy.getContext(), {xegpu::targetinfo::subgroupSize, 1}, + {1, packingFactor})); } int packingFactor = (bitwidth < xegpu::targetinfo::packedSizeInBitsForDefault) ? xegpu::targetinfo::packedSizeInBitsForDefault / bitwidth : 1; - return LayoutInfo(LaneLayout({1, xegpu::targetinfo::subgroupSize}), - LaneData({1, packingFactor})); + return LayoutInfo(xegpu::LayoutAttr::get(tdescTy.getContext(), + {1, xegpu::targetinfo::subgroupSize}, + {1, packingFactor})); } /// Helper Function to get the expected layouts for DPAS operands. `lane_data` @@ -261,15 +304,17 @@ static LayoutInfo getSIMTLayoutInfoForDPASOperand(VectorType vectorTy, Type elementTy = vectorTy.getElementType(); assert(elementTy.isIntOrFloat() && "Expected int or float type in DPAS operands"); - LaneLayout layout({1, xegpu::targetinfo::subgroupSize}); + SmallVector layout({1, xegpu::targetinfo::subgroupSize}); // For B operand, data must be packed in minimum `packedDpasBSizeInBits` and // must have the VNNI format. if (operandNum == 1 && elementTy.getIntOrFloatBitWidth() < xegpu::targetinfo::packedSizeInBitsForDpasB) { - LaneData data({xegpu::targetinfo::packedSizeInBitsForDpasB / - elementTy.getIntOrFloatBitWidth(), - 1}); - return LayoutInfo(layout, data); + SmallVector data( + {static_cast(xegpu::targetinfo::packedSizeInBitsForDpasB / + elementTy.getIntOrFloatBitWidth()), + 1}); + return LayoutInfo( + xegpu::LayoutAttr::get(vectorTy.getContext(), layout, data)); } // Otherwise, return the default layout for the vector type. return getDefaultSIMTLayoutInfo(vectorTy); @@ -450,7 +495,8 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp( } // Given that the result is 1D, the layout of the operand should be 2D with // default layout. - LayoutInfo operandLayout = getDefaultSIMTLayoutInfo(2); + LayoutInfo operandLayout = + getDefaultSIMTLayoutInfo(reduction->getContext(), 2); propagateIfChanged(operands[0], operands[0]->meet(operandLayout)); // Accumulator should have the same layout as the result. propagateIfChanged(operands[1], operands[1]->meet(resultLayout)); @@ -494,43 +540,55 @@ void LayoutInfoPropagation::visitShapeCastOp( LayoutInfo resultLayout = results[0]->getValue(); if (!resultLayout.isAssigned()) return; - VectorType sourceTy = shapeCast.getSourceVectorType(); - VectorType resultTy = shapeCast.getResultVectorType(); + int64_t sourceRank = shapeCast.getSourceVectorType().getRank(); + int64_t resultRank = shapeCast.getResultVectorType().getRank(); // Expecting source rank to be 1D or 2D. - if (sourceTy.getRank() != 1 && sourceTy.getRank() != 2) { + if (sourceRank != 1 && sourceRank != 2) { shapeCast.emitWarning("Expecting source type to be 1D or 2D vector."); return; } // Expecting result rank to be 1D or 2D. - if (resultTy.getRank() != 1 && resultTy.getRank() != 2) { + if (resultRank != 1 && resultRank != 2) { shapeCast.emitWarning("Expecting result type to be 1D or 2D vector."); return; } // For 2D -> 2D shape cast, propagate the result layout to the source. - if (sourceTy.getRank() == 2 && resultTy.getRank() == 2) { - // Propagate the result layout to the source operand. + if (sourceRank == 2 && resultRank == 2) { propagateIfChanged(operands[0], operands[0]->meet(resultLayout)); return; } - auto resultLayoutArray = resultLayout.getLayoutAsArrayRef(); - if (resultLayoutArray[0] != 1 && resultLayoutArray[1] != 1) { + auto resultLaneLayout = resultLayout.getLaneLayout(); + if (resultLaneLayout[0] != 1 && resultLaneLayout[1] != 1) { shapeCast.emitWarning( "Expecting result layout to be of form [1, subgroupSize] " "or [subgroupSize, 1]."); return; } - int64_t distributedDim = resultLayoutArray[0] == 1 ? 1 : 0; - // If the result shape can be evenly distributed in the distributed dimension, - // then the source layout should be [subgroupSize][1]. Otherwise, data is - // shared accross lanes (broadcasted). In that case, just assign [1][1] for - // now (TODO: Use slice for this case) - LayoutInfo sourceLayout = - resultTy.getShape()[distributedDim] % xegpu::targetinfo::subgroupSize == 0 - ? LayoutInfo(LaneLayout({xegpu::targetinfo::subgroupSize}), - LaneData({1})) - : LayoutInfo(LaneLayout({1}), LaneData({1})); - // Propagate the source layout to the source operand. - propagateIfChanged(operands[0], operands[0]->meet(sourceLayout)); + ArrayRef resultShape = shapeCast.getResultVectorType().getShape(); + // For 2D -> 1D case, source gets the reusult's lane layout and lane data. + if (sourceRank == 2 && resultRank == 1) { + propagateIfChanged(operands[0], + operands[0]->meet(LayoutInfo(xegpu::LayoutAttr::get( + shapeCast->getContext(), resultLaneLayout, + resultLayout.getLaneData())))); + return; + } + + // For 1D -> 2D case, If the result shape can be evenly distributed in the + // distributed dimension, then the source layout should be [subgroupSize][1]. + // Otherwise, data is shared accross lanes (broadcasted). We use slice + // attribute for the broadcast case. + int64_t distributedDim = resultLaneLayout[0] == 1 ? 1 : 0; + xegpu::LayoutAttr plainLayout = xegpu::LayoutAttr::get( + shapeCast->getContext(), resultLaneLayout, resultLayout.getLaneData()); + if (resultShape[distributedDim] % xegpu::targetinfo::subgroupSize != 0) { + xegpu::SliceAttr sliceLayout = xegpu::SliceAttr::get( + shapeCast->getContext(), plainLayout, + DenseI64ArrayAttr::get(shapeCast->getContext(), {distributedDim})); + propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(sliceLayout))); + return; + } + propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(plainLayout))); } /// Propagate the layout of the result tensor to the source tensor descriptor in @@ -591,7 +649,7 @@ void LayoutInfoPropagation::visitLoadNdOp( if (auto transpose = load.getTranspose()) { load.emitWarning("Transpose effect is not expected for LoadNdOp at " "LayoutInfoPropagation stage."); - tensorDescLayout = valueLayout.getTransposedLayout(transpose.value()); + tensorDescLayout = valueLayout.transpose(transpose.value()); } // Propagate the new layout to the tensor descriptor operand. propagateIfChanged(operands[0], operands[0]->meet(tensorDescLayout)); @@ -606,8 +664,7 @@ void LayoutInfoPropagation::visitTransposeOp( LayoutInfo resultLayout = results[0]->getValue(); if (!resultLayout.isAssigned()) return; - LayoutInfo newLayout = - resultLayout.getTransposedLayout(transpose.getPermutation()); + LayoutInfo newLayout = resultLayout.transpose(transpose.getPermutation()); // Propagate the new layout to the vector operand. propagateIfChanged(operands[0], operands[0]->meet(newLayout)); } @@ -636,9 +693,9 @@ void LayoutInfoPropagation::visitVectorBitcastOp( bool isNarrowing = inElemTyBitWidth > outElemTyBitWidth; int bitCastRatio = isNarrowing ? inElemTyBitWidth / outElemTyBitWidth : outElemTyBitWidth / inElemTyBitWidth; - const LaneLayout &sourceLaneLayout = - resultLayout.getLayout(); // source lane layout is unchanged. - ArrayRef outData = resultLayout.getDataAsArrayRef(); + ArrayRef sourceLaneLayout = + resultLayout.getLaneLayout(); // Lane layout does not change for bitcast. + ArrayRef outData = resultLayout.getLaneData(); // TODO: Currently we assume that bitcasts does not require cross lane // communication. So each lane must own the required number of elements to @@ -650,12 +707,9 @@ void LayoutInfoPropagation::visitVectorBitcastOp( return; } // Check if each lane owns a single element in all dimensions except the - // innermost dimension. For example, if the result layout is [1, 16][2, 1], we - // are not allowed to bitcast such vectors. - // TODO: Relax this based on use cases. - SmallVector sourceLaneDataStorage(outData.begin(), - outData.end() - 1); - if (llvm::any_of(sourceLaneDataStorage, [](int64_t d) { return d != 1; })) { + // innermost dimension. + SmallVector sourceLaneData(outData.begin(), outData.end() - 1); + if (llvm::any_of(sourceLaneData, [](int64_t d) { return d != 1; })) { bitcast.emitWarning( "Each lane must not own multiple elements in any dimension other than " "the innermost dimension."); @@ -664,11 +718,12 @@ void LayoutInfoPropagation::visitVectorBitcastOp( // Decide lane data based on whether the bitcast is narrowing or widening. int64_t innerMostLaneData = isNarrowing ? outData[rank - 1] / bitCastRatio : outData[rank - 1] * bitCastRatio; - sourceLaneDataStorage.push_back(innerMostLaneData); - LaneData sourceLaneData(sourceLaneDataStorage); + sourceLaneData.push_back(innerMostLaneData); - propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo( - sourceLaneLayout, sourceLaneData))); + propagateIfChanged( + operands[0], + operands[0]->meet(LayoutInfo(xegpu::LayoutAttr::get( + bitcast->getContext(), sourceLaneLayout, sourceLaneData)))); } /// Propagate the layout of the result to the tensor descriptor and mask @@ -680,7 +735,7 @@ void LayoutInfoPropagation::visitLoadGatherOp( LayoutInfo layout = getDefaultSIMTLayoutInfo(load.getTensorDescType()); // Mask operand should have 1D default layout. - LayoutInfo maskLayout = getDefaultSIMTLayoutInfo(1); + LayoutInfo maskLayout = getDefaultSIMTLayoutInfo(load->getContext(), 1); // Propagate the new layout to the tensor descriptor operand. propagateIfChanged(operands[0], operands[0]->meet(layout)); @@ -698,7 +753,7 @@ void LayoutInfoPropagation::visitCreateDescOp( if (!descLayout.isAssigned()) return; // For offset operand propagate 1D default layout. - LayoutInfo layout = getDefaultSIMTLayoutInfo(1); + LayoutInfo layout = getDefaultSIMTLayoutInfo(createDesc->getContext(), 1); propagateIfChanged(operands[1], operands[1]->meet(layout)); } @@ -725,7 +780,8 @@ void LayoutInfoPropagation::visitStoreScatterOp( // Propagate the tensor descriptor layout. propagateIfChanged(operands[1], operands[1]->meet(layout)); // Use default 1D layout for mask operand. - LayoutInfo maskLayout = getDefaultSIMTLayoutInfo(1); + LayoutInfo maskLayout = + getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1); propagateIfChanged(operands[2], operands[2]->meet(maskLayout)); } @@ -813,7 +869,7 @@ void RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) { printFunctionResult(funcOp); } -using GetLayoutFnTy = function_ref; +using GetLayoutFnTy = function_ref; /// Update an operation with the layout of its results. If the result type is a /// vector type, a temporary layout attribute is added to the operation. If the /// result type is a tensor descriptor type, the type is updated with the layout @@ -832,7 +888,7 @@ static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op, if (!isa(resultType)) continue; // If the result has no layout but has users, emit a warning and continue. - xegpu::LayoutAttr layout = getLayoutOfValue(result); + xegpu::LayoutTrait layout = getLayoutOfValue(result); if (!layout && result.getNumUses() > 0) { op->emitWarning("op has users but no layout assigned for its result"); continue; @@ -898,8 +954,9 @@ updateControlFlowOps(mlir::OpBuilder &builder, // We only need to operate on tensor descriptor or vector types. if (!isa(inputType)) continue; - xegpu::LayoutAttr successorInputLayout = getLayoutOfValue(successorInput); - xegpu::LayoutAttr successorOperandLayout = + xegpu::LayoutTrait successorInputLayout = + getLayoutOfValue(successorInput); + xegpu::LayoutTrait successorOperandLayout = getLayoutOfValue(successorOperand); // If either of the layouts is not assigned, we cannot proceed. @@ -947,7 +1004,7 @@ static LogicalResult updateFunctionOpInterface(mlir::OpBuilder &builder, newArgTypes.push_back(argType); if (!isa(argType)) continue; - xegpu::LayoutAttr layout = getLayoutOfValue(arg); + xegpu::LayoutTrait layout = getLayoutOfValue(arg); if (!layout) { LLVM_DEBUG(DBGS() << "Expecting layout for function argument: " << arg << " but got none.\n"); @@ -989,13 +1046,13 @@ void XeGPUPropagateLayoutPass::runOnOperation() { return; } // Helper to convert LayoutInfo to xegpu::LayoutAttr. - auto getXeGPULayoutForValue = [&](Value val) -> xegpu::LayoutAttr { + auto getXeGPULayoutForValue = [&](Value val) -> xegpu::LayoutTrait { LayoutInfo layout = analysis.getLayoutInfo(val); if (!layout.isAssigned()) return {}; - return xegpu::LayoutAttr::get( - val.getContext(), llvm::to_vector_of(layout.getLayoutAsArrayRef()), - llvm::to_vector_of(layout.getDataAsArrayRef())); + if (layout.isSliceLayout()) + return cast(layout.get()); + return cast(layout.get()); }; mlir::OpBuilder builder(&getContext()); From 5a683b443e3160c2c81449338beeedebfe6ac229 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 25 Aug 2025 23:53:39 +0000 Subject: [PATCH 10/36] save work --- .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 142 ++++++++++-------- .../Transforms/XeGPUSubgroupDistribute.cpp | 26 ++-- 2 files changed, 94 insertions(+), 74 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 9a7c9570af6b6..0434566e21f4e 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -110,14 +110,11 @@ namespace { struct LayoutInfo { private: - mlir::Attribute storage = nullptr; + xegpu::DistributeLayoutAttr storage = nullptr; public: LayoutInfo() = default; - LayoutInfo(const xegpu::LayoutAttr &layout) : storage(layout) {} - LayoutInfo(const xegpu::SliceAttr &slice) : storage(slice) { - storage = slice.flatten(); - } + LayoutInfo(const xegpu::DistributeLayoutAttr &layout) : storage(layout) {} // Two lattice values are equal if they have `some` layout. The actual // content of the layout does not matter. @@ -135,28 +132,26 @@ struct LayoutInfo { LayoutInfo transpose(ArrayRef permutation) const; - ArrayRef getLaneLayout() const { + SmallVector getLaneLayout() const { if (!isAssigned()) return {}; - if (isa(storage)) - return cast(storage).getLaneLayout().asArrayRef(); - xegpu::SliceAttr slice = cast(storage); - assert(isa(slice.getParent()) && - "Slice parent must be a LayoutAttr"); - auto parent = cast(slice.getParent()); - return parent.getLaneLayout().asArrayRef(); + assert(storage.getLaneLayoutAsInt().has_value() && + "Expected lane layout to be assigned"); + return llvm::map_to_vector( + storage.getLaneLayoutAsInt().value(), + [](int64_t val) { return static_cast(val); }); } - ArrayRef getLaneData() const { + + SmallVector getLaneData() const { if (!isAssigned()) return {}; - if (isa(storage)) - return cast(storage).getLaneData().asArrayRef(); - xegpu::SliceAttr slice = cast(storage); - assert(isa(slice.getParent()) && - "Slice parent must be a LayoutAttr"); - auto parent = cast(slice.getParent()); - return parent.getLaneData().asArrayRef(); + assert(storage.getLaneDataAsInt().has_value() && + "Expected lane data to be assigned"); + return llvm::map_to_vector( + storage.getLaneDataAsInt().value(), + [](int64_t val) { return static_cast(val); }); } + bool isSliceLayout() const { if (!isAssigned()) return false; @@ -558,26 +553,49 @@ void LayoutInfoPropagation::visitShapeCastOp( return; } auto resultLaneLayout = resultLayout.getLaneLayout(); - if (resultLaneLayout[0] != 1 && resultLaneLayout[1] != 1) { + if (resultRank == 2 && resultLaneLayout[0] != 1 && resultLaneLayout[1] != 1) { shapeCast.emitWarning( - "Expecting result layout to be of form [1, subgroupSize] " + "Expecting 2D result layout to be of form [1, subgroupSize] " "or [subgroupSize, 1]."); return; } ArrayRef resultShape = shapeCast.getResultVectorType().getShape(); - // For 2D -> 1D case, source gets the reusult's lane layout and lane data. + ArrayRef sourceShape = shapeCast.getSourceVectorType().getShape(); + // For 2D -> 1D case. if (sourceRank == 2 && resultRank == 1) { - propagateIfChanged(operands[0], - operands[0]->meet(LayoutInfo(xegpu::LayoutAttr::get( - shapeCast->getContext(), resultLaneLayout, - resultLayout.getLaneData())))); - return; + // If the result had slice layout, simply assign the parent layout of the + // slice. + if (resultLayout.isSliceLayout()) { + auto sliceAttr = cast(resultLayout.get()); + propagateIfChanged(operands[0], + operands[0]->meet(LayoutInfo(sliceAttr.getParent()))); + return; + } + // If the result has a regular 1D layout, then we find the first dimension + // that can be fully evenly distributed to lanes. This dimension becomes + // the distributed dimension for deciding the lane layout. + int sourceDistributedDim = + sourceShape[0] % xegpu::targetinfo::subgroupSize == 0 + ? 0 + : (sourceShape[1] % xegpu::targetinfo::subgroupSize ? 1 : -1); + if (sourceDistributedDim == -1) { + shapeCast.emitWarning( + "Source vector can not be evenly distributed across lanes."); + return; + } + SmallVector sourceLaneLayout = {1, 1}, + laneData = {1, resultLayout.getLaneData()[0]}; + sourceLaneLayout[sourceDistributedDim] = xegpu::targetinfo::subgroupSize; + propagateIfChanged( + operands[0], + operands[0]->meet(LayoutInfo(xegpu::LayoutAttr::get( + shapeCast->getContext(), sourceLaneLayout, laneData)))); } // For 1D -> 2D case, If the result shape can be evenly distributed in the - // distributed dimension, then the source layout should be [subgroupSize][1]. - // Otherwise, data is shared accross lanes (broadcasted). We use slice - // attribute for the broadcast case. + // distributed dimension, then the source layout should be + // [subgroupSize][1]. Otherwise, data is shared accross lanes (broadcasted). + // We use slice attribute for the broadcast case. int64_t distributedDim = resultLaneLayout[0] == 1 ? 1 : 0; xegpu::LayoutAttr plainLayout = xegpu::LayoutAttr::get( shapeCast->getContext(), resultLaneLayout, resultLayout.getLaneData()); @@ -591,8 +609,8 @@ void LayoutInfoPropagation::visitShapeCastOp( propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(plainLayout))); } -/// Propagate the layout of the result tensor to the source tensor descriptor in -/// UpdateNdOffsetOp. +/// Propagate the layout of the result tensor to the source tensor descriptor +/// in UpdateNdOffsetOp. void LayoutInfoPropagation::visitUpdateNdOffsetOp( xegpu::UpdateNdOffsetOp updateNdOffset, ArrayRef operands, @@ -710,9 +728,9 @@ void LayoutInfoPropagation::visitVectorBitcastOp( // innermost dimension. SmallVector sourceLaneData(outData.begin(), outData.end() - 1); if (llvm::any_of(sourceLaneData, [](int64_t d) { return d != 1; })) { - bitcast.emitWarning( - "Each lane must not own multiple elements in any dimension other than " - "the innermost dimension."); + bitcast.emitWarning("Each lane must not own multiple elements in any " + "dimension other than " + "the innermost dimension."); return; } // Decide lane data based on whether the bitcast is narrowing or widening. @@ -869,15 +887,16 @@ void RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) { printFunctionResult(funcOp); } -using GetLayoutFnTy = function_ref; -/// Update an operation with the layout of its results. If the result type is a -/// vector type, a temporary layout attribute is added to the operation. If the -/// result type is a tensor descriptor type, the type is updated with the layout -/// attribute. The users of the result are also updated with the layout +using GetLayoutFnTy = function_ref; +/// Update an operation with the layout of its results. If the result type is +/// a vector type, a temporary layout attribute is added to the operation. If +/// the result type is a tensor descriptor type, the type is updated with the +/// layout attribute. The users of the result are also updated with the layout /// attribute. static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op, GetLayoutFnTy getLayoutOfValue) { - // Region ops (like scf.for) are already handled by the updateControlFlowOps. + // Region ops (like scf.for) are already handled by the + // updateControlFlowOps. if (mlir::isa(op)) return success(); @@ -888,7 +907,7 @@ static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op, if (!isa(resultType)) continue; // If the result has no layout but has users, emit a warning and continue. - xegpu::LayoutTrait layout = getLayoutOfValue(result); + xegpu::DistributeLayoutAttr layout = getLayoutOfValue(result); if (!layout && result.getNumUses() > 0) { op->emitWarning("op has users but no layout assigned for its result"); continue; @@ -910,14 +929,14 @@ static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op, } /// Region ops like scf.for need special handling because they have blocks -/// inside. If the blocks have tensor descriptor type as block arguments, thier -/// types must be updated. Also region op can have results that may not have any -/// users (e.g. A and B tiles). They are not assigned a layout by layout -/// analysis because they have no users. However inside the region op -/// corresponding block arguments for these results do have layouts. Therefore, -/// in this case we still need to update the result types with the layout -/// attribute. This function function updates the internal block arguments and -/// the result types of the region op with the assigned layouts. +/// inside. If the blocks have tensor descriptor type as block arguments, +/// thier types must be updated. Also region op can have results that may not +/// have any users (e.g. A and B tiles). They are not assigned a layout by +/// layout analysis because they have no users. However inside the region op +/// corresponding block arguments for these results do have layouts. +/// Therefore, in this case we still need to update the result types with the +/// layout attribute. This function function updates the internal block +/// arguments and the result types of the region op with the assigned layouts. /// clang-format off /// Example: scf.for ... iter_args(...) -> (out types) { /// ^bb0(block types): @@ -929,8 +948,8 @@ static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op, /// regions. One is the ^bb0 (for loop body) and the other is the scf.for op /// itself (yield the results). So we update both the block arguments of the /// successor region (i.e. block types) and the result types of the scf.for op -/// (i.e. out types). Note that yield types are updated by respective producers -/// inside bb0. +/// (i.e. out types). Note that yield types are updated by respective +/// producers inside bb0. static LogicalResult updateControlFlowOps(mlir::OpBuilder &builder, mlir::RegionBranchTerminatorOpInterface terminator, @@ -954,17 +973,16 @@ updateControlFlowOps(mlir::OpBuilder &builder, // We only need to operate on tensor descriptor or vector types. if (!isa(inputType)) continue; - xegpu::LayoutTrait successorInputLayout = + xegpu::DistributeLayoutAttr successorInputLayout = getLayoutOfValue(successorInput); - xegpu::LayoutTrait successorOperandLayout = + xegpu::DistributeLayoutAttr successorOperandLayout = getLayoutOfValue(successorOperand); // If either of the layouts is not assigned, we cannot proceed. if (!successorOperandLayout) { - LLVM_DEBUG( - DBGS() - << "No layout assigned for forwarded operand in branch terminator: " - << successorOperand << "\n"); + LLVM_DEBUG(DBGS() << "No layout assigned for forwarded operand in " + "branch terminator: " + << successorOperand << "\n"); return failure(); } // We expect the layouts to match. @@ -1004,7 +1022,7 @@ static LogicalResult updateFunctionOpInterface(mlir::OpBuilder &builder, newArgTypes.push_back(argType); if (!isa(argType)) continue; - xegpu::LayoutTrait layout = getLayoutOfValue(arg); + xegpu::DistributeLayoutAttr layout = getLayoutOfValue(arg); if (!layout) { LLVM_DEBUG(DBGS() << "Expecting layout for function argument: " << arg << " but got none.\n"); @@ -1046,7 +1064,7 @@ void XeGPUPropagateLayoutPass::runOnOperation() { return; } // Helper to convert LayoutInfo to xegpu::LayoutAttr. - auto getXeGPULayoutForValue = [&](Value val) -> xegpu::LayoutTrait { + auto getXeGPULayoutForValue = [&](Value val) -> xegpu::DistributeLayoutAttr { LayoutInfo layout = analysis.getLayoutInfo(val); if (!layout.isAssigned()) return {}; diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 27b8fc1c2919d..31821ee07d418 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -76,12 +76,12 @@ namespace { /// | 32x16 | [2, 8] | 16x2 | /// | 2x32x16 | [1, 16] | 2x32x1 | static FailureOr -getDistVecTypeBasedOnLaneLayout(xegpu::LayoutAttr layout, +getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout, VectorType originalType) { if (!layout) return failure(); - auto laneLayout = layout.getLaneLayout().asArrayRef(); + auto laneLayout = layout.getLaneLayoutAsInt().value(); assert(originalType.getShape().size() >= laneLayout.size() && "Rank of the original vector type should be greater or equal to the " "size of the lane layout to distribute the vector type."); @@ -868,7 +868,7 @@ struct VectorBitcastDistribution final : public gpu::WarpDistributionPattern { unsigned operandIdx = operand->getOperandNumber(); VectorType distributedSourceType = getDistVecTypeBasedOnLaneLayout( - xegpu::getLayoutAttr(bitcastOp.getSource()), + xegpu::getDistributeLayoutAttr(bitcastOp.getSource()), bitcastOp.getSourceVectorType()) .value_or(VectorType()); if (!distributedSourceType) @@ -907,24 +907,26 @@ struct VectorTransposeDistribution final : public gpu::WarpDistributionPattern { warpOp, "warp result is not a vector::Transpose op"); auto transposeOp = operand->get().getDefiningOp(); unsigned operandIdx = operand->getOperandNumber(); - xegpu::LayoutAttr sourceLayout = - xegpu::getLayoutAttr(transposeOp.getVector()); - xegpu::LayoutAttr resultLayout = - xegpu::getLayoutAttr(transposeOp.getResult()); + xegpu::DistributeLayoutAttr sourceLayout = + xegpu::getDistributeLayoutAttr(transposeOp.getVector()); + xegpu::DistributeLayoutAttr resultLayout = + xegpu::getDistributeLayoutAttr(transposeOp.getResult()); if (!sourceLayout || !resultLayout) return rewriter.notifyMatchFailure( transposeOp, "the source or result vector of the transpose op lacks layout " "attribute"); - ArrayRef sourceLaneLayout = sourceLayout.getLaneLayout().asArrayRef(); - ArrayRef resultLaneLayout = resultLayout.getLaneLayout().asArrayRef(); - ArrayRef sourceLaneData = sourceLayout.getLaneData().asArrayRef(); - ArrayRef resultLaneData = resultLayout.getLaneData().asArrayRef(); + ArrayRef sourceLaneLayout = + sourceLayout.getLaneLayoutAsInt().value(); + ArrayRef resultLaneLayout = + resultLayout.getLaneLayoutAsInt().value(); + ArrayRef sourceLaneData = sourceLayout.getLaneDataAsInt().value(); + ArrayRef resultLaneData = resultLayout.getLaneDataAsInt().value(); if (sourceLaneLayout.size() != 2 || resultLaneLayout.size() != 2) return rewriter.notifyMatchFailure( transposeOp, "the source or result vector of the transpose op " "does not have 2D layout"); - auto is2DTranspose = [](ArrayRef input, ArrayRef output) { + auto is2DTranspose = [](ArrayRef input, ArrayRef output) { return input.size() == 2 && output.size() == 2 && input[0] == output[1] && input[1] == output[0]; }; From 2da2c6de6f3043462d871b8083a19e09738cc509 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 26 Aug 2025 19:54:35 +0000 Subject: [PATCH 11/36] save work --- .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 19 ++--- mlir/test/Dialect/XeGPU/propagate-layout.mlir | 80 ++++++++++++++++++- 2 files changed, 88 insertions(+), 11 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 0434566e21f4e..3f30751875679 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -187,8 +187,8 @@ LayoutInfo LayoutInfo::transpose(ArrayRef permutation) const { // Check if the permutation is valid. llvm::SmallSet seen(permutation.begin(), permutation.end()); bool hasDuplicates = seen.size() != permutation.size(); - bool withinRange = llvm::all_of(permutation, [&](size_t idx) { - return idx >= 0 && idx < permutation.size(); + bool withinRange = llvm::all_of(permutation, [&](int64_t idx) { + return idx >= 0 && idx < static_cast(permutation.size()); }); if (!withinRange || hasDuplicates) { @@ -577,7 +577,7 @@ void LayoutInfoPropagation::visitShapeCastOp( int sourceDistributedDim = sourceShape[0] % xegpu::targetinfo::subgroupSize == 0 ? 0 - : (sourceShape[1] % xegpu::targetinfo::subgroupSize ? 1 : -1); + : (sourceShape[1] % xegpu::targetinfo::subgroupSize == 0 ? 1 : -1); if (sourceDistributedDim == -1) { shapeCast.emitWarning( "Source vector can not be evenly distributed across lanes."); @@ -597,16 +597,17 @@ void LayoutInfoPropagation::visitShapeCastOp( // [subgroupSize][1]. Otherwise, data is shared accross lanes (broadcasted). // We use slice attribute for the broadcast case. int64_t distributedDim = resultLaneLayout[0] == 1 ? 1 : 0; - xegpu::LayoutAttr plainLayout = xegpu::LayoutAttr::get( - shapeCast->getContext(), resultLaneLayout, resultLayout.getLaneData()); if (resultShape[distributedDim] % xegpu::targetinfo::subgroupSize != 0) { + xegpu::LayoutAttr parentLayout = xegpu::LayoutAttr::get( + shapeCast->getContext(), resultLaneLayout, resultLayout.getLaneData()); xegpu::SliceAttr sliceLayout = xegpu::SliceAttr::get( - shapeCast->getContext(), plainLayout, + shapeCast->getContext(), parentLayout, DenseI64ArrayAttr::get(shapeCast->getContext(), {distributedDim})); propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(sliceLayout))); return; } - propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(plainLayout))); + propagateIfChanged(operands[0], operands[0]->meet(getDefaultSIMTLayoutInfo( + shapeCast.getSourceVectorType()))); } /// Propagate the layout of the result tensor to the source tensor descriptor @@ -711,9 +712,9 @@ void LayoutInfoPropagation::visitVectorBitcastOp( bool isNarrowing = inElemTyBitWidth > outElemTyBitWidth; int bitCastRatio = isNarrowing ? inElemTyBitWidth / outElemTyBitWidth : outElemTyBitWidth / inElemTyBitWidth; - ArrayRef sourceLaneLayout = + SmallVector sourceLaneLayout = resultLayout.getLaneLayout(); // Lane layout does not change for bitcast. - ArrayRef outData = resultLayout.getLaneData(); + SmallVector outData = resultLayout.getLaneData(); // TODO: Currently we assume that bitcasts does not require cross lane // communication. So each lane must own the required number of elements to diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir index 994fa44cab0b6..25d237c58e2ce 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir @@ -455,7 +455,7 @@ func.func @prefetch_1d(%arg0: memref<256xf16>){ } // ----- -// CHECK-LABEL: func.func @test_scf_while_and_condition( +// CHECK-LABEL: func.func @scf_while_and_condition( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) { // CHECK: %{{.*}}:3 = scf.while ({{.*}}) : (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout>) // CHECK-SAME: -> (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout>) { @@ -464,7 +464,7 @@ func.func @prefetch_1d(%arg0: memref<256xf16>){ // CHECK-NEXT: ^bb0(%{{.*}}: vector<16xf32>, %{{.*}}: i32, %{{.*}}: !xegpu.tensor_desc<16xf32, #xegpu.layout>): // CHECK: scf.yield {{.*}} : vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout> // CHECK-NEXT: } attributes {layout_result_0 = #xegpu.layout} -func.func @test_scf_while_and_condition(%arg0: memref<256xf32>, %arg1: memref<256xf32>) { +func.func @scf_while_and_condition(%arg0: memref<256xf32>, %arg1: memref<256xf32>) { %c0 = arith.constant 0 : i32 %c16 = arith.constant 16 : i32 %c256 = arith.constant 256 : i32 @@ -486,3 +486,79 @@ func.func @test_scf_while_and_condition(%arg0: memref<256xf32>, %arg1: memref<25 } return } + +// ----- +// CHECK-LABEL: func.func @vector_shape_cast_2d_to_1d_dim0_distributed( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x1xf16, #xegpu.layout>, +// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf16, #xegpu.layout>) { +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] +// CHECK-SAME: {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: !xegpu.tensor_desc<16x1xf16, #xegpu.layout> -> vector<16x1xf16> +// CHECK-NEXT: %{{.*}} = vector.shape_cast %[[LOAD]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: : vector<16x1xf16> to vector<16xf16> +func.func @vector_shape_cast_2d_to_1d_dim0_distributed(%arg0: !xegpu.tensor_desc<16x1xf16>, %arg1: !xegpu.tensor_desc<16xf16>) { + %c0 = arith.constant 0 : index + %3 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x1xf16> -> vector<16x1xf16> + %2 = vector.shape_cast %3 : vector<16x1xf16> to vector<16xf16> + xegpu.store_nd %2, %arg1 : vector<16xf16>, !xegpu.tensor_desc<16xf16> + return +} + +// ----- +// CHECK-LABEL: func.func @vector_shape_cast_2d_to_1d_dim1_distributed( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<1x16xf16, #xegpu.layout>, +// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf16, #xegpu.layout>) { +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.layout> -> vector<1x16xf16> +// CHECK: %{{.*}} = vector.shape_cast %[[LOAD]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: vector<1x16xf16> to vector<16xf16> +func.func @vector_shape_cast_2d_to_1d_dim1_distributed(%arg0: !xegpu.tensor_desc<1x16xf16>, %arg1: !xegpu.tensor_desc<16xf16>) { + %c0 = arith.constant 0 : index + %3 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<1x16xf16> -> vector<1x16xf16> + %2 = vector.shape_cast %3 : vector<1x16xf16> to vector<16xf16> + xegpu.store_nd %2, %arg1 : vector<16xf16>, !xegpu.tensor_desc<16xf16> + return +} + +// ----- +// CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim1_distributed( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, +// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> +// CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction , %[[LOAD]], %{{[0-9a-zA-Z]+}} +// CHECK-SAME: {layout_result_0 = #xegpu.layout} [0] : vector<16x16xf16> to vector<16xf16> +// CHECK-NEXT: %[[CAST:.*]] = vector.shape_cast %[[REDUCE]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: vector<16xf16> to vector<1x16xf16> +func.func @vector_shape_cast_1d_to_2d_dim1_distributed(%arg0: !xegpu.tensor_desc<16x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) { + %c0 = arith.constant 0 : index + %cst = arith.constant dense<0.0000> : vector<16xf16> + %3 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %4 = vector.multi_reduction , %3, %cst [0] : vector<16x16xf16> to vector<16xf16> + %2 = vector.shape_cast %4 : vector<16xf16> to vector<1x16xf16> + %5 = vector.broadcast %2 : vector<1x16xf16> to vector<16x16xf16> + xegpu.store_nd %5, %arg1 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> + return +} + +// ----- +// CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, +// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %arg0 {layout_result_0 = #xegpu.layout} +// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> +// CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction , %[[LOAD]], %{{[0-9a-zA-Z]+}} +// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} [1] +// CHECK-SAME: vector<16x16xf16> to vector<16xf16> +// CHECK-NEXT: %[[CAST:.*]] = vector.shape_cast %[[REDUCE]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: vector<16xf16> to vector<16x1xf16> +func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(%arg0: !xegpu.tensor_desc<16x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) { + %c0 = arith.constant 0 : index + %cst = arith.constant dense<0.0000> : vector<16xf16> + %3 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %4 = vector.multi_reduction , %3, %cst [1] : vector<16x16xf16> to vector<16xf16> + %2 = vector.shape_cast %4 : vector<16xf16> to vector<16x1xf16> + %5 = vector.broadcast %2 : vector<16x1xf16> to vector<16x16xf16> + xegpu.store_nd %5, %arg1 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> + return +} From 7eabad47a70eaac1c15207a62d844f01c4205b62 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 26 Aug 2025 23:10:04 +0000 Subject: [PATCH 12/36] save work --- .../Transforms/XeGPUSubgroupDistribute.cpp | 3 + .../Dialect/XeGPU/subgroup-distribute.mlir | 96 +++++++++++++++++++ 2 files changed, 99 insertions(+) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 31821ee07d418..3e67e6406b956 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -827,6 +827,9 @@ struct GpuBarrierDistribution final : public gpu::WarpDistributionPattern { } }; +/// Sink a memref::ExtractAlignedPointerAsIndex op feeding into yield op of an +/// enclosing `gpu.warp_execute_on_lane_0` region. This will simply move the op +/// outside of the warp op. struct MemrefExtractAlignedPointerAsIndexDistribution final : public gpu::WarpDistributionPattern { using gpu::WarpDistributionPattern::WarpDistributionPattern; diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index 54ef56e013abb..690b13f5a2973 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -319,3 +319,99 @@ gpu.module @test { gpu.return } } + +// ----- +// CHECK-LABEL: gpu.func @memref_extract_aligned_pointer_as_index( +// CHECK: %{{.*}} = memref.extract_aligned_pointer_as_index %{{.*}} : memref<256x256xf16> -> index +gpu.module @test { + gpu.func @memref_extract_aligned_pointer_as_index(%arg0 : memref<256x256xf16>) { + %c0 = arith.constant 0 : index + %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16xf16> + %ptr = memref.extract_aligned_pointer_as_index %arg0 : memref<256x256xf16> -> index + %ptr_i64 = arith.index_cast %ptr : index to i64 + %tdesc = xegpu.create_nd_tdesc %ptr_i64[%c0], shape: [16], strides: [16] : i64 + -> !xegpu.tensor_desc<16xf16, #xegpu.layout> + xegpu.store_nd %cst, %tdesc : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout> + gpu.return + } +} + + +// ----- +// CHECK-LABEL: gpu.func @vector_transpose( +// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<2xf32> +// CHECK: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<2x16xf32> -> !xegpu.tensor_desc<2x16xf32> +// CHECK: xegpu.store_nd %[[CST]], %[[DEST]] : vector<2xf32>, !xegpu.tensor_desc<2x16xf32> +gpu.module @test { + gpu.func @vector_transpose(%arg0: memref<2x16xf32>) { + %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> + : vector<16x2xf32> + %c0 = arith.constant 0 : index + %transpose = vector.transpose %cst, [1, 0] {layout_result_0 = #xegpu.layout} + : vector<16x2xf32> to vector<2x16xf32> + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<2x16xf32> + -> !xegpu.tensor_desc<2x16xf32, #xegpu.layout> + xegpu.store_nd %transpose, %0 : vector<2x16xf32>, + !xegpu.tensor_desc<2x16xf32, #xegpu.layout> + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @vector_bitcast( +// CHECK: %[[CAST:.*]] = vector.bitcast %{{.*}} : vector<4x2xi8> to vector<4x1xi16> +// CHECK-NEXT: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<4x16xi16> -> !xegpu.tensor_desc<4x16xi16> +// CHECK-NEXT: %[[T0:.*]] = vector.shape_cast %[[CAST]] : vector<4x1xi16> to vector<4xi16> +// CHECK-NEXT: xegpu.store_nd %[[T0]], %[[DEST]] : vector<4xi16>, !xegpu.tensor_desc<4x16xi16> +gpu.module @test { + gpu.func @vector_bitcast(%arg0: memref<4x16xi16>) { + %cst = "some_op"() {layout_result_0 = #xegpu.layout} + : () -> (vector<4x32xi8>) + %bitcast = vector.bitcast %cst {layout_result_0 = #xegpu.layout} + : vector<4x32xi8> to vector<4x16xi16> + %c0 = arith.constant 0 : index + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<4x16xi16> + -> !xegpu.tensor_desc<4x16xi16, #xegpu.layout> + xegpu.store_nd %bitcast, %0 : vector<4x16xi16>, + !xegpu.tensor_desc<4x16xi16, #xegpu.layout> + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @mma_transpose_b( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x8xi32>, +// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { +// CHECK: %[[ADESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> +// CHECK-NEXT: %[[A:.*]] = xegpu.load_nd %[[ADESC]] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> +// CHECK-NEXT: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32> +// CHECK-NEXT: %[[B:.*]] = xegpu.load_nd %[[BDESC]] <{transpose = array}> : !xegpu.tensor_desc<16x8xi32> -> vector<8xi32> +// CHECK-NEXT: %[[BCAST0:.*]] = vector.shape_cast %[[B]] : vector<8xi32> to vector<1x8xi32> +// CHECK-NEXT: %[[BCAST1:.*]] = vector.bitcast %[[BCAST0]] : vector<1x8xi32> to vector<1x16xf16> +// CHECK-NEXT: %[[BCAST2:.*]] = vector.shape_cast %[[BCAST1]] : vector<1x16xf16> to vector<16xf16> +// CHECK-NEXT: %[[C:.*]] = xegpu.dpas %[[A]], %[[BCAST2]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> +gpu.module @test { + gpu.func @mma_transpose_b(%arg0: memref<8x16xf16>, %arg1: memref<16x8xi32>, %arg2: memref<8x16xf32>) { + %c0 = arith.constant 0 : index + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> + -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> + %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} + : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> + %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x8xi32> + -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout> + %3 = xegpu.load_nd %2 {layout_result_0 = #xegpu.layout} + : !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> + %4 = vector.bitcast %3 {layout_result_0 = #xegpu.layout} + : vector<16x8xi32> to vector<16x16xf16> + %5 = vector.transpose %4, [1, 0] {layout_result_0 = #xegpu.layout} + : vector<16x16xf16> to vector<16x16xf16> + %6 = xegpu.dpas %1, %5 {layout_result_0 = #xegpu.layout} + : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> + %7 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> + -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + xegpu.store_nd %6, %7 : vector<8x16xf32>, + !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + gpu.return + + } +} From 635a00679d1287f23a18594d8643811bbc6297f5 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 26 Aug 2025 23:20:54 +0000 Subject: [PATCH 13/36] save work --- mlir/test/Dialect/XeGPU/propagate-layout.mlir | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir index 25d237c58e2ce..29592ec76f918 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir @@ -224,7 +224,7 @@ func.func @vector_bitcast_i16_to_i32(%arg0: memref<8x32xi16>, %arg1: memref<8x16 // ----- // CHECK-LABEL: func.func @vector_bitcast_require_cross_lane_shuffle( -// CHECK-NOT: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = {{.*}}} : !xegpu.tensor_desc<8x16xi32> -> vector<8x16xi32> +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xi32> -> vector<8x16xi32> // CHECK: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout} // CHECK-SAME: vector<8x16xi32> to vector<8x32xi16> func.func @vector_bitcast_require_cross_lane_shuffle(%arg0: memref<8x16xi32>, %arg1: memref<8x32xi16>) { From 74ab5a37ee0acee4d564c5eecb1fb0b564a5157b Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 26 Aug 2025 23:38:59 +0000 Subject: [PATCH 14/36] save work --- mlir/test/Dialect/XeGPU/subgroup-distribute.mlir | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index 690b13f5a2973..8ecd080c96922 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -382,10 +382,10 @@ gpu.module @test { // CHECK-LABEL: gpu.func @mma_transpose_b( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x8xi32>, // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %[[ADESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK-NEXT: %[[A:.*]] = xegpu.load_nd %[[ADESC]] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> -// CHECK-NEXT: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32> -// CHECK-NEXT: %[[B:.*]] = xegpu.load_nd %[[BDESC]] <{transpose = array}> : !xegpu.tensor_desc<16x8xi32> -> vector<8xi32> +// CHECK-DAG: %[[ADESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> +// CHECK-DAG: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32> +// CHECK-DAG: %[[A:.*]] = xegpu.load_nd %[[ADESC]] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> +// CHECK-DAG: %[[B:.*]] = xegpu.load_nd %[[BDESC]] <{transpose = array}> : !xegpu.tensor_desc<16x8xi32> -> vector<8xi32> // CHECK-NEXT: %[[BCAST0:.*]] = vector.shape_cast %[[B]] : vector<8xi32> to vector<1x8xi32> // CHECK-NEXT: %[[BCAST1:.*]] = vector.bitcast %[[BCAST0]] : vector<1x8xi32> to vector<1x16xf16> // CHECK-NEXT: %[[BCAST2:.*]] = vector.shape_cast %[[BCAST1]] : vector<1x16xf16> to vector<16xf16> From b36e109eb628a9262000ddfe4eb5e9c1e0d9bc5b Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 27 Aug 2025 00:00:59 +0000 Subject: [PATCH 15/36] save work --- mlir/test/Dialect/XeGPU/subgroup-distribute.mlir | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index 8ecd080c96922..d2af6d064bb03 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -380,8 +380,7 @@ gpu.module @test { // ----- // CHECK-LABEL: gpu.func @mma_transpose_b( -// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x8xi32>, -// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { +// CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x8xi32>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK-DAG: %[[ADESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> // CHECK-DAG: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32> // CHECK-DAG: %[[A:.*]] = xegpu.load_nd %[[ADESC]] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> From 4e871d71045384755630e14d4f04eed375615c21 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 27 Aug 2025 01:10:12 +0000 Subject: [PATCH 16/36] save work --- .../XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 9d55be57a30ea..73a9ebaa6a89a 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -915,12 +915,14 @@ struct VectorTransposeDistribution final : public gpu::WarpDistributionPattern { transposeOp, "the source or result vector of the transpose op lacks layout " "attribute"); - ArrayRef sourceLaneLayout = + SmallVector sourceLaneLayout = sourceLayout.getLaneLayoutAsInt().value(); - ArrayRef resultLaneLayout = + SmallVector resultLaneLayout = resultLayout.getLaneLayoutAsInt().value(); - ArrayRef sourceLaneData = sourceLayout.getLaneDataAsInt().value(); - ArrayRef resultLaneData = resultLayout.getLaneDataAsInt().value(); + SmallVector sourceLaneData = + sourceLayout.getLaneDataAsInt().value(); + SmallVector resultLaneData = + resultLayout.getLaneDataAsInt().value(); if (sourceLaneLayout.size() != 2 || resultLaneLayout.size() != 2) return rewriter.notifyMatchFailure( transposeOp, "the source or result vector of the transpose op " From 6bf4c688adaa6dc71bacb18566fa1690225101db Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 27 Aug 2025 18:49:46 +0000 Subject: [PATCH 17/36] fix --- .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 38 +------------------ 1 file changed, 1 insertion(+), 37 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index 20d73902ac20a..cfe3e800484ce 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -421,42 +421,6 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> { return {}; } - std::optional> getInstDataAsInt() const { - if (DenseI32ArrayAttr inst = getInstData()) - return llvm::to_vector_of(inst.asArrayRef()); - return std::nullopt; - } - - std::optional> getLaneLayoutAsInt() const { - if (DenseI32ArrayAttr layout = getLaneLayout()) - return llvm::to_vector_of(layout.asArrayRef()); - return {}; - } - - SmallVector getSgDataAsInt() const { - if (DenseI32ArrayAttr data = getSgData()) - return llvm::to_vector_of(data.asArrayRef()); - return {}; - } - - SmallVector getInstDataAsInt() const { - if (DenseI32ArrayAttr inst = getInstData()) - return llvm::to_vector_of(inst.asArrayRef()); - return {}; - } - - SmallVector getLaneLayoutAsInt() const { - if (DenseI32ArrayAttr layout = getLaneLayout()) - return llvm::to_vector_of(layout.asArrayRef()); - return {}; - } - - SmallVector getLaneDataAsInt() const { - if (DenseI32ArrayAttr data = getLaneData()) - return llvm::to_vector_of(data.asArrayRef()); - return {}; - } - /// Delinearizes a linear subgroup ID into its multidimensional indices /// based on the effective subgroup layout. FailureOr> @@ -546,7 +510,7 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> { return {}; } - /// Returns the LaneData of the attribute, computed by applying + /// Returns the SgData of the attribute, computed by applying /// the slice dimensions to the underlying LayoutAttr. SmallVector getSgDataAsInt() const { SliceAttr attr = flatten(); From 1a1ef3227381794fe26d7d4d925af11f4ac88248 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 27 Aug 2025 19:08:08 +0000 Subject: [PATCH 18/36] fix --- .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 49 +++---------------- .../Transforms/XeGPUSubgroupDistribute.cpp | 17 +++---- 2 files changed, 16 insertions(+), 50 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 3f30751875679..56b8600e533f6 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -55,39 +55,6 @@ using namespace mlir::dataflow; namespace { -//===----------------------------------------------------------------------===// -// Layout -//===----------------------------------------------------------------------===// - -/// Helper class to store the ND layout of lanes within a subgroup and data -/// owned by each lane. -// struct Layout { -// SmallVector layout; -// Layout() = default; -// Layout(std::initializer_list list) : layout(list) {} -// Layout(SmallVector &list) : layout(list) {} -// void print(llvm::raw_ostream &os) const; -// size_t size() const { return layout.size(); } -// int64_t operator[](size_t idx) const; -// }; - -// int64_t Layout::operator[](size_t idx) const { -// assert(idx < layout.size() && "Index out of bounds"); -// return layout[idx]; -// } - -// void Layout::print(llvm::raw_ostream &os) const { -// os << llvm::interleaved_array(layout); -// } - -// /// LaneLayout represents the logical layout of lanes within a subgroup when -// it -// /// accesses some value. LaneData represents the logical layout of data owned -// by -// /// each work item. -// using LaneLayout = Layout; -// using LaneData = Layout; - //===----------------------------------------------------------------------===// // LayoutInfo //===----------------------------------------------------------------------===// @@ -135,21 +102,21 @@ struct LayoutInfo { SmallVector getLaneLayout() const { if (!isAssigned()) return {}; - assert(storage.getLaneLayoutAsInt().has_value() && + assert(storage.getLaneLayoutAsInt().size() && "Expected lane layout to be assigned"); - return llvm::map_to_vector( - storage.getLaneLayoutAsInt().value(), - [](int64_t val) { return static_cast(val); }); + return llvm::map_to_vector(storage.getLaneLayoutAsInt(), [](int64_t val) { + return static_cast(val); + }); } SmallVector getLaneData() const { if (!isAssigned()) return {}; - assert(storage.getLaneDataAsInt().has_value() && + assert(storage.getLaneDataAsInt().size() && "Expected lane data to be assigned"); - return llvm::map_to_vector( - storage.getLaneDataAsInt().value(), - [](int64_t val) { return static_cast(val); }); + return llvm::map_to_vector(storage.getLaneDataAsInt(), [](int64_t val) { + return static_cast(val); + }); } bool isSliceLayout() const { diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 73a9ebaa6a89a..5f74e376bcb26 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -81,7 +81,10 @@ getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout, if (!layout) return failure(); - auto laneLayout = layout.getLaneLayoutAsInt().value(); + SmallVector laneLayout = layout.getLaneLayoutAsInt(); + // We expect non-empty lane layout. + if (!laneLayout.size()) + return failure(); assert(originalType.getShape().size() >= laneLayout.size() && "Rank of the original vector type should be greater or equal to the " "size of the lane layout to distribute the vector type."); @@ -915,14 +918,10 @@ struct VectorTransposeDistribution final : public gpu::WarpDistributionPattern { transposeOp, "the source or result vector of the transpose op lacks layout " "attribute"); - SmallVector sourceLaneLayout = - sourceLayout.getLaneLayoutAsInt().value(); - SmallVector resultLaneLayout = - resultLayout.getLaneLayoutAsInt().value(); - SmallVector sourceLaneData = - sourceLayout.getLaneDataAsInt().value(); - SmallVector resultLaneData = - resultLayout.getLaneDataAsInt().value(); + SmallVector sourceLaneLayout = sourceLayout.getLaneLayoutAsInt(); + SmallVector resultLaneLayout = resultLayout.getLaneLayoutAsInt(); + SmallVector sourceLaneData = sourceLayout.getLaneDataAsInt(); + SmallVector resultLaneData = resultLayout.getLaneDataAsInt(); if (sourceLaneLayout.size() != 2 || resultLaneLayout.size() != 2) return rewriter.notifyMatchFailure( transposeOp, "the source or result vector of the transpose op " From 34f1703ef1cd753ccf72a36f416bf2e43b19ef45 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 27 Aug 2025 19:18:22 +0000 Subject: [PATCH 19/36] fix --- .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 50 +++++++++++-------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 56b8600e533f6..b785285100328 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -60,10 +60,14 @@ namespace { //===----------------------------------------------------------------------===// /// Helper class for tracking the analysis state of an mlir value. For layout -/// propagation, the analysis state is simply the lane_layout and lane_data of -/// each value. Purpose of this analysis to propagate some unique layout for -/// each value in the program starting from a set of anchor operations (like -/// DPAS, StoreNd, etc.). +/// propagation, the analysis state is simply the distribution layout of +/// each value. The distribution layout information is encapsulated using +/// xegpu::DistributeLayoutAttr class which can hold information about any type +/// of distribution layout that XeGPU dialect supports. Purpose of this analysis +/// to propagate some unique distribution layout for each value in the program +/// starting from a set of anchor operations (like DPAS, StoreNd, etc.). Note +/// that analysis will reach a fixed point when all values are reached some +/// layout and, analysis does not try to modify any already assigned layouts. /// /// Given this, LayoutInfo satisifies the following properties: /// 1) A LayoutInfo value can be in one of two states - `assigned` or `not @@ -99,25 +103,9 @@ struct LayoutInfo { LayoutInfo transpose(ArrayRef permutation) const; - SmallVector getLaneLayout() const { - if (!isAssigned()) - return {}; - assert(storage.getLaneLayoutAsInt().size() && - "Expected lane layout to be assigned"); - return llvm::map_to_vector(storage.getLaneLayoutAsInt(), [](int64_t val) { - return static_cast(val); - }); - } + SmallVector getLaneLayout() const; - SmallVector getLaneData() const { - if (!isAssigned()) - return {}; - assert(storage.getLaneDataAsInt().size() && - "Expected lane data to be assigned"); - return llvm::map_to_vector(storage.getLaneDataAsInt(), [](int64_t val) { - return static_cast(val); - }); - } + SmallVector getLaneData() const; bool isSliceLayout() const { if (!isAssigned()) @@ -128,6 +116,24 @@ struct LayoutInfo { Attribute get() { return storage; } }; +SmallVector LayoutInfo::getLaneLayout() const { + if (!isAssigned()) + return {}; + assert(storage.getLaneLayoutAsInt().size() && + "Expected lane layout to be assigned"); + return llvm::map_to_vector(storage.getLaneLayoutAsInt(), + [](int64_t val) { return static_cast(val); }); +} + +SmallVector LayoutInfo::getLaneData() const { + if (!isAssigned()) + return {}; + assert(storage.getLaneDataAsInt().size() && + "Expected lane data to be assigned"); + return llvm::map_to_vector(storage.getLaneDataAsInt(), + [](int64_t val) { return static_cast(val); }); +} + void LayoutInfo::print(raw_ostream &os) const { if (isAssigned()) { os << storage; From d7169defd78e88d8ba4b858dfd25afecce20c4b9 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 27 Aug 2025 19:22:55 +0000 Subject: [PATCH 20/36] fix --- mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp | 3 --- mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 4 ---- 2 files changed, 7 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index b785285100328..15878de1562fb 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -21,7 +21,6 @@ #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinTypes.h" -#include "mlir/IR/MLIRContext.h" #include "mlir/IR/Operation.h" #include "mlir/IR/Value.h" #include "mlir/IR/Visitors.h" @@ -35,10 +34,8 @@ #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/InterleavedRange.h" #include "llvm/Support/LogicalResult.h" #include "llvm/Support/raw_ostream.h" -#include namespace mlir { namespace xegpu { diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 5f74e376bcb26..c9c5ea955962c 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -27,7 +27,6 @@ #include "mlir/IR/Value.h" #include "mlir/IR/Visitors.h" #include "mlir/Interfaces/FunctionInterfaces.h" -#include "mlir/Interfaces/SideEffectInterfaces.h" #include "mlir/Support/LLVM.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" @@ -35,9 +34,6 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/SmallVectorExtras.h" -#include "llvm/Support/LogicalResult.h" -#include namespace mlir { namespace xegpu { From 13a2137902504c0412fd8658d4101287495c86b9 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Tue, 9 Sep 2025 22:22:44 +0000 Subject: [PATCH 21/36] save work --- .../lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 8c53c7d5c2bbc..47c070badd6dc 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -224,8 +224,9 @@ static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy, bitwidth < xegpu::targetinfo::packedSizeInBitsForGatherScatter ? xegpu::targetinfo::packedSizeInBitsForGatherScatter / bitwidth : 1; - return LayoutInfo(LaneLayout({xegpu::targetinfo::subgroupSize, 1}), - LaneData({1, packingFactor})); + return LayoutInfo(xegpu::LayoutAttr::get( + vectorTy.getContext(), {xegpu::targetinfo::subgroupSize, 1}, + {1, packingFactor})); } if (bitwidth < xegpu::targetinfo::packedSizeInBitsForDefault) packingFactor = xegpu::targetinfo::packedSizeInBitsForDefault / bitwidth; @@ -787,7 +788,8 @@ void LayoutInfoPropagation::visitStoreScatterOp( LayoutInfo payloadLayout = getDefaultSIMTLayoutInfo(payloadTy, /*scattered=*/true); - LayoutInfo maskLayout = getDefaultSIMTLayoutInfo(1); + LayoutInfo maskLayout = + getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1); // Propagate the payload operand layout propagateIfChanged(operands[0], operands[0]->meet(payloadLayout)); // Propagate the destination (if tdesc) operand layout From 93f07e7eb645772bbd8add33d8407242e62dbc97 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 10 Sep 2025 16:41:54 +0000 Subject: [PATCH 22/36] remove restriction --- .../XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 7619dcdc83692..46cd58e314acd 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -1047,6 +1047,11 @@ struct MemrefExtractAlignedPointerAsIndexDistribution final } }; +/// Distribute a vector::BitCastOp feeding into yield op of an enclosing +/// `gpu.warp_execute_on_lane_0` region. Bitcast only impacts the innermost +/// diemension of the source/result vectors. Equivalent vector::BitCastOp is +/// created outside of the warp op with distributed source vector type (computed +/// using assigned layout). struct VectorBitcastDistribution final : public gpu::WarpDistributionPattern { using gpu::WarpDistributionPattern::WarpDistributionPattern; LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, @@ -1069,11 +1074,6 @@ struct VectorBitcastDistribution final : public gpu::WarpDistributionPattern { "vector::BitCast op"); VectorType distributedResultType = cast(warpOp.getResult(operandIdx).getType()); - if (distributedSourceType.getRank() != 2 || - distributedResultType.getRank() != 2) - return rewriter.notifyMatchFailure( - bitcastOp, "the source or result vector of the bitcast op " - "are not 2D vectors"); SmallVector newRetIndices; gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( rewriter, warpOp, bitcastOp.getSource(), From be1c00cc486c3b2fe69c13b5477df5be8bd1c70e Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 10 Sep 2025 17:51:57 +0000 Subject: [PATCH 23/36] add transpose function --- .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 46 ++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index cfe3e800484ce..24756318e4339 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -231,7 +231,51 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> { multiple blocks according to round-robin distribution rules.}], "FailureOr>>", "getOffsets", - (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId, "ArrayRef":$shape)> + (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId, "ArrayRef":$shape)>, + InterfaceMethod": $perm), + /*methodBody=*/[{ + if (!other) + return false; + if ($_self.getRank() != other.getRank() || perm.size() != static_cast($_self.getRank())) + return false; + // check if the permutation is valid + int64_t rank = $_self.getRank(); + SmallVector seen(rank, false); + for (const auto &ta : llvm::enumerate(perm)) { + if (ta.value() < 0 || ta.value() >= rank) + return false; + if (seen[ta.value()]) + return false; + seen[ta.value()] = true; + } + auto checkTranspose = [](ArrayRef dst, ArrayRef src, ArrayRef perm) { + for (const auto &ta : llvm::enumerate(perm)) { + if (src[ta.index()] != dst[ta.value()]) + return false; + } + return true; + }; + // check sgLayout + if (!checkTranspose($_self.getSgLayoutAsInt(), other.getSgLayoutAsInt(), perm)) + return false; + // check sgData + if (!checkTranspose($_self.getSgDataAsInt(), other.getSgDataAsInt(), perm)) + return false; + // check instData + if (!checkTranspose($_self.getInstDataAsInt(), other.getInstDataAsInt(), perm)) + return false; + // check laneLayout + if (!checkTranspose($_self.getLaneLayoutAsInt(), other.getLaneLayoutAsInt(), perm)) + return false; + // check laneData + if (!checkTranspose($_self.getLaneDataAsInt(), other.getLaneDataAsInt(), perm)) + return false; + return true; + }]> ]; } From 916c75f12298f76b2f8c6e2b5645125e75d34a73 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 10 Sep 2025 23:15:18 +0000 Subject: [PATCH 24/36] add slice attribute utils --- .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 12 ++++++++++- mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 21 +++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index 24756318e4339..aa3e3c5cddc05 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -275,7 +275,11 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> { if (!checkTranspose($_self.getLaneDataAsInt(), other.getLaneDataAsInt(), perm)) return false; return true; - }]> + }]>, + InterfaceMethod ]; } @@ -477,6 +481,9 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> { FailureOr>> getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef shape); + /// Check if this is slice of some other layout. + bool isSliceOf(const xegpu::DistributeLayoutAttr &other) { return false; } + }]; let assemblyFormat = "`<` struct(params) `>`"; @@ -638,6 +645,9 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> { FailureOr>> getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef shape); + /// Check if this is slice of some other layout. + bool isSliceOf(const xegpu::DistributeLayoutAttr &other); + }]; let assemblyFormat = "`<` qualified($parent) `,` `dims` `=` $dims `>`"; diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index 7f3be7f91c56b..a3783d5e05df6 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -14,6 +14,7 @@ #include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h" #include "mlir/IR/Builders.h" #include "mlir/IR/DialectImplementation.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/Debug.h" @@ -409,6 +410,26 @@ SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId, shape); } +bool SliceAttr::isSliceOf(const xegpu::DistributeLayoutAttr &other) { + auto flattenedThis = flatten(); + // If other is a LayoutAttr, just compare directly with parent of + // flattenedThis. + if (auto otherLayout = dyn_cast(other)) + return flattenedThis.getParent() == otherLayout; + // If other is a SliceAttr, flatten it first before comparing. + auto otherFlattened = dyn_cast(other).flatten(); + // Both must have common parent LayoutAttr. + if (flattenedThis.getParent() != otherFlattened.getParent()) + return false; + // otherFlattened's sliced dims must be a subset of flattenedThis's sliced + // dims. + llvm::SmallDenseSet thisDims( + flattenedThis.getDims().asArrayRef().begin(), + flattenedThis.getDims().asArrayRef().end()); + return llvm::all_of(otherFlattened.getDims().asArrayRef(), + [&](int64_t dim) { return thisDims.contains(dim); }); +} + //===----------------------------------------------------------------------===// // XeGPU_RangeAttr //===----------------------------------------------------------------------===// From 77e8a9477dbd76bf95e5d142a0a6e6a4596ab3d2 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 10 Sep 2025 23:54:57 +0000 Subject: [PATCH 25/36] fix name --- mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index a3783d5e05df6..cc133b110c95a 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -417,16 +417,16 @@ bool SliceAttr::isSliceOf(const xegpu::DistributeLayoutAttr &other) { if (auto otherLayout = dyn_cast(other)) return flattenedThis.getParent() == otherLayout; // If other is a SliceAttr, flatten it first before comparing. - auto otherFlattened = dyn_cast(other).flatten(); + auto flattenedOther = dyn_cast(other).flatten(); // Both must have common parent LayoutAttr. - if (flattenedThis.getParent() != otherFlattened.getParent()) + if (flattenedThis.getParent() != flattenedOther.getParent()) return false; // otherFlattened's sliced dims must be a subset of flattenedThis's sliced // dims. llvm::SmallDenseSet thisDims( flattenedThis.getDims().asArrayRef().begin(), flattenedThis.getDims().asArrayRef().end()); - return llvm::all_of(otherFlattened.getDims().asArrayRef(), + return llvm::all_of(flattenedOther.getDims().asArrayRef(), [&](int64_t dim) { return thisDims.contains(dim); }); } From dc3a25006ca2e56629d50537840863582ca8bd8d Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 11 Sep 2025 21:43:29 +0000 Subject: [PATCH 26/36] use isTransposeOf --- .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 36 ++++++++++--------- mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 20 ----------- .../Transforms/XeGPUSubgroupDistribute.cpp | 19 +++------- 3 files changed, 25 insertions(+), 50 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index aa3e3c5cddc05..8fee28985736a 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -191,6 +191,9 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> { InterfaceMethod<"Get the rank of attribute", "int64_t", "getRank">, + InterfaceMethod<"Get the order field of the attribute as integer array", + "DenseI32ArrayAttr", + "getOrder">, InterfaceMethod<"Get the num of effective subgroups", "int64_t", "getNumSubgroups", (ins), [{ @@ -253,33 +256,40 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> { seen[ta.value()] = true; } auto checkTranspose = [](ArrayRef dst, ArrayRef src, ArrayRef perm) { + // If both `dst` and `src` are empty, conservatively return true + // here because some layout fields can be empty. + if (dst.empty() && src.empty()) + return true; for (const auto &ta : llvm::enumerate(perm)) { if (src[ta.index()] != dst[ta.value()]) return false; } return true; }; - // check sgLayout + // Check sgLayout if (!checkTranspose($_self.getSgLayoutAsInt(), other.getSgLayoutAsInt(), perm)) return false; - // check sgData + // Check sgData if (!checkTranspose($_self.getSgDataAsInt(), other.getSgDataAsInt(), perm)) return false; - // check instData + // Check instData if (!checkTranspose($_self.getInstDataAsInt(), other.getInstDataAsInt(), perm)) return false; - // check laneLayout + // Check laneLayout if (!checkTranspose($_self.getLaneLayoutAsInt(), other.getLaneLayoutAsInt(), perm)) return false; - // check laneData + // Check laneData if (!checkTranspose($_self.getLaneDataAsInt(), other.getLaneDataAsInt(), perm)) return false; + // Check order if both sides have order field. + if ($_self.getOrder() && other.getOrder()) { + auto thisOrderAsInt = llvm::to_vector_of($_self.getOrder().asArrayRef()); + auto otherOrderAsInt = llvm::to_vector_of(other.getOrder().asArrayRef()); + if (!checkTranspose(thisOrderAsInt, otherOrderAsInt, perm)) + return false; + } return true; - }]>, - InterfaceMethod + }]> ]; } @@ -481,9 +491,6 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> { FailureOr>> getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef shape); - /// Check if this is slice of some other layout. - bool isSliceOf(const xegpu::DistributeLayoutAttr &other) { return false; } - }]; let assemblyFormat = "`<` struct(params) `>`"; @@ -645,9 +652,6 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> { FailureOr>> getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef shape); - /// Check if this is slice of some other layout. - bool isSliceOf(const xegpu::DistributeLayoutAttr &other); - }]; let assemblyFormat = "`<` qualified($parent) `,` `dims` `=` $dims `>`"; diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index cc133b110c95a..29496784eb333 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -410,26 +410,6 @@ SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId, shape); } -bool SliceAttr::isSliceOf(const xegpu::DistributeLayoutAttr &other) { - auto flattenedThis = flatten(); - // If other is a LayoutAttr, just compare directly with parent of - // flattenedThis. - if (auto otherLayout = dyn_cast(other)) - return flattenedThis.getParent() == otherLayout; - // If other is a SliceAttr, flatten it first before comparing. - auto flattenedOther = dyn_cast(other).flatten(); - // Both must have common parent LayoutAttr. - if (flattenedThis.getParent() != flattenedOther.getParent()) - return false; - // otherFlattened's sliced dims must be a subset of flattenedThis's sliced - // dims. - llvm::SmallDenseSet thisDims( - flattenedThis.getDims().asArrayRef().begin(), - flattenedThis.getDims().asArrayRef().end()); - return llvm::all_of(flattenedOther.getDims().asArrayRef(), - [&](int64_t dim) { return thisDims.contains(dim); }); -} - //===----------------------------------------------------------------------===// // XeGPU_RangeAttr //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 46cd58e314acd..20934bb928bb8 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -1108,24 +1108,15 @@ struct VectorTransposeDistribution final : public gpu::WarpDistributionPattern { transposeOp, "the source or result vector of the transpose op lacks layout " "attribute"); - SmallVector sourceLaneLayout = sourceLayout.getLaneLayoutAsInt(); - SmallVector resultLaneLayout = resultLayout.getLaneLayoutAsInt(); - SmallVector sourceLaneData = sourceLayout.getLaneDataAsInt(); - SmallVector resultLaneData = resultLayout.getLaneDataAsInt(); - if (sourceLaneLayout.size() != 2 || resultLaneLayout.size() != 2) + if (sourceLayout.getRank() != 2 || resultLayout.getRank() != 2) return rewriter.notifyMatchFailure( transposeOp, "the source or result vector of the transpose op " "does not have 2D layout"); - auto is2DTranspose = [](ArrayRef input, ArrayRef output) { - return input.size() == 2 && output.size() == 2 && input[0] == output[1] && - input[1] == output[0]; - }; - - if (!is2DTranspose(sourceLaneLayout, resultLaneLayout) || - !is2DTranspose(sourceLaneData, resultLaneData)) + ArrayRef perm = transposeOp.getPermutation(); + if (!resultLayout.isTransposeOf(sourceLayout, perm)) return rewriter.notifyMatchFailure( transposeOp, - "the source or result vector layouts must be transposes of each " + "the source or result vector layouts must be 2D transposes of each " "other"); FailureOr distributedSourceTypeOrFailure = getDistVecTypeBasedOnLaneLayout(sourceLayout, @@ -1141,7 +1132,7 @@ struct VectorTransposeDistribution final : public gpu::WarpDistributionPattern { rewriter.setInsertionPointAfter(newWarpOp); auto newTransposeOp = vector::TransposeOp::create( rewriter, newWarpOp.getLoc(), newWarpOp.getResult(newRetIndices[0]), - transposeOp.getPermutation()); + perm); Value distributedVal = newWarpOp.getResult(operandIdx); rewriter.replaceAllUsesWith(distributedVal, newTransposeOp.getResult()); return success(); From 2f8341720c0364ad8f28724d3192b9571d823d53 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 11 Sep 2025 22:21:50 +0000 Subject: [PATCH 27/36] cleanup --- mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index 29496784eb333..7f3be7f91c56b 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -14,7 +14,6 @@ #include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h" #include "mlir/IR/Builders.h" #include "mlir/IR/DialectImplementation.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/Debug.h" From 081948913fed9c4ae35496da86d3fb620af91706 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 11 Sep 2025 22:37:35 +0000 Subject: [PATCH 28/36] cleanup --- .../Transforms/XeGPUSubgroupDistribute.cpp | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 20934bb928bb8..6c4aa2d31f6b3 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -1237,24 +1237,6 @@ void XeGPUSubgroupDistributePass::runOnOperation() { vector::populatePropagateWarpVectorDistributionPatterns( patterns, distributionFn, shuffleFn); - auto warpReduction = [](Location loc, OpBuilder &builder, Value input, - vector::CombiningKind kind, uint32_t size) { - // First reduce on a single thread to get per lane reduction value. - Value laneVal = builder.create(loc, kind, input); - // Parallel reduction using butterfly shuffles. - for (uint64_t i = 1; i < size; i <<= 1) { - Value shuffled = - builder - .create(loc, laneVal, i, - /*width=*/size, - /*mode=*/gpu::ShuffleMode::XOR) - .getShuffleResult(); - laneVal = makeArithReduction(builder, loc, kind, laneVal, shuffled); - } - return laneVal; - }; - - vector::populateDistributeReduction(patterns, warpReduction); if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { signalPassFailure(); return; From 90b6d8e087e4e6e344e8fb22c8885a522009ef7d Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 12 Sep 2025 22:16:41 +0000 Subject: [PATCH 29/36] address comments --- .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 14 +- .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 8 +- .../Transforms/XeGPUSubgroupDistribute.cpp | 176 ++++++++++++++++-- 3 files changed, 173 insertions(+), 25 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index 9476c79d2732a..3d4ccd84d8c2d 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -191,9 +191,6 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> { InterfaceMethod<"Get the rank of attribute", "int64_t", "getRank">, - InterfaceMethod<"Get the order field of the attribute as integer array", - "DenseI32ArrayAttr", - "getOrder">, InterfaceMethod<"Get the num of effective subgroups", "int64_t", "getNumSubgroups", (ins), [{ @@ -270,19 +267,19 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> { return true; }; // Check sgLayout - if (!checkTranspose($_self.getSgLayoutAsInt(), other.getSgLayoutAsInt(), perm)) + if (!checkTranspose($_self.getEffectiveSgLayoutAsInt(), other.getEffectiveSgLayoutAsInt(), perm)) return false; // Check sgData - if (!checkTranspose($_self.getSgDataAsInt(), other.getSgDataAsInt(), perm)) + if (!checkTranspose($_self.getEffectiveSgDataAsInt(), other.getEffectiveSgDataAsInt(), perm)) return false; // Check instData - if (!checkTranspose($_self.getInstDataAsInt(), other.getInstDataAsInt(), perm)) + if (!checkTranspose($_self.getEffectiveInstDataAsInt(), other.getEffectiveInstDataAsInt(), perm)) return false; // Check laneLayout - if (!checkTranspose($_self.getLaneLayoutAsInt(), other.getLaneLayoutAsInt(), perm)) + if (!checkTranspose($_self.getEffectiveLaneLayoutAsInt(), other.getEffectiveLaneLayoutAsInt(), perm)) return false; // Check laneData - if (!checkTranspose($_self.getLaneDataAsInt(), other.getLaneDataAsInt(), perm)) + if (!checkTranspose($_self.getEffectiveLaneDataAsInt(), other.getEffectiveLaneDataAsInt(), perm)) return false; // Check order if both sides have order field. if ($_self.getOrder() && other.getOrder()) { @@ -293,7 +290,6 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> { } return true; }]>, - InterfaceMethod LayoutInfo::getLaneLayout() const { if (!isAssigned()) return {}; - assert(storage.getLaneLayoutAsInt().size() && + assert(storage.getEffectiveLaneLayoutAsInt().size() && "Expected lane layout to be assigned"); - return llvm::map_to_vector(storage.getLaneLayoutAsInt(), + return llvm::map_to_vector(storage.getEffectiveLaneLayoutAsInt(), [](int64_t val) { return static_cast(val); }); } SmallVector LayoutInfo::getLaneData() const { if (!isAssigned()) return {}; - assert(storage.getLaneDataAsInt().size() && + assert(storage.getEffectiveLaneDataAsInt().size() && "Expected lane data to be assigned"); - return llvm::map_to_vector(storage.getLaneDataAsInt(), + return llvm::map_to_vector(storage.getEffectiveLaneDataAsInt(), [](int64_t val) { return static_cast(val); }); } diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index de6687c2b7965..60fc02e18260f 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -21,6 +21,7 @@ #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Diagnostics.h" #include "mlir/IR/Operation.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/TypeRange.h" @@ -34,6 +35,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" +#include namespace mlir { namespace xegpu { @@ -147,22 +149,29 @@ static Value resolveDistributedTy(Value orig, T expected, /// Helper function to check if the layout is packed. Layout is packed if it is /// 2D and lane_data[0] != 1 (data packed from col dimension). -static bool hasPackedLayout(xegpu::LayoutAttr layout) { - if (layout == xegpu::LayoutAttr()) +/// TODO: Move to target info. +static bool requirePacked(const xegpu::LayoutAttr layout) { + if (!layout) return false; - DenseI32ArrayAttr laneData = layout.getLaneData(); - if (!laneData || laneData.size() != 2) + auto laneData = layout.getEffectiveLaneDataAsInt(); + if (laneData.size() != 2) return false; - return laneData.asArrayRef()[0] != 1; + return laneData[0] != 1; } -static bool hasTransposedLayout(xegpu::LayoutAttr layout) { - if (layout == xegpu::LayoutAttr()) +/// Helper function to check if the layout requires a transpose effect. +static bool requireTranspose(const xegpu::LayoutAttr layout, + const std::string &chipStr) { + // Return false for unsupported targets. + // TODO: Add more support or move to target info. + if (chipStr != "pvc" && chipStr != "bmg") + return false; + if (!layout) return false; - DenseI32ArrayAttr laneLayout = layout.getLaneLayout(); - if (!laneLayout || laneLayout.size() != 2) + auto laneLayout = layout.getEffectiveLaneLayoutAsInt(); + if (laneLayout.size() != 2) return false; - return laneLayout.asArrayRef()[0] > 1 && laneLayout.asArrayRef()[1] == 1; + return laneLayout[0] == xegpu::targetinfo::subgroupSize && laneLayout[1] == 1; } /// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body @@ -516,8 +525,15 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern { loadOp->getAttrs()); xegpu::removeLayoutAttrs(newLoadOp); // Set the packed attribute if the layout requires it. - newLoadOp.setPacked(hasPackedLayout(layout)); - if (hasTransposedLayout(layout)) + newLoadOp.setPacked(requirePacked(layout)); + // Decide if this load op requires a transpose effect. + auto chipStr = xegpu::getChipStr(loadOp); + if (!chipStr) + return rewriter.notifyMatchFailure( + loadOp, + "xegpu::LoadNdOp require chip information to determine transpose " + "requirement"); + if (requireTranspose(layout, chipStr.value())) newLoadOp.setTranspose( DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0})); Value distributedVal = newWarpOp.getResult(operandIdx); @@ -1288,6 +1304,142 @@ struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern { } }; +/// Sink a memref::ExtractAlignedPointerAsIndex op feeding into yield op of an +/// enclosing `gpu.warp_execute_on_lane_0` region. This will simply move the op +/// outside of the warp op. +struct MemrefExtractAlignedPointerAsIndexDistribution final + : public gpu::WarpDistributionPattern { + using gpu::WarpDistributionPattern::WarpDistributionPattern; + LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, + PatternRewriter &rewriter) const override { + OpOperand *operand = getWarpResult( + warpOp, llvm::IsaPred); + if (!operand) + return rewriter.notifyMatchFailure( + warpOp, + "warp result is not a xegpu::MemrefExtractAlignedPointerAsIndex op"); + auto extractOp = + operand->get().getDefiningOp(); + unsigned operandIdx = operand->getOperandNumber(); + SmallVector newRetIndices; + gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( + rewriter, warpOp, extractOp.getSource(), + TypeRange{extractOp.getSource().getType()}, newRetIndices); + rewriter.setInsertionPointAfter(newWarpOp); + auto newExtractOp = memref::ExtractAlignedPointerAsIndexOp::create( + rewriter, newWarpOp.getLoc(), extractOp.getType(), + newWarpOp.getResult(newRetIndices[0])); + Value distributedVal = newWarpOp.getResult(operandIdx); + rewriter.replaceAllUsesWith(distributedVal, newExtractOp.getResult()); + return success(); + } +}; + +/// Distribute a vector::BitCastOp feeding into yield op of an enclosing +/// `gpu.warp_execute_on_lane_0` region. Bitcast only impacts the innermost +/// diemension of the source/result vectors. Equivalent vector::BitCastOp is +/// created outside of the warp op with distributed source vector type (computed +/// using assigned layout). +struct VectorBitcastDistribution final : public gpu::WarpDistributionPattern { + using gpu::WarpDistributionPattern::WarpDistributionPattern; + LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, + PatternRewriter &rewriter) const override { + OpOperand *operand = + getWarpResult(warpOp, llvm::IsaPred); + if (!operand) + return rewriter.notifyMatchFailure( + warpOp, "warp result is not a vector::BitCast op"); + auto bitcastOp = operand->get().getDefiningOp(); + unsigned operandIdx = operand->getOperandNumber(); + VectorType distributedSourceType = + getDistVecTypeBasedOnLaneLayout( + xegpu::getDistributeLayoutAttr(bitcastOp.getSource()), + bitcastOp.getSourceVectorType()) + .value_or(VectorType()); + if (!distributedSourceType) + return rewriter.notifyMatchFailure( + bitcastOp, "Failed to distribute the source vector type in " + "vector::BitCast op"); + VectorType distributedResultType = + cast(warpOp.getResult(operandIdx).getType()); + SmallVector newRetIndices; + gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( + rewriter, warpOp, bitcastOp.getSource(), + TypeRange{distributedSourceType}, newRetIndices); + rewriter.setInsertionPointAfter(newWarpOp); + auto newBitcastOp = vector::BitCastOp::create( + rewriter, newWarpOp.getLoc(), distributedResultType, + newWarpOp.getResult(newRetIndices[0])); + Value distributedVal = newWarpOp.getResult(operandIdx); + rewriter.replaceAllUsesWith(distributedVal, newBitcastOp.getResult()); + return success(); + } +}; + +/// Distribute a vector::TransposeOp feeding into yield op of an enclosing +/// `gpu.warp_execute_on_lane_0` region. Currently only 2D transposes are +/// supported. In most cases, transpose is a no op because it is entirely +/// handled using the layouts (e.g. 16x1 -> 1x16). However, if each lane owns +/// multiple slices of data after distribution (e.g. 16x2 -> 2x16), a lane-local +/// transpose (i.e. shuffle) is needed. Therefore, we create an equivalent +/// vector::TransposeOp outside of the warp op with distributed source vector +/// type (computed using assigned layout). +struct VectorTransposeDistribution final : public gpu::WarpDistributionPattern { + using gpu::WarpDistributionPattern::WarpDistributionPattern; + LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, + PatternRewriter &rewriter) const override { + OpOperand *operand = + getWarpResult(warpOp, llvm::IsaPred); + if (!operand) + return rewriter.notifyMatchFailure( + warpOp, "warp result is not a vector::Transpose op"); + auto transposeOp = operand->get().getDefiningOp(); + unsigned operandIdx = operand->getOperandNumber(); + xegpu::DistributeLayoutAttr sourceLayout = + xegpu::getDistributeLayoutAttr(transposeOp.getVector()); + xegpu::DistributeLayoutAttr resultLayout = + xegpu::getDistributeLayoutAttr(transposeOp.getResult()); + if (!sourceLayout || !resultLayout) + return rewriter.notifyMatchFailure( + transposeOp, + "the source or result vector of the transpose op lacks layout " + "attribute"); + int64_t sourceRank = transposeOp.getSourceVectorType().getRank(); + int64_t resultRank = transposeOp.getResultVectorType().getRank(); + // Only 2D transposes are supported for now. + // TODO: Support nD transposes. + if (sourceRank != 2 || resultRank != 2) + return rewriter.notifyMatchFailure( + transposeOp, "the source or result vector of the transpose op " + "does not have 2D layout"); + ArrayRef perm = transposeOp.getPermutation(); + // Result layout must be a transpose of source layout. + if (!resultLayout.isTransposeOf(sourceLayout, perm)) + return rewriter.notifyMatchFailure( + transposeOp, + "the source or result vector layouts must be 2D transposes of each " + "other"); + FailureOr distributedSourceTypeOrFailure = + getDistVecTypeBasedOnLaneLayout(sourceLayout, + transposeOp.getSourceVectorType()); + if (failed(distributedSourceTypeOrFailure)) + return rewriter.notifyMatchFailure( + transposeOp, "Failed to distribute the source vector type in " + "vector::Transpose op"); + SmallVector newRetIndices; + gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( + rewriter, warpOp, transposeOp.getVector(), + TypeRange{distributedSourceTypeOrFailure.value()}, newRetIndices); + rewriter.setInsertionPointAfter(newWarpOp); + auto newTransposeOp = vector::TransposeOp::create( + rewriter, newWarpOp.getLoc(), newWarpOp.getResult(newRetIndices[0]), + perm); + Value distributedVal = newWarpOp.getResult(operandIdx); + rewriter.replaceAllUsesWith(distributedVal, newTransposeOp.getResult()); + return success(); + } +}; + } // namespace namespace { From 9c2a7ed8d24f963c7808d25f767ab37169bc55a4 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 12 Sep 2025 22:37:28 +0000 Subject: [PATCH 30/36] address comments --- .../Transforms/XeGPUSubgroupDistribute.cpp | 17 +++--- .../Dialect/XeGPU/subgroup-distribute.mlir | 60 ++++++++++--------- 2 files changed, 40 insertions(+), 37 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 60fc02e18260f..973dc66627b09 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -485,7 +485,14 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern { warpOp, "warp result is not a xegpu::LoadNd op"); auto loadOp = operand->get().getDefiningOp(); - + // Chip information is required to decide if the layout requires transpose + // effect. + auto chipStr = xegpu::getChipStr(loadOp); + if (!chipStr) + return rewriter.notifyMatchFailure( + loadOp, + "xegpu::LoadNdOp require chip information to determine transpose " + "requirement"); int64_t offsetSize = static_cast(loadOp.getOffsets().size()); if ((offsetSize != 0) || loadOp.getConstOffsetsAttr()) return failure(); @@ -526,13 +533,7 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern { xegpu::removeLayoutAttrs(newLoadOp); // Set the packed attribute if the layout requires it. newLoadOp.setPacked(requirePacked(layout)); - // Decide if this load op requires a transpose effect. - auto chipStr = xegpu::getChipStr(loadOp); - if (!chipStr) - return rewriter.notifyMatchFailure( - loadOp, - "xegpu::LoadNdOp require chip information to determine transpose " - "requirement"); + // Set the transpose attribute if the layout requires it. if (requireTranspose(layout, chipStr.value())) newLoadOp.setTranspose( DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0})); diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index 3fa9d90a0047e..13b0ed176eb0c 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -1,6 +1,8 @@ -// RUN: mlir-opt -xegpu-subgroup-distribute -allow-unregistered-dialect -canonicalize -cse -split-input-file %s | FileCheck %s +// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' -xegpu-subgroup-distribute \ +// RUN: -allow-unregistered-dialect -canonicalize -cse -split-input-file %s | FileCheck %s -// RUN: mlir-opt -xegpu-subgroup-distribute="enable-sg-reductions=false" -allow-unregistered-dialect \ +// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' \ +// RUN: -xegpu-subgroup-distribute="enable-sg-reductions=false" -allow-unregistered-dialect \ // RUN: -canonicalize -cse -split-input-file %s | FileCheck %s --check-prefix=CHECK-REDUCTION // CHECK-LABEL: gpu.func @store_nd_1d @@ -9,7 +11,7 @@ // CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> // CHECK: xegpu.store_nd %[[CST]], %[[T0]] : vector<1xf32>, !xegpu.tensor_desc<16xf32> // CHECK: gpu.return -gpu.module @test { +gpu.module @xevm_module{ gpu.func @store_nd_1d(%arg0: memref<16xf32>) { %c0 = arith.constant 0 : index %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16xf32> @@ -25,7 +27,7 @@ gpu.module @test { // CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf16> // CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> // CHECK: xegpu.store_nd %[[CST]], %[[T0]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @store_nd_2d(%arg0: memref<16x16xf16>) { %c0 = arith.constant 0 : index %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16x16xf16> @@ -44,7 +46,7 @@ gpu.module @test { // CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16xf32> -> vector<1xf32> // CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> // CHECK: xegpu.store_nd %[[T1]], %[[T2]] : vector<1xf32>, !xegpu.tensor_desc<16xf32> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> @@ -62,7 +64,7 @@ gpu.module @test { // CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> // CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> // CHECK: xegpu.store_nd %[[T1]], %[[T2]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> @@ -83,7 +85,7 @@ gpu.module @test { // CHECK-DAG: %[[T4:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> // CHECK-DAG: %[[T5:.*]] = vector.shape_cast %[[T3]] : vector<16x1xf16> to vector<16xf16> // CHECK: xegpu.store_nd %[[T5]], %[[T4]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, #xegpu.layout> @@ -105,7 +107,7 @@ gpu.module @test { // CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> // CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][%{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> // CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @load_dpas_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> @@ -133,7 +135,7 @@ gpu.module @test { // CHECK-DAG: %[[T8:.*]] = vector.shape_cast %[[T6]] : vector<8x1xf32> to vector<8xf32> // CHECK-DAG: %[[T7:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> // CHECK: xegpu.store_nd %[[T8]], %[[T7]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @load_dpas_postop_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> @@ -157,7 +159,7 @@ gpu.module @test { // CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> // CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> // CHECK: xegpu.store_nd %[[T1]], %[[T2]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %arg1: ui64, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0], shape:[%arg2, %arg3], strides:[%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> @@ -191,7 +193,7 @@ gpu.module @test { // CHECK-NEXT: } // CHECK-NEXT: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32> // CHECK-NEXT: xegpu.store_nd %[[T9]], %[[T2]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){ %c0 = arith.constant 0 : index %c16 = arith.constant 16 : index @@ -223,7 +225,7 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf32> -> !xegpu.tensor_desc<16xf32> // CHECK: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%c32] : !xegpu.tensor_desc<16xf32> // CHECK: xegpu.store_nd %[[CST]], %[[T1]] : vector<1xf32>, !xegpu.tensor_desc<16xf32> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @update_nd_offset_1d(%arg0: memref<256xf32>) { %c0 = arith.constant 0 : index %c32 = arith.constant 32 : index @@ -242,7 +244,7 @@ gpu.module @test { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32> // CHECK: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%c32, %c32] : !xegpu.tensor_desc<16x16xf32> // CHECK: xegpu.store_nd %[[CST]], %[[T1]] : vector<16xf32>, !xegpu.tensor_desc<16x16xf32> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @update_nd_offset_2d(%arg0: memref<256x256xf32>) { %c0 = arith.constant 0 : index %c32 = arith.constant 32 : index @@ -259,7 +261,7 @@ gpu.module @test { // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16> // CHECK: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @prefetch_2d(%arg0: memref<256x256xf16>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> @@ -279,7 +281,7 @@ gpu.module @test { // CHECK: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]] : // CHECK-SAME: !xegpu.tensor_desc<16x16xf32, #xegpu.layout> to !xegpu.tensor_desc<16x16xf32> {resolve_simt_type_mismatch} // CHECK: xegpu.update_nd_offset %[[T1]], [%{{.*}}] : !xegpu.tensor_desc<16x16xf32> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @check_update_nd_offset_distributed_tensor_desc() { %c32 = arith.constant 32 : index %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16x16xf32> @@ -295,7 +297,7 @@ gpu.module @test { // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16> // CHECK: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @prefetch_1d(%arg0: memref<256xf16>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> @@ -311,7 +313,7 @@ gpu.module @test { // CHECK-NEXT: gpu.barrier // CHECK-NEXT: %[[T2:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16> // CHECK-NEXT: xegpu.store_nd %[[T1]], %[[T2]] : vector<1xf16>, !xegpu.tensor_desc<16xf16> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @gpu_barrier(%arg0: memref<256xf16>, %arg1: memref<256xf16>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> @@ -337,7 +339,7 @@ gpu.module @test { // CHECK-NEXT: %[[CAST1:.*]] = vector.shape_cast %[[COL1]] : vector<16x1xf32> to vector<16xf32> // CHECK-NEXT: %[[RED1:.*]] = vector.reduction , %[[CAST1]], %{{.*}} : vector<16xf32> into f32 // CHECK-NEXT: vector.from_elements %[[RED0]], %[[RED1]] : vector<2xf32> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction() { %0 = "some_def"() : () -> !xegpu.tensor_desc<1x32xf32, #xegpu.layout> %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<16x32xf32>) @@ -363,7 +365,7 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction() { // CHECK-REDUCTION-NEXT: gpu.yield %4, %[[R1]], %[[R0]] : !xegpu.tensor_desc<2x16xf32, #xegpu.layout>, f32, f32 // CHECK-REDUCTION-NEXT: } // CHECK-REDUCTION-NEXT: vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction() { %0 = "some_def"() : () -> !xegpu.tensor_desc<2x16xf32, #xegpu.layout> %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<2x16xf32>) @@ -390,7 +392,7 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction() { // CHECK: %[[ROW1:.*]] = vector.extract %[[W]]#1[1] : vector<16xf32> from vector<2x16xf32> // CHECK-NEXT: %[[R1:.*]] = vector.reduction , %[[ROW1]], %{{.*}} : vector<16xf32> into f32 // CHECK-NEXT: vector.from_elements %[[R0]], %[[R1]] : vector<2xf32> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction() { %0 = "some_def"() : () -> !xegpu.tensor_desc<32x1xf32, #xegpu.layout> %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<32x16xf32>) @@ -418,7 +420,7 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction() { // CHECK-REDUCTION-NEXT: gpu.yield %4, %[[R1]], %[[R0]] : !xegpu.tensor_desc<16x2xf32, #xegpu.layout>, f32, f32 // CHECK-REDUCTION-NEXT: } // CHECK-REDUCTION-NEXT: vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction() { %0 = "some_def"() : () -> !xegpu.tensor_desc<16x2xf32, #xegpu.layout> %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<16x2xf32>) @@ -439,7 +441,7 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction() { // CHECK-NEXT: %[[LANE_OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> // CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[LANE_OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> // CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[LANE_OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @scatter_ops_chunksize(%src: memref<256xf16>) { %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> @@ -464,7 +466,7 @@ gpu.module @test { // CHECK-NEXT: scf.yield %[[DEFAULT]] : vector<8xf16> // CHECK-NEXT: } // CHECK-NEXT: xegpu.store %[[PREDICATED_LOAD]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @scatter_ops_scf_yield(%src: memref<256xf16>, %pred : i1) { %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> @@ -493,7 +495,7 @@ gpu.module @test { // CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> // CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> // CHECK-NEXT: } -gpu.module @test { +gpu.module @xevm_module{ gpu.func @scatter_ops_scf_non_yield(%src: memref<256xf16>) { %pred = llvm.mlir.poison : i1 %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> @@ -514,7 +516,7 @@ gpu.module @test { // CHECK-NEXT: %[[LANE_OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> // CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[LANE_OFFSET]]], %[[MASK]] : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16> // CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[LANE_OFFSET]]], %[[MASK]] : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @scatter_ops(%src: memref<256xf16>) { %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> @@ -529,7 +531,7 @@ gpu.module @test { // ----- // CHECK-LABEL: gpu.func @memref_extract_aligned_pointer_as_index( // CHECK: %{{.*}} = memref.extract_aligned_pointer_as_index %{{.*}} : memref<256x256xf16> -> index -gpu.module @test { +gpu.module @xevm_module{ gpu.func @memref_extract_aligned_pointer_as_index(%arg0 : memref<256x256xf16>) { %c0 = arith.constant 0 : index %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16xf16> @@ -548,7 +550,7 @@ gpu.module @test { // CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<2xf32> // CHECK: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<2x16xf32> -> !xegpu.tensor_desc<2x16xf32> // CHECK: xegpu.store_nd %[[CST]], %[[DEST]] : vector<2xf32>, !xegpu.tensor_desc<2x16xf32> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @vector_transpose(%arg0: memref<2x16xf32>) { %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16x2xf32> @@ -569,7 +571,7 @@ gpu.module @test { // CHECK-NEXT: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<4x16xi16> -> !xegpu.tensor_desc<4x16xi16> // CHECK-NEXT: %[[T0:.*]] = vector.shape_cast %[[CAST]] : vector<4x1xi16> to vector<4xi16> // CHECK-NEXT: xegpu.store_nd %[[T0]], %[[DEST]] : vector<4xi16>, !xegpu.tensor_desc<4x16xi16> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @vector_bitcast(%arg0: memref<4x16xi16>) { %cst = "some_op"() {layout_result_0 = #xegpu.layout} : () -> (vector<4x32xi8>) @@ -595,7 +597,7 @@ gpu.module @test { // CHECK-NEXT: %[[BCAST1:.*]] = vector.bitcast %[[BCAST0]] : vector<1x8xi32> to vector<1x16xf16> // CHECK-NEXT: %[[BCAST2:.*]] = vector.shape_cast %[[BCAST1]] : vector<1x16xf16> to vector<16xf16> // CHECK-NEXT: %[[C:.*]] = xegpu.dpas %[[A]], %[[BCAST2]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @mma_transpose_b(%arg0: memref<8x16xf16>, %arg1: memref<16x8xi32>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> From d55bce804a908c57f1c2f2f5f055256737080cd2 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Sat, 13 Sep 2025 00:04:52 +0000 Subject: [PATCH 31/36] address comments --- .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 124 ++++++++---------- 1 file changed, 54 insertions(+), 70 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 1b9969dc39bcd..328bcc3df104b 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -31,6 +31,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" @@ -110,6 +111,12 @@ struct LayoutInfo { return isa(storage); } + int64_t getRank() const { + if (!isAssigned()) + return -1; + return storage.getRank(); + } + Attribute get() { return storage; } }; @@ -493,15 +500,14 @@ void LayoutInfoPropagation::visitVectorBroadCastOp( return; } - // Only consider 2D -> 2D broadcast. - if (sourceTy.getRank() != 2 || resultTy.getRank() != 2) { - broadcast.emitWarning("Expecting source type to be 2D vector and " - "result type to be 2D vector."); + // Only consider nD -> nD broadcast. + if (sourceTy.getRank() != resultTy.getRank()) { + broadcast.emitWarning("Expecting source and result to have same rank."); return; } SetVector broadcastUnitDims = broadcast.computeBroadcastedUnitDims(); if (broadcastUnitDims.size() != 1) { - broadcast.emitWarning("Expecting source type to be 2D vector only with " + broadcast.emitWarning("Expecting source type to be nD vector only with " "one broadcasted dimension."); return; } @@ -516,79 +522,46 @@ void LayoutInfoPropagation::visitShapeCastOp( LayoutInfo resultLayout = results[0]->getValue(); if (!resultLayout.isAssigned()) return; - int64_t sourceRank = shapeCast.getSourceVectorType().getRank(); - int64_t resultRank = shapeCast.getResultVectorType().getRank(); - // Expecting source rank to be 1D or 2D. - if (sourceRank != 1 && sourceRank != 2) { - shapeCast.emitWarning("Expecting source type to be 1D or 2D vector."); - return; - } - // Expecting result rank to be 1D or 2D. - if (resultRank != 1 && resultRank != 2) { - shapeCast.emitWarning("Expecting result type to be 1D or 2D vector."); + VectorType sourceTy = shapeCast.getSourceVectorType(); + VectorType resultTy = shapeCast.getResultVectorType(); + // Shape cast layout propagation has following restrictions: + // 1) nD -> nD shape cast is not supported. + // 2) Shape cast must always expand the rank (e.g. 1D -> 2D). + // 3) Newly expanded dimensions must be 1. + // 4) Result layout can not be a slice layout. + if (sourceTy.getRank() == resultTy.getRank()) { + shapeCast.emitWarning("nD -> nD shape cast is not supported."); return; } - // For 2D -> 2D shape cast, propagate the result layout to the source. - if (sourceRank == 2 && resultRank == 2) { - propagateIfChanged(operands[0], operands[0]->meet(resultLayout)); + if (sourceTy.getRank() > resultTy.getRank()) { + shapeCast.emitWarning("Expecting shape cast to expand the rank."); return; } - auto resultLaneLayout = resultLayout.getLaneLayout(); - if (resultRank == 2 && resultLaneLayout[0] != 1 && resultLaneLayout[1] != 1) { - shapeCast.emitWarning( - "Expecting 2D result layout to be of form [1, subgroupSize] " - "or [subgroupSize, 1]."); + if (resultLayout.getRank() != resultTy.getRank() || + resultLayout.isSliceLayout()) { + shapeCast.emitWarning("Expecting result layout to have same rank as the " + "result type and not be a slice layout."); return; } ArrayRef resultShape = shapeCast.getResultVectorType().getShape(); ArrayRef sourceShape = shapeCast.getSourceVectorType().getShape(); - // For 2D -> 1D case. - if (sourceRank == 2 && resultRank == 1) { - // If the result had slice layout, simply assign the parent layout of the - // slice. - if (resultLayout.isSliceLayout()) { - auto sliceAttr = cast(resultLayout.get()); - propagateIfChanged(operands[0], - operands[0]->meet(LayoutInfo(sliceAttr.getParent()))); - return; - } - // If the result has a regular 1D layout, then we find the first dimension - // that can be fully evenly distributed to lanes. This dimension becomes - // the distributed dimension for deciding the lane layout. - int sourceDistributedDim = - sourceShape[0] % xegpu::targetinfo::subgroupSize == 0 - ? 0 - : (sourceShape[1] % xegpu::targetinfo::subgroupSize == 0 ? 1 : -1); - if (sourceDistributedDim == -1) { - shapeCast.emitWarning( - "Source vector can not be evenly distributed across lanes."); - return; - } - SmallVector sourceLaneLayout = {1, 1}, - laneData = {1, resultLayout.getLaneData()[0]}; - sourceLaneLayout[sourceDistributedDim] = xegpu::targetinfo::subgroupSize; - propagateIfChanged( - operands[0], - operands[0]->meet(LayoutInfo(xegpu::LayoutAttr::get( - shapeCast->getContext(), sourceLaneLayout, laneData)))); - } - - // For 1D -> 2D case, If the result shape can be evenly distributed in the - // distributed dimension, then the source layout should be - // [subgroupSize][1]. Otherwise, data is shared accross lanes (broadcasted). - // We use slice attribute for the broadcast case. - int64_t distributedDim = resultLaneLayout[0] == 1 ? 1 : 0; - if (resultShape[distributedDim] % xegpu::targetinfo::subgroupSize != 0) { - xegpu::LayoutAttr parentLayout = xegpu::LayoutAttr::get( - shapeCast->getContext(), resultLaneLayout, resultLayout.getLaneData()); - xegpu::SliceAttr sliceLayout = xegpu::SliceAttr::get( - shapeCast->getContext(), parentLayout, - DenseI64ArrayAttr::get(shapeCast->getContext(), {distributedDim})); - propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(sliceLayout))); - return; - } - propagateIfChanged(operands[0], operands[0]->meet(getDefaultSIMTLayoutInfo( - shapeCast.getSourceVectorType()))); + auto findUnitDims = [](ArrayRef shape) { + SmallVector unitDims; + for (int i = 0, e = shape.size(); i < e; ++i) + if (shape[i] == 1) + unitDims.push_back(i); + return unitDims; + }; + SmallVector resultUnitDims = findUnitDims(resultShape); + SmallVector sourceUnitDims = findUnitDims(sourceShape); + // Remove first `sourceUnitDims.size()` unit dims from resultUnitDims. + auto sliceDims = + ArrayRef(resultUnitDims).drop_front(sourceUnitDims.size()); + // Source layout is obtained by removing the slice dims from result layout. + xegpu::SliceAttr sliceLayout = xegpu::SliceAttr::get( + shapeCast->getContext(), cast(resultLayout.get()), + DenseI64ArrayAttr::get(shapeCast->getContext(), sliceDims)); + propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(sliceLayout))); } /// Propagate the layout of the result tensor to the source tensor descriptor @@ -687,6 +660,17 @@ void LayoutInfoPropagation::visitVectorBitcastOp( propagateIfChanged(operands[0], operands[0]->meet(resultLayout)); return; } + // Check if the result layout is valid. i.e. result vector can be distributed. + auto resultLaneLayout = resultLayout.getLaneLayout(); + auto resultLaneData = resultLayout.getLaneData(); + if (failed(xegpu::getDistributedVectorType( + bitcast.getResultVectorType(), + xegpu::LayoutAttr::get(bitcast->getContext(), resultLaneLayout, + resultLaneData)))) { + bitcast.emitWarning( + "Result vector type can not be evenly distributed across lanes."); + return; + } int64_t rank = bitcast.getSourceVectorType().getRank(); // Bitcast is a `narrowing` if the input element type bit width larger than // the output element type bit width. eg. f32 -> f16 is a narrowing bitcast. From 74df1befbd0d470861f2be1b9fdb155a8931f33e Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 15 Sep 2025 22:05:11 +0000 Subject: [PATCH 32/36] remove invalid shape cast tests --- mlir/test/Dialect/XeGPU/propagate-layout.mlir | 35 +------------------ 1 file changed, 1 insertion(+), 34 deletions(-) diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir index c650c4737670c..30f785ded975a 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir @@ -521,39 +521,6 @@ func.func @scf_while_and_condition(%arg0: memref<256xf32>, %arg1: memref<256xf32 return } -// ----- -// CHECK-LABEL: func.func @vector_shape_cast_2d_to_1d_dim0_distributed( -// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x1xf16, #xegpu.layout>, -// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf16, #xegpu.layout>) { -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] -// CHECK-SAME: {layout_result_0 = #xegpu.layout} : -// CHECK-SAME: !xegpu.tensor_desc<16x1xf16, #xegpu.layout> -> vector<16x1xf16> -// CHECK-NEXT: %{{.*}} = vector.shape_cast %[[LOAD]] {layout_result_0 = #xegpu.layout} -// CHECK-SAME: : vector<16x1xf16> to vector<16xf16> -func.func @vector_shape_cast_2d_to_1d_dim0_distributed(%arg0: !xegpu.tensor_desc<16x1xf16>, %arg1: !xegpu.tensor_desc<16xf16>) { - %c0 = arith.constant 0 : index - %3 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x1xf16> -> vector<16x1xf16> - %2 = vector.shape_cast %3 : vector<16x1xf16> to vector<16xf16> - xegpu.store_nd %2, %arg1 : vector<16xf16>, !xegpu.tensor_desc<16xf16> - return -} - -// ----- -// CHECK-LABEL: func.func @vector_shape_cast_2d_to_1d_dim1_distributed( -// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<1x16xf16, #xegpu.layout>, -// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf16, #xegpu.layout>) { -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] {layout_result_0 = #xegpu.layout} -// CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.layout> -> vector<1x16xf16> -// CHECK: %{{.*}} = vector.shape_cast %[[LOAD]] {layout_result_0 = #xegpu.layout} -// CHECK-SAME: vector<1x16xf16> to vector<16xf16> -func.func @vector_shape_cast_2d_to_1d_dim1_distributed(%arg0: !xegpu.tensor_desc<1x16xf16>, %arg1: !xegpu.tensor_desc<16xf16>) { - %c0 = arith.constant 0 : index - %3 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<1x16xf16> -> vector<1x16xf16> - %2 = vector.shape_cast %3 : vector<1x16xf16> to vector<16xf16> - xegpu.store_nd %2, %arg1 : vector<16xf16>, !xegpu.tensor_desc<16xf16> - return -} - // ----- // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim1_distributed( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, @@ -561,7 +528,7 @@ func.func @vector_shape_cast_2d_to_1d_dim1_distributed(%arg0: !xegpu.tensor_desc // CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] {layout_result_0 = #xegpu.layout} // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction , %[[LOAD]], %{{[0-9a-zA-Z]+}} -// CHECK-SAME: {layout_result_0 = #xegpu.layout} [0] : vector<16x16xf16> to vector<16xf16> +// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16> // CHECK-NEXT: %[[CAST:.*]] = vector.shape_cast %[[REDUCE]] {layout_result_0 = #xegpu.layout} // CHECK-SAME: vector<16xf16> to vector<1x16xf16> func.func @vector_shape_cast_1d_to_2d_dim1_distributed(%arg0: !xegpu.tensor_desc<16x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) { From d1ca356fd5c83023f8ae5e71574a89bd42a1eeaa Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 18 Sep 2025 17:59:26 +0000 Subject: [PATCH 33/36] address comments --- mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 13 +++---------- .../XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 2 +- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index 3d4ccd84d8c2d..5695d5d515d7f 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -245,16 +245,9 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> { return false; if ($_self.getRank() != other.getRank() || perm.size() != static_cast($_self.getRank())) return false; - // check if the permutation is valid - int64_t rank = $_self.getRank(); - SmallVector seen(rank, false); - for (const auto &ta : llvm::enumerate(perm)) { - if (ta.value() < 0 || ta.value() >= rank) - return false; - if (seen[ta.value()]) - return false; - seen[ta.value()] = true; - } + // Check if the permutation is valid + if (!isPermutationVector(perm)) + return false; auto checkTranspose = [](ArrayRef dst, ArrayRef src, ArrayRef perm) { // If both `dst` and `src` are empty, conservatively return true // here because some layout fields can be empty. diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 973dc66627b09..69efca0d42c0b 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -1318,7 +1318,7 @@ struct MemrefExtractAlignedPointerAsIndexDistribution final if (!operand) return rewriter.notifyMatchFailure( warpOp, - "warp result is not a xegpu::MemrefExtractAlignedPointerAsIndex op"); + "warp result is not a memref::MemrefExtractAlignedPointerAsIndex op"); auto extractOp = operand->get().getDefiningOp(); unsigned operandIdx = operand->getOperandNumber(); From 80e930fc693cfeb99ee4c4547f953e388735919e Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 18 Sep 2025 21:41:53 +0000 Subject: [PATCH 34/36] address comments --- .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 58 +++++++++++++++---- 1 file changed, 47 insertions(+), 11 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 328bcc3df104b..1f5a06cfb450c 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -37,6 +37,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/LogicalResult.h" #include "llvm/Support/raw_ostream.h" +#include namespace mlir { namespace xegpu { @@ -545,19 +546,54 @@ void LayoutInfoPropagation::visitShapeCastOp( } ArrayRef resultShape = shapeCast.getResultVectorType().getShape(); ArrayRef sourceShape = shapeCast.getSourceVectorType().getShape(); - auto findUnitDims = [](ArrayRef shape) { - SmallVector unitDims; + + auto findNonUnitDims = [](ArrayRef shape) { + SmallVector nonUnitDims; for (int i = 0, e = shape.size(); i < e; ++i) - if (shape[i] == 1) - unitDims.push_back(i); - return unitDims; + if (shape[i] != 1) + nonUnitDims.push_back(i); + return nonUnitDims; }; - SmallVector resultUnitDims = findUnitDims(resultShape); - SmallVector sourceUnitDims = findUnitDims(sourceShape); - // Remove first `sourceUnitDims.size()` unit dims from resultUnitDims. - auto sliceDims = - ArrayRef(resultUnitDims).drop_front(sourceUnitDims.size()); - // Source layout is obtained by removing the slice dims from result layout. + SmallVector resultNonUnitDims = findNonUnitDims(resultShape); + SmallVector sourceNonUnitDims = findNonUnitDims(sourceShape); + // Source and result must have the same number of non-unit dimensions and + // thier values must match. + if (resultNonUnitDims.size() != sourceNonUnitDims.size()) { + shapeCast.emitWarning("Expecting source and result shapes to have same " + "number of non-unit dimensions."); + return; + } + auto reesultNonUnitDimShapes = llvm::map_to_vector( + resultNonUnitDims, [&](int64_t idx) { return resultShape[idx]; }); + auto sourceNonUnitDimShapes = llvm::map_to_vector( + sourceNonUnitDims, [&](int64_t idx) { return sourceShape[idx]; }); + if (llvm::any_of( + llvm::zip(sourceNonUnitDimShapes, reesultNonUnitDimShapes), + [](auto pair) { return std::get<0>(pair) != std::get<1>(pair); })) { + shapeCast.emitWarning("Expecting non-unit dimensions of source and result " + "shapes to match."); + return; + } + // Slice dims are unit dims that exist in the result shape but not in the + // source shape. + SmallVector sliceDims; + int64_t srcPrev, resPrev = 0; + // Add a dummy non unit dim at the end to handle trailing unit dims. + sourceNonUnitDims.push_back(sourceShape.size()); + resultNonUnitDims.push_back(resultShape.size()); + for (auto [s, r] : llvm::zip_equal(sourceNonUnitDims, resultNonUnitDims)) { + int unitDimDiff = (r - resPrev) - (s - srcPrev); + // Negative unitDimDiff means source shape has more unit dims in this range. + if (unitDimDiff < 0) { + shapeCast.emitWarning("Unsupported shape cast. Source shape has more " + "unit dims in between two non-unit dims."); + return; + } + for (auto it : llvm::seq(0, unitDimDiff)) + sliceDims.push_back(resPrev + it); + srcPrev = s + 1; + resPrev = r + 1; + } xegpu::SliceAttr sliceLayout = xegpu::SliceAttr::get( shapeCast->getContext(), cast(resultLayout.get()), DenseI64ArrayAttr::get(shapeCast->getContext(), sliceDims)); From b1bb16b7c2560c80a2ee42a0b7ce657ada04b7a0 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 18 Sep 2025 23:21:02 +0000 Subject: [PATCH 35/36] simplify shape cast handling --- .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 75 ++----------------- 1 file changed, 7 insertions(+), 68 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 1f5a06cfb450c..b63acd67f7813 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -525,78 +525,17 @@ void LayoutInfoPropagation::visitShapeCastOp( return; VectorType sourceTy = shapeCast.getSourceVectorType(); VectorType resultTy = shapeCast.getResultVectorType(); - // Shape cast layout propagation has following restrictions: - // 1) nD -> nD shape cast is not supported. - // 2) Shape cast must always expand the rank (e.g. 1D -> 2D). - // 3) Newly expanded dimensions must be 1. - // 4) Result layout can not be a slice layout. - if (sourceTy.getRank() == resultTy.getRank()) { - shapeCast.emitWarning("nD -> nD shape cast is not supported."); + // Shape cast layout propagation only supports 1D -> 2D shape casts. + // TODO: Support kD -> nD shape casts (k < n, n >= 2) where expanded dims are + // unit dimensions and non-unit dims match. + if (sourceTy.getRank() != 1 || resultTy.getRank() != 2) { + shapeCast.emitWarning("Expecting shape cast to be 1D -> 2D."); return; } - if (sourceTy.getRank() > resultTy.getRank()) { - shapeCast.emitWarning("Expecting shape cast to expand the rank."); - return; - } - if (resultLayout.getRank() != resultTy.getRank() || - resultLayout.isSliceLayout()) { - shapeCast.emitWarning("Expecting result layout to have same rank as the " - "result type and not be a slice layout."); - return; - } - ArrayRef resultShape = shapeCast.getResultVectorType().getShape(); - ArrayRef sourceShape = shapeCast.getSourceVectorType().getShape(); - - auto findNonUnitDims = [](ArrayRef shape) { - SmallVector nonUnitDims; - for (int i = 0, e = shape.size(); i < e; ++i) - if (shape[i] != 1) - nonUnitDims.push_back(i); - return nonUnitDims; - }; - SmallVector resultNonUnitDims = findNonUnitDims(resultShape); - SmallVector sourceNonUnitDims = findNonUnitDims(sourceShape); - // Source and result must have the same number of non-unit dimensions and - // thier values must match. - if (resultNonUnitDims.size() != sourceNonUnitDims.size()) { - shapeCast.emitWarning("Expecting source and result shapes to have same " - "number of non-unit dimensions."); - return; - } - auto reesultNonUnitDimShapes = llvm::map_to_vector( - resultNonUnitDims, [&](int64_t idx) { return resultShape[idx]; }); - auto sourceNonUnitDimShapes = llvm::map_to_vector( - sourceNonUnitDims, [&](int64_t idx) { return sourceShape[idx]; }); - if (llvm::any_of( - llvm::zip(sourceNonUnitDimShapes, reesultNonUnitDimShapes), - [](auto pair) { return std::get<0>(pair) != std::get<1>(pair); })) { - shapeCast.emitWarning("Expecting non-unit dimensions of source and result " - "shapes to match."); - return; - } - // Slice dims are unit dims that exist in the result shape but not in the - // source shape. - SmallVector sliceDims; - int64_t srcPrev, resPrev = 0; - // Add a dummy non unit dim at the end to handle trailing unit dims. - sourceNonUnitDims.push_back(sourceShape.size()); - resultNonUnitDims.push_back(resultShape.size()); - for (auto [s, r] : llvm::zip_equal(sourceNonUnitDims, resultNonUnitDims)) { - int unitDimDiff = (r - resPrev) - (s - srcPrev); - // Negative unitDimDiff means source shape has more unit dims in this range. - if (unitDimDiff < 0) { - shapeCast.emitWarning("Unsupported shape cast. Source shape has more " - "unit dims in between two non-unit dims."); - return; - } - for (auto it : llvm::seq(0, unitDimDiff)) - sliceDims.push_back(resPrev + it); - srcPrev = s + 1; - resPrev = r + 1; - } + int64_t slicedDim = resultTy.getShape()[0] == 1 ? 0 : 1; xegpu::SliceAttr sliceLayout = xegpu::SliceAttr::get( shapeCast->getContext(), cast(resultLayout.get()), - DenseI64ArrayAttr::get(shapeCast->getContext(), sliceDims)); + DenseI64ArrayAttr::get(shapeCast->getContext(), {slicedDim})); propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(sliceLayout))); } From 1376ca2de32f2acfd1fd251ad3fcf022ddf2c6c1 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 19 Sep 2025 16:31:47 +0000 Subject: [PATCH 36/36] remove headers --- mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp | 2 -- mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 2 -- 2 files changed, 4 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index b63acd67f7813..8fab255d6347f 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -31,13 +31,11 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/LogicalResult.h" #include "llvm/Support/raw_ostream.h" -#include namespace mlir { namespace xegpu { diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 69efca0d42c0b..449b8eb030b07 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -21,7 +21,6 @@ #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/BuiltinTypes.h" -#include "mlir/IR/Diagnostics.h" #include "mlir/IR/Operation.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/TypeRange.h" @@ -35,7 +34,6 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" -#include namespace mlir { namespace xegpu {