diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index 1f1d367118365..5695d5d515d7f 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -235,6 +235,54 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> { "FailureOr>>", "getOffsets", (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId, "ArrayRef":$shape)>, + InterfaceMethod": $perm), + /*methodBody=*/[{ + if (!other) + return false; + if ($_self.getRank() != other.getRank() || perm.size() != static_cast($_self.getRank())) + return false; + // Check if the permutation is valid + if (!isPermutationVector(perm)) + return false; + auto checkTranspose = [](ArrayRef dst, ArrayRef src, ArrayRef perm) { + // If both `dst` and `src` are empty, conservatively return true + // here because some layout fields can be empty. + if (dst.empty() && src.empty()) + return true; + for (const auto &ta : llvm::enumerate(perm)) { + if (src[ta.index()] != dst[ta.value()]) + return false; + } + return true; + }; + // Check sgLayout + if (!checkTranspose($_self.getEffectiveSgLayoutAsInt(), other.getEffectiveSgLayoutAsInt(), perm)) + return false; + // Check sgData + if (!checkTranspose($_self.getEffectiveSgDataAsInt(), other.getEffectiveSgDataAsInt(), perm)) + return false; + // Check instData + if (!checkTranspose($_self.getEffectiveInstDataAsInt(), other.getEffectiveInstDataAsInt(), perm)) + return false; + // Check laneLayout + if (!checkTranspose($_self.getEffectiveLaneLayoutAsInt(), other.getEffectiveLaneLayoutAsInt(), perm)) + return false; + // Check laneData + if (!checkTranspose($_self.getEffectiveLaneDataAsInt(), other.getEffectiveLaneDataAsInt(), perm)) + return false; + // Check order if both sides have order field. + if ($_self.getOrder() && other.getOrder()) { + auto thisOrderAsInt = llvm::to_vector_of($_self.getOrder().asArrayRef()); + auto otherOrderAsInt = llvm::to_vector_of(other.getOrder().asArrayRef()); + if (!checkTranspose(thisOrderAsInt, otherOrderAsInt, perm)) + return false; + } + return true; + }]>, InterfaceMethod layout; - Layout() = default; - Layout(std::initializer_list list) : layout(list) {} - void print(llvm::raw_ostream &os) const; - size_t size() const { return layout.size(); } -}; - -void Layout::print(llvm::raw_ostream &os) const { - os << llvm::interleaved_array(layout); -} - -/// LaneLayout represents the logical layout of lanes within a subgroup when it -/// accesses some value. LaneData represents the logical layout of data owned by -/// each work item. -using LaneLayout = Layout; -using LaneData = Layout; - //===----------------------------------------------------------------------===// // LayoutInfo //===----------------------------------------------------------------------===// /// Helper class for tracking the analysis state of an mlir value. For layout -/// propagation, the analysis state is simply the lane_layout and lane_data of -/// each value. Purpose of this analysis to propagate some unique layout for -/// each value in the program starting from a set of anchor operations (like -/// DPAS, StoreNd, etc.). +/// propagation, the analysis state is simply the distribution layout of +/// each value. The distribution layout information is encapsulated using +/// xegpu::DistributeLayoutAttr class which can hold information about any type +/// of distribution layout that XeGPU dialect supports. Purpose of this analysis +/// to propagate some unique distribution layout for each value in the program +/// starting from a set of anchor operations (like DPAS, StoreNd, etc.). Note +/// that analysis will reach a fixed point when all values are reached some +/// layout and, analysis does not try to modify any already assigned layouts. /// /// Given this, LayoutInfo satisifies the following properties: /// 1) A LayoutInfo value can be in one of two states - `assigned` or `not @@ -98,14 +78,11 @@ using LaneData = Layout; struct LayoutInfo { private: - LaneLayout laneLayout; - LaneData laneData; - xegpu::LayoutAttr layoutAttr; + xegpu::DistributeLayoutAttr storage = nullptr; public: LayoutInfo() = default; - LayoutInfo(const LaneLayout &layout, const LaneData &data) - : laneLayout(layout), laneData(data) {} + LayoutInfo(const xegpu::DistributeLayoutAttr &layout) : storage(layout) {} // Two lattice values are equal if they have `some` layout. The actual // content of the layout does not matter. @@ -119,24 +96,50 @@ struct LayoutInfo { void print(raw_ostream &os) const; - bool isAssigned() const { - return laneLayout.size() > 0 && laneData.size() > 0; + bool isAssigned() const { return storage != nullptr; } + + LayoutInfo transpose(ArrayRef permutation) const; + + SmallVector getLaneLayout() const; + + SmallVector getLaneData() const; + + bool isSliceLayout() const { + if (!isAssigned()) + return false; + return isa(storage); } - LayoutInfo getTransposedLayout(ArrayRef permutation) const; + int64_t getRank() const { + if (!isAssigned()) + return -1; + return storage.getRank(); + } - const LaneLayout &getLayout() const { return laneLayout; } - const LaneData &getData() const { return laneData; } - ArrayRef getLayoutAsArrayRef() const { return laneLayout.layout; } - ArrayRef getDataAsArrayRef() const { return laneData.layout; } + Attribute get() { return storage; } }; +SmallVector LayoutInfo::getLaneLayout() const { + if (!isAssigned()) + return {}; + assert(storage.getEffectiveLaneLayoutAsInt().size() && + "Expected lane layout to be assigned"); + return llvm::map_to_vector(storage.getEffectiveLaneLayoutAsInt(), + [](int64_t val) { return static_cast(val); }); +} + +SmallVector LayoutInfo::getLaneData() const { + if (!isAssigned()) + return {}; + assert(storage.getEffectiveLaneDataAsInt().size() && + "Expected lane data to be assigned"); + return llvm::map_to_vector(storage.getEffectiveLaneDataAsInt(), + [](int64_t val) { return static_cast(val); }); +} + void LayoutInfo::print(raw_ostream &os) const { if (isAssigned()) { - os << "lane_layout: "; - laneLayout.print(os); - os << ", lane_data: "; - laneData.print(os); + os << storage; } else { os << "Not assigned."; } @@ -153,18 +156,30 @@ LayoutInfo LayoutInfo::join(const LayoutInfo &lhs, const LayoutInfo &rhs) { llvm_unreachable("Join should not be triggered by layout propagation."); } -/// Get the transposed layout according to the given permutation. -LayoutInfo -LayoutInfo::getTransposedLayout(ArrayRef permutation) const { +/// Construct a new layout with the transposed lane layout and lane data. +LayoutInfo LayoutInfo::transpose(ArrayRef permutation) const { if (!isAssigned()) return {}; - LaneLayout newLayout; - LaneData newData; + // Check if the permutation is valid. + llvm::SmallSet seen(permutation.begin(), permutation.end()); + bool hasDuplicates = seen.size() != permutation.size(); + bool withinRange = llvm::all_of(permutation, [&](int64_t idx) { + return idx >= 0 && idx < static_cast(permutation.size()); + }); + + if (!withinRange || hasDuplicates) { + assert(false && "Invalid permutation for transpose."); + return {}; + } + + SmallVector laneLayout; + SmallVector laneData; for (int64_t idx : permutation) { - newLayout.layout.push_back(laneLayout.layout[idx]); - newData.layout.push_back(laneData.layout[idx]); + laneLayout.push_back(static_cast(getLaneLayout()[idx])); + laneData.push_back(static_cast(getLaneData()[idx])); } - return LayoutInfo(newLayout, newData); + return LayoutInfo( + xegpu::LayoutAttr::get(storage.getContext(), laneLayout, laneData)); } //===----------------------------------------------------------------------===// @@ -184,13 +199,15 @@ struct LayoutInfoLattice : public Lattice { /// Helper Function to get the default layout for uniform values like constants. /// For 1D vector, lane_layout is [subgroupSize] and lane_data is [1]. /// For 2D vector, lane_layout is [1, subgroupSize] and lane_data is [1, 1]. -static LayoutInfo getDefaultSIMTLayoutInfo(unsigned rank) { +static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx, + unsigned rank) { assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector."); - if (rank == 1) - return LayoutInfo(LaneLayout({xegpu::targetinfo::subgroupSize}), - LaneData({1})); - return LayoutInfo(LaneLayout({1, xegpu::targetinfo::subgroupSize}), - LaneData({1, 1})); + if (rank == 1) { + return LayoutInfo( + xegpu::LayoutAttr::get(ctx, {xegpu::targetinfo::subgroupSize}, {1})); + } + return LayoutInfo(xegpu::LayoutAttr::get( + ctx, {1, xegpu::targetinfo::subgroupSize}, {1, 1})); } /// Helper to get the default layout for a vector type. @@ -204,7 +221,7 @@ static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy, "Expected int or float element type."); // If the rank is 1, then return default layout for 1D vector. if (vectorTy.getRank() == 1) - return getDefaultSIMTLayoutInfo(1); + return getDefaultSIMTLayoutInfo(vectorTy.getContext(), 1); // Packing factor is determined by the element type bitwidth. int packingFactor = 1; unsigned bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth(); @@ -213,13 +230,15 @@ static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy, bitwidth < xegpu::targetinfo::packedSizeInBitsForGatherScatter ? xegpu::targetinfo::packedSizeInBitsForGatherScatter / bitwidth : 1; - return LayoutInfo(LaneLayout({xegpu::targetinfo::subgroupSize, 1}), - LaneData({1, packingFactor})); + return LayoutInfo(xegpu::LayoutAttr::get( + vectorTy.getContext(), {xegpu::targetinfo::subgroupSize, 1}, + {1, packingFactor})); } if (bitwidth < xegpu::targetinfo::packedSizeInBitsForDefault) packingFactor = xegpu::targetinfo::packedSizeInBitsForDefault / bitwidth; - return LayoutInfo(LaneLayout({1, xegpu::targetinfo::subgroupSize}), - LaneData({1, packingFactor})); + return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), + {1, xegpu::targetinfo::subgroupSize}, + {1, packingFactor})); } /// Helper to get the default layout for a vector type. @@ -233,7 +252,7 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy, "Expected int or float element type."); // If the rank is 1, then return default layout for 1D vector. if (tdescTy.getRank() == 1) - return getDefaultSIMTLayoutInfo(1); + return getDefaultSIMTLayoutInfo(tdescTy.getContext(), 1); // Packing factor is determined by the element type bitwidth. unsigned bitwidth = tdescTy.getElementType().getIntOrFloatBitWidth(); @@ -242,16 +261,18 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy, bitwidth < xegpu::targetinfo::packedSizeInBitsForGatherScatter ? xegpu::targetinfo::packedSizeInBitsForGatherScatter / bitwidth : 1; - return LayoutInfo(LaneLayout({xegpu::targetinfo::subgroupSize, 1}), - LaneData({1, packingFactor})); + return LayoutInfo(xegpu::LayoutAttr::get( + tdescTy.getContext(), {xegpu::targetinfo::subgroupSize, 1}, + {1, packingFactor})); } int packingFactor = (bitwidth < xegpu::targetinfo::packedSizeInBitsForDefault) ? xegpu::targetinfo::packedSizeInBitsForDefault / bitwidth : 1; - return LayoutInfo(LaneLayout({1, xegpu::targetinfo::subgroupSize}), - LaneData({1, packingFactor})); + return LayoutInfo(xegpu::LayoutAttr::get(tdescTy.getContext(), + {1, xegpu::targetinfo::subgroupSize}, + {1, packingFactor})); } /// Helper Function to get the expected layouts for DPAS operands. `lane_data` @@ -265,15 +286,17 @@ static LayoutInfo getSIMTLayoutInfoForDPASOperand(VectorType vectorTy, Type elementTy = vectorTy.getElementType(); assert(elementTy.isIntOrFloat() && "Expected int or float type in DPAS operands"); - LaneLayout layout({1, xegpu::targetinfo::subgroupSize}); + SmallVector layout({1, xegpu::targetinfo::subgroupSize}); // For B operand, data must be packed in minimum `packedDpasBSizeInBits` and // must have the VNNI format. if (operandNum == 1 && elementTy.getIntOrFloatBitWidth() < xegpu::targetinfo::packedSizeInBitsForDpasB) { - LaneData data({xegpu::targetinfo::packedSizeInBitsForDpasB / - elementTy.getIntOrFloatBitWidth(), - 1}); - return LayoutInfo(layout, data); + SmallVector data( + {static_cast(xegpu::targetinfo::packedSizeInBitsForDpasB / + elementTy.getIntOrFloatBitWidth()), + 1}); + return LayoutInfo( + xegpu::LayoutAttr::get(vectorTy.getContext(), layout, data)); } // Otherwise, return the default layout for the vector type. return getDefaultSIMTLayoutInfo(vectorTy); @@ -334,6 +357,13 @@ class LayoutInfoPropagation ArrayRef operands, ArrayRef results); + void visitVectorBroadCastOp(vector::BroadcastOp broadcast, + ArrayRef operands, + ArrayRef results); + void visitShapeCastOp(vector::ShapeCastOp shapeCast, + ArrayRef operands, + ArrayRef results); + public: LayoutInfoPropagation(DataFlowSolver &solver, SymbolTableCollection &symbolTable) @@ -393,6 +423,12 @@ LogicalResult LayoutInfoPropagation::visitOperation( .Case([&](auto reductionOp) { visitVectorMultiReductionOp(reductionOp, operands, results); }) + .Case([&](auto broadcastOp) { + visitVectorBroadCastOp(broadcastOp, operands, results); + }) + .Case([&](auto shapeCastOp) { + visitShapeCastOp(shapeCastOp, operands, results); + }) // All other ops. .Default([&](Operation *op) { for (const LayoutInfoLattice *resultInfo : results) { @@ -441,14 +477,68 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp( } // Given that the result is 1D, the layout of the operand should be 2D with // default layout. - LayoutInfo operandLayout = getDefaultSIMTLayoutInfo(2); + LayoutInfo operandLayout = + getDefaultSIMTLayoutInfo(reduction->getContext(), 2); propagateIfChanged(operands[0], operands[0]->meet(operandLayout)); // Accumulator should have the same layout as the result. propagateIfChanged(operands[1], operands[1]->meet(resultLayout)); } -/// Propagate the layout of the result tensor to the source tensor descriptor in -/// UpdateNdOffsetOp. +void LayoutInfoPropagation::visitVectorBroadCastOp( + vector::BroadcastOp broadcast, ArrayRef operands, + ArrayRef results) { + // The layout of the result must be present. + LayoutInfo resultLayout = results[0]->getValue(); + if (!resultLayout.isAssigned()) + return; + // Only consider vector to vector broadcasts for now. + VectorType resultTy = broadcast.getResultVectorType(); + VectorType sourceTy = dyn_cast(broadcast.getSourceType()); + if (!sourceTy) { + broadcast.emitWarning("Expecting source type to be a vector type."); + return; + } + + // Only consider nD -> nD broadcast. + if (sourceTy.getRank() != resultTy.getRank()) { + broadcast.emitWarning("Expecting source and result to have same rank."); + return; + } + SetVector broadcastUnitDims = broadcast.computeBroadcastedUnitDims(); + if (broadcastUnitDims.size() != 1) { + broadcast.emitWarning("Expecting source type to be nD vector only with " + "one broadcasted dimension."); + return; + } + // Propagate the result layout to the source operand. + propagateIfChanged(operands[0], operands[0]->meet(resultLayout)); +} + +void LayoutInfoPropagation::visitShapeCastOp( + vector::ShapeCastOp shapeCast, ArrayRef operands, + ArrayRef results) { + // The layout of the result must be present. + LayoutInfo resultLayout = results[0]->getValue(); + if (!resultLayout.isAssigned()) + return; + VectorType sourceTy = shapeCast.getSourceVectorType(); + VectorType resultTy = shapeCast.getResultVectorType(); + // Shape cast layout propagation only supports 1D -> 2D shape casts. + // TODO: Support kD -> nD shape casts (k < n, n >= 2) where expanded dims are + // unit dimensions and non-unit dims match. + if (sourceTy.getRank() != 1 || resultTy.getRank() != 2) { + shapeCast.emitWarning("Expecting shape cast to be 1D -> 2D."); + return; + } + int64_t slicedDim = resultTy.getShape()[0] == 1 ? 0 : 1; + xegpu::SliceAttr sliceLayout = xegpu::SliceAttr::get( + shapeCast->getContext(), cast(resultLayout.get()), + DenseI64ArrayAttr::get(shapeCast->getContext(), {slicedDim})); + propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(sliceLayout))); +} + +/// Propagate the layout of the result tensor to the source tensor descriptor +/// in UpdateNdOffsetOp. void LayoutInfoPropagation::visitUpdateNdOffsetOp( xegpu::UpdateNdOffsetOp updateNdOffset, ArrayRef operands, @@ -505,7 +595,7 @@ void LayoutInfoPropagation::visitLoadNdOp( if (auto transpose = load.getTranspose()) { load.emitWarning("Transpose effect is not expected for LoadNdOp at " "LayoutInfoPropagation stage."); - tensorDescLayout = valueLayout.getTransposedLayout(transpose.value()); + tensorDescLayout = valueLayout.transpose(transpose.value()); } // Propagate the new layout to the tensor descriptor operand. propagateIfChanged(operands[0], operands[0]->meet(tensorDescLayout)); @@ -520,8 +610,7 @@ void LayoutInfoPropagation::visitTransposeOp( LayoutInfo resultLayout = results[0]->getValue(); if (!resultLayout.isAssigned()) return; - LayoutInfo newLayout = - resultLayout.getTransposedLayout(transpose.getPermutation()); + LayoutInfo newLayout = resultLayout.transpose(transpose.getPermutation()); // Propagate the new layout to the vector operand. propagateIfChanged(operands[0], operands[0]->meet(newLayout)); } @@ -539,16 +628,59 @@ void LayoutInfoPropagation::visitVectorBitcastOp( bitcast.getSourceVectorType().getElementType().getIntOrFloatBitWidth(); int outElemTyBitWidth = bitcast.getResultVectorType().getElementType().getIntOrFloatBitWidth(); - - // NOTE: We do not expect widening or narrowing bitcasts at this stage. Emit - // a warning and return. - if (inElemTyBitWidth != outElemTyBitWidth) { - bitcast.emitWarning("Widening or narrowing bitcasts are not expected at " - "layout propagation stage."); + // If the element bit widths are the same, then the layout does not change. + if (inElemTyBitWidth == outElemTyBitWidth) { + propagateIfChanged(operands[0], operands[0]->meet(resultLayout)); return; } + // Check if the result layout is valid. i.e. result vector can be distributed. + auto resultLaneLayout = resultLayout.getLaneLayout(); + auto resultLaneData = resultLayout.getLaneData(); + if (failed(xegpu::getDistributedVectorType( + bitcast.getResultVectorType(), + xegpu::LayoutAttr::get(bitcast->getContext(), resultLaneLayout, + resultLaneData)))) { + bitcast.emitWarning( + "Result vector type can not be evenly distributed across lanes."); + return; + } + int64_t rank = bitcast.getSourceVectorType().getRank(); + // Bitcast is a `narrowing` if the input element type bit width larger than + // the output element type bit width. eg. f32 -> f16 is a narrowing bitcast. + bool isNarrowing = inElemTyBitWidth > outElemTyBitWidth; + int bitCastRatio = isNarrowing ? inElemTyBitWidth / outElemTyBitWidth + : outElemTyBitWidth / inElemTyBitWidth; + SmallVector sourceLaneLayout = + resultLayout.getLaneLayout(); // Lane layout does not change for bitcast. + SmallVector outData = resultLayout.getLaneData(); + + // TODO: Currently we assume that bitcasts does not require cross lane + // communication. So each lane must own the required number of elements to + // perform the bitcast locally without cross-lane communication. + int outInnerBitsPerLane = outData[rank - 1] * outElemTyBitWidth; + if (outInnerBitsPerLane < inElemTyBitWidth) { + bitcast.emitWarning( + "Narrowing bitcast with cross lane communication is not supported."); + return; + } + // Check if each lane owns a single element in all dimensions except the + // innermost dimension. + SmallVector sourceLaneData(outData.begin(), outData.end() - 1); + if (llvm::any_of(sourceLaneData, [](int64_t d) { return d != 1; })) { + bitcast.emitWarning("Each lane must not own multiple elements in any " + "dimension other than " + "the innermost dimension."); + return; + } + // Decide lane data based on whether the bitcast is narrowing or widening. + int64_t innerMostLaneData = isNarrowing ? outData[rank - 1] / bitCastRatio + : outData[rank - 1] * bitCastRatio; + sourceLaneData.push_back(innerMostLaneData); - propagateIfChanged(operands[0], operands[0]->meet(resultLayout)); + propagateIfChanged( + operands[0], + operands[0]->meet(LayoutInfo(xegpu::LayoutAttr::get( + bitcast->getContext(), sourceLaneLayout, sourceLaneData)))); } /// Propagate the layout of the result to the tensor descriptor, mask and offset @@ -565,7 +697,7 @@ void LayoutInfoPropagation::visitLoadGatherOp( LayoutInfo layout = getDefaultSIMTLayoutInfo(payloadTy, /*scattered*/ true); // Mask operand should have 1D default layout. - LayoutInfo maskLayout = getDefaultSIMTLayoutInfo(1); + LayoutInfo maskLayout = getDefaultSIMTLayoutInfo(load->getContext(), 1); // Propagate the new layout to the tensor descriptor operand. if (isa(load.getSourceType())) @@ -586,7 +718,7 @@ void LayoutInfoPropagation::visitCreateDescOp( if (!descLayout.isAssigned()) return; // For offset operand propagate 1D default layout. - LayoutInfo layout = getDefaultSIMTLayoutInfo(1); + LayoutInfo layout = getDefaultSIMTLayoutInfo(createDesc->getContext(), 1); propagateIfChanged(operands[1], operands[1]->meet(layout)); } @@ -613,7 +745,8 @@ void LayoutInfoPropagation::visitStoreScatterOp( LayoutInfo payloadLayout = getDefaultSIMTLayoutInfo(payloadTy, /*scattered=*/true); - LayoutInfo maskLayout = getDefaultSIMTLayoutInfo(1); + LayoutInfo maskLayout = + getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1); // Propagate the payload operand layout propagateIfChanged(operands[0], operands[0]->meet(payloadLayout)); // Propagate the destination (if tdesc) operand layout @@ -709,15 +842,16 @@ void RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) { printFunctionResult(funcOp); } -using GetLayoutFnTy = function_ref; -/// Update an operation with the layout of its results. If the result type is a -/// vector type, a temporary layout attribute is added to the operation. If the -/// result type is a tensor descriptor type, the type is updated with the layout -/// attribute. The users of the result are also updated with the layout +using GetLayoutFnTy = function_ref; +/// Update an operation with the layout of its results. If the result type is +/// a vector type, a temporary layout attribute is added to the operation. If +/// the result type is a tensor descriptor type, the type is updated with the +/// layout attribute. The users of the result are also updated with the layout /// attribute. static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op, GetLayoutFnTy getLayoutOfValue) { - // Region ops (like scf.for) are already handled by the updateControlFlowOps. + // Region ops (like scf.for) are already handled by the + // updateControlFlowOps. if (mlir::isa(op)) return success(); @@ -728,7 +862,7 @@ static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op, if (!isa(resultType)) continue; // If the result has no layout but has users, emit a warning and continue. - xegpu::LayoutAttr layout = getLayoutOfValue(result); + xegpu::DistributeLayoutAttr layout = getLayoutOfValue(result); if (!layout && result.getNumUses() > 0) { op->emitWarning("op has users but no layout assigned for its result"); continue; @@ -750,14 +884,14 @@ static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op, } /// Region ops like scf.for need special handling because they have blocks -/// inside. If the blocks have tensor descriptor type as block arguments, thier -/// types must be updated. Also region op can have results that may not have any -/// users (e.g. A and B tiles). They are not assigned a layout by layout -/// analysis because they have no users. However inside the region op -/// corresponding block arguments for these results do have layouts. Therefore, -/// in this case we still need to update the result types with the layout -/// attribute. This function function updates the internal block arguments and -/// the result types of the region op with the assigned layouts. +/// inside. If the blocks have tensor descriptor type as block arguments, +/// thier types must be updated. Also region op can have results that may not +/// have any users (e.g. A and B tiles). They are not assigned a layout by +/// layout analysis because they have no users. However inside the region op +/// corresponding block arguments for these results do have layouts. +/// Therefore, in this case we still need to update the result types with the +/// layout attribute. This function function updates the internal block +/// arguments and the result types of the region op with the assigned layouts. /// clang-format off /// Example: scf.for ... iter_args(...) -> (out types) { /// ^bb0(block types): @@ -769,8 +903,8 @@ static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op, /// regions. One is the ^bb0 (for loop body) and the other is the scf.for op /// itself (yield the results). So we update both the block arguments of the /// successor region (i.e. block types) and the result types of the scf.for op -/// (i.e. out types). Note that yield types are updated by respective producers -/// inside bb0. +/// (i.e. out types). Note that yield types are updated by respective +/// producers inside bb0. static LogicalResult updateControlFlowOps(mlir::OpBuilder &builder, mlir::RegionBranchTerminatorOpInterface terminator, @@ -794,16 +928,16 @@ updateControlFlowOps(mlir::OpBuilder &builder, // We only need to operate on tensor descriptor or vector types. if (!isa(inputType)) continue; - xegpu::LayoutAttr successorInputLayout = getLayoutOfValue(successorInput); - xegpu::LayoutAttr successorOperandLayout = + xegpu::DistributeLayoutAttr successorInputLayout = + getLayoutOfValue(successorInput); + xegpu::DistributeLayoutAttr successorOperandLayout = getLayoutOfValue(successorOperand); // If either of the layouts is not assigned, we cannot proceed. if (!successorOperandLayout) { - LLVM_DEBUG( - DBGS() - << "No layout assigned for forwarded operand in branch terminator: " - << successorOperand << "\n"); + LLVM_DEBUG(DBGS() << "No layout assigned for forwarded operand in " + "branch terminator: " + << successorOperand << "\n"); return failure(); } // We expect the layouts to match. @@ -843,7 +977,7 @@ static LogicalResult updateFunctionOpInterface(mlir::OpBuilder &builder, newArgTypes.push_back(argType); if (!isa(argType)) continue; - xegpu::LayoutAttr layout = getLayoutOfValue(arg); + xegpu::DistributeLayoutAttr layout = getLayoutOfValue(arg); if (!layout) { LLVM_DEBUG(DBGS() << "Expecting layout for function argument: " << arg << " but got none.\n"); @@ -885,13 +1019,13 @@ void XeGPUPropagateLayoutPass::runOnOperation() { return; } // Helper to convert LayoutInfo to xegpu::LayoutAttr. - auto getXeGPULayoutForValue = [&](Value val) -> xegpu::LayoutAttr { + auto getXeGPULayoutForValue = [&](Value val) -> xegpu::DistributeLayoutAttr { LayoutInfo layout = analysis.getLayoutInfo(val); if (!layout.isAssigned()) return {}; - return xegpu::LayoutAttr::get( - val.getContext(), llvm::to_vector_of(layout.getLayoutAsArrayRef()), - llvm::to_vector_of(layout.getDataAsArrayRef())); + if (layout.isSliceLayout()) + return cast(layout.get()); + return cast(layout.get()); }; mlir::OpBuilder builder(&getContext()); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 21c1583bf2633..449b8eb030b07 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -147,13 +147,29 @@ static Value resolveDistributedTy(Value orig, T expected, /// Helper function to check if the layout is packed. Layout is packed if it is /// 2D and lane_data[0] != 1 (data packed from col dimension). -static bool hasPackedLayout(xegpu::LayoutAttr layout) { - if (layout == xegpu::LayoutAttr()) +/// TODO: Move to target info. +static bool requirePacked(const xegpu::LayoutAttr layout) { + if (!layout) return false; - DenseI32ArrayAttr laneData = layout.getLaneData(); - if (!laneData || laneData.size() != 2) + auto laneData = layout.getEffectiveLaneDataAsInt(); + if (laneData.size() != 2) return false; - return laneData.asArrayRef()[0] != 1; + return laneData[0] != 1; +} + +/// Helper function to check if the layout requires a transpose effect. +static bool requireTranspose(const xegpu::LayoutAttr layout, + const std::string &chipStr) { + // Return false for unsupported targets. + // TODO: Add more support or move to target info. + if (chipStr != "pvc" && chipStr != "bmg") + return false; + if (!layout) + return false; + auto laneLayout = layout.getEffectiveLaneLayoutAsInt(); + if (laneLayout.size() != 2) + return false; + return laneLayout[0] == xegpu::targetinfo::subgroupSize && laneLayout[1] == 1; } /// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body @@ -467,7 +483,14 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern { warpOp, "warp result is not a xegpu::LoadNd op"); auto loadOp = operand->get().getDefiningOp(); - + // Chip information is required to decide if the layout requires transpose + // effect. + auto chipStr = xegpu::getChipStr(loadOp); + if (!chipStr) + return rewriter.notifyMatchFailure( + loadOp, + "xegpu::LoadNdOp require chip information to determine transpose " + "requirement"); int64_t offsetSize = static_cast(loadOp.getOffsets().size()); if ((offsetSize != 0) || loadOp.getConstOffsetsAttr()) return failure(); @@ -507,7 +530,11 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern { loadOp->getAttrs()); xegpu::removeLayoutAttrs(newLoadOp); // Set the packed attribute if the layout requires it. - newLoadOp.setPacked(hasPackedLayout(layout)); + newLoadOp.setPacked(requirePacked(layout)); + // Set the transpose attribute if the layout requires it. + if (requireTranspose(layout, chipStr.value())) + newLoadOp.setTranspose( + DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0})); Value distributedVal = newWarpOp.getResult(operandIdx); // There can be a conflict between the vector type distributed by the // warp op and (xegpu-specific) distributed type supported by the load @@ -1276,6 +1303,142 @@ struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern { } }; +/// Sink a memref::ExtractAlignedPointerAsIndex op feeding into yield op of an +/// enclosing `gpu.warp_execute_on_lane_0` region. This will simply move the op +/// outside of the warp op. +struct MemrefExtractAlignedPointerAsIndexDistribution final + : public gpu::WarpDistributionPattern { + using gpu::WarpDistributionPattern::WarpDistributionPattern; + LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, + PatternRewriter &rewriter) const override { + OpOperand *operand = getWarpResult( + warpOp, llvm::IsaPred); + if (!operand) + return rewriter.notifyMatchFailure( + warpOp, + "warp result is not a memref::MemrefExtractAlignedPointerAsIndex op"); + auto extractOp = + operand->get().getDefiningOp(); + unsigned operandIdx = operand->getOperandNumber(); + SmallVector newRetIndices; + gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( + rewriter, warpOp, extractOp.getSource(), + TypeRange{extractOp.getSource().getType()}, newRetIndices); + rewriter.setInsertionPointAfter(newWarpOp); + auto newExtractOp = memref::ExtractAlignedPointerAsIndexOp::create( + rewriter, newWarpOp.getLoc(), extractOp.getType(), + newWarpOp.getResult(newRetIndices[0])); + Value distributedVal = newWarpOp.getResult(operandIdx); + rewriter.replaceAllUsesWith(distributedVal, newExtractOp.getResult()); + return success(); + } +}; + +/// Distribute a vector::BitCastOp feeding into yield op of an enclosing +/// `gpu.warp_execute_on_lane_0` region. Bitcast only impacts the innermost +/// diemension of the source/result vectors. Equivalent vector::BitCastOp is +/// created outside of the warp op with distributed source vector type (computed +/// using assigned layout). +struct VectorBitcastDistribution final : public gpu::WarpDistributionPattern { + using gpu::WarpDistributionPattern::WarpDistributionPattern; + LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, + PatternRewriter &rewriter) const override { + OpOperand *operand = + getWarpResult(warpOp, llvm::IsaPred); + if (!operand) + return rewriter.notifyMatchFailure( + warpOp, "warp result is not a vector::BitCast op"); + auto bitcastOp = operand->get().getDefiningOp(); + unsigned operandIdx = operand->getOperandNumber(); + VectorType distributedSourceType = + getDistVecTypeBasedOnLaneLayout( + xegpu::getDistributeLayoutAttr(bitcastOp.getSource()), + bitcastOp.getSourceVectorType()) + .value_or(VectorType()); + if (!distributedSourceType) + return rewriter.notifyMatchFailure( + bitcastOp, "Failed to distribute the source vector type in " + "vector::BitCast op"); + VectorType distributedResultType = + cast(warpOp.getResult(operandIdx).getType()); + SmallVector newRetIndices; + gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( + rewriter, warpOp, bitcastOp.getSource(), + TypeRange{distributedSourceType}, newRetIndices); + rewriter.setInsertionPointAfter(newWarpOp); + auto newBitcastOp = vector::BitCastOp::create( + rewriter, newWarpOp.getLoc(), distributedResultType, + newWarpOp.getResult(newRetIndices[0])); + Value distributedVal = newWarpOp.getResult(operandIdx); + rewriter.replaceAllUsesWith(distributedVal, newBitcastOp.getResult()); + return success(); + } +}; + +/// Distribute a vector::TransposeOp feeding into yield op of an enclosing +/// `gpu.warp_execute_on_lane_0` region. Currently only 2D transposes are +/// supported. In most cases, transpose is a no op because it is entirely +/// handled using the layouts (e.g. 16x1 -> 1x16). However, if each lane owns +/// multiple slices of data after distribution (e.g. 16x2 -> 2x16), a lane-local +/// transpose (i.e. shuffle) is needed. Therefore, we create an equivalent +/// vector::TransposeOp outside of the warp op with distributed source vector +/// type (computed using assigned layout). +struct VectorTransposeDistribution final : public gpu::WarpDistributionPattern { + using gpu::WarpDistributionPattern::WarpDistributionPattern; + LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, + PatternRewriter &rewriter) const override { + OpOperand *operand = + getWarpResult(warpOp, llvm::IsaPred); + if (!operand) + return rewriter.notifyMatchFailure( + warpOp, "warp result is not a vector::Transpose op"); + auto transposeOp = operand->get().getDefiningOp(); + unsigned operandIdx = operand->getOperandNumber(); + xegpu::DistributeLayoutAttr sourceLayout = + xegpu::getDistributeLayoutAttr(transposeOp.getVector()); + xegpu::DistributeLayoutAttr resultLayout = + xegpu::getDistributeLayoutAttr(transposeOp.getResult()); + if (!sourceLayout || !resultLayout) + return rewriter.notifyMatchFailure( + transposeOp, + "the source or result vector of the transpose op lacks layout " + "attribute"); + int64_t sourceRank = transposeOp.getSourceVectorType().getRank(); + int64_t resultRank = transposeOp.getResultVectorType().getRank(); + // Only 2D transposes are supported for now. + // TODO: Support nD transposes. + if (sourceRank != 2 || resultRank != 2) + return rewriter.notifyMatchFailure( + transposeOp, "the source or result vector of the transpose op " + "does not have 2D layout"); + ArrayRef perm = transposeOp.getPermutation(); + // Result layout must be a transpose of source layout. + if (!resultLayout.isTransposeOf(sourceLayout, perm)) + return rewriter.notifyMatchFailure( + transposeOp, + "the source or result vector layouts must be 2D transposes of each " + "other"); + FailureOr distributedSourceTypeOrFailure = + getDistVecTypeBasedOnLaneLayout(sourceLayout, + transposeOp.getSourceVectorType()); + if (failed(distributedSourceTypeOrFailure)) + return rewriter.notifyMatchFailure( + transposeOp, "Failed to distribute the source vector type in " + "vector::Transpose op"); + SmallVector newRetIndices; + gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( + rewriter, warpOp, transposeOp.getVector(), + TypeRange{distributedSourceTypeOrFailure.value()}, newRetIndices); + rewriter.setInsertionPointAfter(newWarpOp); + auto newTransposeOp = vector::TransposeOp::create( + rewriter, newWarpOp.getLoc(), newWarpOp.getResult(newRetIndices[0]), + perm); + Value distributedVal = newWarpOp.getResult(operandIdx); + rewriter.replaceAllUsesWith(distributedVal, newTransposeOp.getResult()); + return success(); + } +}; + } // namespace namespace { @@ -1297,7 +1460,9 @@ void xegpu::populateXeGPUSubgroupDistributePatterns( .add( + LoadDistribution, StoreDistribution, VectorTransposeDistribution, + VectorBitcastDistribution, + MemrefExtractAlignedPointerAsIndexDistribution>( patterns.getContext(), /*pattern benefit=*/regularPatternBenefit); patterns.add( @@ -1406,9 +1571,23 @@ void XeGPUSubgroupDistributePass::runOnOperation() { return; } - // Step 4: Finllay, clean up UnrealizedConversionCastOps that were inserted + // Step 4: Finally, clean up UnrealizedConversionCastOps that were inserted // due to tensor desc type mismatches created by using upstream distribution - // patterns (scf.for) + // patterns (scf.for). This cleanup should only be done if all the ops are + // distributed successfully, if some ops are still not distributed and remains + // inside any WarpExecuteOnLane0Op we avoid this simplication step to avoid + // breaking the IR. + bool foundWarpOp = false; + getOperation()->walk([&](gpu::WarpExecuteOnLane0Op warpOp) { + // Look for WarpOps that are not trivially dead. + if (isOpTriviallyDead(warpOp)) + return WalkResult::advance(); + foundWarpOp = true; + return WalkResult::interrupt(); + }); + if (foundWarpOp) + return; + getOperation()->walk([&](mlir::UnrealizedConversionCastOp op) { // We are only interested in UnrealizedConversionCastOps there were added // for resolving SIMT type mismatches. @@ -1427,7 +1606,7 @@ void XeGPUSubgroupDistributePass::runOnOperation() { "Unrealized conversion cast must have tensor descriptor types"); // tensor_desc -> tensor_desc Type of conversions. - // This occurs iside scf.for body to resolve the block argument type to + // This occurs inside scf.for body to resolve the block argument type to // SIMT type. if (inputDescType.getLayout()) { auto argument = mlir::dyn_cast(input); diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir index cba3f0bd690c3..30f785ded975a 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir @@ -198,9 +198,14 @@ func.func @scatter_ops(%src: memref<256xf16>) { // ----- // CHECK-LABEL: func.func @vector_bitcast_i16_to_f16( -// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xi16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xi16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %{{.*}} = vector.bitcast %{{.*}} {layout_result_0 = #xegpu.layout} : vector<8x16xi16> to vector<8x16xf16> -// CHECK: %{{.*}} = vector.bitcast %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x16xi16> to vector<16x16xf16> +// CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout} +// CHECK-SAME: !xegpu.tensor_desc<8x16xi16, #xegpu.layout> -> vector<8x16xi16> +// CHECK: %[[LOAD1:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout} +// CHECK-SAME: !xegpu.tensor_desc<16x16xi16, #xegpu.layout> -> vector<16x16xi16> +// CHECK: %{{.*}} = vector.bitcast %[[LOAD0]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: vector<8x16xi16> to vector<8x16xf16> +// CHECK: %{{.*}} = vector.bitcast %[[LOAD1]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: vector<16x16xi16> to vector<16x16xf16> func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x16xi16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16> @@ -215,6 +220,58 @@ func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x1 return } +// ----- +// CHECK-LABEL: func.func @vector_bitcast_i32_to_f16( +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout} +// CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> +// CHECK-NEXT: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: vector<16x8xi32> to vector<16x16xf16> +func.func @vector_bitcast_i32_to_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x8xi32>, %arg2: memref<8x16xf32>) { + %c0 = arith.constant 0 : index + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32> + %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<16x8xi32> -> vector<16x8xi32> + %4 = vector.bitcast %3 : vector<16x8xi32> to vector<16x16xf16> + %5 = vector.transpose %4, [1, 0] : vector<16x16xf16> to vector<16x16xf16> + %6 = xegpu.dpas %2, %5 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> + %7 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %6, %7 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + return +} + +// ----- +// CHECK-LABEL: func.func @vector_bitcast_i16_to_i32( +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout} +// CHECK-SAME: !xegpu.tensor_desc<8x32xi16, #xegpu.layout> -> vector<8x32xi16> +// CHECK-NEXT: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: vector<8x32xi16> to vector<8x16xi32> +func.func @vector_bitcast_i16_to_i32(%arg0: memref<8x32xi16>, %arg1: memref<8x16xi32>) { + %c0 = arith.constant 0 : index + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x32xi16> -> !xegpu.tensor_desc<8x32xi16> + %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32> + %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x32xi16> -> vector<8x32xi16> + %3 = vector.bitcast %2 : vector<8x32xi16> to vector<8x16xi32> + xegpu.store_nd %3, %1 : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32> + return +} + +// ----- +// CHECK-LABEL: func.func @vector_bitcast_require_cross_lane_shuffle( +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x16xi32> -> vector<8x16xi32> +// CHECK: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: vector<8x16xi32> to vector<8x32xi16> +func.func @vector_bitcast_require_cross_lane_shuffle(%arg0: memref<8x16xi32>, %arg1: memref<8x32xi16>) { + %c0 = arith.constant 0 : index + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32> + %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<8x32xi16> -> !xegpu.tensor_desc<8x32xi16> + %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xi32> -> vector<8x16xi32> + %3 = vector.bitcast %2 : vector<8x16xi32> to vector<8x32xi16> + xegpu.store_nd %3, %1 : vector<8x32xi16>, !xegpu.tensor_desc<8x32xi16> + return +} + + // ----- // CHECK-LABEL: func.func @binary_op_one_use( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, @@ -432,7 +489,7 @@ func.func @prefetch_1d(%arg0: memref<256xf16>){ } // ----- -// CHECK-LABEL: func.func @test_scf_while_and_condition( +// CHECK-LABEL: func.func @scf_while_and_condition( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) { // CHECK: %{{.*}}:3 = scf.while ({{.*}}) : (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout>) // CHECK-SAME: -> (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout>) { @@ -441,7 +498,7 @@ func.func @prefetch_1d(%arg0: memref<256xf16>){ // CHECK-NEXT: ^bb0(%{{.*}}: vector<16xf32>, %{{.*}}: i32, %{{.*}}: !xegpu.tensor_desc<16xf32, #xegpu.layout>): // CHECK: scf.yield {{.*}} : vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout> // CHECK-NEXT: } attributes {layout_result_0 = #xegpu.layout} -func.func @test_scf_while_and_condition(%arg0: memref<256xf32>, %arg1: memref<256xf32>) { +func.func @scf_while_and_condition(%arg0: memref<256xf32>, %arg1: memref<256xf32>) { %c0 = arith.constant 0 : i32 %c16 = arith.constant 16 : i32 %c256 = arith.constant 256 : i32 @@ -463,3 +520,46 @@ func.func @test_scf_while_and_condition(%arg0: memref<256xf32>, %arg1: memref<25 } return } + +// ----- +// CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim1_distributed( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, +// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> +// CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction , %[[LOAD]], %{{[0-9a-zA-Z]+}} +// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16> +// CHECK-NEXT: %[[CAST:.*]] = vector.shape_cast %[[REDUCE]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: vector<16xf16> to vector<1x16xf16> +func.func @vector_shape_cast_1d_to_2d_dim1_distributed(%arg0: !xegpu.tensor_desc<16x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) { + %c0 = arith.constant 0 : index + %cst = arith.constant dense<0.0000> : vector<16xf16> + %3 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %4 = vector.multi_reduction , %3, %cst [0] : vector<16x16xf16> to vector<16xf16> + %2 = vector.shape_cast %4 : vector<16xf16> to vector<1x16xf16> + %5 = vector.broadcast %2 : vector<1x16xf16> to vector<16x16xf16> + xegpu.store_nd %5, %arg1 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> + return +} + +// ----- +// CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, +// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %arg0 {layout_result_0 = #xegpu.layout} +// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> +// CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction , %[[LOAD]], %{{[0-9a-zA-Z]+}} +// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} [1] +// CHECK-SAME: vector<16x16xf16> to vector<16xf16> +// CHECK-NEXT: %[[CAST:.*]] = vector.shape_cast %[[REDUCE]] {layout_result_0 = #xegpu.layout} +// CHECK-SAME: vector<16xf16> to vector<16x1xf16> +func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(%arg0: !xegpu.tensor_desc<16x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) { + %c0 = arith.constant 0 : index + %cst = arith.constant dense<0.0000> : vector<16xf16> + %3 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %4 = vector.multi_reduction , %3, %cst [1] : vector<16x16xf16> to vector<16xf16> + %2 = vector.shape_cast %4 : vector<16xf16> to vector<16x1xf16> + %5 = vector.broadcast %2 : vector<16x1xf16> to vector<16x16xf16> + xegpu.store_nd %5, %arg1 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> + return +} diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index 30ca9816df5bc..13b0ed176eb0c 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -1,6 +1,8 @@ -// RUN: mlir-opt -xegpu-subgroup-distribute -allow-unregistered-dialect -canonicalize -cse -split-input-file %s | FileCheck %s +// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' -xegpu-subgroup-distribute \ +// RUN: -allow-unregistered-dialect -canonicalize -cse -split-input-file %s | FileCheck %s -// RUN: mlir-opt -xegpu-subgroup-distribute="enable-sg-reductions=false" -allow-unregistered-dialect \ +// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' \ +// RUN: -xegpu-subgroup-distribute="enable-sg-reductions=false" -allow-unregistered-dialect \ // RUN: -canonicalize -cse -split-input-file %s | FileCheck %s --check-prefix=CHECK-REDUCTION // CHECK-LABEL: gpu.func @store_nd_1d @@ -9,7 +11,7 @@ // CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> // CHECK: xegpu.store_nd %[[CST]], %[[T0]] : vector<1xf32>, !xegpu.tensor_desc<16xf32> // CHECK: gpu.return -gpu.module @test { +gpu.module @xevm_module{ gpu.func @store_nd_1d(%arg0: memref<16xf32>) { %c0 = arith.constant 0 : index %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16xf32> @@ -25,7 +27,7 @@ gpu.module @test { // CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf16> // CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> // CHECK: xegpu.store_nd %[[CST]], %[[T0]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @store_nd_2d(%arg0: memref<16x16xf16>) { %c0 = arith.constant 0 : index %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16x16xf16> @@ -44,7 +46,7 @@ gpu.module @test { // CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16xf32> -> vector<1xf32> // CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> // CHECK: xegpu.store_nd %[[T1]], %[[T2]] : vector<1xf32>, !xegpu.tensor_desc<16xf32> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> @@ -62,7 +64,7 @@ gpu.module @test { // CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> // CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> // CHECK: xegpu.store_nd %[[T1]], %[[T2]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> @@ -83,7 +85,7 @@ gpu.module @test { // CHECK-DAG: %[[T4:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> // CHECK-DAG: %[[T5:.*]] = vector.shape_cast %[[T3]] : vector<16x1xf16> to vector<16xf16> // CHECK: xegpu.store_nd %[[T5]], %[[T4]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, #xegpu.layout> @@ -105,7 +107,7 @@ gpu.module @test { // CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> // CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][%{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> // CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @load_dpas_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> @@ -133,7 +135,7 @@ gpu.module @test { // CHECK-DAG: %[[T8:.*]] = vector.shape_cast %[[T6]] : vector<8x1xf32> to vector<8xf32> // CHECK-DAG: %[[T7:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> // CHECK: xegpu.store_nd %[[T8]], %[[T7]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @load_dpas_postop_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> @@ -157,7 +159,7 @@ gpu.module @test { // CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> // CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> // CHECK: xegpu.store_nd %[[T1]], %[[T2]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %arg1: ui64, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0], shape:[%arg2, %arg3], strides:[%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> @@ -191,7 +193,7 @@ gpu.module @test { // CHECK-NEXT: } // CHECK-NEXT: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32> // CHECK-NEXT: xegpu.store_nd %[[T9]], %[[T2]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){ %c0 = arith.constant 0 : index %c16 = arith.constant 16 : index @@ -223,7 +225,7 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf32> -> !xegpu.tensor_desc<16xf32> // CHECK: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%c32] : !xegpu.tensor_desc<16xf32> // CHECK: xegpu.store_nd %[[CST]], %[[T1]] : vector<1xf32>, !xegpu.tensor_desc<16xf32> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @update_nd_offset_1d(%arg0: memref<256xf32>) { %c0 = arith.constant 0 : index %c32 = arith.constant 32 : index @@ -242,7 +244,7 @@ gpu.module @test { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32> // CHECK: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%c32, %c32] : !xegpu.tensor_desc<16x16xf32> // CHECK: xegpu.store_nd %[[CST]], %[[T1]] : vector<16xf32>, !xegpu.tensor_desc<16x16xf32> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @update_nd_offset_2d(%arg0: memref<256x256xf32>) { %c0 = arith.constant 0 : index %c32 = arith.constant 32 : index @@ -259,7 +261,7 @@ gpu.module @test { // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16> // CHECK: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @prefetch_2d(%arg0: memref<256x256xf16>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> @@ -279,7 +281,7 @@ gpu.module @test { // CHECK: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]] : // CHECK-SAME: !xegpu.tensor_desc<16x16xf32, #xegpu.layout> to !xegpu.tensor_desc<16x16xf32> {resolve_simt_type_mismatch} // CHECK: xegpu.update_nd_offset %[[T1]], [%{{.*}}] : !xegpu.tensor_desc<16x16xf32> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @check_update_nd_offset_distributed_tensor_desc() { %c32 = arith.constant 32 : index %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16x16xf32> @@ -295,7 +297,7 @@ gpu.module @test { // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16> // CHECK: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @prefetch_1d(%arg0: memref<256xf16>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> @@ -311,7 +313,7 @@ gpu.module @test { // CHECK-NEXT: gpu.barrier // CHECK-NEXT: %[[T2:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16> // CHECK-NEXT: xegpu.store_nd %[[T1]], %[[T2]] : vector<1xf16>, !xegpu.tensor_desc<16xf16> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @gpu_barrier(%arg0: memref<256xf16>, %arg1: memref<256xf16>) { %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> @@ -337,7 +339,7 @@ gpu.module @test { // CHECK-NEXT: %[[CAST1:.*]] = vector.shape_cast %[[COL1]] : vector<16x1xf32> to vector<16xf32> // CHECK-NEXT: %[[RED1:.*]] = vector.reduction , %[[CAST1]], %{{.*}} : vector<16xf32> into f32 // CHECK-NEXT: vector.from_elements %[[RED0]], %[[RED1]] : vector<2xf32> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction() { %0 = "some_def"() : () -> !xegpu.tensor_desc<1x32xf32, #xegpu.layout> %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<16x32xf32>) @@ -363,7 +365,7 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction() { // CHECK-REDUCTION-NEXT: gpu.yield %4, %[[R1]], %[[R0]] : !xegpu.tensor_desc<2x16xf32, #xegpu.layout>, f32, f32 // CHECK-REDUCTION-NEXT: } // CHECK-REDUCTION-NEXT: vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction() { %0 = "some_def"() : () -> !xegpu.tensor_desc<2x16xf32, #xegpu.layout> %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<2x16xf32>) @@ -390,7 +392,7 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction() { // CHECK: %[[ROW1:.*]] = vector.extract %[[W]]#1[1] : vector<16xf32> from vector<2x16xf32> // CHECK-NEXT: %[[R1:.*]] = vector.reduction , %[[ROW1]], %{{.*}} : vector<16xf32> into f32 // CHECK-NEXT: vector.from_elements %[[R0]], %[[R1]] : vector<2xf32> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction() { %0 = "some_def"() : () -> !xegpu.tensor_desc<32x1xf32, #xegpu.layout> %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<32x16xf32>) @@ -418,7 +420,7 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction() { // CHECK-REDUCTION-NEXT: gpu.yield %4, %[[R1]], %[[R0]] : !xegpu.tensor_desc<16x2xf32, #xegpu.layout>, f32, f32 // CHECK-REDUCTION-NEXT: } // CHECK-REDUCTION-NEXT: vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction() { %0 = "some_def"() : () -> !xegpu.tensor_desc<16x2xf32, #xegpu.layout> %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<16x2xf32>) @@ -439,7 +441,7 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction() { // CHECK-NEXT: %[[LANE_OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> // CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[LANE_OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> // CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[LANE_OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @scatter_ops_chunksize(%src: memref<256xf16>) { %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> @@ -464,7 +466,7 @@ gpu.module @test { // CHECK-NEXT: scf.yield %[[DEFAULT]] : vector<8xf16> // CHECK-NEXT: } // CHECK-NEXT: xegpu.store %[[PREDICATED_LOAD]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @scatter_ops_scf_yield(%src: memref<256xf16>, %pred : i1) { %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> @@ -493,7 +495,7 @@ gpu.module @test { // CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> // CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> // CHECK-NEXT: } -gpu.module @test { +gpu.module @xevm_module{ gpu.func @scatter_ops_scf_non_yield(%src: memref<256xf16>) { %pred = llvm.mlir.poison : i1 %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> @@ -514,7 +516,7 @@ gpu.module @test { // CHECK-NEXT: %[[LANE_OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> // CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[LANE_OFFSET]]], %[[MASK]] : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16> // CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[LANE_OFFSET]]], %[[MASK]] : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> -gpu.module @test { +gpu.module @xevm_module{ gpu.func @scatter_ops(%src: memref<256xf16>) { %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> @@ -525,3 +527,98 @@ gpu.module @test { gpu.return } } + +// ----- +// CHECK-LABEL: gpu.func @memref_extract_aligned_pointer_as_index( +// CHECK: %{{.*}} = memref.extract_aligned_pointer_as_index %{{.*}} : memref<256x256xf16> -> index +gpu.module @xevm_module{ + gpu.func @memref_extract_aligned_pointer_as_index(%arg0 : memref<256x256xf16>) { + %c0 = arith.constant 0 : index + %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16xf16> + %ptr = memref.extract_aligned_pointer_as_index %arg0 : memref<256x256xf16> -> index + %ptr_i64 = arith.index_cast %ptr : index to i64 + %tdesc = xegpu.create_nd_tdesc %ptr_i64[%c0], shape: [16], strides: [16] : i64 + -> !xegpu.tensor_desc<16xf16, #xegpu.layout> + xegpu.store_nd %cst, %tdesc : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout> + gpu.return + } +} + + +// ----- +// CHECK-LABEL: gpu.func @vector_transpose( +// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<2xf32> +// CHECK: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<2x16xf32> -> !xegpu.tensor_desc<2x16xf32> +// CHECK: xegpu.store_nd %[[CST]], %[[DEST]] : vector<2xf32>, !xegpu.tensor_desc<2x16xf32> +gpu.module @xevm_module{ + gpu.func @vector_transpose(%arg0: memref<2x16xf32>) { + %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> + : vector<16x2xf32> + %c0 = arith.constant 0 : index + %transpose = vector.transpose %cst, [1, 0] {layout_result_0 = #xegpu.layout} + : vector<16x2xf32> to vector<2x16xf32> + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<2x16xf32> + -> !xegpu.tensor_desc<2x16xf32, #xegpu.layout> + xegpu.store_nd %transpose, %0 : vector<2x16xf32>, + !xegpu.tensor_desc<2x16xf32, #xegpu.layout> + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @vector_bitcast( +// CHECK: %[[CAST:.*]] = vector.bitcast %{{.*}} : vector<4x2xi8> to vector<4x1xi16> +// CHECK-NEXT: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<4x16xi16> -> !xegpu.tensor_desc<4x16xi16> +// CHECK-NEXT: %[[T0:.*]] = vector.shape_cast %[[CAST]] : vector<4x1xi16> to vector<4xi16> +// CHECK-NEXT: xegpu.store_nd %[[T0]], %[[DEST]] : vector<4xi16>, !xegpu.tensor_desc<4x16xi16> +gpu.module @xevm_module{ + gpu.func @vector_bitcast(%arg0: memref<4x16xi16>) { + %cst = "some_op"() {layout_result_0 = #xegpu.layout} + : () -> (vector<4x32xi8>) + %bitcast = vector.bitcast %cst {layout_result_0 = #xegpu.layout} + : vector<4x32xi8> to vector<4x16xi16> + %c0 = arith.constant 0 : index + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<4x16xi16> + -> !xegpu.tensor_desc<4x16xi16, #xegpu.layout> + xegpu.store_nd %bitcast, %0 : vector<4x16xi16>, + !xegpu.tensor_desc<4x16xi16, #xegpu.layout> + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @mma_transpose_b( +// CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x8xi32>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { +// CHECK-DAG: %[[ADESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> +// CHECK-DAG: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32> +// CHECK-DAG: %[[A:.*]] = xegpu.load_nd %[[ADESC]] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> +// CHECK-DAG: %[[B:.*]] = xegpu.load_nd %[[BDESC]] <{transpose = array}> : !xegpu.tensor_desc<16x8xi32> -> vector<8xi32> +// CHECK-NEXT: %[[BCAST0:.*]] = vector.shape_cast %[[B]] : vector<8xi32> to vector<1x8xi32> +// CHECK-NEXT: %[[BCAST1:.*]] = vector.bitcast %[[BCAST0]] : vector<1x8xi32> to vector<1x16xf16> +// CHECK-NEXT: %[[BCAST2:.*]] = vector.shape_cast %[[BCAST1]] : vector<1x16xf16> to vector<16xf16> +// CHECK-NEXT: %[[C:.*]] = xegpu.dpas %[[A]], %[[BCAST2]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> +gpu.module @xevm_module{ + gpu.func @mma_transpose_b(%arg0: memref<8x16xf16>, %arg1: memref<16x8xi32>, %arg2: memref<8x16xf32>) { + %c0 = arith.constant 0 : index + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> + -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> + %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} + : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> + %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x8xi32> + -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout> + %3 = xegpu.load_nd %2 {layout_result_0 = #xegpu.layout} + : !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> + %4 = vector.bitcast %3 {layout_result_0 = #xegpu.layout} + : vector<16x8xi32> to vector<16x16xf16> + %5 = vector.transpose %4, [1, 0] {layout_result_0 = #xegpu.layout} + : vector<16x16xf16> to vector<16x16xf16> + %6 = xegpu.dpas %1, %5 {layout_result_0 = #xegpu.layout} + : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> + %7 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> + -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + xegpu.store_nd %6, %7 : vector<8x16xf32>, + !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + gpu.return + + } +}