diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 7ca1b957bbd01..6faa25cf49df9 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -287,13 +287,10 @@ void XeGPUBlockingPass::runOnOperation() { MLIRContext *ctx = &getContext(); Operation *op = getOperation(); - // TODO-LayoutRefactor: unify the local propagation for layout preprocessing - // replace the function with recoverTemporaryLayouts - // if (!xegpu::recoverTemporaryLayouts(op)) { - // signalPassFailure(); - // return; - // } - xegpu::recoverTemporaryLayoutsDeprecated(op); + if (!xegpu::recoverTemporaryLayouts(op)) { + signalPassFailure(); + return; + } auto getTileShapeAndCount = [](llvm::ArrayRef shape, xegpu::LayoutAttr layout) { diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp index 738ff3f8aa915..a4e47fca96d34 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp @@ -89,6 +89,10 @@ bool xegpu::recoverTemporaryLayouts(Operation *rootOp) { // Layouts are needed for vector type only. if (!isa(operand.get().getType())) continue; + // Skip block arguments since they don't have defining ops to attach + // layout attributes to + if (isa(operand.get())) + continue; auto layout = xegpu::getDistributeLayoutAttr(operand.get()); if (!layout) { op->emitWarning("Could not find layout attribute for operand ") diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index f37d25108dbcb..cd6bc9ac4b8e0 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -511,8 +511,6 @@ struct WgToSgVectorBroadcastOp for (auto operand : adaptor.getOperands().front()) { auto newBroadcast = vector::BroadcastOp::create(rewriter, op.getLoc(), newResultType, operand); - xegpu::setTemporaryLayout(newBroadcast->getResult(0), - layout.dropSgLayoutAndData()); newBroadcastOps.push_back(newBroadcast.getResult()); } @@ -564,10 +562,9 @@ struct WgToSgElementwiseOp : public ConversionPattern { OperationState state(op->getLoc(), op->getName()); state.addOperands(opOperands); state.addTypes(newResultType); - // Copy all attributes, but update "layout_result_0" to drop - // sgLayout/sgData - state.addAttributes(xegpu::dropSgLayoutAndDataOnAttrs(op->getAttrs())); + state.addAttributes(op->getAttrs()); Operation *newOp = rewriter.create(state); + xegpu::removeLayoutAttrs(newOp); newResults.push_back(newOp->getResult(0)); } @@ -749,24 +746,17 @@ struct WgToSgArithConstantOp : public OpConversionPattern { Location loc = op.getLoc(); auto eltType = vecType.getElementType(); - auto setLayout = [&](Value val) { - xegpu::setTemporaryLayout(llvm::dyn_cast(val), - layout.dropSgLayoutAndData()); - }; - if (vecAttr.isSplat()) { // Splat: single value for all subgroups Attribute singleVal = vecAttr.getSplatValue(); auto sgAttr = DenseElementsAttr::get(newType, singleVal); auto cstOp = arith::ConstantOp::create(rewriter, loc, newType, sgAttr); - setLayout(cstOp->getResult(0)); rewriter.replaceOp(op, cstOp); return success(); } else if (sgShape == wgShape) { // if the entire vector is shared by all // subgroups, don't distribute auto newConstOp = arith::ConstantOp::create(rewriter, op.getLoc(), vecType, vecAttr); - setLayout(newConstOp->getResult(0)); rewriter.replaceOp(op, newConstOp); return success(); } else { @@ -868,9 +858,6 @@ struct WgToSgArithConstantOp : public OpConversionPattern { rewriter, loc, baseConstVec.getType(), mulOffset); auto finalConst = arith::AddIOp::create(rewriter, loc, baseConstVec, bcastOffset); - setLayout(baseConstVec); - setLayout(bcastOffset); - setLayout(finalConst); newConstOps.push_back(finalConst); } rewriter.replaceOpWithMultiple(op, {newConstOps}); @@ -926,7 +913,6 @@ struct WgToSgLoadGatherOpWithOffset rewriter, loc, newTy, op.getSource(), offsets, mask, chunkSizeAttr, op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr(), newLayout); - newLoadOp.setAnchorLayout(newLayout); newLoadOps.push_back(newLoadOp); } rewriter.replaceOpWithMultiple(op, {newLoadOps}); @@ -972,17 +958,10 @@ struct WgToSgStoreScatterOpWithOffset auto chunkSizeAttr = rewriter.getI64IntegerAttr(chunkSize); for (auto [val, offs, mask] : llvm::zip( adaptor.getValue(), adaptor.getOffsets(), adaptor.getMask())) { - auto store = xegpu::StoreScatterOp::create( - rewriter, loc, val, op.getDest(), offs, mask, chunkSizeAttr, - op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr(), - layout.dropSgLayoutAndData()); - // Update the layout attribute to drop sg_layout and sg_data. - for (OpOperand &operand : store->getOpOperands()) { - // Skip for operand one (memref) - if (operand.getOperandNumber() == 1) - continue; - xegpu::setTemporaryLayout(operand, layout.dropSgLayoutAndData()); - } + xegpu::StoreScatterOp::create(rewriter, loc, val, op.getDest(), offs, + mask, chunkSizeAttr, op.getL1HintAttr(), + op.getL2HintAttr(), op.getL3HintAttr(), + layout.dropSgLayoutAndData()); } rewriter.eraseOp(op); return success(); @@ -1074,12 +1053,6 @@ struct WgToSgVectorStepOp : public OpConversionPattern { vector::BroadcastOp::create(rewriter, loc, newTy, offsets[0]); auto finalSteps = arith::AddIOp::create(rewriter, loc, steps, bcastOffset); - xegpu::setTemporaryLayout(steps->getResult(0), - layout.dropSgLayoutAndData()); - xegpu::setTemporaryLayout(bcastOffset->getResult(0), - layout.dropSgLayoutAndData()); - xegpu::setTemporaryLayout(finalSteps->getResult(0), - layout.dropSgLayoutAndData()); newOps.push_back(finalSteps); } @@ -1150,8 +1123,6 @@ struct WgToSgVectorShapeCastOp for (auto src : adaptor.getSource()) { auto newShapeCast = vector::ShapeCastOp::create(rewriter, op.getLoc(), newResultType, src); - xegpu::setTemporaryLayout(newShapeCast->getResult(0), - layout.dropSgLayoutAndData()); newShapeCastOps.push_back(newShapeCast.getResult()); } @@ -1386,9 +1357,6 @@ struct WgToSgMultiDimReductionOp for (auto localResult : localReductions) { auto finalResult = vector::makeArithReduction( rewriter, loc, op.getKind(), localResult, adaptor.getAcc()[0]); - if (auto defOp = finalResult.getDefiningOp()) - xegpu::setDistributeLayoutAttr(defOp->getResult(0), - layout.dropSgLayoutAndData()); results.push_back(finalResult); } rewriter.replaceOpWithMultiple(op, {results}); @@ -1525,10 +1493,6 @@ struct WgToSgMultiDimReductionOp auto finalResult = vector::makeArithReduction( rewriter, loc, op.getKind(), finalReduce.getResult(), accToAdd); - if (auto defOp = finalResult.getDefiningOp()) - xegpu::setDistributeLayoutAttr(defOp->getResult(0), - layout.dropSgLayoutAndData()); - rewriter.replaceOp(op, finalResult); return success(); } @@ -1588,8 +1552,6 @@ struct WgToSgVectorTransposeOp for (auto src : adaptor.getVector()) { auto newTranspose = vector::TransposeOp::create( rewriter, op.getLoc(), newResultType, src, permutation); - xegpu::setTemporaryLayout(newTranspose->getResult(0), - layout.dropSgLayoutAndData()); newTransposeOps.push_back(newTranspose.getResult()); } @@ -1658,8 +1620,6 @@ struct WgToSgVectorMaskOp : public OpConversionPattern { auto newCreateMaskOp = vector::CreateMaskOp::create(rewriter, loc, resultType, maskOperands); - xegpu::setTemporaryLayout(newCreateMaskOp->getResult(0), - layout.dropSgLayoutAndData()); newCreateMaskOps.push_back(newCreateMaskOp.getResult()); } @@ -1700,12 +1660,11 @@ struct XeGPUWgToSgDistributePass void XeGPUWgToSgDistributePass::runOnOperation() { - // TODO-LayoutRefactor: unify the local propagation for layout preprocessing - // Operation *op = getOperation(); - // if (!xegpu::recoverTemporaryLayouts(op)) { - // signalPassFailure(); - // return; - // } + Operation *op = getOperation(); + if (!xegpu::recoverTemporaryLayouts(op)) { + signalPassFailure(); + return; + } // Track existing UnrealizedConversionCastOps SmallVector existingCastOps; @@ -1888,22 +1847,4 @@ void XeGPUWgToSgDistributePass::runOnOperation() { if (failed( applyPartialConversion(getOperation(), target, std::move(patterns)))) return signalPassFailure(); - - // Remove sg_layout and sg_data attributes from the Layout - // attribute for each VectorType result of the operation. - // For Structured Control Flow ops, the layout is simply removed, - // since in 1:N case, the layout for new results are missing. - // Layout propagation pass will activated. - getOperation()->walk([](Operation *op) { - for (OpResult result : op->getOpResults()) { - std::string name = xegpu::getTemporaryLayoutName(result); - if (auto layout = op->getAttrOfType(name)) { - op->removeAttr(name); - if (!isa(op)) { - if (auto newLayout = layout.dropSgLayoutAndData()) - op->setAttr(name, newLayout); - } - } - } - }); } diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir index 0b6e30e6f95f0..68f6e8e1ec955 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir @@ -34,7 +34,9 @@ gpu.module @test_kernel { %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32, %c0] : !xegpu.tensor_desc<32x32xf16, #b> scf.yield %a_next_tdesc, %b_next_tdesc, %c : !xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32> - } + } {layout_result_0 = #a, + layout_result_1 = #b, + layout_result_2 = #c} //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> xegpu.store_nd %out#2, %c_tdesc {layout = #c}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c> gpu.return @@ -75,7 +77,9 @@ gpu.module @test_kernel { %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32, %c0] : !xegpu.tensor_desc<32x32xf16, #l2> scf.yield %a_next_tdesc, %b_next_tdesc, %c : !xegpu.tensor_desc<16x32xf16, #l1>, !xegpu.tensor_desc<32x32xf16, #l2>, vector<16x32xf32> - } + } {layout_result_0 = #l1, + layout_result_1 = #l2, + layout_result_2 = #l1} //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> xegpu.store_nd %out#2, %c_tdesc {layout = #l1}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #l1> gpu.return @@ -118,7 +122,9 @@ gpu.module @test_kernel { %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32, %c0] : !xegpu.tensor_desc<16x32xf16, #l2> scf.yield %a_next_tdesc, %b_next_tdesc, %c : !xegpu.tensor_desc<8x16xf16, #l1>, !xegpu.tensor_desc<16x32xf16, #l2>, vector<8x32xf32> - } + } {layout_result_0 = #l1, + layout_result_1 = #l2, + layout_result_2 = #l1} //CHECK-COUNT-2: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> xegpu.store_nd %out#2, %c_tdesc {layout = #l1}: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #l1> gpu.return @@ -162,7 +168,9 @@ gpu.module @test_kernel { %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32, %c0] : !xegpu.tensor_desc<32x32xf16, #b> scf.yield %a_next_tdesc, %b_next_tdesc, %c : !xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32> - } + } {layout_result_0 = #a, + layout_result_1 = #b, + layout_result_2 = #c} //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> xegpu.store_nd %out#2, %c_tdesc {layout = #c}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c> gpu.return @@ -252,7 +260,7 @@ gpu.module @test_kernel { #r = #xegpu.layout gpu.module @test_kernel { gpu.func @reduce_dim_0(%a: memref<16x512xf32>, %b: memref<512xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { - %acc = arith.constant dense<0.0> : vector<64xf32> + %acc = arith.constant {layout_result_0 = #r} dense<0.0> : vector<64xf32> %c64 = arith.constant 64 : index %block_id_x = gpu.block_id x %m = arith.muli %block_id_x, %c64 : index @@ -274,7 +282,7 @@ gpu.module @test_kernel { gpu.func @reduce_dim_1(%a: memref<512x32xf32>, %b: memref<512xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { %c1 = arith.constant 1 : index %c32 = arith.constant 32 : index - %acc = arith.constant dense<0.0> : vector<32xf32> + %acc = arith.constant {layout_result_0 = #r} dense<0.0> : vector<32xf32> %block_id_x = gpu.block_id x %block_id_y = gpu.block_id y @@ -324,7 +332,7 @@ gpu.module @test_kernel { %m = arith.muli %block_id_x, %c32 : index %0 = xegpu.create_nd_tdesc %a[%m] : memref<512xf32> -> !xegpu.tensor_desc<32xf32, #r> %1 = xegpu.load_nd %0 {layout = #r}: !xegpu.tensor_desc<32xf32, #r> -> vector<32xf32> - %11 = vector.shape_cast %1 : vector<32xf32> to vector<32x1xf32> + %11 = vector.shape_cast %1 {layout_result_0 = #l} : vector<32xf32> to vector<32x1xf32> // CHECK-COUNT-8: vector.broadcast {{.*}}: vector<16x1xf32> to vector<16x16xf32> %2 = vector.broadcast %11 {layout_result_0 = #l} : vector<32x1xf32> to vector<32x64xf32> %3 = xegpu.create_nd_tdesc %b[0, %m] : memref<16x512xf32> -> !xegpu.tensor_desc<32x64xf32, #l> @@ -358,7 +366,7 @@ gpu.module @test_kernel { gpu.func @test_vector_constant_mask(%src: ui64, %dst: ui64) { //CHECK: arith.constant dense : vector<16xi1> %mask = vector.constant_mask [32] {layout_result_0 = #l} : vector<32xi1> - %cst = arith.constant dense<[ + %cst = arith.constant {layout_result_0 = #l} dense<[ 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, @@ -377,7 +385,7 @@ gpu.module @test_kernel { %c16 = arith.constant 16 : index //CHECK-COUNT-2: vector.create_mask {{.*}} : vector<16xi1> %mask = vector.create_mask %c16 {layout_result_0 = #l} : vector<32xi1> - %cst = arith.constant dense<[ + %cst = arith.constant {layout_result_0 = #l} dense<[ 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, @@ -418,7 +426,7 @@ gpu.module @test_kernel { gpu.func @test_prefetch_load_store_update(%src: ui64) { - %cst = arith.constant dense<[ + %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<[ 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, @@ -428,7 +436,7 @@ gpu.module @test_kernel { %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> xegpu.prefetch %tdesc {layout = #xegpu.layout}: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> - %delta = arith.constant dense<[ + %delta = arith.constant {layout_result_0 = #xegpu.layout} dense<[ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 64, 128, 128, 128, 128, 128, 128, 128, 128, @@ -438,11 +446,11 @@ gpu.module @test_kernel { : !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<32xindex> %c17 = arith.constant 17: index - %mask = vector.create_mask %c17: vector<32xi1> + %mask = vector.create_mask %c17 {layout_result_0 = #xegpu.layout} : vector<32xi1> %ld_vec = xegpu.load %new_tdesc, %mask {layout = #xegpu.layout}: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<32xi1> -> vector<32xf32> - %st_vec = arith.addf %ld_vec, %ld_vec : vector<32xf32> + %st_vec = arith.addf %ld_vec, %ld_vec {layout_result_0 = #xegpu.layout} : vector<32xf32> xegpu.store %st_vec, %tdesc, %mask {layout = #xegpu.layout}: vector<32xf32>, !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, @@ -465,7 +473,7 @@ gpu.module @test_kernel { gpu.func @test_prefetch_load_store_update_chunk(%src: ui64) { - %cst = arith.constant dense<[ + %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<[ 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, @@ -475,7 +483,7 @@ gpu.module @test_kernel { %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> xegpu.prefetch %tdesc {layout = #xegpu.layout}: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - %delta = arith.constant dense<[ + %delta = arith.constant {layout_result_0 = #xegpu.layout} dense<[ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 64, 128, 128, 128, 128, 128, 128, 128, 128, @@ -485,11 +493,11 @@ gpu.module @test_kernel { : !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<32xindex> %c17 = arith.constant 17: index - %mask = vector.create_mask %c17: vector<32xi1> + %mask = vector.create_mask %c17 {layout_result_0 = #xegpu.layout} : vector<32xi1> %ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, layout = #xegpu.layout}>: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<32xi1> -> vector<32x4xf32> - %st_vec = arith.addf %ld_vec, %ld_vec : vector<32x4xf32> + %st_vec = arith.addf %ld_vec, %ld_vec {layout_result_0 = #xegpu.layout} : vector<32x4xf32> xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, layout = #xegpu.layout}>: vector<32x4xf32>, !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, @@ -521,7 +529,7 @@ gpu.module @test_kernel { gpu.func @test_3d_scattered_tensor_desc(%src: ui64) { - %cst = arith.constant dense<[ + %cst = arith.constant {layout_result_0 = #l} dense<[ [0, 8, 16, 24, 32, 40, 48, 56], [64, 72, 80, 88, 96, 104, 112, 120], [128, 136, 144, 152, 160, 168, 176, 184], @@ -531,7 +539,7 @@ gpu.module @test_kernel { %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<4x8xindex> -> !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr, #l> xegpu.prefetch %tdesc {layout = #l}: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr, #l> - %delta = arith.constant dense<[ + %delta = arith.constant {layout_result_0 = #l} dense<[ [32, 32, 32, 32, 32, 32, 32, 32], [32, 32, 32, 32, 32, 32, 32, 64], [128, 128, 128, 128, 128, 128, 128, 128], @@ -541,7 +549,7 @@ gpu.module @test_kernel { : !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr, #l>, vector<4x8xindex> %c4 = arith.constant 4: index - %mask = vector.create_mask %c4, %c4: vector<4x8xi1> + %mask = vector.create_mask %c4, %c4 {layout_result_0 = #l}: vector<4x8xi1> %ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, layout = #l}>: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr, #l>, vector<4x8xi1> -> vector<4x8x4xf32> @@ -643,7 +651,7 @@ gpu.module @test_kernel { // CHECK-LABEL: load_with_offsets // CHECK-COUNT-2: xegpu.load {{.*}}[{{.*}}], {{.*}} <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32> gpu.func @load_with_offsets(%src: ui64) -> vector<32xf32> { - %cst = arith.constant dense<[ + %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<[ 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, @@ -651,7 +659,7 @@ gpu.module @test_kernel { ]> : vector<32xindex> %c17 = arith.constant 17: index - %mask = vector.create_mask %c17: vector<32xi1> + %mask = vector.create_mask %c17 {layout_result_0 = #xegpu.layout}: vector<32xi1> %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32> gpu.return %ld : vector<32xf32> @@ -663,7 +671,7 @@ gpu.module @test_kernel { // CHECK-LABEL: store_with_offsets // CHECK-COUNT-2: xegpu.store {{.*}}[{{.*}}], {{.*}} <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint}> : vector<16xf32>, ui64, vector<16xindex>, vector<16xi1> gpu.func @store_with_offsets(%src: ui64) { - %cst = arith.constant dense<[ + %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<[ 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, @@ -671,9 +679,9 @@ gpu.module @test_kernel { ]> : vector<32xindex> %c17 = arith.constant 17: index - %mask = vector.create_mask %c17: vector<32xi1> + %mask = vector.create_mask %c17 {layout_result_0 = #xegpu.layout}: vector<32xi1> - %st_vec = arith.constant dense<1023.0>: vector<32xf32> + %st_vec = arith.constant {layout_result_0 = #xegpu.layout} dense<1023.0>: vector<32xf32> xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} : vector<32xf32>, ui64, vector<32xindex>, vector<32xi1> gpu.return @@ -690,7 +698,7 @@ gpu.module @test_kernel { // CHECK: [[cst3:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex> // CHECK-COUNT-4: xegpu.load {{.*}}[{{.*}}], {{.*}} <{chunk_size = 2 : i64, l1_hint = #xegpu.cache_hint}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16x2xf32> gpu.func @load_with_offsets_chunk(%src: ui64) -> vector<32x4xf32> { - %cst = arith.constant dense<[ + %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<[ 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, @@ -698,7 +706,7 @@ gpu.module @test_kernel { ]> : vector<32xindex> %c17 = arith.constant 17: index - %mask = vector.create_mask %c17: vector<32xi1> + %mask = vector.create_mask %c17 {layout_result_0 = #xegpu.layout}: vector<32xi1> %ld = xegpu.load %src[%cst], %mask {chunk_size = 4, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32> gpu.return %ld : vector<32x4xf32> } @@ -714,7 +722,7 @@ gpu.module @test_kernel { // CHECK: [[cst3:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex> // CHECK-COUNT-4: xegpu.store {{.*}}[{{.*}}], {{.*}} <{chunk_size = 2 : i64, l1_hint = #xegpu.cache_hint}> : vector<16x2xf32>, ui64, vector<16xindex>, vector<16xi1> gpu.func @store_with_offsets_chunk(%src: ui64) { - %cst = arith.constant dense<[ + %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<[ 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, @@ -722,9 +730,9 @@ gpu.module @test_kernel { ]> : vector<32xindex> %c17 = arith.constant 17: index - %mask = vector.create_mask %c17: vector<32xi1> + %mask = vector.create_mask %c17 {layout_result_0 = #xegpu.layout}: vector<32xi1> - %st_vec = arith.constant dense<1023.>: vector<32x4xf32> + %st_vec = arith.constant {layout_result_0 = #xegpu.layout} dense<1023.>: vector<32x4xf32> xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 4, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} : vector<32x4xf32>, ui64, vector<32xindex>, vector<32xi1> gpu.return } diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir index 6e9711442b92d..762530e5d189f 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir @@ -27,11 +27,11 @@ gpu.module @test_elementwise_ops { %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - // CHECK: math.exp {{.*}} {layout_result_0 = #xegpu.layout} : vector<12x8xf32> + // CHECK: math.exp {{.*}} : vector<12x8xf32> %exp = math.exp %load_a {layout_result_0 = #xegpu.layout} : vector<24x32xf32> - // CHECK: arith.negf {{.*}} {layout_result_0 = #xegpu.layout} : vector<12x8xf32> + // CHECK: arith.negf {{.*}} : vector<12x8xf32> %negf = arith.negf %load_a {layout_result_0 = #xegpu.layout} : vector<24x32xf32> @@ -50,13 +50,11 @@ gpu.module @test_elementwise_ops { %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - // CHECK: arith.addf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout} - // CHECK-SAME: : vector<12x8xf32> + // CHECK: arith.addf {{.*}}, {{.*}} : vector<12x8xf32> %addf = arith.addf %load_a, %load_b {layout_result_0 = #xegpu.layout} : vector<24x32xf32> - // CHECK: math.powf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout} - // CHECK-SAME: : vector<12x8xf32> + // CHECK: math.powf {{.*}}, {{.*}} : vector<12x8xf32> %powf = math.powf %load_a, %load_b {layout_result_0 = #xegpu.layout} : vector<24x32xf32> @@ -80,13 +78,11 @@ gpu.module @test_elementwise_ops { %load_c = xegpu.load_nd %tdesc_c {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xi1, #xegpu.layout> -> vector<24x32xi1> - // CHECK: arith.select {{.*}}, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout} - // CHECK-SAME: : vector<12x8xi1>, vector<12x8xf32> + // CHECK: arith.select {{.*}}, {{.*}}, {{.*}} : vector<12x8xi1>, vector<12x8xf32> %select = arith.select %load_c, %load_a, %load_b {layout_result_0 = #xegpu.layout} : vector<24x32xi1>, vector<24x32xf32> - // CHECK: math.fma {{.*}}, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout} - // CHECK-SAME: : vector<12x8xf32> + // CHECK: math.fma {{.*}}, {{.*}}, {{.*}} : vector<12x8xf32> %fma = math.fma %load_a, %load_b, %load_a {layout_result_0 = #xegpu.layout} : vector<24x32xf32> @@ -105,13 +101,11 @@ gpu.module @test_elementwise_ops { %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xi32, #xegpu.layout> -> vector<24x32xi32> - // CHECK: arith.truncf {{.*}} {layout_result_0 = #xegpu.layout} - // CHECK-SAME: : vector<12x8xf32> to vector<12x8xf16> + // CHECK: arith.truncf {{.*}} : vector<12x8xf32> to vector<12x8xf16> %truncf = arith.truncf %load_a {layout_result_0 = #xegpu.layout} : vector<24x32xf32> to vector<24x32xf16> - // CHECK: arith.bitcast {{.*}} {layout_result_0 = #xegpu.layout} - // CHECK-SAME: : vector<12x8xi32> to vector<12x8xf32> + // CHECK: arith.bitcast {{.*}} : vector<12x8xi32> to vector<12x8xf32> %bitcast = arith.bitcast %load_b {layout_result_0 = #xegpu.layout} : vector<24x32xi32> to vector<24x32xf32> @@ -140,13 +134,11 @@ gpu.module @test_elementwise_ops { %load_d = xegpu.load_nd %tdesc_d {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xi32, #xegpu.layout> -> vector<24x32xi32> - // CHECK: arith.cmpf ult, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout} - // CHECK-SAME: : vector<12x8xf32> + // CHECK: arith.cmpf ult, {{.*}}, {{.*}} : vector<12x8xf32> %cmpf = arith.cmpf ult, %load_a, %load_b {layout_result_0 = #xegpu.layout} : vector<24x32xf32> - // CHECK: arith.cmpi eq, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout} - // CHECK-SAME: : vector<12x8xi32> + // CHECK: arith.cmpi eq, {{.*}}, {{.*}} : vector<12x8xi32> %cmpi = arith.cmpi eq, %load_c, %load_d {layout_result_0 = #xegpu.layout} : vector<24x32xi32> @@ -166,12 +158,12 @@ gpu.module @test_elementwise_ops { %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - // CHECK-COUNT-12: arith.negf {{.*}} {layout_result_0 = #xegpu.layout} : vector<2x2xf32> + // CHECK-COUNT-12: arith.negf {{.*}} : vector<2x2xf32> // CHECK-NOT: arith.negf %negf = arith.negf %load_a {layout_result_0 = #xegpu.layout} : vector<24x32xf32> - // CHECK-COUNT-12: math.powf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout} : vector<2x2xf32> + // CHECK-COUNT-12: math.powf {{.*}}, {{.*}} : vector<2x2xf32> // CHECK-NOT: math.powf %powf = math.powf %load_a, %load_b {layout_result_0 = #xegpu.layout} @@ -179,3 +171,4 @@ gpu.module @test_elementwise_ops { gpu.return } } + diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir index 6b8b4f282b744..e89cb52ee02f5 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir @@ -116,7 +116,7 @@ gpu.module @test_round_robin_assignment { %load = xegpu.load_nd %tdesc {layout = #xegpu.layout} : !xegpu.tensor_desc<128x1xf32, #xegpu.layout> -> vector<128x1xf32> - // CHECK-COUNT-2: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout} : vector<16x1xf32> to vector<16x32xf32> + // CHECK-COUNT-2: vector.broadcast {{.*}} : vector<16x1xf32> to vector<16x32xf32> // CHECK-NOT: vector.broadcast %broadcast = vector.broadcast %load {layout_result_0 = #xegpu.layout} diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir index 4b11270373f95..ecdfdb9ad34c5 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir @@ -26,14 +26,14 @@ gpu.module @test_distribution { // CHECK-LABEL: store_nd_with_offset gpu.func @store_nd_with_offset(%src: memref<256x128xf32>) { - // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout> + // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout}> : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout> // CHECK-NOT: xegpu.store_nd %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> -> vector<256x128xf32> - xegpu.store_nd %load, %tdesc[0, 0] + xegpu.store_nd %load, %tdesc[0, 0] {layout = #xegpu.layout} : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout> gpu.return } @@ -126,7 +126,7 @@ gpu.module @test_distribution { %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> -> vector<256x128xf32> - // CHECK-COUNT-2: vector.transpose {{.*}}, [1, 0] {layout_result_0 = #xegpu.layout} : vector<32x16xf32> to vector<16x32xf32> + // CHECK-COUNT-2: vector.transpose {{.*}}, [1, 0] : vector<32x16xf32> to vector<16x32xf32> // CHECK-NOT: vector.transpose %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout} : vector<256x128xf32> to vector<128x256xf32> gpu.return @@ -149,13 +149,13 @@ gpu.module @test_distribution { } // CHECK-LABEL: distribute_shapecast_expandunitdims_broadcast - // CHECK: %[[CAST:.*]] = vector.shape_cast %[[REDUCE:.*]] {layout_result_0 = #xegpu.layout} : vector<8xf32> to vector<8x1xf32> - // CHECK: %[[BCAST:.*]] = vector.broadcast %[[CAST]] {layout_result_0 = #xegpu.layout} : vector<8x1xf32> to vector<8x128xf32> + // CHECK: %[[CAST:.*]] = vector.shape_cast %[[REDUCE:.*]] : vector<8xf32> to vector<8x1xf32> + // CHECK: %[[BCAST:.*]] = vector.broadcast %[[CAST]] : vector<8x1xf32> to vector<8x128xf32> gpu.func @distribute_shapecast_expandunitdims_broadcast(%arg0: memref<4096x128xf32>, %arg1: memref<4096x128xf32>) { %cst_0 = arith.constant {layout_result_0=#xegpu.slice<#xegpu.layout, dims = [1]>} dense<0xFF800000> : vector<256xf32> %block_id_x = gpu.block_id x %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr, #xegpu.layout> - %1 = xegpu.load_nd %0[%block_id_x, 0] : !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr, #xegpu.layout> -> vector<256x128xf32> + %1 = xegpu.load_nd %0[%block_id_x, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr, #xegpu.layout> -> vector<256x128xf32> %2 = vector.multi_reduction , %1, %cst_0 {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} [1] : vector<256x128xf32> to vector<256xf32> %3 = vector.shape_cast %2 {layout_result_0 = #xegpu.layout, layout_operand_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} : vector<256xf32> to vector<256x1xf32> %4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout} : vector<256x1xf32>to vector<256x128xf32> diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir index 9cb96775b4ee4..69a2ca7c49c2d 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir @@ -58,7 +58,7 @@ gpu.module @test_distribution { // CHECK-LABEL: store_nd_with_offsets // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> gpu.func @store_nd_with_offsets(%src: memref<256x128xf32>) { - //CHECK: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] : vector<32x32xf32>, !xegpu.tensor_desc<32x32xf32, #xegpu.layout> + //CHECK: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout}> : vector<32x32xf32>, !xegpu.tensor_desc<32x32xf32, #xegpu.layout> %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} @@ -130,17 +130,6 @@ gpu.module @test_distribution { gpu.return } - // CHECK-LABEL: dpas_with_no_create_nd_desc - gpu.func @dpas_with_no_create_nd_desc(%a: vector<256x128xf32>, %b: vector<128x256xf32>) { - // CHECK-NOT: vector<32x32xf32> - %dpas = xegpu.dpas %a, %b - {layout_a = #xegpu.layout, - layout_b = #xegpu.layout, - layout_cd = #xegpu.layout} - : vector<256x128xf32>, vector<128x256xf32> -> vector<256x256xf32> - gpu.return - } - // CHECK-LABEL: broadcast_dim1 // CHECK-SAME: %[[ARG_0:.*]]: memref<256x1xf32> gpu.func @broadcast_dim1(%src: memref<256x1xf32>) { @@ -149,8 +138,7 @@ gpu.module @test_distribution { %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x1xf32, #xegpu.layout> -> vector<256x1xf32> - // CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout} - // CHECK-SAME: : vector<32x1xf32> to vector<32x32xf32> + // CHECK: vector.broadcast {{.*}} : vector<32x1xf32> to vector<32x32xf32> %broadcast = vector.broadcast %load {layout_result_0 = #xegpu.layout} : vector<256x1xf32> to vector<256x32xf32> @@ -165,8 +153,7 @@ gpu.module @test_distribution { %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<1x128xf32, #xegpu.layout> -> vector<1x128xf32> - // CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout} - // CHECK-SAME: : vector<1x32xf32> to vector<32x32xf32> + // CHECK: vector.broadcast {{.*}} : vector<1x32xf32> to vector<32x32xf32> %broadcast = vector.broadcast %load {layout_result_0 = #xegpu.layout} : vector<1x128xf32> to vector<32x128xf32> @@ -204,7 +191,7 @@ gpu.module @test_distribution { // CHECK: [[b:%.+]] = xegpu.load_nd [[DESC_B]][{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16> // CHECK: scf.yield [[a]], [[b]], [[c]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> %8:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg4 = %6, %arg5 = %7, %arg6 = %5) - -> (vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32>) { + -> (vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32>) { // load_nd with offset inside loop %9 = xegpu.dpas %arg4, %arg5, %arg6 {layout_a = #xegpu.layout, @@ -214,9 +201,11 @@ gpu.module @test_distribution { %10 = xegpu.load_nd %3[%arg3, %c0] {layout = #xegpu.layout}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> %11 = xegpu.load_nd %4[%c0, %arg3] {layout = #xegpu.layout}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> scf.yield %10, %11, %9 : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> - } + } {layout_result_0 = #xegpu.layout, + layout_result_1 = #xegpu.layout, + layout_result_2 = #xegpu.layout} // store_nd with offset - xegpu.store_nd %8#2, %2[%0, %1] : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout> + xegpu.store_nd %8#2, %2[%0, %1] {layout = #xegpu.layout} : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout> gpu.return } @@ -303,9 +292,9 @@ gpu.module @test_distribution { // CHECK-LABEL: @store_scatter // CHECK-SAME: %[[ARG0:.*]]: memref<256xf16> gpu.func @store_scatter(%dest : memref<256xf16>) { - // CHECK: %[[VAL:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<2.550000e+01> : vector<8xf16> - // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<8xindex> - // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<8xi1> + // CHECK: %[[VAL:.*]] = arith.constant dense<2.550000e+01> : vector<8xf16> + // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<8xindex> + // CHECK: %[[MASK:.*]] = arith.constant dense : vector<8xi1> // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint, layout = #xegpu.layout}> // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1> %val = arith.constant {layout_result_0 = #xegpu.layout} dense<25.5> : vector<256xf16> @@ -489,7 +478,7 @@ gpu.module @test_distribution { %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x32xf32, #xegpu.layout> -> vector<256x32xf32> - //CHECK: vector.transpose {{.*}}, [1, 0] {layout_result_0 = #xegpu.layout} : vector<64x32xf32> to vector<32x64xf32> + //CHECK: vector.transpose {{.*}}, [1, 0] : vector<64x32xf32> to vector<32x64xf32> %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout} : vector<256x32xf32> to vector<32x256xf32> gpu.return } @@ -645,7 +634,7 @@ gpu.module @test_distribution { // CHECK-SAME: memref<4096xf32>, vector<32xindex>, vector<32xi1> -> vector<32xf32> %3 = xegpu.load %2[%offset], %mask {chunk_size = 1, layout = #xegpu.slice<#xegpu.layout, dims = [0]> } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32> - // CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[LOAD]] {layout_result_0 = #xegpu.layout} : vector<32xf32> to vector<32x32xf32> + // CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[LOAD]] : vector<32xf32> to vector<32x32xf32> %4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout} : vector<256xf32> to vector<256x256xf32> gpu.return @@ -726,7 +715,7 @@ gpu.module @test_distribution { %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} dense<0.0> : vector<128xf32> %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc + %load = xegpu.load_nd %tdesc {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> -> vector<256x128xf32> %reduce = vector.multi_reduction , %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} [0] diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir index 4f29a686d301f..467c53fa20f94 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir @@ -63,7 +63,7 @@ gpu.module @test_1_1_assignment { // CHECK-SAME: : vector<32x32xf32>, !xegpu.tensor_desc<32x32xf32, #xegpu.layout> %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc + %load = xegpu.load_nd %tdesc {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> -> vector<256x128xf32> xegpu.store_nd %load, %tdesc @@ -141,27 +141,15 @@ gpu.module @test_1_1_assignment { gpu.return } - // CHECK-LABEL: dpas_with_no_create_nd_desc - gpu.func @dpas_with_no_create_nd_desc(%a: vector<256x128xf32>, %b: vector<128x256xf32>) { - // CHECK-NOT: vector<32x32xf32> - %dpas = xegpu.dpas %a, %b - {layout_a = #xegpu.layout, - layout_b = #xegpu.layout, - layout_cd = #xegpu.layout} - : vector<256x128xf32>, vector<128x256xf32> -> vector<256x256xf32> - gpu.return - } - // CHECK-LABEL: broadcast_dim1 // CHECK-SAME: %[[ARG_0:.*]]: memref<256x1xf32> gpu.func @broadcast_dim1(%src: memref<256x1xf32>) { %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x1xf32> -> !xegpu.tensor_desc<256x1xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc + %load = xegpu.load_nd %tdesc {layout = #xegpu.layout} : !xegpu.tensor_desc<256x1xf32, #xegpu.layout> -> vector<256x1xf32> - // CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout} - // CHECK-SAME: : vector<32x1xf32> to vector<32x32xf32> + // CHECK: vector.broadcast {{.*}} : vector<32x1xf32> to vector<32x32xf32> %broadcast = vector.broadcast %load {layout_result_0 = #xegpu.layout} : vector<256x1xf32> to vector<256x32xf32> @@ -173,11 +161,10 @@ gpu.module @test_1_1_assignment { gpu.func @broadcast_dim0(%src: memref<1x128xf32>) { %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<1x128xf32> -> !xegpu.tensor_desc<1x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc + %load = xegpu.load_nd %tdesc {layout = #xegpu.layout} : !xegpu.tensor_desc<1x128xf32, #xegpu.layout> -> vector<1x128xf32> - // CHECK: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout} - // CHECK-SAME: : vector<1x32xf32> to vector<32x32xf32> + // CHECK: vector.broadcast {{.*}} : vector<1x32xf32> to vector<32x32xf32> %broadcast = vector.broadcast %load {layout_result_0 = #xegpu.layout} : vector<1x128xf32> to vector<32x128xf32> @@ -222,7 +209,9 @@ gpu.module @test_1_1_assignment { %12 = xegpu.update_nd_offset %arg5, [%c128, %c0] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> scf.yield %11, %12, %10 : !xegpu.tensor_desc<128x128xf16, #xegpu.layout>, !xegpu.tensor_desc<128x128xf16, #xegpu.layout>, vector<128x128xf32> - } + } {layout_result_0 = #xegpu.layout, + layout_result_1 = #xegpu.layout, + layout_result_2 = #xegpu.layout} %7 = xegpu.create_nd_tdesc %arg2[%0, %1] : memref<1024x1024xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> xegpu.store_nd %6#2, %7 {layout = #xegpu.layout } : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout> @@ -288,7 +277,7 @@ gpu.module @test_1_1_assignment { %id = gpu.subgroup_id : index %t = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> - %d = xegpu.load_nd %t : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %d = xegpu.load_nd %t {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> %0 = arith.cmpi eq, %id, %c10 : index // CHECK-LABEL: scf.if @@ -340,7 +329,7 @@ gpu.module @test_1_1_assignment { // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C2]] %tdesc = xegpu.create_nd_tdesc %src2[0, 0] : memref<128x64xf32> -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc + %load = xegpu.load_nd %tdesc {layout = #xegpu.layout} : !xegpu.tensor_desc<128x64xf32, #xegpu.layout> -> vector<128x64xf32> %exp = math.exp %load {layout_result_0 = #xegpu.layout} : vector<128x64xf32> @@ -369,7 +358,7 @@ gpu.module @test_1_1_assignment { // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C3]] %td = xegpu.create_nd_tdesc %src1[0, 0] : memref<128x64xf32> -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout> - %ld = xegpu.load_nd %td + %ld = xegpu.load_nd %td {layout = #xegpu.layout} : !xegpu.tensor_desc<128x64xf32, #xegpu.layout> -> vector<128x64xf32> %exp = math.exp %ld {layout_result_0 = #xegpu.layout} : vector<128x64xf32>