diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index 446f64fffa468..5e56854c254a0 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -520,9 +520,9 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> { /// Check if this is slice of some other layout. bool isSliceOf(const xegpu::DistributeLayoutAttr &other) { return false; } - + /// Check if this is identical to some other layout. - bool isEqualTo(const xegpu::DistributeLayoutAttr &other); + bool isEqualTo(const xegpu::DistributeLayoutAttr &other); }]; @@ -698,9 +698,12 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> { /// Check if this is slice of some other layout. bool isSliceOf(const xegpu::DistributeLayoutAttr &other); - + + /// Drop the slice dims to get the original layout. + SliceAttr dropSliceDims(ArrayRef sliceDimsToDrop); + /// Check if this is identical to some other layout. - bool isEqualTo(const xegpu::DistributeLayoutAttr &other); + bool isEqualTo(const xegpu::DistributeLayoutAttr &other); }]; let assemblyFormat = "`<` qualified($parent) `,` `dims` `=` $dims `>`"; @@ -782,13 +785,13 @@ def AnchorLayoutInterface : OpInterface<"AnchorLayoutInterface"> { let methods = [ InterfaceMethod< /*desc=*/"Get the anchor layout attribute.", - /*retTy=*/"xegpu::DistributeLayoutAttr", + /*retTy=*/"xegpu::DistributeLayoutAttr", /*methodName=*/"getAnchorLayout", /*args=*/(ins) >, InterfaceMethod< /*desc=*/"Set the anchor layout attribute.", - /*retTy=*/"void", + /*retTy=*/"void", /*methodName=*/"setAnchorLayout", /*args=*/(ins "xegpu::DistributeLayoutAttr":$layout) >, diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td index 3ff7805263f0e..e25adbd1673d9 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td @@ -102,7 +102,7 @@ def XeGPUVectorLinearize : Pass<"xegpu-vector-linearize"> { "scf::SCFDialect", "ub::UBDialect", "vector::VectorDialect"]; } -def XeGPUOptimizeBlockLoads : Pass<"xegpu-optimize-block-loads"> { +def XeGPUPeepHoleOptimizer : Pass<"xegpu-optimize-peephole"> { let summary = "Optimize XeGPU block load operations"; let description = [{ This pass rewrites XeGPU loadNd operations into more optimal forms diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h index 1776a209d0bf1..5942f69b4a66d 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h @@ -62,7 +62,7 @@ struct UnrollOptions { /// Appends patterns for folding aliasing ops into XeGPU ops into `patterns`. void populateXeGPUFoldAliasOpsPatterns(RewritePatternSet &patterns); /// Appends patterns for optimizing block load operations into `patterns`. -void populateXeGPUOptimizeBlockLoadsPatterns(RewritePatternSet &patterns); +void populateXeGPUPeepHoleOptimizerPatterns(RewritePatternSet &patterns); /// Appends patterns for XeGPU SIMT distribution into `patterns`. void populateXeGPUSubgroupDistributePatterns(RewritePatternSet &patterns); /// Appends patterns for moving function body into gpu.warp_execute_on_lane0 op. diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp index 38313dc3c01d5..f7fff8e1fd4cf 100644 --- a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp +++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp @@ -75,6 +75,9 @@ void buildGPUPassPipeline(OpPassManager &pm, options.xegpuOpLevel == "workgroup") { xegpu::XeGPUPropagateLayoutOptions layoutOptions; layoutOptions.layoutKind = "lane"; + pm.addNestedPass( + xegpu::createXeGPUPropagateLayout(layoutOptions)); + pm.addNestedPass(xegpu::createXeGPUPeepHoleOptimizer()); pm.addNestedPass( xegpu::createXeGPUPropagateLayout(layoutOptions)); pm.addNestedPass(xegpu::createXeGPUSubgroupDistribute()); diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index ccf17da26c942..53ca17f4f99bc 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -592,6 +592,24 @@ bool SliceAttr::isSliceOf(const xegpu::DistributeLayoutAttr &other) { [&](int64_t dim) { return thisDims.contains(dim); }); } +xegpu::SliceAttr SliceAttr::dropSliceDims(ArrayRef sliceDimsToDrop) { + if (sliceDimsToDrop.empty()) + return *this; + SmallVector sliceDims{getDims().asArrayRef()}; + for (auto dim : sliceDimsToDrop) { + auto foundIt = std::find(sliceDims.begin(), sliceDims.end(), dim); + assert(foundIt != sliceDims.end() && + "Expected to find the specified reduction dim in slice dims"); + sliceDims.erase(foundIt); + } + + auto sliceWithoutDims = xegpu::SliceAttr::get( + this->getContext(), getParent(), + DenseI64ArrayAttr::get(this->getContext(), sliceDims)); + + return sliceWithoutDims; +} + bool SliceAttr::isEqualTo(const xegpu::DistributeLayoutAttr &other) { if (dyn_cast(other)) return false; diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt index 29b645feab2c6..15d31eadcb6df 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt @@ -6,7 +6,7 @@ add_mlir_dialect_library(MLIRXeGPUTransforms XeGPUWgToSgDistribute.cpp XeGPUPropagateLayout.cpp XeGPUVectorLinearize.cpp - XeGPUOptimizeBlockLoads.cpp + XeGPUPeepHoleOptimizer.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/XeGPU diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp similarity index 82% rename from mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp rename to mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp index bb80df197d45b..6a3e533fb2df4 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp @@ -1,4 +1,4 @@ -//===- XeGPUOptimizeBlockLoads.cpp - XeGPU optimize block loads -*- C++ -*-===// +//===- XeGPUPeepHoleOptimizer.cpp - XeGPU optimize block loads -*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -30,12 +30,12 @@ namespace mlir { namespace xegpu { -#define GEN_PASS_DEF_XEGPUOPTIMIZEBLOCKLOADS +#define GEN_PASS_DEF_XEGPUPEEPHOLEOPTIMIZER #include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc" } // namespace xegpu } // namespace mlir -#define DEBUG_TYPE "xegpu-optimize-block-loads" +#define DEBUG_TYPE "xegpu-optimize-peephole" #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") using namespace mlir; @@ -416,19 +416,104 @@ class VectorExtractOpPattern final } }; +/// Performs a reduction over 2 dimensions by decomposing it into two 1D +/// reductions ordered based on layout to minimize cross-lane communication. +class MultiRed2dOpPattern + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(vector::MultiDimReductionOp reductionOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto sourceVecType = reductionOp.getSourceVectorType(); + if (reductionOp.getReductionDims().size() != 2 || + sourceVecType.getRank() != 2) + return rewriter.notifyMatchFailure( + reductionOp, "Expected 2D multi reduction of a 2D source"); + auto resLayout = xegpu::getDistributeLayoutAttr(reductionOp.getResult()); + // Retrieve and order dims for 1D decomposition (prefer intra-lane first). + auto dims = llvm::to_vector(reductionOp.getReductionDims()); + auto [intraLaneDim, crossLaneDim] = getReductionDimOrder(dims, resLayout); + // Order does not matter + if (intraLaneDim == -1 || crossLaneDim == -1) { + intraLaneDim = dims[0]; + crossLaneDim = dims[1]; + } + auto loc = reductionOp.getLoc(); + auto acc = reductionOp.getAcc(); + + // The first reduction's dist attribute does not have the cross lane dim. + auto resSliceLayoutAttr = cast(resLayout); + SmallVector dropDims{crossLaneDim}; + auto intraLaneRedResLayout = resSliceLayoutAttr.dropSliceDims(dropDims); + + SmallVector accShape(sourceVecType.getShape()); + accShape.erase(accShape.begin() + intraLaneDim); + if (acc) { + acc = vector::BroadcastOp::create( + rewriter, loc, + VectorType::get(accShape, sourceVecType.getElementType()), acc); + xegpu::setDistributeLayoutAttr( + llvm::dyn_cast(acc), + cast(intraLaneRedResLayout)); + } + Value intraLaneReduced = vector::MultiDimReductionOp::create( + rewriter, loc, reductionOp.getKind(), reductionOp.getSource(), acc, + ArrayRef(intraLaneDim)); + xegpu::setDistributeLayoutAttr( + llvm::dyn_cast(intraLaneReduced), + cast(intraLaneRedResLayout)); + + Value crossLaneReduced = vector::ReductionOp::create( + rewriter, loc, reductionOp.getKind(), intraLaneReduced, nullptr); + xegpu::setDistributeLayoutAttr( + llvm::dyn_cast(crossLaneReduced), + cast(resLayout)); + assert(crossLaneReduced.getType() == reductionOp.getResult().getType() && + "Type mismatch"); + rewriter.replaceOp(reductionOp, crossLaneReduced); + return success(); + } + +private: + std::pair + getReductionDimOrder(ArrayRef reductionDims, + xegpu::DistributeLayoutAttr layout) const { + assert(layout.isForSubgroup() && "Must know the lane layout"); + assert(reductionDims.size() == 2 && "Expected 2D reduction"); + int64_t intra, cross = -1; + xegpu::LayoutAttr layoutAttr = dyn_cast(layout); + if (auto layoutSliceAttr = dyn_cast(layout)) + layoutAttr = + dyn_cast(layoutSliceAttr.flatten().getParent()); + assert(layoutAttr); + SmallVector laneLayout = layoutAttr.getEffectiveLaneLayoutAsInt(); + + assert(laneLayout.size() && "Expected a non-empty layout"); + // try to pick a dim that does not communicate + for (auto dim : reductionDims) { + if (laneLayout[dim] == 1) + intra = dim; + else + cross = dim; + } + return {intra, cross}; + } +}; + } // namespace -void xegpu::populateXeGPUOptimizeBlockLoadsPatterns( +void xegpu::populateXeGPUPeepHoleOptimizerPatterns( RewritePatternSet &patterns) { patterns.add(patterns.getContext()); + VectorExtractOpPattern, MultiRed2dOpPattern>( + patterns.getContext()); } namespace { -struct XeGPUOptimizeBlockLoadsPass final - : public xegpu::impl::XeGPUOptimizeBlockLoadsBase< - XeGPUOptimizeBlockLoadsPass> { +struct XeGPUPeepHoleOptimizerPass final + : public xegpu::impl::XeGPUPeepHoleOptimizerBase< + XeGPUPeepHoleOptimizerPass> { void runOnOperation() override { MLIRContext &context = getContext(); TypeConverter converter; @@ -445,7 +530,7 @@ struct XeGPUOptimizeBlockLoadsPass final }); if (!isTargetSupported) { - DBGS() << "XeGPUOptimizeBlockLoadsPass only supports PVC and BMG targets." + DBGS() << "XeGPUPeepHoleOptimizerPass only supports PVC and BMG targets." << "\n"; return; } @@ -473,13 +558,24 @@ struct XeGPUOptimizeBlockLoadsPass final auto laneData = layout.getEffectiveLaneDataAsInt(); return !canBeOptimizedForTranspose(laneLayout, laneData); }); + + target.addDynamicallyLegalOp( + [=](Operation *op) -> bool { + auto layout = xegpu::getDistributeLayoutAttr(op->getResult(0)); + if (!layout || !layout.isForSubgroup()) + return true; + if (auto reductionOp = dyn_cast(op)) + return reductionOp.getReductionDims().size() != 2; + return true; + }); + converter.addConversion([](Type type) { return type; }); target.addLegalDialect(); scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns, target); - xegpu::populateXeGPUOptimizeBlockLoadsPatterns(patterns); + xegpu::populateXeGPUPeepHoleOptimizerPatterns(patterns); if (failed(applyPartialConversion(getOperation(), target, std::move(patterns)))) { DBGS() << "Optimize block loads pass failed.\n"; diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index a1c0656d0bdb5..9113f00ac39f0 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -875,12 +875,15 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern { std::string layoutMaskName = xegpu::getTemporaryLayoutName(storeScatterOp->getOpOperand(3)); - xegpu::LayoutAttr layoutPayload = - storeScatterOp->getAttrOfType(layoutPayloadName); - xegpu::LayoutAttr layoutOffsets = - storeScatterOp->getAttrOfType(layoutOffsetsName); - xegpu::LayoutAttr layoutMask = - storeScatterOp->getAttrOfType(layoutMaskName); + xegpu::DistributeLayoutAttr layoutPayload = + storeScatterOp->getAttrOfType( + layoutPayloadName); + xegpu::DistributeLayoutAttr layoutOffsets = + storeScatterOp->getAttrOfType( + layoutOffsetsName); + xegpu::DistributeLayoutAttr layoutMask = + storeScatterOp->getAttrOfType( + layoutMaskName); FailureOr distStoreVecByWarpOpOrFailure = getDistVecTypeBasedOnLaneLayout(layoutPayload, storeVecTy); diff --git a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir b/mlir/test/Dialect/XeGPU/peephole-optimize.mlir similarity index 89% rename from mlir/test/Dialect/XeGPU/optimize-transpose.mlir rename to mlir/test/Dialect/XeGPU/peephole-optimize.mlir index c748c1ca5ef88..56a4b263255e8 100644 --- a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir +++ b/mlir/test/Dialect/XeGPU/peephole-optimize.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' \ -// RUN: --xegpu-optimize-block-loads --canonicalize --cse --split-input-file %s | FileCheck %s +// RUN: --xegpu-optimize-peephole --canonicalize --cse --split-input-file %s | FileCheck %s // CHECK-LABEL: gpu.func @no_scf( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<64x64xf16>, %{{.*}}: vector<8x16xf16>) -> vector<8x16xf32> { @@ -278,3 +278,36 @@ gpu.func @array_length(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg gpu.return } } + +// ----- +// CHECK-LABEL: gpu.func @vector_reduce_2d( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<4x16xf32>, %[[ARG2:[0-9a-zA-Z]+]]: memref<256xf32>) { +// CHECK: %[[ACC_VEC:.*]] = arith.constant dense<1.000000e+00> : vector<16xf32> +// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout> +// CHECK: %[[LOADED:.*]] = xegpu.load_nd %[[TDESC]][0, 0] : !xegpu.tensor_desc<4x16xf32, #xegpu.layout> -> vector<4x16xf32> +// CHECK: %[[LOADED_REDUCED:.*]] = vector.multi_reduction , %[[LOADED]], %[[ACC_VEC]] +// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} [0] : vector<4x16xf32> to vector<16xf32> +// CHECK: %[[LOADED_REDUCED_FOR_CROSS:.*]] = vector.reduction , %[[LOADED_REDUCED]] +// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0, 1]>} : vector<16xf32> into f32 +gpu.module @xevm_test { + gpu.func @vector_reduce_2d(%src: memref<4x16xf32>, %dst: memref<256xf32>) { + %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0, 1]>} 1.0 : f32 + %tdesc = xegpu.create_nd_tdesc %src : memref<4x16xf32> + -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc[0, 0] + : !xegpu.tensor_desc<4x16xf32, #xegpu.layout> + -> vector<4x16xf32> + %reduce = vector.multi_reduction , %load, %cst + {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0, 1]>} + [0, 1] : vector<4x16xf32> to f32 + %reduce_bcast = vector.broadcast %reduce + {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} + : f32 to vector<16xf32> + + %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<16xindex> + %mask = arith.constant {layout_result_0 = #xegpu.layout} dense<1> : vector<16xi1> + + xegpu.store %reduce_bcast, %dst[%offset], %mask {layout = #xegpu.slice<#xegpu.layout, dims = [0]>} : vector<16xf32>, memref<256xf32>, vector<16xindex>, vector<16xi1> + gpu.return + } +} diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index 87c67ba6bf324..dae00838fdcb6 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -208,7 +208,7 @@ gpu.module @xevm_module{ : vector<16x8xi32> to vector<16x16xf16> %5 = vector.transpose %4, [1, 0] {layout_result_0 = #xegpu.layout} : vector<16x16xf16> to vector<16x16xf16> - %6 = xegpu.dpas %1, %5 + %6 = xegpu.dpas %1, %5 {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} @@ -364,12 +364,12 @@ gpu.module @xevm_module{ %c0 = arith.constant 0 : index %mask = vector.constant_mask [16] {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>}: vector<16xi1> %1 = xegpu.load %arg0[%c0], %mask {layout = #xegpu.slice<#xegpu.layout, dims = [1]>}: memref<16xf16>, index, vector<16xi1> -> vector<16xf16> - + %11 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout} : vector<16xf16> to vector<16x1xf16> %2 = vector.broadcast %11 {layout_result_0 = #xegpu.layout} : vector<16x1xf16> to vector<16x16xf16> // CHECK-NOT: vector.broadcast // CHECK-NOT: vector.shape_cast - + %tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> // CHECK: xegpu.store_nd {{.*}}, {{.*}}[{{.*}}, {{.*}}] @@ -397,4 +397,42 @@ gpu.module @xevm_module{ } } - +// ----- +gpu.module @xevm_test { + // CHECK-LABEL: gpu.func @vector_reduce_2d + // CHECK-DAG: %[[C8:.*]] = arith.constant 8 : i32 + // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : i32 + // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : i32 + // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : i32 + // CHECK-DAG: %[[C16:.*]] = arith.constant 16 : i32 + // CHECK-DAG: %[[CST_1:.*]] = arith.constant 1.000000e+00 : f32 + // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index + // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense : vector<1xi1> + // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1xindex> + // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %arg0 : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32> + // CHECK: %[[LOADED:.*]] = xegpu.load_nd %[[TDESC]][0, 0] : !xegpu.tensor_desc<4x16xf32> -> vector<4xf32> + // CHECK: %[[LOADED_REDUCED:.*]] = vector.reduction , %[[LOADED]], %[[CST_1]] : vector<4xf32> into f32 + // CHECK: %[[SHUFFLE_0:.*]], %{{.*}} = gpu.shuffle xor %[[LOADED_REDUCED]], %[[C1]], %[[C16]] : f32 + // CHECK: %[[VEC_RED_0:.*]] = arith.addf %[[LOADED_REDUCED]], %[[SHUFFLE_0]] : f32 + // CHECK: %[[SHUFFLE_1:.*]], %{{.*}} = gpu.shuffle xor %[[VEC_RED_0]], %[[C2]], %[[C16]] : f32 + // CHECK: %[[VEC_RED_1:.*]] = arith.addf %[[VEC_RED_0]], %[[SHUFFLE_1]] : f32 + // CHECK: %[[SHUFFLE_2:.*]], %{{.*}} = gpu.shuffle xor %[[VEC_RED_1]], %[[C4]], %[[C16]] : f32 + // CHECK: %[[VEC_RED_2:.*]] = arith.addf %[[VEC_RED_1]], %[[SHUFFLE_2]] : f32 + // CHECK: %[[SHUFFLE_3:.*]], %{{.*}} = gpu.shuffle xor %[[VEC_RED_2]], %[[C8]], %[[C16]] : f32 + // CHECK: %[[VEC_RED_3:.*]] = arith.addf %[[VEC_RED_2]], %[[SHUFFLE_3]] : f32 + // CHECK: %[[VEC_RED:.*]] = vector.broadcast %[[VEC_RED_3]] : f32 to vector<1xf32> + // CHECK: xegpu.store %[[VEC_RED]], %arg1[%[[CST]]], %[[CST_0]] : vector<1xf32>, memref<256xf32>, vector<1xindex>, vector<1xi1> + gpu.func @vector_reduce_2d(%arg0: memref<4x16xf32>, %arg1: memref<256xf32>) { + %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0, 1]>} 1.000000e+00 : f32 + %0 = xegpu.create_nd_tdesc %arg0 : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout> + %1 = xegpu.load_nd %0[0, 0] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<4x16xf32, #xegpu.layout> -> vector<4x16xf32> + %2 = vector.broadcast %cst {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} : f32 to vector<16xf32> + %3 = vector.multi_reduction , %1, %2 {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} [0] : vector<4x16xf32> to vector<16xf32> + %4 = vector.reduction , %3 {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0, 1]>} : vector<16xf32> into f32 + %5 = vector.broadcast %4 {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} : f32 to vector<16xf32> + %cst_0 = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<16xindex> + %cst_1 = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> + xegpu.store %5, %arg1[%cst_0], %cst_1 <{layout = #xegpu.slice<#xegpu.layout, dims = [0]>}> : vector<16xf32>, memref<256xf32>, vector<16xindex>, vector<16xi1> + gpu.return + } +}