diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 446f64fffa468..5e56854c254a0 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -520,9 +520,9 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> {
 
     /// Check if this is slice of some other layout.
     bool isSliceOf(const xegpu::DistributeLayoutAttr &other) { return false; }
-    
+
     /// Check if this is identical to some other layout.
-    bool isEqualTo(const xegpu::DistributeLayoutAttr &other); 
+    bool isEqualTo(const xegpu::DistributeLayoutAttr &other);
 
   }];
 
@@ -698,9 +698,12 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
 
     /// Check if this is slice of some other layout.
     bool isSliceOf(const xegpu::DistributeLayoutAttr &other);
-    
+
+    /// Drop the slice dims to get the original layout.
+    SliceAttr dropSliceDims(ArrayRef<int64_t> sliceDimsToDrop);
+
     /// Check if this is identical to some other layout.
-    bool isEqualTo(const xegpu::DistributeLayoutAttr &other); 
+    bool isEqualTo(const xegpu::DistributeLayoutAttr &other);
   }];
 
   let assemblyFormat = "`<` qualified($parent) `,` `dims` `=` $dims `>`";
@@ -782,13 +785,13 @@ def AnchorLayoutInterface : OpInterface<"AnchorLayoutInterface"> {
   let methods = [
     InterfaceMethod<
       /*desc=*/"Get the anchor layout attribute.",
-      /*retTy=*/"xegpu::DistributeLayoutAttr", 
+      /*retTy=*/"xegpu::DistributeLayoutAttr",
       /*methodName=*/"getAnchorLayout",
       /*args=*/(ins)
     >,
     InterfaceMethod<
       /*desc=*/"Set the anchor layout attribute.",
-      /*retTy=*/"void", 
+      /*retTy=*/"void",
       /*methodName=*/"setAnchorLayout",
       /*args=*/(ins "xegpu::DistributeLayoutAttr":$layout)
     >,
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
index 3ff7805263f0e..e25adbd1673d9 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -102,7 +102,7 @@ def XeGPUVectorLinearize : Pass<"xegpu-vector-linearize"> {
                            "scf::SCFDialect", "ub::UBDialect", "vector::VectorDialect"];
 }
 
-def XeGPUOptimizeBlockLoads : Pass<"xegpu-optimize-block-loads"> {
+def XeGPUPeepHoleOptimizer : Pass<"xegpu-optimize-peephole"> {
   let summary = "Optimize XeGPU block load operations";
   let description = [{
     This pass rewrites XeGPU loadNd operations into more optimal forms
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index 1776a209d0bf1..5942f69b4a66d 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -62,7 +62,7 @@ struct UnrollOptions {
 /// Appends patterns for folding aliasing ops into XeGPU ops into `patterns`.
 void populateXeGPUFoldAliasOpsPatterns(RewritePatternSet &patterns);
 /// Appends patterns for optimizing block load operations into `patterns`.
-void populateXeGPUOptimizeBlockLoadsPatterns(RewritePatternSet &patterns);
+void populateXeGPUPeepHoleOptimizerPatterns(RewritePatternSet &patterns);
 /// Appends patterns for XeGPU SIMT distribution into `patterns`.
 void populateXeGPUSubgroupDistributePatterns(RewritePatternSet &patterns);
 /// Appends patterns for moving function body into gpu.warp_execute_on_lane0 op.
diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
index 38313dc3c01d5..f7fff8e1fd4cf 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
@@ -75,6 +75,9 @@ void buildGPUPassPipeline(OpPassManager &pm,
       options.xegpuOpLevel == "workgroup") {
     xegpu::XeGPUPropagateLayoutOptions layoutOptions;
     layoutOptions.layoutKind = "lane";
+    pm.addNestedPass<gpu::GPUModuleOp>(
+        xegpu::createXeGPUPropagateLayout(layoutOptions));
+    pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUPeepHoleOptimizer());
     pm.addNestedPass<gpu::GPUModuleOp>(
         xegpu::createXeGPUPropagateLayout(layoutOptions));
     pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUSubgroupDistribute());
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index ccf17da26c942..53ca17f4f99bc 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -592,6 +592,24 @@ bool SliceAttr::isSliceOf(const xegpu::DistributeLayoutAttr &other) {
                       [&](int64_t dim) { return thisDims.contains(dim); });
 }
 
+xegpu::SliceAttr SliceAttr::dropSliceDims(ArrayRef<int64_t> sliceDimsToDrop) {
+  if (sliceDimsToDrop.empty())
+    return *this;
+  SmallVector<int64_t> sliceDims{getDims().asArrayRef()};
+  for (auto dim : sliceDimsToDrop) {
+    auto foundIt = std::find(sliceDims.begin(), sliceDims.end(), dim);
+    assert(foundIt != sliceDims.end() &&
+           "Expected to find the specified reduction dim in slice dims");
+    sliceDims.erase(foundIt);
+  }
+
+  auto sliceWithoutDims = xegpu::SliceAttr::get(
+      this->getContext(), getParent(),
+      DenseI64ArrayAttr::get(this->getContext(), sliceDims));
+
+  return sliceWithoutDims;
+}
+
 bool SliceAttr::isEqualTo(const xegpu::DistributeLayoutAttr &other) {
   if (dyn_cast<xegpu::LayoutAttr>(other))
     return false;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
index 29b645feab2c6..15d31eadcb6df 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
@@ -6,7 +6,7 @@ add_mlir_dialect_library(MLIRXeGPUTransforms
   XeGPUWgToSgDistribute.cpp
   XeGPUPropagateLayout.cpp
   XeGPUVectorLinearize.cpp
-  XeGPUOptimizeBlockLoads.cpp
+  XeGPUPeepHoleOptimizer.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/XeGPU
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
similarity index 82%
rename from mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
rename to mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
index bb80df197d45b..6a3e533fb2df4 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPeepHoleOptimizer.cpp
@@ -1,4 +1,4 @@
-//===- XeGPUOptimizeBlockLoads.cpp - XeGPU optimize block loads -*- C++ -*-===//
+//===- XeGPUPeepHoleOptimizer.cpp - XeGPU optimize block loads -*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -30,12 +30,12 @@
 
 namespace mlir {
 namespace xegpu {
-#define GEN_PASS_DEF_XEGPUOPTIMIZEBLOCKLOADS
+#define GEN_PASS_DEF_XEGPUPEEPHOLEOPTIMIZER
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
 } // namespace xegpu
 } // namespace mlir
 
-#define DEBUG_TYPE "xegpu-optimize-block-loads"
+#define DEBUG_TYPE "xegpu-optimize-peephole"
 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
 
 using namespace mlir;
@@ -416,19 +416,104 @@ class VectorExtractOpPattern final
   }
 };
 
+/// Performs a reduction over 2 dimensions by decomposing it into two 1D
+/// reductions ordered based on layout to minimize cross-lane communication.
+class MultiRed2dOpPattern
+    : public OpConversionPattern<vector::MultiDimReductionOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(vector::MultiDimReductionOp reductionOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto sourceVecType = reductionOp.getSourceVectorType();
+    if (reductionOp.getReductionDims().size() != 2 ||
+        sourceVecType.getRank() != 2)
+      return rewriter.notifyMatchFailure(
+          reductionOp, "Expected 2D multi reduction of a 2D source");
+    auto resLayout = xegpu::getDistributeLayoutAttr(reductionOp.getResult());
+    // Retrieve and order dims for 1D decomposition (prefer intra-lane first).
+    auto dims = llvm::to_vector(reductionOp.getReductionDims());
+    auto [intraLaneDim, crossLaneDim] = getReductionDimOrder(dims, resLayout);
+    // Order does not matter
+    if (intraLaneDim == -1 || crossLaneDim == -1) {
+      intraLaneDim = dims[0];
+      crossLaneDim = dims[1];
+    }
+    auto loc = reductionOp.getLoc();
+    auto acc = reductionOp.getAcc();
+
+    // The first reduction's dist attribute does not have the cross lane dim.
+    auto resSliceLayoutAttr = cast<xegpu::SliceAttr>(resLayout);
+    SmallVector<int64_t> dropDims{crossLaneDim};
+    auto intraLaneRedResLayout = resSliceLayoutAttr.dropSliceDims(dropDims);
+
+    SmallVector<int64_t> accShape(sourceVecType.getShape());
+    accShape.erase(accShape.begin() + intraLaneDim);
+    if (acc) {
+      acc = vector::BroadcastOp::create(
+          rewriter, loc,
+          VectorType::get(accShape, sourceVecType.getElementType()), acc);
+      xegpu::setDistributeLayoutAttr(
+          llvm::dyn_cast<OpResult>(acc),
+          cast<xegpu::DistributeLayoutAttr>(intraLaneRedResLayout));
+    }
+    Value intraLaneReduced = vector::MultiDimReductionOp::create(
+        rewriter, loc, reductionOp.getKind(), reductionOp.getSource(), acc,
+        ArrayRef<int64_t>(intraLaneDim));
+    xegpu::setDistributeLayoutAttr(
+        llvm::dyn_cast<OpResult>(intraLaneReduced),
+        cast<xegpu::DistributeLayoutAttr>(intraLaneRedResLayout));
+
+    Value crossLaneReduced = vector::ReductionOp::create(
+        rewriter, loc, reductionOp.getKind(), intraLaneReduced, nullptr);
+    xegpu::setDistributeLayoutAttr(
+        llvm::dyn_cast<OpResult>(crossLaneReduced),
+        cast<xegpu::DistributeLayoutAttr>(resLayout));
+    assert(crossLaneReduced.getType() == reductionOp.getResult().getType() &&
+           "Type mismatch");
+    rewriter.replaceOp(reductionOp, crossLaneReduced);
+    return success();
+  }
+
+private:
+  std::pair<int64_t, int64_t>
+  getReductionDimOrder(ArrayRef<int64_t> reductionDims,
+                       xegpu::DistributeLayoutAttr layout) const {
+    assert(layout.isForSubgroup() && "Must know the lane layout");
+    assert(reductionDims.size() == 2 && "Expected 2D reduction");
+    int64_t intra, cross = -1;
+    xegpu::LayoutAttr layoutAttr = dyn_cast<xegpu::LayoutAttr>(layout);
+    if (auto layoutSliceAttr = dyn_cast<xegpu::SliceAttr>(layout))
+      layoutAttr =
+          dyn_cast<xegpu::LayoutAttr>(layoutSliceAttr.flatten().getParent());
+    assert(layoutAttr);
+    SmallVector<int64_t> laneLayout = layoutAttr.getEffectiveLaneLayoutAsInt();
+
+    assert(laneLayout.size() && "Expected a non-empty layout");
+    // try to pick a dim that does not communicate
+    for (auto dim : reductionDims) {
+      if (laneLayout[dim] == 1)
+        intra = dim;
+      else
+        cross = dim;
+    }
+    return {intra, cross};
+  }
+};
+
 } // namespace
 
-void xegpu::populateXeGPUOptimizeBlockLoadsPatterns(
+void xegpu::populateXeGPUPeepHoleOptimizerPatterns(
     RewritePatternSet &patterns) {
   patterns.add<XeGPUCreateNdDescOpPattern, XeGPULoadNdDescOpPattern,
-               VectorExtractOpPattern>(patterns.getContext());
+               VectorExtractOpPattern, MultiRed2dOpPattern>(
+      patterns.getContext());
 }
 
 namespace {
 
-struct XeGPUOptimizeBlockLoadsPass final
-    : public xegpu::impl::XeGPUOptimizeBlockLoadsBase<
-          XeGPUOptimizeBlockLoadsPass> {
+struct XeGPUPeepHoleOptimizerPass final
+    : public xegpu::impl::XeGPUPeepHoleOptimizerBase<
+          XeGPUPeepHoleOptimizerPass> {
   void runOnOperation() override {
     MLIRContext &context = getContext();
     TypeConverter converter;
@@ -445,7 +530,7 @@ struct XeGPUOptimizeBlockLoadsPass final
     });
 
     if (!isTargetSupported) {
-      DBGS() << "XeGPUOptimizeBlockLoadsPass only supports PVC and BMG targets."
+      DBGS() << "XeGPUPeepHoleOptimizerPass only supports PVC and BMG targets."
              << "\n";
       return;
     }
@@ -473,13 +558,24 @@ struct XeGPUOptimizeBlockLoadsPass final
           auto laneData = layout.getEffectiveLaneDataAsInt();
           return !canBeOptimizedForTranspose(laneLayout, laneData);
         });
+
+    target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
+        [=](Operation *op) -> bool {
+          auto layout = xegpu::getDistributeLayoutAttr(op->getResult(0));
+          if (!layout || !layout.isForSubgroup())
+            return true;
+          if (auto reductionOp = dyn_cast<vector::MultiDimReductionOp>(op))
+            return reductionOp.getReductionDims().size() != 2;
+          return true;
+        });
+
     converter.addConversion([](Type type) { return type; });
 
     target.addLegalDialect<arith::ArithDialect, memref::MemRefDialect,
                            vector::VectorDialect>();
     scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns,
                                                          target);
-    xegpu::populateXeGPUOptimizeBlockLoadsPatterns(patterns);
+    xegpu::populateXeGPUPeepHoleOptimizerPatterns(patterns);
     if (failed(applyPartialConversion(getOperation(), target,
                                       std::move(patterns)))) {
       DBGS() << "Optimize block loads pass failed.\n";
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index a1c0656d0bdb5..9113f00ac39f0 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -875,12 +875,15 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
     std::string layoutMaskName =
         xegpu::getTemporaryLayoutName(storeScatterOp->getOpOperand(3));
 
-    xegpu::LayoutAttr layoutPayload =
-        storeScatterOp->getAttrOfType<xegpu::LayoutAttr>(layoutPayloadName);
-    xegpu::LayoutAttr layoutOffsets =
-        storeScatterOp->getAttrOfType<xegpu::LayoutAttr>(layoutOffsetsName);
-    xegpu::LayoutAttr layoutMask =
-        storeScatterOp->getAttrOfType<xegpu::LayoutAttr>(layoutMaskName);
+    xegpu::DistributeLayoutAttr layoutPayload =
+        storeScatterOp->getAttrOfType<xegpu::DistributeLayoutAttr>(
+            layoutPayloadName);
+    xegpu::DistributeLayoutAttr layoutOffsets =
+        storeScatterOp->getAttrOfType<xegpu::DistributeLayoutAttr>(
+            layoutOffsetsName);
+    xegpu::DistributeLayoutAttr layoutMask =
+        storeScatterOp->getAttrOfType<xegpu::DistributeLayoutAttr>(
+            layoutMaskName);
 
     FailureOr<VectorType> distStoreVecByWarpOpOrFailure =
         getDistVecTypeBasedOnLaneLayout(layoutPayload, storeVecTy);
diff --git a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir b/mlir/test/Dialect/XeGPU/peephole-optimize.mlir
similarity index 89%
rename from mlir/test/Dialect/XeGPU/optimize-transpose.mlir
rename to mlir/test/Dialect/XeGPU/peephole-optimize.mlir
index c748c1ca5ef88..56a4b263255e8 100644
--- a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir
+++ b/mlir/test/Dialect/XeGPU/peephole-optimize.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc'  \
-// RUN:   --xegpu-optimize-block-loads --canonicalize --cse --split-input-file %s | FileCheck %s
+// RUN:   --xegpu-optimize-peephole --canonicalize --cse --split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: gpu.func @no_scf(
 // CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]+]]: memref<64x64xf16>, %{{.*}}: vector<8x16xf16>) -> vector<8x16xf32> {
@@ -278,3 +278,36 @@ gpu.func @array_length(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg
   gpu.return
 }
 }
+
+// -----
+// CHECK-LABEL: gpu.func @vector_reduce_2d(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<4x16xf32>, %[[ARG2:[0-9a-zA-Z]+]]: memref<256xf32>) {
+// CHECK:      %[[ACC_VEC:.*]] = arith.constant dense<1.000000e+00> : vector<16xf32>
+// CHECK:      %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK:      %[[LOADED:.*]] = xegpu.load_nd %[[TDESC]][0, 0]  : !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<4x16xf32>
+// CHECK:      %[[LOADED_REDUCED:.*]] = vector.multi_reduction <add>, %[[LOADED]], %[[ACC_VEC]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<4x16xf32> to vector<16xf32>
+// CHECK:      %[[LOADED_REDUCED_FOR_CROSS:.*]] = vector.reduction <add>, %[[LOADED_REDUCED]]
+// CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>} : vector<16xf32> into f32
+gpu.module @xevm_test {
+  gpu.func @vector_reduce_2d(%src: memref<4x16xf32>, %dst: memref<256xf32>) {
+    %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>} 1.0 : f32
+    %tdesc = xegpu.create_nd_tdesc %src : memref<4x16xf32>
+      -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %load =  xegpu.load_nd %tdesc[0, 0]
+      : !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      -> vector<4x16xf32>
+    %reduce = vector.multi_reduction <add>, %load, %cst
+     {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>}
+     [0, 1] : vector<4x16xf32> to f32
+    %reduce_bcast = vector.broadcast %reduce
+     {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}
+     : f32 to vector<16xf32>
+
+    %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<0> : vector<16xindex>
+    %mask = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1> : vector<16xi1>
+
+    xegpu.store %reduce_bcast, %dst[%offset], %mask {layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} : vector<16xf32>, memref<256xf32>, vector<16xindex>, vector<16xi1>
+    gpu.return
+  }
+}
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index 87c67ba6bf324..dae00838fdcb6 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -208,7 +208,7 @@ gpu.module @xevm_module{
       : vector<16x8xi32> to vector<16x16xf16>
     %5 = vector.transpose %4, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
       : vector<16x16xf16> to vector<16x16xf16>
-    %6 = xegpu.dpas %1, %5 
+    %6 = xegpu.dpas %1, %5
       {layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
        layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
        layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
@@ -364,12 +364,12 @@ gpu.module @xevm_module{
     %c0 = arith.constant 0 : index
     %mask = vector.constant_mask [16] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}: vector<16xi1>
     %1 = xegpu.load %arg0[%c0], %mask {layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}: memref<16xf16>, index, vector<16xi1> -> vector<16xf16>
-    
+
     %11 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x1xf16>
     %2 = vector.broadcast %11 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x1xf16> to vector<16x16xf16>
     // CHECK-NOT: vector.broadcast
     // CHECK-NOT: vector.shape_cast
- 
+
     %tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16>
       -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     // CHECK: xegpu.store_nd {{.*}}, {{.*}}[{{.*}}, {{.*}}]
@@ -397,4 +397,42 @@ gpu.module @xevm_module{
   }
 }
 
-
+// -----
+gpu.module @xevm_test {
+    // CHECK-LABEL: gpu.func @vector_reduce_2d
+    // CHECK-DAG: %[[C8:.*]] = arith.constant 8 : i32
+    // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : i32
+    // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : i32
+    // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : i32
+    // CHECK-DAG: %[[C16:.*]] = arith.constant 16 : i32
+    // CHECK-DAG: %[[CST_1:.*]] = arith.constant 1.000000e+00 : f32
+    // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+    // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<true> : vector<1xi1>
+    // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1xindex>
+    // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %arg0 : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32>
+    // CHECK: %[[LOADED:.*]] = xegpu.load_nd %[[TDESC]][0, 0] : !xegpu.tensor_desc<4x16xf32> -> vector<4xf32>
+    // CHECK: %[[LOADED_REDUCED:.*]] = vector.reduction <add>, %[[LOADED]], %[[CST_1]] : vector<4xf32> into f32
+    // CHECK: %[[SHUFFLE_0:.*]], %{{.*}} = gpu.shuffle xor %[[LOADED_REDUCED]], %[[C1]], %[[C16]] : f32
+    // CHECK: %[[VEC_RED_0:.*]] = arith.addf %[[LOADED_REDUCED]], %[[SHUFFLE_0]] : f32
+    // CHECK: %[[SHUFFLE_1:.*]], %{{.*}} = gpu.shuffle xor %[[VEC_RED_0]], %[[C2]], %[[C16]] : f32
+    // CHECK: %[[VEC_RED_1:.*]] = arith.addf %[[VEC_RED_0]], %[[SHUFFLE_1]] : f32
+    // CHECK: %[[SHUFFLE_2:.*]], %{{.*}} = gpu.shuffle xor %[[VEC_RED_1]], %[[C4]], %[[C16]] : f32
+    // CHECK: %[[VEC_RED_2:.*]] = arith.addf %[[VEC_RED_1]], %[[SHUFFLE_2]] : f32
+    // CHECK: %[[SHUFFLE_3:.*]], %{{.*}} = gpu.shuffle xor %[[VEC_RED_2]], %[[C8]], %[[C16]] : f32
+    // CHECK: %[[VEC_RED_3:.*]] = arith.addf %[[VEC_RED_2]], %[[SHUFFLE_3]] : f32
+    // CHECK: %[[VEC_RED:.*]] = vector.broadcast %[[VEC_RED_3]] : f32 to vector<1xf32>
+    // CHECK: xegpu.store %[[VEC_RED]], %arg1[%[[CST]]], %[[CST_0]] : vector<1xf32>, memref<256xf32>, vector<1xindex>, vector<1xi1>
+  gpu.func @vector_reduce_2d(%arg0: memref<4x16xf32>, %arg1: memref<256xf32>) {
+      %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>} 1.000000e+00 : f32
+      %0 = xegpu.create_nd_tdesc %arg0 : memref<4x16xf32> -> !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      %1 = xegpu.load_nd %0[0, 0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : !xegpu.tensor_desc<4x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<4x16xf32>
+      %2 = vector.broadcast %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} : f32 to vector<16xf32>
+      %3 = vector.multi_reduction <add>, %1, %2 {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<4x16xf32> to vector<16xf32>
+      %4 = vector.reduction <add>, %3 {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0, 1]>} : vector<16xf32> into f32
+      %5 = vector.broadcast %4 {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} : f32 to vector<16xf32>
+      %cst_0 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<0> : vector<16xindex>
+      %cst_1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
+      xegpu.store %5, %arg1[%cst_0], %cst_1 <{layout = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}> : vector<16xf32>, memref<256xf32>, vector<16xindex>, vector<16xi1>
+    gpu.return
+  }
+}