From ec59156d4af11cfd2aec0c4754cfc3ca6ec932ee Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Mon, 22 Sep 2025 22:15:47 +0000 Subject: [PATCH 1/9] save work --- .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp index e1ba45c60ac36..1be7a5085aff1 100644 --- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp +++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp @@ -199,6 +199,35 @@ class TestStepOpPattern : public OpConversionPattern { } }; +struct TestXeGPUSGDistribute + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestXeGPUSGDistribute) + + StringRef getArgument() const final { return "test-xegpu-sg-distribute"; } + + StringRef getDescription() const final { + return "Test the implementation of XeGPU Subgroup Distribution"; + } + + void getDependentDialects(::mlir::DialectRegistry ®istry) const override { + registry.insert(); + registry.insert(); + registry.insert(); + registry.insert(); + registry.insert(); + } + + TestXeGPUSGDistribute() = default; + TestXeGPUSGDistribute(const TestXeGPUSGDistribute &pass) = default; + + void runOnOperation() override { + RewritePatternSet patterns(&getContext()); + xegpu::populateXeGPUSubgroupDistributePatterns(patterns); + (void)applyPatternsGreedily(getOperation(), std::move(patterns)); + } +}; + struct TestXeGPULayoutInterface : public PassWrapper> { @@ -263,6 +292,7 @@ namespace test { void registerTestXeGPULowerings() { PassRegistration(); PassRegistration(); + PassRegistration(); } } // namespace test } // namespace mlir From 2d124ee35cdb61770d23d9fc376fbfca3818fe88 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 24 Sep 2025 00:08:47 +0000 Subject: [PATCH 2/9] fix test --- .../Transforms/XeGPUSubgroupDistribute.cpp | 208 +++++-------- .../Dialect/XeGPU/subgroup-distribute.mlir | 292 +++++++----------- 2 files changed, 202 insertions(+), 298 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 449b8eb030b07..336df73e52eda 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -268,7 +268,7 @@ struct MoveFuncBodyToWarpExecuteOnLane0 /// %r = gpu.warp_execute_on_lane_0(%laneid) -> /// (!xegpu.tensor_desc<4x8xf32, #layout0>) { /// ... -/// %td = xegpu.create_nd_tdesc %arg0[0, 0] +/// %td = xegpu.create_nd_tdesc %arg0 /// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0> /// vector.yield %td /// } @@ -277,11 +277,11 @@ struct MoveFuncBodyToWarpExecuteOnLane0 /// ``` /// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (...) { /// ... -/// %dead = xegpu.create_nd_tdesc %arg0[0, 0] +/// %dead = xegpu.create_nd_tdesc %arg0 /// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0> /// vector.yield %arg0, %dead /// } -/// %td = xegpu.create_nd_tdesc %r#0[0, 0]: memref<4x8xf32> +/// %td = xegpu.create_nd_tdesc %r#0: memref<4x8xf32> /// -> !xegpu.tensor_desc<4x8xf32> /// /// ``` @@ -301,6 +301,10 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern { if (!layout) return rewriter.notifyMatchFailure( descOp, "the tensor descriptor lacks layout attribute"); + // CreateNdOp must not have offsets. + if (descOp.getMixedOffsets().size()) + return rewriter.notifyMatchFailure( + descOp, "xegpu::CreateNdDescOp must not have offsets"); SmallVector newRetIndices; rewriter.setInsertionPoint(warpOp); @@ -339,22 +343,23 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern { /// #layout0 = #xegpu.layout /// gpu.warp_execute_on_lane_0(%laneid) -> () { /// ... -/// xegpu.store_nd %arg0, %arg1: vector<4x8xf32>, +/// xegpu.store_nd %arg0, %arg1 [%x, %y]: vector<4x8xf32>, /// !xegpu.tensor_desc<4x8xf32, #layout0> /// } /// ``` /// To /// ``` /// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>, -/// !xegpu.tensor_desc<4x8xf32, #layout0>) { -/// gpu.yield %arg0, %arg1: vector<4x8xf32>, !xegpu.tensor_desc<4x8xf32, -/// #layout0> +/// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) { +/// ... +/// gpu.yield %arg0, %arg1, %x, %y: vector<4x8xf32>, +/// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index /// } /// %0 = vector.shape_cast %r#0: vector<4x1xf32> to vector<4xf32> /// %1 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32, /// #layout0> /// -> !xegpu.tensor_desc<4x8xf32> -/// xegpu.store_nd %0, %1: vector<4xf32>, +/// xegpu.store_nd %0, %1 [%r#2, %r#3]: vector<4xf32>, /// !xegpu.tensor_desc<4x8xf32> /// /// ``` @@ -368,10 +373,15 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern { if (!storeOp) return failure(); - int64_t offsetSize = static_cast(storeOp.getOffsets().size()); - if ((offsetSize != 0) || storeOp.getConstOffsetsAttr()) - return failure(); - + SmallVector offsets = storeOp.getMixedOffsets(); + // Expecting offsets to be present. + if (offsets.empty()) + return rewriter.notifyMatchFailure(storeOp, + "the store op must have offsets"); + SmallVector offsetsAsValues = + vector::getAsValues(rewriter, storeOp.getLoc(), offsets); + SmallVector offsetTypes = llvm::to_vector( + llvm::map_range(offsetsAsValues, [](Value v) { return v.getType(); })); xegpu::TensorDescType tensorDescTy = storeOp.getTensorDescType(); xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr(); if (!layout) @@ -387,13 +397,13 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern { distributedTypeByWarpOpOrFailure.value(); SmallVector newRetIndices; + SmallVector newYieldedValues = {storeOp.getValue(), + storeOp.getTensorDesc()}; + SmallVector newYieldedTypes = {distributedTypeByWarpOp, tensorDescTy}; + newYieldedValues.append(offsetsAsValues.begin(), offsetsAsValues.end()); + newYieldedTypes.append(offsetTypes.begin(), offsetTypes.end()); gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( - rewriter, warpOp, - /* new yielded values = */ - ValueRange{storeOp.getValue(), storeOp.getTensorDesc()}, - /* new yielded types = */ - TypeRange{distributedTypeByWarpOp, storeOp.getTensorDescType()}, - newRetIndices); + rewriter, warpOp, newYieldedValues, newYieldedTypes, newRetIndices); // Create a new store op outside the warp op with the distributed vector // type. Tensor descriptor is not distributed. rewriter.setInsertionPointAfter(newWarpOp); @@ -418,6 +428,9 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern { newStoreOperands.push_back( resolveDistributedTy(newWarpOp.getResult(newRetIndices[1]), distributedTensorDescTy, rewriter)); + // Collect offsets. + for (size_t i = 2; i < newRetIndices.size(); ++i) + newStoreOperands.push_back(newWarpOp.getResult(newRetIndices[i])); auto newStoreOp = xegpu::StoreNdOp::create(rewriter, newWarpOp.getLoc(), TypeRange{}, @@ -491,9 +504,18 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern { loadOp, "xegpu::LoadNdOp require chip information to determine transpose " "requirement"); - int64_t offsetSize = static_cast(loadOp.getOffsets().size()); - if ((offsetSize != 0) || loadOp.getConstOffsetsAttr()) - return failure(); + // int64_t offsetSize = static_cast(loadOp.getOffsets().size()); + // if ((offsetSize != 0) || loadOp.getConstOffsetsAttr()) + // return failure(); + // Expecting offsets to be present. + SmallVector offsets = loadOp.getMixedOffsets(); + if (offsets.empty()) + return rewriter.notifyMatchFailure(loadOp, + "the load op must have offsets"); + SmallVector offsetsAsValues = + vector::getAsValues(rewriter, loadOp.getLoc(), offsets); + SmallVector offsetTypes = llvm::to_vector( + llvm::map_range(offsetsAsValues, [](Value v) { return v.getType(); })); xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType(); xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr(); @@ -506,10 +528,12 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern { cast(warpOp.getResult(operandIdx).getType()); SmallVector newRetIndices; + SmallVector newYieldedValues = {loadOp.getTensorDesc()}; + SmallVector newYieldedTypes = {tensorDescTy}; + newYieldedValues.append(offsetsAsValues.begin(), offsetsAsValues.end()); + newYieldedTypes.append(offsetTypes.begin(), offsetTypes.end()); gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( - rewriter, warpOp, - /* new yielded values = */ loadOp.getTensorDesc(), - /* new yielded types = */ tensorDescTy, newRetIndices); + rewriter, warpOp, newYieldedValues, newYieldedTypes, newRetIndices); // Create a new load op outside the warp op with the distributed vector // type. @@ -523,11 +547,15 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern { loadOp.getTensorDescType().dropLayouts(); // Distributed tensor // descriptor type does not // contain layout info. + SmallVector newLoadOperands{ + resolveDistributedTy(newWarpOp.getResult(newRetIndices[0]), + distributedTensorDescTy, rewriter)}; + // Collect offsets. + for (size_t i = 1; i < newRetIndices.size(); ++i) + newLoadOperands.push_back(newWarpOp.getResult(newRetIndices[i])); auto newLoadOp = xegpu::LoadNdOp::create( rewriter, newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(), - resolveDistributedTy(newWarpOp->getResult(newRetIndices[0]), - distributedTensorDescTy, rewriter), - loadOp->getAttrs()); + newLoadOperands, loadOp->getAttrs()); xegpu::removeLayoutAttrs(newLoadOp); // Set the packed attribute if the layout requires it. newLoadOp.setPacked(requirePacked(layout)); @@ -677,85 +705,6 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern { } }; -/// Sink an update_nd_offset op feeding into yield op of an enclosing -/// `gpu.warp_execute_on_lane_0` region. The warp op will still contain the -/// original op that will not be used by the yield op (and should be cleaned -/// up later). The yield op will bypass the updateOp's arguments. The tensor -/// descriptor type is not distributed. Appropriate cast ops are inserted if -/// the distributed types does not match expected xegpu SIMT types. -/// Example: -/// ``` -/// #layout0 = #xegpu.layout -/// %r = gpu.warp_execute_on_lane_0(%laneid) -> -/// (!xegpu.tensor_desc<4x8xf32, #layout0>) { -/// ... -/// %update = xegpu.update_nd_offset %arg0, [%c32, %c16]: -/// !xegpu.tensor_desc<4x8xf32, #layout0> -/// gpu.yield %update -/// } -/// ... -/// ``` -/// To -/// ``` -/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> ( -/// !xegpu.tensor_desc<4x8xf32, #layout0>, -/// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) { -/// ... -/// %dead = xegpu.update_nd_offset %arg0, [%c32, %c16]: -/// !xegpu.tensor_desc<4x8xf32, #layout0> gpu.yield %dead, %arg0 -/// gpu.yield %dead, %arg0, %c32, %c16 -/// } -/// %0 = xegpu.unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32, -/// #layout0> -> !xegpu.tensor_desc<4x8xf32> -/// %1 = xegpu.update_nd_offset %0, [%r#2, %r#3]: -/// !xegpu.tensor_desc<4x8xf32> -/// ... -/// ``` -struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern { - using gpu::WarpDistributionPattern::WarpDistributionPattern; - LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, - PatternRewriter &rewriter) const override { - OpOperand *operand = - getWarpResult(warpOp, llvm::IsaPred); - if (!operand) - return rewriter.notifyMatchFailure( - warpOp, "warp result is not a xegpu::UpdateNdOffset op"); - auto updateOp = operand->get().getDefiningOp(); - unsigned operandIdx = operand->getOperandNumber(); - - SmallVector newRetIndices; - gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( - rewriter, warpOp, updateOp->getOperands(), updateOp.getOperandTypes(), - newRetIndices); - rewriter.setInsertionPointAfter(newWarpOp); - // new update op does not have layout attribute. - xegpu::TensorDescType distributedTensorDescTy = - updateOp.getTensorDescType().dropLayouts(); - SmallVector newUpdateOperands = - llvm::map_to_vector(newRetIndices, [&](size_t i) { - // For the tensor descriptor operand, the layout attribute is - // dropped after distribution. Types needs to be resolved in this - // case. - if (isa(newWarpOp.getResult(i).getType())) { - return resolveDistributedTy(newWarpOp.getResult(i), - distributedTensorDescTy, rewriter); - } - return newWarpOp.getResult(i); - }); - // Create a new update op outside the warp op. - auto newUpdateOp = xegpu::UpdateNdOffsetOp::create( - rewriter, newWarpOp.getLoc(), distributedTensorDescTy, - newUpdateOperands, updateOp->getAttrs()); - xegpu::removeLayoutAttrs(newUpdateOp); - Value distributedVal = newWarpOp.getResult(operandIdx); - // Resolve the distributed type with the original type. - Value typeResolved = resolveDistributedTy( - newUpdateOp.getResult(), distributedVal.getType(), rewriter); - rewriter.replaceAllUsesWith(distributedVal, typeResolved); - return success(); - } -}; - /// Distribute a prefetch_nd op at the end of enclosing /// `gpu.warp_execute_on_lane_0`. In case arguments for the prefetch are passed /// through the warp op interface they would be propagated as returned values. @@ -769,18 +718,19 @@ struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern { /// #layout0 = #xegpu.layout /// gpu.warp_execute_on_lane_0(%laneid) -> () { /// ... -/// xegpu.prefetch_nd %arg0 : !xegpu.tensor_desc<4x8xf32, #layout0> +/// xegpu.prefetch_nd %arg0 [%x, %y] : !xegpu.tensor_desc<4x8xf32, #layout0> /// } /// ``` /// To /// ``` /// %r:1 = gpu.warp_execute_on_lane_0(%laneid) -> ( -/// !xegpu.tensor_desc<4x8xf32, #layout0>) { -/// gpu.yield %arg0: !xegpu.tensor_desc<4x8xf32, #layout0> +/// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) { +/// gpu.yield %arg0, %x, %y: !xegpu.tensor_desc<4x8xf32, #layout0>, index, +/// index /// } /// %1 = unrealized_conversion_cast %r#0: !xegpu.tensor_desc<4x8xf32, /// #layout0> -> !xegpu.tensor_desc<4x8xf32> -/// xegpu.prefetch_nd %1 : !xegpu.tensor_desc<4x8xf32> +/// xegpu.prefetch_nd %1 [%r#1, %r#2] : !xegpu.tensor_desc<4x8xf32> /// /// ``` struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern { @@ -793,17 +743,25 @@ struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern { if (!prefetchOp) return failure(); - int64_t offsetSize = static_cast(prefetchOp.getOffsets().size()); - if ((offsetSize != 0) || prefetchOp.getConstOffsetsAttr()) - return failure(); + SmallVector offsets = prefetchOp.getMixedOffsets(); + // PrefetchNdOp must have offsets. + if (offsets.empty()) + return rewriter.notifyMatchFailure(prefetchOp, + "the prefetch op must have offsets"); + SmallVector offsetsAsValues = + vector::getAsValues(rewriter, prefetchOp.getLoc(), offsets); + SmallVector offsetTypes = llvm::to_vector( + llvm::map_range(offsetsAsValues, [](Value v) { return v.getType(); })); xegpu::LayoutAttr layout = prefetchOp.getTensorDescType().getLayoutAttr(); if (!layout) return rewriter.notifyMatchFailure( prefetchOp, "the source tensor descriptor lacks layout attribute"); - SmallVector newYieldValues = {prefetchOp.getTensorDesc()}; - SmallVector newYieldTypes = {prefetchOp.getTensorDescType()}; + SmallVector newYieldValues = {prefetchOp.getTensorDesc()}; + SmallVector newYieldTypes = {prefetchOp.getTensorDescType()}; + newYieldValues.append(offsetsAsValues.begin(), offsetsAsValues.end()); + newYieldTypes.append(offsetTypes.begin(), offsetTypes.end()); SmallVector newRetIndices; gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices); @@ -814,6 +772,9 @@ struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern { rewriter.setInsertionPointAfter(newWarpOp); SmallVector newPrefetchOperands = {resolveDistributedTy( newWarpOp.getResult(newRetIndices[0]), newTensorDescTy, rewriter)}; + // Collect offsets. + for (size_t i = 1; i < newRetIndices.size(); ++i) + newPrefetchOperands.push_back(newWarpOp.getResult(newRetIndices[i])); xegpu::PrefetchNdOp::create(rewriter, newWarpOp.getLoc(), TypeRange{}, newPrefetchOperands, prefetchOp->getAttrs()); xegpu::removeLayoutAttrs(prefetchOp); @@ -1456,15 +1417,14 @@ struct XeGPUSubgroupDistributePass final void xegpu::populateXeGPUSubgroupDistributePatterns( RewritePatternSet &patterns) { - patterns - .add( - patterns.getContext(), - /*pattern benefit=*/regularPatternBenefit); + patterns.add( + patterns.getContext(), + /*pattern benefit=*/regularPatternBenefit); patterns.add( patterns.getContext(), /*pattern benefit=*/highPatternBenefit); diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index 13b0ed176eb0c..59fac26d18cf4 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -8,15 +8,15 @@ // CHECK-LABEL: gpu.func @store_nd_1d // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>) { // CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1xf32> -// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> -// CHECK: xegpu.store_nd %[[CST]], %[[T0]] : vector<1xf32>, !xegpu.tensor_desc<16xf32> +// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> +// CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}] : vector<1xf32>, !xegpu.tensor_desc<16xf32> // CHECK: gpu.return gpu.module @xevm_module{ gpu.func @store_nd_1d(%arg0: memref<16xf32>) { %c0 = arith.constant 0 : index %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16xf32> - %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> - xegpu.store_nd %cst, %0 : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout> + %0 = xegpu.create_nd_tdesc %arg0 : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> + xegpu.store_nd %cst, %0 [%c0] : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout> gpu.return } } @@ -25,14 +25,14 @@ gpu.module @xevm_module{ // CHECK-LABEL: gpu.func @store_nd_2d // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>) { // CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf16> -// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.store_nd %[[CST]], %[[T0]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> +// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> gpu.module @xevm_module{ gpu.func @store_nd_2d(%arg0: memref<16x16xf16>) { %c0 = arith.constant 0 : index %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16x16xf16> - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - xegpu.store_nd %cst, %0 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + xegpu.store_nd %cst, %0 [%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> gpu.return } } @@ -42,17 +42,17 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @load_nd_1d // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16xf32>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> -// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16xf32> -> vector<1xf32> -// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> -// CHECK: xegpu.store_nd %[[T1]], %[[T2]] : vector<1xf32>, !xegpu.tensor_desc<16xf32> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> +// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16xf32> -> vector<1xf32> +// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> +// CHECK: xegpu.store_nd %[[T1]], %[[T2]][%{{.*}}] : vector<1xf32>, !xegpu.tensor_desc<16xf32> gpu.module @xevm_module{ gpu.func @load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> - %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16xf32, #xegpu.layout> -> vector<16xf32> - %2 = xegpu.create_nd_tdesc %arg1[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> - xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout> + %0 = xegpu.create_nd_tdesc %arg0 : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> + %1 = xegpu.load_nd %0 [%c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16xf32, #xegpu.layout> -> vector<16xf32> + %2 = xegpu.create_nd_tdesc %arg1 : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> + xegpu.store_nd %1, %2 [%c0] : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout> gpu.return } } @@ -60,17 +60,17 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @load_nd_2d // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> -// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.store_nd %[[T1]], %[[T2]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> +// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK: xegpu.store_nd %[[T1]], %[[T2]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> gpu.module @xevm_module{ gpu.func @load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> - %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + xegpu.store_nd %1, %2[%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> gpu.return } } @@ -78,21 +78,21 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @load_nd_array_length // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<32xf16> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<32xf16> // CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<32xf16> to vector<2x16x1xf16> // CHECK: %[[T3:.*]] = vector.extract %[[T2]][0] : vector<16x1xf16> from vector<2x16x1xf16> -// CHECK-DAG: %[[T4:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK-DAG: %[[T4:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> // CHECK-DAG: %[[T5:.*]] = vector.shape_cast %[[T3]] : vector<16x1xf16> to vector<16xf16> -// CHECK: xegpu.store_nd %[[T5]], %[[T4]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> +// CHECK: xegpu.store_nd %[[T5]], %[[T4]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> gpu.module @xevm_module{ gpu.func @load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, #xegpu.layout> - %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, #xegpu.layout> -> vector<2x16x16xf16> + %0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, #xegpu.layout> + %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, #xegpu.layout> -> vector<2x16x16xf16> %2 = vector.extract %1[%c0] {layout_result_0 = #xegpu.layout} : vector<16x16xf16> from vector<2x16x16xf16> - %3 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - xegpu.store_nd %2, %3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %3 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + xegpu.store_nd %2, %3[%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> gpu.return } } @@ -100,23 +100,23 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @load_dpas_store // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> +// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> // CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> -// CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][%{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> +// CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> +// CHECK: xegpu.store_nd %[[T4]], %[[T5]][%{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> gpu.module @xevm_module{ gpu.func @load_dpas_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> - %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> - %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - %3 = xegpu.load_nd %2 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> + %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> + %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %3 = xegpu.load_nd %2[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> %4 = xegpu.dpas %1, %3 {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + %5 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + xegpu.store_nd %4, %5[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> gpu.return } } @@ -125,27 +125,27 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @load_dpas_postop_store // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> +// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> // CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> // CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32> // CHECK: %[[T6:.*]] = math.exp %[[T5]] {{{.*}}} : vector<8x1xf32> // CHECK-DAG: %[[T8:.*]] = vector.shape_cast %[[T6]] : vector<8x1xf32> to vector<8xf32> -// CHECK-DAG: %[[T7:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: xegpu.store_nd %[[T8]], %[[T7]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> +// CHECK-DAG: %[[T7:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> +// CHECK: xegpu.store_nd %[[T8]], %[[T7]][{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> gpu.module @xevm_module{ gpu.func @load_dpas_postop_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> - %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> - %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - %3 = xegpu.load_nd %2 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> + %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> + %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %3 = xegpu.load_nd %2[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> %4 = xegpu.dpas %1, %3 {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> %5 = math.exp %4 {layout_result_0 = #xegpu.layout} : vector<8x16xf32> - %6 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - xegpu.store_nd %5, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + %6 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + xegpu.store_nd %5, %6[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> gpu.return } } @@ -155,17 +155,17 @@ gpu.module @xevm_module{ // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: ui64, %[[ARG2:[0-9a-zA-Z]+]]: index, // CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: index, %[[ARG4:[0-9a-zA-Z]+]]: index, // CHECK-SAME: %[[ARG5:[0-9a-zA-Z]+]]: index, %[[ARG6:[0-9a-zA-Z]+]]: index, %[[ARG7:[0-9a-zA-Z]+]]: index) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.store_nd %[[T1]], %[[T2]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][{{.*}}] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> +// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> +// CHECK: xegpu.store_nd %[[T1]], %[[T2]][{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> gpu.module @xevm_module{ gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %arg1: ui64, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0], shape:[%arg2, %arg3], strides:[%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> - %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0], shape:[%arg2, %arg3], strides:[%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %0 = xegpu.create_nd_tdesc %arg0, shape:[%arg2, %arg3], strides:[%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + %2 = xegpu.create_nd_tdesc %arg1, shape:[%arg2, %arg3], strides:[%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + xegpu.store_nd %1, %2[%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> gpu.return } } @@ -178,21 +178,21 @@ gpu.module @xevm_module{ // CHECK-DAG: %[[BLOCK_ID_Y:.*]] = gpu.block_id y // CHECK-DAG: %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index // CHECK-DAG: %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]][%[[X_COORD]], %[[Y_COORD]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[T2]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> +// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32> +// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> // CHECK-NEXT: %[[T4:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32> // CHECK: %[[T5:.*]] = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %[[T4]]) -> (vector<8x1xf32>) { -// CHECK-DAG: %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[K]], %[[Y_COORD]]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16> -// CHECK-DAG: %[[T11:.*]] = xegpu.load_nd %[[T10]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16> -// CHECK-DAG: %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[X_COORD]], %[[K]]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16> -// CHECK-DAG: %[[T13:.*]] = xegpu.load_nd %[[T12]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16> +// CHECK-DAG: %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16> +// CHECK-DAG: %[[T11:.*]] = xegpu.load_nd %[[T10]][%[[K]], %[[Y_COORD]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16> +// CHECK-DAG: %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16> +// CHECK-DAG: %[[T13:.*]] = xegpu.load_nd %[[T12]][%[[X_COORD]], %[[K]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16> // CHECK-DAG: %[[T14:.*]] = vector.shape_cast %[[ARG4]] : vector<8x1xf32> to vector<8xf32> // CHECK-NEXT: %[[T15:.*]] = xegpu.dpas %[[T13]], %[[T11]], %[[T14]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32> // CHECK-NEXT: %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32> // CHECK-NEXT: scf.yield %[[T16]] : vector<8x1xf32> // CHECK-NEXT: } // CHECK-NEXT: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32> -// CHECK-NEXT: xegpu.store_nd %[[T9]], %[[T2]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> +// CHECK-NEXT: xegpu.store_nd %[[T9]], %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> gpu.module @xevm_module{ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){ %c0 = arith.constant 0 : index @@ -203,91 +203,31 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar %block_id_y = gpu.block_id y %0 = arith.muli %block_id_x, %c8 : index %1 = arith.muli %block_id_y, %c16 : index - %2 = xegpu.create_nd_tdesc %arg2[%0, %1] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - %3 = xegpu.load_nd %2 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -> vector<8x16xf32> + %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + %3 = xegpu.load_nd %2[%0, %1] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -> vector<8x16xf32> %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) { - %5 = xegpu.create_nd_tdesc %arg0[%0, %arg3] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> - %6 = xegpu.create_nd_tdesc %arg1[%arg3, %1] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> - %7 = xegpu.load_nd %5 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> -> vector<8x16xbf16> - %8 = xegpu.load_nd %6 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> -> vector<16x16xbf16> + %5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> + %6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> + %7 = xegpu.load_nd %5[%0, %arg3] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> -> vector<8x16xbf16> + %8 = xegpu.load_nd %6[%arg3, %1] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> -> vector<16x16xbf16> %9 = xegpu.dpas %7, %8, %arg4 {layout_result_0 = #xegpu.layout} : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32> scf.yield %9 : vector<8x16xf32> } {layout_result_0 = #xegpu.layout} - xegpu.store_nd %4, %2 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + xegpu.store_nd %4, %2[%0, %1] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> gpu.return } } -// ----- -// CHECK-LABEL: gpu.func @update_nd_offset_1d( -// CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>) { -// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1xf32> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf32> -> !xegpu.tensor_desc<16xf32> -// CHECK: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%c32] : !xegpu.tensor_desc<16xf32> -// CHECK: xegpu.store_nd %[[CST]], %[[T1]] : vector<1xf32>, !xegpu.tensor_desc<16xf32> -gpu.module @xevm_module{ - gpu.func @update_nd_offset_1d(%arg0: memref<256xf32>) { - %c0 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16xf32> - %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> - %1 = xegpu.update_nd_offset %0, [%c32] : !xegpu.tensor_desc<16xf32, #xegpu.layout> - xegpu.store_nd %cst, %1 : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout> - gpu.return - } -} - -// ----- -// CHECK-LABEL: gpu.func @update_nd_offset_2d -// CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf32>) { -// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf32> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32> -// CHECK: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%c32, %c32] : !xegpu.tensor_desc<16x16xf32> -// CHECK: xegpu.store_nd %[[CST]], %[[T1]] : vector<16xf32>, !xegpu.tensor_desc<16x16xf32> -gpu.module @xevm_module{ - gpu.func @update_nd_offset_2d(%arg0: memref<256x256xf32>) { - %c0 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16x16xf32> - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout> - %1 = xegpu.update_nd_offset %0, [%c32, %c32] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> - xegpu.store_nd %cst, %1 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout> - gpu.return - } -} - // ----- // CHECK-LABEL: gpu.func @prefetch_2d // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK: xegpu.prefetch_nd %[[T0]][%{{.*}}] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16> gpu.module @xevm_module{ gpu.func @prefetch_2d(%arg0: memref<256x256xf16>) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - gpu.return - } -} - -// ----- -// Explicitly check that update_nd_offset op's source retain layout when yielded from the warp op (PR150545) -// CHECK-LABEL: gpu.func @check_update_nd_offset_distributed_tensor_desc -// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> -// CHECK-SAME: (!xegpu.tensor_desc<16x16xf32, #xegpu.layout>) { -// CHECK: %[[T0:.*]] = "some_op"() : () -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout> -// CHECK: gpu.yield %[[T0]] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> -// CHECK: } -// CHECK: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]] : -// CHECK-SAME: !xegpu.tensor_desc<16x16xf32, #xegpu.layout> to !xegpu.tensor_desc<16x16xf32> {resolve_simt_type_mismatch} -// CHECK: xegpu.update_nd_offset %[[T1]], [%{{.*}}] : !xegpu.tensor_desc<16x16xf32> -gpu.module @xevm_module{ - gpu.func @check_update_nd_offset_distributed_tensor_desc() { - %c32 = arith.constant 32 : index - %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16x16xf32> - %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout> - %1 = xegpu.update_nd_offset %0, [%c32, %c32] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> - xegpu.store_nd %cst, %1 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout> + %0 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + xegpu.prefetch_nd %0[%c0, %c0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> gpu.return } } @@ -295,13 +235,13 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @prefetch_1d // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16> -// CHECK: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256xf16> -> !xegpu.tensor_desc<16xf16> +// CHECK: xegpu.prefetch_nd %[[T0]][%{{.*}}] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16> gpu.module @xevm_module{ gpu.func @prefetch_1d(%arg0: memref<256xf16>) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> - xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16, #xegpu.layout> + %0 = xegpu.create_nd_tdesc %arg0: memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> + xegpu.prefetch_nd %0[%c0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16, #xegpu.layout> gpu.return } } @@ -309,18 +249,18 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @gpu_barrier({{.*}}) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16> -// CHECK-NEXT: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16xf16> -> vector<1xf16> +// CHECK-NEXT: %[[T1:.*]] = xegpu.load_nd %[[T0]][{{.*}}] : !xegpu.tensor_desc<16xf16> -> vector<1xf16> // CHECK-NEXT: gpu.barrier // CHECK-NEXT: %[[T2:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16> -// CHECK-NEXT: xegpu.store_nd %[[T1]], %[[T2]] : vector<1xf16>, !xegpu.tensor_desc<16xf16> +// CHECK-NEXT: xegpu.store_nd %[[T1]], %[[T2]][{{.*}}] : vector<1xf16>, !xegpu.tensor_desc<16xf16> gpu.module @xevm_module{ gpu.func @gpu_barrier(%arg0: memref<256xf16>, %arg1: memref<256xf16>) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> - %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16xf16, #xegpu.layout> -> vector<16xf16> + %0 = xegpu.create_nd_tdesc %arg0 : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> + %1 = xegpu.load_nd %0[%c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16xf16, #xegpu.layout> -> vector<16xf16> gpu.barrier - %2 = xegpu.create_nd_tdesc %arg1[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> - xegpu.store_nd %1, %2 : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout> + %2 = xegpu.create_nd_tdesc %arg1 : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> + xegpu.store_nd %1, %2[%c0] : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout> gpu.return } } @@ -341,6 +281,7 @@ gpu.module @xevm_module{ // CHECK-NEXT: vector.from_elements %[[RED0]], %[[RED1]] : vector<2xf32> gpu.module @xevm_module{ gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction() { + %c0 = arith.constant 0 : index %0 = "some_def"() : () -> !xegpu.tensor_desc<1x32xf32, #xegpu.layout> %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<16x32xf32>) %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} dense<0.0> : vector<32xf32> @@ -348,7 +289,7 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction() { : vector<16x32xf32> to vector<32xf32> %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout} : vector<32xf32> to vector<1x32xf32> - xegpu.store_nd %3, %0 : vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #xegpu.layout> + xegpu.store_nd %3, %0[%c0, %c0] : vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #xegpu.layout> gpu.return } } @@ -367,6 +308,7 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction() { // CHECK-REDUCTION-NEXT: vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32> gpu.module @xevm_module{ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction() { + %c0 = arith.constant 0 : index %0 = "some_def"() : () -> !xegpu.tensor_desc<2x16xf32, #xegpu.layout> %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<2x16xf32>) %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} dense<0.0> : vector<2xf32> @@ -375,7 +317,7 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction() { %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout} : vector<2xf32> to vector<2x1xf32> %4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout} : vector<2x1xf32> to vector<2x16xf32> - xegpu.store_nd %4, %0 : vector<2x16xf32>, !xegpu.tensor_desc<2x16xf32, #xegpu.layout> + xegpu.store_nd %4, %0[%c0, %c0] : vector<2x16xf32>, !xegpu.tensor_desc<2x16xf32, #xegpu.layout> gpu.return } } @@ -394,6 +336,7 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction() { // CHECK-NEXT: vector.from_elements %[[R0]], %[[R1]] : vector<2xf32> gpu.module @xevm_module{ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction() { + %c0 = arith.constant 0 : index %0 = "some_def"() : () -> !xegpu.tensor_desc<32x1xf32, #xegpu.layout> %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<32x16xf32>) %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} dense<0.0> : vector<32xf32> @@ -401,7 +344,7 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction() { : vector<32x16xf32> to vector<32xf32> %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout} : vector<32xf32> to vector<32x1xf32> - xegpu.store_nd %3, %0 : vector<32x1xf32>, !xegpu.tensor_desc<32x1xf32, #xegpu.layout> + xegpu.store_nd %3, %0[%c0, %c0] : vector<32x1xf32>, !xegpu.tensor_desc<32x1xf32, #xegpu.layout> gpu.return } } @@ -422,6 +365,7 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction() { // CHECK-REDUCTION-NEXT: vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32> gpu.module @xevm_module{ gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction() { + %c0 = arith.constant 0 : index %0 = "some_def"() : () -> !xegpu.tensor_desc<16x2xf32, #xegpu.layout> %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<16x2xf32>) %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} dense<0.0> : vector<2xf32> @@ -430,7 +374,7 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction() { %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout} : vector<2xf32> to vector<1x2xf32> %4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout} : vector<1x2xf32> to vector<16x2xf32> - xegpu.store_nd %4, %0 : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.layout> + xegpu.store_nd %4, %0[%c0, %c0] : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.layout> gpu.return } } @@ -537,9 +481,9 @@ gpu.module @xevm_module{ %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16xf16> %ptr = memref.extract_aligned_pointer_as_index %arg0 : memref<256x256xf16> -> index %ptr_i64 = arith.index_cast %ptr : index to i64 - %tdesc = xegpu.create_nd_tdesc %ptr_i64[%c0], shape: [16], strides: [16] : i64 + %tdesc = xegpu.create_nd_tdesc %ptr_i64, shape: [16], strides: [16] : i64 -> !xegpu.tensor_desc<16xf16, #xegpu.layout> - xegpu.store_nd %cst, %tdesc : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout> + xegpu.store_nd %cst, %tdesc[%c0] : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout> gpu.return } } @@ -549,7 +493,7 @@ gpu.module @xevm_module{ // CHECK-LABEL: gpu.func @vector_transpose( // CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<2xf32> // CHECK: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<2x16xf32> -> !xegpu.tensor_desc<2x16xf32> -// CHECK: xegpu.store_nd %[[CST]], %[[DEST]] : vector<2xf32>, !xegpu.tensor_desc<2x16xf32> +// CHECK: xegpu.store_nd %[[CST]], %[[DEST]][{{.*}}] : vector<2xf32>, !xegpu.tensor_desc<2x16xf32> gpu.module @xevm_module{ gpu.func @vector_transpose(%arg0: memref<2x16xf32>) { %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> @@ -557,9 +501,9 @@ gpu.module @xevm_module{ %c0 = arith.constant 0 : index %transpose = vector.transpose %cst, [1, 0] {layout_result_0 = #xegpu.layout} : vector<16x2xf32> to vector<2x16xf32> - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<2x16xf32> + %0 = xegpu.create_nd_tdesc %arg0 : memref<2x16xf32> -> !xegpu.tensor_desc<2x16xf32, #xegpu.layout> - xegpu.store_nd %transpose, %0 : vector<2x16xf32>, + xegpu.store_nd %transpose, %0[%c0, %c0] : vector<2x16xf32>, !xegpu.tensor_desc<2x16xf32, #xegpu.layout> gpu.return } @@ -570,7 +514,7 @@ gpu.module @xevm_module{ // CHECK: %[[CAST:.*]] = vector.bitcast %{{.*}} : vector<4x2xi8> to vector<4x1xi16> // CHECK-NEXT: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<4x16xi16> -> !xegpu.tensor_desc<4x16xi16> // CHECK-NEXT: %[[T0:.*]] = vector.shape_cast %[[CAST]] : vector<4x1xi16> to vector<4xi16> -// CHECK-NEXT: xegpu.store_nd %[[T0]], %[[DEST]] : vector<4xi16>, !xegpu.tensor_desc<4x16xi16> +// CHECK-NEXT: xegpu.store_nd %[[T0]], %[[DEST]][{{.*}}] : vector<4xi16>, !xegpu.tensor_desc<4x16xi16> gpu.module @xevm_module{ gpu.func @vector_bitcast(%arg0: memref<4x16xi16>) { %cst = "some_op"() {layout_result_0 = #xegpu.layout} @@ -578,9 +522,9 @@ gpu.module @xevm_module{ %bitcast = vector.bitcast %cst {layout_result_0 = #xegpu.layout} : vector<4x32xi8> to vector<4x16xi16> %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<4x16xi16> + %0 = xegpu.create_nd_tdesc %arg0 : memref<4x16xi16> -> !xegpu.tensor_desc<4x16xi16, #xegpu.layout> - xegpu.store_nd %bitcast, %0 : vector<4x16xi16>, + xegpu.store_nd %bitcast, %0[%c0, %c0] : vector<4x16xi16>, !xegpu.tensor_desc<4x16xi16, #xegpu.layout> gpu.return } @@ -589,10 +533,10 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @mma_transpose_b( // CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x8xi32>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK-DAG: %[[ADESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK-DAG: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32> -// CHECK-DAG: %[[A:.*]] = xegpu.load_nd %[[ADESC]] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> -// CHECK-DAG: %[[B:.*]] = xegpu.load_nd %[[BDESC]] <{transpose = array}> : !xegpu.tensor_desc<16x8xi32> -> vector<8xi32> +// CHECK-DAG: %[[ADESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> +// CHECK-DAG: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32> +// CHECK-DAG: %[[A:.*]] = xegpu.load_nd %[[ADESC]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> +// CHECK-DAG: %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}] <{transpose = array}> : !xegpu.tensor_desc<16x8xi32> -> vector<8xi32> // CHECK-NEXT: %[[BCAST0:.*]] = vector.shape_cast %[[B]] : vector<8xi32> to vector<1x8xi32> // CHECK-NEXT: %[[BCAST1:.*]] = vector.bitcast %[[BCAST0]] : vector<1x8xi32> to vector<1x16xf16> // CHECK-NEXT: %[[BCAST2:.*]] = vector.shape_cast %[[BCAST1]] : vector<1x16xf16> to vector<16xf16> @@ -600,13 +544,13 @@ gpu.module @xevm_module{ gpu.module @xevm_module{ gpu.func @mma_transpose_b(%arg0: memref<8x16xf16>, %arg1: memref<16x8xi32>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> + %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> - %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} + %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> - %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x8xi32> + %2 = xegpu.create_nd_tdesc %arg1 : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout> - %3 = xegpu.load_nd %2 {layout_result_0 = #xegpu.layout} + %3 = xegpu.load_nd %2[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> %4 = vector.bitcast %3 {layout_result_0 = #xegpu.layout} : vector<16x8xi32> to vector<16x16xf16> @@ -614,9 +558,9 @@ gpu.module @xevm_module{ : vector<16x16xf16> to vector<16x16xf16> %6 = xegpu.dpas %1, %5 {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - %7 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> + %7 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - xegpu.store_nd %6, %7 : vector<8x16xf32>, + xegpu.store_nd %6, %7[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> gpu.return From 80ab644f3018c9357552176a9dfa75e5ac4c59c3 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 24 Sep 2025 21:38:39 +0000 Subject: [PATCH 3/9] fix --- mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 336df73e52eda..882691fd19f58 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -504,9 +504,6 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern { loadOp, "xegpu::LoadNdOp require chip information to determine transpose " "requirement"); - // int64_t offsetSize = static_cast(loadOp.getOffsets().size()); - // if ((offsetSize != 0) || loadOp.getConstOffsetsAttr()) - // return failure(); // Expecting offsets to be present. SmallVector offsets = loadOp.getMixedOffsets(); if (offsets.empty()) From e0ffd7f9301f5a359d6102ca13af08808fbaa4c7 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Thu, 25 Sep 2025 16:43:35 +0000 Subject: [PATCH 4/9] save work --- .../Dialect/XeGPU/subgroup-distribute.mlir | 890 +++++++++--------- 1 file changed, 445 insertions(+), 445 deletions(-) diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index 59fac26d18cf4..a68317e79edd3 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -12,11 +12,14 @@ // CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}] : vector<1xf32>, !xegpu.tensor_desc<16xf32> // CHECK: gpu.return gpu.module @xevm_module{ - gpu.func @store_nd_1d(%arg0: memref<16xf32>) { + gpu.func @store_nd_1d(%laneid: index) { %c0 = arith.constant 0 : index - %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16xf32> - %0 = xegpu.create_nd_tdesc %arg0 : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> - xegpu.store_nd %cst, %0 [%c0] : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout> + gpu.warp_execute_on_lane_0(%laneid)[16] { + %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout> + %cst = "some_op"() : () -> vector<16xf32> + xegpu.store_nd %cst, %0 [%c0] {layout_operand_0 = #xegpu.layout} + : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout> + } gpu.return } } @@ -28,11 +31,14 @@ gpu.module @xevm_module{ // CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> // CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> gpu.module @xevm_module{ - gpu.func @store_nd_2d(%arg0: memref<16x16xf16>) { + gpu.func @store_nd_2d(%laneid : index) { %c0 = arith.constant 0 : index - %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16x16xf16> - %0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - xegpu.store_nd %cst, %0 [%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + gpu.warp_execute_on_lane_0(%laneid)[16] { + %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %cst = "some_op"() : () -> vector<16x16xf16> + xegpu.store_nd %cst, %0 [%c0, %c0] {layout_operand_0 = #xegpu.layout} + : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + } gpu.return } } @@ -47,12 +53,15 @@ gpu.module @xevm_module{ // CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> // CHECK: xegpu.store_nd %[[T1]], %[[T2]][%{{.*}}] : vector<1xf32>, !xegpu.tensor_desc<16xf32> gpu.module @xevm_module{ - gpu.func @load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>) { + gpu.func @load_nd_1d(%laneid: index) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0 : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> - %1 = xegpu.load_nd %0 [%c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16xf32, #xegpu.layout> -> vector<16xf32> - %2 = xegpu.create_nd_tdesc %arg1 : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> - xegpu.store_nd %1, %2 [%c0] : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout> + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16xf32>) { + %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout> + %1 = xegpu.load_nd %0 [%c0] {layout_result_0 = #xegpu.layout} : + !xegpu.tensor_desc<16xf32, #xegpu.layout> -> vector<16xf32> + gpu.yield %1 : vector<16xf32> + } + "some_user_op"(%r) : (vector<16xf32>) -> () gpu.return } } @@ -65,12 +74,15 @@ gpu.module @xevm_module{ // CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> // CHECK: xegpu.store_nd %[[T1]], %[[T2]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> gpu.module @xevm_module{ - gpu.func @load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) { + gpu.func @load_nd_2d(%laneid: index) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> - %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - xegpu.store_nd %1, %2[%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16x1xf16>) { + %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} + : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + gpu.yield %1 : vector<16x16xf16> + } + "some_user_op"(%r) : (vector<16x1xf16>) -> () gpu.return } } @@ -86,13 +98,17 @@ gpu.module @xevm_module{ // CHECK-DAG: %[[T5:.*]] = vector.shape_cast %[[T3]] : vector<16x1xf16> to vector<16xf16> // CHECK: xegpu.store_nd %[[T5]], %[[T4]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> gpu.module @xevm_module{ - gpu.func @load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) { + gpu.func @load_nd_array_length(%laneid: index) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, #xegpu.layout> - %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, #xegpu.layout> -> vector<2x16x16xf16> - %2 = vector.extract %1[%c0] {layout_result_0 = #xegpu.layout} : vector<16x16xf16> from vector<2x16x16xf16> - %3 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - xegpu.store_nd %2, %3[%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2x16x1xf16>) { + %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, + #xegpu.layout> + %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} + : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, + #xegpu.layout> -> vector<2x16x16xf16> + gpu.yield %1 : vector<2x16x16xf16> + } + "some_user_op"(%r) : (vector<2x16x1xf16>) -> () gpu.return } } @@ -108,48 +124,27 @@ gpu.module @xevm_module{ // CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> // CHECK: xegpu.store_nd %[[T4]], %[[T5]][%{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> gpu.module @xevm_module{ - gpu.func @load_dpas_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> - %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> - %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - %3 = xegpu.load_nd %2[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> - %4 = xegpu.dpas %1, %3 {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - %5 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - xegpu.store_nd %4, %5[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + gpu.func @dpas(%laneid: index) { + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) { + %0 = "some_op"() : () -> vector<8x16xf16> + %1 = "some_op"() : () -> vector<16x16xf16> + %2 = "some_op"() : () -> vector<8x16xf32> + %3 = xegpu.dpas %0, %1, %2 + { + layout_operand_0 = #xegpu.layout, + layout_operand_1 = #xegpu.layout, + layout_operand_2 = #xegpu.layout, + layout_result_0 = #xegpu.layout + } + : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + gpu.yield %3 : vector<8x16xf32> + } + "some_user_op"(%r) : (vector<8x1xf32>) -> () gpu.return } } -// ----- -// CHECK-LABEL: gpu.func @load_dpas_postop_store -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> -// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> -// CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32> -// CHECK: %[[T6:.*]] = math.exp %[[T5]] {{{.*}}} : vector<8x1xf32> -// CHECK-DAG: %[[T8:.*]] = vector.shape_cast %[[T6]] : vector<8x1xf32> to vector<8xf32> -// CHECK-DAG: %[[T7:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: xegpu.store_nd %[[T8]], %[[T7]][{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> -gpu.module @xevm_module{ - gpu.func @load_dpas_postop_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> - %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> - %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - %3 = xegpu.load_nd %2[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> - %4 = xegpu.dpas %1, %3 {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - %5 = math.exp %4 {layout_result_0 = #xegpu.layout} : vector<8x16xf32> - %6 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - xegpu.store_nd %5, %6[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - gpu.return - } -} - // ----- // CHECK-LABEL: gpu.func @create_nd_tdesc_non_memref // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: ui64, %[[ARG2:[0-9a-zA-Z]+]]: index, @@ -160,409 +155,414 @@ gpu.module @xevm_module{ // CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> // CHECK: xegpu.store_nd %[[T1]], %[[T2]][{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> gpu.module @xevm_module{ - gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %arg1: ui64, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index) { + gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %laneid: index) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0, shape:[%arg2, %arg3], strides:[%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> - %2 = xegpu.create_nd_tdesc %arg1, shape:[%arg2, %arg3], strides:[%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - xegpu.store_nd %1, %2[%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16x1xf16>) { + %0 = xegpu.create_nd_tdesc %arg0, shape:[64, 128], strides:[128, 1] : ui64 -> + !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %1 = xegpu.load_nd %0[%c0, %c0] + {layout_result_0 = #xegpu.layout} + : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + gpu.yield %1 : vector<16x16xf16> + } + "some_user_op"(%r) : (vector<16x1xf16>) -> () gpu.return } } -// ----- -// TODO: gemm does not use update_nd_offset because of an issue in scf-for distribution. -// CHECK-LABEL: gpu.func @gemm -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) { -// CHECK-DAG: %[[BLOCK_ID_X:.*]] = gpu.block_id x -// CHECK-DAG: %[[BLOCK_ID_Y:.*]] = gpu.block_id y -// CHECK-DAG: %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index -// CHECK-DAG: %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> -// CHECK-NEXT: %[[T4:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32> -// CHECK: %[[T5:.*]] = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %[[T4]]) -> (vector<8x1xf32>) { -// CHECK-DAG: %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16> -// CHECK-DAG: %[[T11:.*]] = xegpu.load_nd %[[T10]][%[[K]], %[[Y_COORD]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16> -// CHECK-DAG: %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16> -// CHECK-DAG: %[[T13:.*]] = xegpu.load_nd %[[T12]][%[[X_COORD]], %[[K]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16> -// CHECK-DAG: %[[T14:.*]] = vector.shape_cast %[[ARG4]] : vector<8x1xf32> to vector<8xf32> -// CHECK-NEXT: %[[T15:.*]] = xegpu.dpas %[[T13]], %[[T11]], %[[T14]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32> -// CHECK-NEXT: %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32> -// CHECK-NEXT: scf.yield %[[T16]] : vector<8x1xf32> -// CHECK-NEXT: } -// CHECK-NEXT: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32> -// CHECK-NEXT: xegpu.store_nd %[[T9]], %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> -gpu.module @xevm_module{ -gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){ - %c0 = arith.constant 0 : index - %c16 = arith.constant 16 : index - %c8 = arith.constant 8 : index - %c1024 = arith.constant 1024 : index - %block_id_x = gpu.block_id x - %block_id_y = gpu.block_id y - %0 = arith.muli %block_id_x, %c8 : index - %1 = arith.muli %block_id_y, %c16 : index - %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - %3 = xegpu.load_nd %2[%0, %1] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -> vector<8x16xf32> - %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) { - %5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> - %6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> - %7 = xegpu.load_nd %5[%0, %arg3] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> -> vector<8x16xbf16> - %8 = xegpu.load_nd %6[%arg3, %1] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> -> vector<16x16xbf16> - %9 = xegpu.dpas %7, %8, %arg4 {layout_result_0 = #xegpu.layout} : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32> - scf.yield %9 : vector<8x16xf32> - } {layout_result_0 = #xegpu.layout} - xegpu.store_nd %4, %2[%0, %1] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - gpu.return -} -} +// // ----- +// // TODO: gemm does not use update_nd_offset because of an issue in scf-for distribution. +// // CHECK-LABEL: gpu.func @gemm +// // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) { +// // CHECK-DAG: %[[BLOCK_ID_X:.*]] = gpu.block_id x +// // CHECK-DAG: %[[BLOCK_ID_Y:.*]] = gpu.block_id y +// // CHECK-DAG: %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index +// // CHECK-DAG: %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index +// // CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32> +// // CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> +// // CHECK-NEXT: %[[T4:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32> +// // CHECK: %[[T5:.*]] = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %[[T4]]) -> (vector<8x1xf32>) { +// // CHECK-DAG: %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16> +// // CHECK-DAG: %[[T11:.*]] = xegpu.load_nd %[[T10]][%[[K]], %[[Y_COORD]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16> +// // CHECK-DAG: %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16> +// // CHECK-DAG: %[[T13:.*]] = xegpu.load_nd %[[T12]][%[[X_COORD]], %[[K]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16> +// // CHECK-DAG: %[[T14:.*]] = vector.shape_cast %[[ARG4]] : vector<8x1xf32> to vector<8xf32> +// // CHECK-NEXT: %[[T15:.*]] = xegpu.dpas %[[T13]], %[[T11]], %[[T14]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32> +// // CHECK-NEXT: %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32> +// // CHECK-NEXT: scf.yield %[[T16]] : vector<8x1xf32> +// // CHECK-NEXT: } +// // CHECK-NEXT: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32> +// // CHECK-NEXT: xegpu.store_nd %[[T9]], %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> +// gpu.module @xevm_module{ +// gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){ +// %c0 = arith.constant 0 : index +// %c16 = arith.constant 16 : index +// %c8 = arith.constant 8 : index +// %c1024 = arith.constant 1024 : index +// %block_id_x = gpu.block_id x +// %block_id_y = gpu.block_id y +// %0 = arith.muli %block_id_x, %c8 : index +// %1 = arith.muli %block_id_y, %c16 : index +// %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// %3 = xegpu.load_nd %2[%0, %1] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -> vector<8x16xf32> +// %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) { +// %5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> +// %6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> +// %7 = xegpu.load_nd %5[%0, %arg3] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> -> vector<8x16xbf16> +// %8 = xegpu.load_nd %6[%arg3, %1] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> -> vector<16x16xbf16> +// %9 = xegpu.dpas %7, %8, %arg4 {layout_result_0 = #xegpu.layout} : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32> +// scf.yield %9 : vector<8x16xf32> +// } {layout_result_0 = #xegpu.layout} +// xegpu.store_nd %4, %2[%0, %1] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// gpu.return +// } +// } -// ----- -// CHECK-LABEL: gpu.func @prefetch_2d -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.prefetch_nd %[[T0]][%{{.*}}] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16> -gpu.module @xevm_module{ - gpu.func @prefetch_2d(%arg0: memref<256x256xf16>) { - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - xegpu.prefetch_nd %0[%c0, %c0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - gpu.return - } -} +// // ----- +// // CHECK-LABEL: gpu.func @prefetch_2d +// // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) { +// // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16> +// // CHECK: xegpu.prefetch_nd %[[T0]][%{{.*}}] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16> +// gpu.module @xevm_module{ +// gpu.func @prefetch_2d(%arg0: memref<256x256xf16>) { +// %c0 = arith.constant 0 : index +// %0 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> +// xegpu.prefetch_nd %0[%c0, %c0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> +// gpu.return +// } +// } -// ----- -// CHECK-LABEL: gpu.func @prefetch_1d -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256xf16> -> !xegpu.tensor_desc<16xf16> -// CHECK: xegpu.prefetch_nd %[[T0]][%{{.*}}] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16> -gpu.module @xevm_module{ - gpu.func @prefetch_1d(%arg0: memref<256xf16>) { - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0: memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> - xegpu.prefetch_nd %0[%c0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16, #xegpu.layout> - gpu.return - } -} +// // ----- +// // CHECK-LABEL: gpu.func @prefetch_1d +// // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { +// // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256xf16> -> !xegpu.tensor_desc<16xf16> +// // CHECK: xegpu.prefetch_nd %[[T0]][%{{.*}}] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16> +// gpu.module @xevm_module{ +// gpu.func @prefetch_1d(%arg0: memref<256xf16>) { +// %c0 = arith.constant 0 : index +// %0 = xegpu.create_nd_tdesc %arg0: memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> +// xegpu.prefetch_nd %0[%c0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16, #xegpu.layout> +// gpu.return +// } +// } -// ----- -// CHECK-LABEL: gpu.func @gpu_barrier({{.*}}) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16> -// CHECK-NEXT: %[[T1:.*]] = xegpu.load_nd %[[T0]][{{.*}}] : !xegpu.tensor_desc<16xf16> -> vector<1xf16> -// CHECK-NEXT: gpu.barrier -// CHECK-NEXT: %[[T2:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16> -// CHECK-NEXT: xegpu.store_nd %[[T1]], %[[T2]][{{.*}}] : vector<1xf16>, !xegpu.tensor_desc<16xf16> -gpu.module @xevm_module{ - gpu.func @gpu_barrier(%arg0: memref<256xf16>, %arg1: memref<256xf16>) { - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0 : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> - %1 = xegpu.load_nd %0[%c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16xf16, #xegpu.layout> -> vector<16xf16> - gpu.barrier - %2 = xegpu.create_nd_tdesc %arg1 : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> - xegpu.store_nd %1, %2[%c0] : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout> - gpu.return - } -} +// // ----- +// // CHECK-LABEL: gpu.func @gpu_barrier({{.*}}) { +// // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16> +// // CHECK-NEXT: %[[T1:.*]] = xegpu.load_nd %[[T0]][{{.*}}] : !xegpu.tensor_desc<16xf16> -> vector<1xf16> +// // CHECK-NEXT: gpu.barrier +// // CHECK-NEXT: %[[T2:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16> +// // CHECK-NEXT: xegpu.store_nd %[[T1]], %[[T2]][{{.*}}] : vector<1xf16>, !xegpu.tensor_desc<16xf16> +// gpu.module @xevm_module{ +// gpu.func @gpu_barrier(%arg0: memref<256xf16>, %arg1: memref<256xf16>) { +// %c0 = arith.constant 0 : index +// %0 = xegpu.create_nd_tdesc %arg0 : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> +// %1 = xegpu.load_nd %0[%c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16xf16, #xegpu.layout> -> vector<16xf16> +// gpu.barrier +// %2 = xegpu.create_nd_tdesc %arg1 : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> +// xegpu.store_nd %1, %2[%c0] : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout> +// gpu.return +// } +// } -// ----- -// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction -// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> -// CHECK-SAME: (!xegpu.tensor_desc<1x32xf32, #xegpu.layout>, vector<16x2xf32>) { -// CHECK: %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout} : () -> vector<16x32xf32> -// CHECK-NEXT: gpu.yield %{{.*}}, %[[SRC]] : !xegpu.tensor_desc<1x32xf32, #xegpu.layout>, vector<16x32xf32> -// CHECK-NEXT: } -// CHECK: %[[COL0:.*]] = vector.extract_strided_slice %[[W]]#1 {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> -// CHECK-NEXT: %[[CAST0:.*]] = vector.shape_cast %[[COL0]] : vector<16x1xf32> to vector<16xf32> -// CHECK-NEXT: %[[RED0:.*]] = vector.reduction , %[[CAST0]], %{{.*}} : vector<16xf32> into f32 -// CHECK: %[[COL1:.*]] = vector.extract_strided_slice %[[W]]#1 {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> -// CHECK-NEXT: %[[CAST1:.*]] = vector.shape_cast %[[COL1]] : vector<16x1xf32> to vector<16xf32> -// CHECK-NEXT: %[[RED1:.*]] = vector.reduction , %[[CAST1]], %{{.*}} : vector<16xf32> into f32 -// CHECK-NEXT: vector.from_elements %[[RED0]], %[[RED1]] : vector<2xf32> -gpu.module @xevm_module{ -gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction() { - %c0 = arith.constant 0 : index - %0 = "some_def"() : () -> !xegpu.tensor_desc<1x32xf32, #xegpu.layout> - %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<16x32xf32>) - %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} dense<0.0> : vector<32xf32> - %1 = vector.multi_reduction , %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} [0] - : vector<16x32xf32> to vector<32xf32> - %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout} - : vector<32xf32> to vector<1x32xf32> - xegpu.store_nd %3, %0[%c0, %c0] : vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #xegpu.layout> - gpu.return -} -} +// // ----- +// // CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction +// // CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> +// // CHECK-SAME: (!xegpu.tensor_desc<1x32xf32, #xegpu.layout>, vector<16x2xf32>) { +// // CHECK: %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout} : () -> vector<16x32xf32> +// // CHECK-NEXT: gpu.yield %{{.*}}, %[[SRC]] : !xegpu.tensor_desc<1x32xf32, #xegpu.layout>, vector<16x32xf32> +// // CHECK-NEXT: } +// // CHECK: %[[COL0:.*]] = vector.extract_strided_slice %[[W]]#1 {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> +// // CHECK-NEXT: %[[CAST0:.*]] = vector.shape_cast %[[COL0]] : vector<16x1xf32> to vector<16xf32> +// // CHECK-NEXT: %[[RED0:.*]] = vector.reduction , %[[CAST0]], %{{.*}} : vector<16xf32> into f32 +// // CHECK: %[[COL1:.*]] = vector.extract_strided_slice %[[W]]#1 {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> +// // CHECK-NEXT: %[[CAST1:.*]] = vector.shape_cast %[[COL1]] : vector<16x1xf32> to vector<16xf32> +// // CHECK-NEXT: %[[RED1:.*]] = vector.reduction , %[[CAST1]], %{{.*}} : vector<16xf32> into f32 +// // CHECK-NEXT: vector.from_elements %[[RED0]], %[[RED1]] : vector<2xf32> +// gpu.module @xevm_module{ +// gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction() { +// %c0 = arith.constant 0 : index +// %0 = "some_def"() : () -> !xegpu.tensor_desc<1x32xf32, #xegpu.layout> +// %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<16x32xf32>) +// %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} dense<0.0> : vector<32xf32> +// %1 = vector.multi_reduction , %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} [0] +// : vector<16x32xf32> to vector<32xf32> +// %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout} +// : vector<32xf32> to vector<1x32xf32> +// xegpu.store_nd %3, %0[%c0, %c0] : vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #xegpu.layout> +// gpu.return +// } +// } -// ----- -// CHECK-REDUCTION-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction -// CHECK-REDUCTION: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (!xegpu.tensor_desc<2x16xf32, -// CHECK-REDUCTION-SAME: #xegpu.layout>, f32, f32) { -// CHECK-REDUCTION: %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout} : () -> vector<2x16xf32> -// CHECK-REDUCTION-NEXT: %[[ROW0:.*]] = vector.extract %[[SRC]][0] : vector<16xf32> from vector<2x16xf32> -// CHECK-REDUCTION-NEXT: %[[R0:.*]] = vector.reduction , %[[ROW0]], %{{.*}} : vector<16xf32> into f32 -// CHECK-REDUCTION-NEXT: %[[ROW1:.*]] = vector.extract %[[SRC]][1] : vector<16xf32> from vector<2x16xf32> -// CHECK-REDUCTION-NEXT: %[[R1:.*]] = vector.reduction , %[[ROW1]], %{{.*}} : vector<16xf32> into f32 -// CHECK-REDUCTION-NEXT: gpu.yield %4, %[[R1]], %[[R0]] : !xegpu.tensor_desc<2x16xf32, #xegpu.layout>, f32, f32 -// CHECK-REDUCTION-NEXT: } -// CHECK-REDUCTION-NEXT: vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32> -gpu.module @xevm_module{ -gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction() { - %c0 = arith.constant 0 : index - %0 = "some_def"() : () -> !xegpu.tensor_desc<2x16xf32, #xegpu.layout> - %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<2x16xf32>) - %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} dense<0.0> : vector<2xf32> - %1 = vector.multi_reduction , %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} - [1] : vector<2x16xf32> to vector<2xf32> - %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout} - : vector<2xf32> to vector<2x1xf32> - %4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout} : vector<2x1xf32> to vector<2x16xf32> - xegpu.store_nd %4, %0[%c0, %c0] : vector<2x16xf32>, !xegpu.tensor_desc<2x16xf32, #xegpu.layout> - gpu.return -} -} +// // ----- +// // CHECK-REDUCTION-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction +// // CHECK-REDUCTION: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (!xegpu.tensor_desc<2x16xf32, +// // CHECK-REDUCTION-SAME: #xegpu.layout>, f32, f32) { +// // CHECK-REDUCTION: %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout} : () -> vector<2x16xf32> +// // CHECK-REDUCTION-NEXT: %[[ROW0:.*]] = vector.extract %[[SRC]][0] : vector<16xf32> from vector<2x16xf32> +// // CHECK-REDUCTION-NEXT: %[[R0:.*]] = vector.reduction , %[[ROW0]], %{{.*}} : vector<16xf32> into f32 +// // CHECK-REDUCTION-NEXT: %[[ROW1:.*]] = vector.extract %[[SRC]][1] : vector<16xf32> from vector<2x16xf32> +// // CHECK-REDUCTION-NEXT: %[[R1:.*]] = vector.reduction , %[[ROW1]], %{{.*}} : vector<16xf32> into f32 +// // CHECK-REDUCTION-NEXT: gpu.yield %4, %[[R1]], %[[R0]] : !xegpu.tensor_desc<2x16xf32, #xegpu.layout>, f32, f32 +// // CHECK-REDUCTION-NEXT: } +// // CHECK-REDUCTION-NEXT: vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32> +// gpu.module @xevm_module{ +// gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction() { +// %c0 = arith.constant 0 : index +// %0 = "some_def"() : () -> !xegpu.tensor_desc<2x16xf32, #xegpu.layout> +// %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<2x16xf32>) +// %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} dense<0.0> : vector<2xf32> +// %1 = vector.multi_reduction , %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} +// [1] : vector<2x16xf32> to vector<2xf32> +// %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout} +// : vector<2xf32> to vector<2x1xf32> +// %4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout} : vector<2x1xf32> to vector<2x16xf32> +// xegpu.store_nd %4, %0[%c0, %c0] : vector<2x16xf32>, !xegpu.tensor_desc<2x16xf32, #xegpu.layout> +// gpu.return +// } +// } -// ----- -// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction -// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%0)[16] -> -// CHECK-SAME: (!xegpu.tensor_desc<32x1xf32, #xegpu.layout>, vector<2x16xf32>) { -// CHECK: %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout} : () -> vector<32x16xf32> -// CHECK-NEXT: gpu.yield %{{.*}}, %[[SRC]] : !xegpu.tensor_desc<32x1xf32, #xegpu.layout>, vector<32x16xf32> -// CHECK-NEXT: } -// CHECK: %[[ROW0:.*]] = vector.extract %[[W]]#1[0] : vector<16xf32> from vector<2x16xf32> -// CHECK-NEXT: %[[R0:.*]] = vector.reduction , %[[ROW0]], %{{.*}} : vector<16xf32> into f32 -// CHECK: %[[ROW1:.*]] = vector.extract %[[W]]#1[1] : vector<16xf32> from vector<2x16xf32> -// CHECK-NEXT: %[[R1:.*]] = vector.reduction , %[[ROW1]], %{{.*}} : vector<16xf32> into f32 -// CHECK-NEXT: vector.from_elements %[[R0]], %[[R1]] : vector<2xf32> -gpu.module @xevm_module{ -gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction() { - %c0 = arith.constant 0 : index - %0 = "some_def"() : () -> !xegpu.tensor_desc<32x1xf32, #xegpu.layout> - %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<32x16xf32>) - %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} dense<0.0> : vector<32xf32> - %1 = vector.multi_reduction , %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} [1] - : vector<32x16xf32> to vector<32xf32> - %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout} - : vector<32xf32> to vector<32x1xf32> - xegpu.store_nd %3, %0[%c0, %c0] : vector<32x1xf32>, !xegpu.tensor_desc<32x1xf32, #xegpu.layout> - gpu.return -} -} +// // ----- +// // CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction +// // CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%0)[16] -> +// // CHECK-SAME: (!xegpu.tensor_desc<32x1xf32, #xegpu.layout>, vector<2x16xf32>) { +// // CHECK: %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout} : () -> vector<32x16xf32> +// // CHECK-NEXT: gpu.yield %{{.*}}, %[[SRC]] : !xegpu.tensor_desc<32x1xf32, #xegpu.layout>, vector<32x16xf32> +// // CHECK-NEXT: } +// // CHECK: %[[ROW0:.*]] = vector.extract %[[W]]#1[0] : vector<16xf32> from vector<2x16xf32> +// // CHECK-NEXT: %[[R0:.*]] = vector.reduction , %[[ROW0]], %{{.*}} : vector<16xf32> into f32 +// // CHECK: %[[ROW1:.*]] = vector.extract %[[W]]#1[1] : vector<16xf32> from vector<2x16xf32> +// // CHECK-NEXT: %[[R1:.*]] = vector.reduction , %[[ROW1]], %{{.*}} : vector<16xf32> into f32 +// // CHECK-NEXT: vector.from_elements %[[R0]], %[[R1]] : vector<2xf32> +// gpu.module @xevm_module{ +// gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction() { +// %c0 = arith.constant 0 : index +// %0 = "some_def"() : () -> !xegpu.tensor_desc<32x1xf32, #xegpu.layout> +// %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<32x16xf32>) +// %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} dense<0.0> : vector<32xf32> +// %1 = vector.multi_reduction , %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} [1] +// : vector<32x16xf32> to vector<32xf32> +// %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout} +// : vector<32xf32> to vector<32x1xf32> +// xegpu.store_nd %3, %0[%c0, %c0] : vector<32x1xf32>, !xegpu.tensor_desc<32x1xf32, #xegpu.layout> +// gpu.return +// } +// } -// ----- -// CHECK-REDUCTION-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction -// CHECK-REDUCTION: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (!xegpu.tensor_desc<16x2xf32, -// CHECK-REDUCTION-SAME: #xegpu.layout>, f32, f32) { -// CHECK-REDUCTION: %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout} : () -> vector<16x2xf32> -// CHECK-REDUCTION-NEXT: %[[COL0:.*]] = vector.extract_strided_slice %[[SRC]] {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> -// CHECK-REDUCTION-NEXT: %[[CAST0:.*]] = vector.shape_cast %[[COL0]] : vector<16x1xf32> to vector<16xf32> -// CHECK-REDUCTION-NEXT: %[[R0:.*]] = vector.reduction , %[[CAST0]], %{{.*}} : vector<16xf32> into f32 -// CHECK-REDUCTION-NEXT: %[[COL1:.*]] = vector.extract_strided_slice %5 {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> -// CHECK-REDUCTION-NEXT: %[[CAST1:.*]] = vector.shape_cast %[[COL1]] : vector<16x1xf32> to vector<16xf32> -// CHECK-REDUCTION-NEXT: %[[R1:.*]] = vector.reduction , %[[CAST1]], %cst : vector<16xf32> into f32 -// CHECK-REDUCTION-NEXT: gpu.yield %4, %[[R1]], %[[R0]] : !xegpu.tensor_desc<16x2xf32, #xegpu.layout>, f32, f32 -// CHECK-REDUCTION-NEXT: } -// CHECK-REDUCTION-NEXT: vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32> -gpu.module @xevm_module{ -gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction() { - %c0 = arith.constant 0 : index - %0 = "some_def"() : () -> !xegpu.tensor_desc<16x2xf32, #xegpu.layout> - %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<16x2xf32>) - %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} dense<0.0> : vector<2xf32> - %1 = vector.multi_reduction , %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} - [0] : vector<16x2xf32> to vector<2xf32> - %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout} - : vector<2xf32> to vector<1x2xf32> - %4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout} : vector<1x2xf32> to vector<16x2xf32> - xegpu.store_nd %4, %0[%c0, %c0] : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.layout> - gpu.return -} -} +// // ----- +// // CHECK-REDUCTION-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction +// // CHECK-REDUCTION: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (!xegpu.tensor_desc<16x2xf32, +// // CHECK-REDUCTION-SAME: #xegpu.layout>, f32, f32) { +// // CHECK-REDUCTION: %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout} : () -> vector<16x2xf32> +// // CHECK-REDUCTION-NEXT: %[[COL0:.*]] = vector.extract_strided_slice %[[SRC]] {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> +// // CHECK-REDUCTION-NEXT: %[[CAST0:.*]] = vector.shape_cast %[[COL0]] : vector<16x1xf32> to vector<16xf32> +// // CHECK-REDUCTION-NEXT: %[[R0:.*]] = vector.reduction , %[[CAST0]], %{{.*}} : vector<16xf32> into f32 +// // CHECK-REDUCTION-NEXT: %[[COL1:.*]] = vector.extract_strided_slice %5 {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> +// // CHECK-REDUCTION-NEXT: %[[CAST1:.*]] = vector.shape_cast %[[COL1]] : vector<16x1xf32> to vector<16xf32> +// // CHECK-REDUCTION-NEXT: %[[R1:.*]] = vector.reduction , %[[CAST1]], %cst : vector<16xf32> into f32 +// // CHECK-REDUCTION-NEXT: gpu.yield %4, %[[R1]], %[[R0]] : !xegpu.tensor_desc<16x2xf32, #xegpu.layout>, f32, f32 +// // CHECK-REDUCTION-NEXT: } +// // CHECK-REDUCTION-NEXT: vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32> +// gpu.module @xevm_module{ +// gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction() { +// %c0 = arith.constant 0 : index +// %0 = "some_def"() : () -> !xegpu.tensor_desc<16x2xf32, #xegpu.layout> +// %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<16x2xf32>) +// %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} dense<0.0> : vector<2xf32> +// %1 = vector.multi_reduction , %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} +// [0] : vector<16x2xf32> to vector<2xf32> +// %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout} +// : vector<2xf32> to vector<1x2xf32> +// %4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout} : vector<1x2xf32> to vector<16x2xf32> +// xegpu.store_nd %4, %0[%c0, %c0] : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.layout> +// gpu.return +// } +// } -// ----- -// CHECK-LABEL: gpu.func @scatter_ops_chunksize({{.*}}) { -// CHECK: %[[MASK:.*]] = arith.constant dense : vector<1xi1> -// CHECK-NEXT: %[[LANE_OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> -// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[LANE_OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> -// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[LANE_OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> -gpu.module @xevm_module{ - gpu.func @scatter_ops_chunksize(%src: memref<256xf16>) { - %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> - %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> - %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> { - layout_result_0 = #xegpu.layout - } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> - xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> - gpu.return - } -} +// // ----- +// // CHECK-LABEL: gpu.func @scatter_ops_chunksize({{.*}}) { +// // CHECK: %[[MASK:.*]] = arith.constant dense : vector<1xi1> +// // CHECK-NEXT: %[[LANE_OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> +// // CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[LANE_OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> +// // CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[LANE_OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> +// gpu.module @xevm_module{ +// gpu.func @scatter_ops_chunksize(%src: memref<256xf16>) { +// %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> +// %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> +// %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> { +// layout_result_0 = #xegpu.layout +// } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> +// xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// gpu.return +// } +// } -// ----- -// CHECK-LABEL: gpu.func @scatter_ops_scf_yield({{.*}}, -// CHECK-SAME: %[[PREDICATE:.*]]: i1) { -// CHECK: %[[DEFAULT:.*]] = arith.constant dense<1.200000e+01> : vector<8xf16> -// CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> -// CHECK: %[[MASK:.*]] = arith.constant dense : vector<1xi1> -// CHECK: %[[PREDICATED_LOAD:.*]] = scf.if %[[PREDICATE]] -> (vector<8xf16>) { -// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> -// CHECK-NEXT: scf.yield %[[LOADED]] : vector<8xf16> -// CHECK-NEXT: } else { -// CHECK-NEXT: scf.yield %[[DEFAULT]] : vector<8xf16> -// CHECK-NEXT: } -// CHECK-NEXT: xegpu.store %[[PREDICATED_LOAD]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> -gpu.module @xevm_module{ - gpu.func @scatter_ops_scf_yield(%src: memref<256xf16>, %pred : i1) { - %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> - %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> - %loaded = scf.if %pred -> (vector<16x8xf16>) { - %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> { - layout_result_0 = #xegpu.layout - } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> - scf.yield %3 : vector<16x8xf16> - } else { - %3 = arith.constant { - layout_result_0 = #xegpu.layout - } dense<12.> : vector<16x8xf16> - scf.yield %3 : vector<16x8xf16> - } { layout_result_0 = #xegpu.layout } - xegpu.store %loaded, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> - gpu.return - } -} +// // ----- +// // CHECK-LABEL: gpu.func @scatter_ops_scf_yield({{.*}}, +// // CHECK-SAME: %[[PREDICATE:.*]]: i1) { +// // CHECK: %[[DEFAULT:.*]] = arith.constant dense<1.200000e+01> : vector<8xf16> +// // CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> +// // CHECK: %[[MASK:.*]] = arith.constant dense : vector<1xi1> +// // CHECK: %[[PREDICATED_LOAD:.*]] = scf.if %[[PREDICATE]] -> (vector<8xf16>) { +// // CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> +// // CHECK-NEXT: scf.yield %[[LOADED]] : vector<8xf16> +// // CHECK-NEXT: } else { +// // CHECK-NEXT: scf.yield %[[DEFAULT]] : vector<8xf16> +// // CHECK-NEXT: } +// // CHECK-NEXT: xegpu.store %[[PREDICATED_LOAD]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> +// gpu.module @xevm_module{ +// gpu.func @scatter_ops_scf_yield(%src: memref<256xf16>, %pred : i1) { +// %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> +// %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> +// %loaded = scf.if %pred -> (vector<16x8xf16>) { +// %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> { +// layout_result_0 = #xegpu.layout +// } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> +// scf.yield %3 : vector<16x8xf16> +// } else { +// %3 = arith.constant { +// layout_result_0 = #xegpu.layout +// } dense<12.> : vector<16x8xf16> +// scf.yield %3 : vector<16x8xf16> +// } { layout_result_0 = #xegpu.layout } +// xegpu.store %loaded, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// gpu.return +// } +// } -// ----- -// CHECK-LABEL: gpu.func @scatter_ops_scf_non_yield({{.*}}) { -// CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> -// CHECK: %[[MASK:.*]] = arith.constant dense : vector<1xi1> -// CHECK: %[[PREDICATE:.*]] = llvm.mlir.poison : i1 -// CHECK: scf.if %[[PREDICATE]] { -// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> -// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> -// CHECK-NEXT: } -gpu.module @xevm_module{ - gpu.func @scatter_ops_scf_non_yield(%src: memref<256xf16>) { - %pred = llvm.mlir.poison : i1 - %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> - %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> - scf.if %pred { - %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> { - layout_result_0 = #xegpu.layout - } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> - xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> - } - gpu.return - } -} +// // ----- +// // CHECK-LABEL: gpu.func @scatter_ops_scf_non_yield({{.*}}) { +// // CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> +// // CHECK: %[[MASK:.*]] = arith.constant dense : vector<1xi1> +// // CHECK: %[[PREDICATE:.*]] = llvm.mlir.poison : i1 +// // CHECK: scf.if %[[PREDICATE]] { +// // CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> +// // CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> +// // CHECK-NEXT: } +// gpu.module @xevm_module{ +// gpu.func @scatter_ops_scf_non_yield(%src: memref<256xf16>) { +// %pred = llvm.mlir.poison : i1 +// %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> +// %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> +// scf.if %pred { +// %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> { +// layout_result_0 = #xegpu.layout +// } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> +// xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// } +// gpu.return +// } +// } -// ----- -// CHECK-LABEL: gpu.func @scatter_ops({{.*}}) { -// CHECK: %[[MASK:.*]] = arith.constant dense : vector<1xi1> -// CHECK-NEXT: %[[LANE_OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> -// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[LANE_OFFSET]]], %[[MASK]] : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16> -// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[LANE_OFFSET]]], %[[MASK]] : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> -gpu.module @xevm_module{ - gpu.func @scatter_ops(%src: memref<256xf16>) { - %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> - %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> - %3 = xegpu.load %src[%offset], %1 { - layout_result_0 = #xegpu.layout - } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> - xegpu.store %3, %src[%offset], %1 : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> - gpu.return - } -} +// // ----- +// // CHECK-LABEL: gpu.func @scatter_ops({{.*}}) { +// // CHECK: %[[MASK:.*]] = arith.constant dense : vector<1xi1> +// // CHECK-NEXT: %[[LANE_OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> +// // CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[LANE_OFFSET]]], %[[MASK]] : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16> +// // CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[LANE_OFFSET]]], %[[MASK]] : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> +// gpu.module @xevm_module{ +// gpu.func @scatter_ops(%src: memref<256xf16>) { +// %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> +// %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> +// %3 = xegpu.load %src[%offset], %1 { +// layout_result_0 = #xegpu.layout +// } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> +// xegpu.store %3, %src[%offset], %1 : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// gpu.return +// } +// } -// ----- -// CHECK-LABEL: gpu.func @memref_extract_aligned_pointer_as_index( -// CHECK: %{{.*}} = memref.extract_aligned_pointer_as_index %{{.*}} : memref<256x256xf16> -> index -gpu.module @xevm_module{ - gpu.func @memref_extract_aligned_pointer_as_index(%arg0 : memref<256x256xf16>) { - %c0 = arith.constant 0 : index - %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16xf16> - %ptr = memref.extract_aligned_pointer_as_index %arg0 : memref<256x256xf16> -> index - %ptr_i64 = arith.index_cast %ptr : index to i64 - %tdesc = xegpu.create_nd_tdesc %ptr_i64, shape: [16], strides: [16] : i64 - -> !xegpu.tensor_desc<16xf16, #xegpu.layout> - xegpu.store_nd %cst, %tdesc[%c0] : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout> - gpu.return - } -} +// // ----- +// // CHECK-LABEL: gpu.func @memref_extract_aligned_pointer_as_index( +// // CHECK: %{{.*}} = memref.extract_aligned_pointer_as_index %{{.*}} : memref<256x256xf16> -> index +// gpu.module @xevm_module{ +// gpu.func @memref_extract_aligned_pointer_as_index(%arg0 : memref<256x256xf16>) { +// %c0 = arith.constant 0 : index +// %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16xf16> +// %ptr = memref.extract_aligned_pointer_as_index %arg0 : memref<256x256xf16> -> index +// %ptr_i64 = arith.index_cast %ptr : index to i64 +// %tdesc = xegpu.create_nd_tdesc %ptr_i64, shape: [16], strides: [16] : i64 +// -> !xegpu.tensor_desc<16xf16, #xegpu.layout> +// xegpu.store_nd %cst, %tdesc[%c0] : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout> +// gpu.return +// } +// } -// ----- -// CHECK-LABEL: gpu.func @vector_transpose( -// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<2xf32> -// CHECK: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<2x16xf32> -> !xegpu.tensor_desc<2x16xf32> -// CHECK: xegpu.store_nd %[[CST]], %[[DEST]][{{.*}}] : vector<2xf32>, !xegpu.tensor_desc<2x16xf32> -gpu.module @xevm_module{ - gpu.func @vector_transpose(%arg0: memref<2x16xf32>) { - %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> - : vector<16x2xf32> - %c0 = arith.constant 0 : index - %transpose = vector.transpose %cst, [1, 0] {layout_result_0 = #xegpu.layout} - : vector<16x2xf32> to vector<2x16xf32> - %0 = xegpu.create_nd_tdesc %arg0 : memref<2x16xf32> - -> !xegpu.tensor_desc<2x16xf32, #xegpu.layout> - xegpu.store_nd %transpose, %0[%c0, %c0] : vector<2x16xf32>, - !xegpu.tensor_desc<2x16xf32, #xegpu.layout> - gpu.return - } -} +// // ----- +// // CHECK-LABEL: gpu.func @vector_transpose( +// // CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<2xf32> +// // CHECK: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<2x16xf32> -> !xegpu.tensor_desc<2x16xf32> +// // CHECK: xegpu.store_nd %[[CST]], %[[DEST]][{{.*}}] : vector<2xf32>, !xegpu.tensor_desc<2x16xf32> +// gpu.module @xevm_module{ +// gpu.func @vector_transpose(%arg0: memref<2x16xf32>) { +// %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> +// : vector<16x2xf32> +// %c0 = arith.constant 0 : index +// %transpose = vector.transpose %cst, [1, 0] {layout_result_0 = #xegpu.layout} +// : vector<16x2xf32> to vector<2x16xf32> +// %0 = xegpu.create_nd_tdesc %arg0 : memref<2x16xf32> +// -> !xegpu.tensor_desc<2x16xf32, #xegpu.layout> +// xegpu.store_nd %transpose, %0[%c0, %c0] : vector<2x16xf32>, +// !xegpu.tensor_desc<2x16xf32, #xegpu.layout> +// gpu.return +// } +// } -// ----- -// CHECK-LABEL: gpu.func @vector_bitcast( -// CHECK: %[[CAST:.*]] = vector.bitcast %{{.*}} : vector<4x2xi8> to vector<4x1xi16> -// CHECK-NEXT: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<4x16xi16> -> !xegpu.tensor_desc<4x16xi16> -// CHECK-NEXT: %[[T0:.*]] = vector.shape_cast %[[CAST]] : vector<4x1xi16> to vector<4xi16> -// CHECK-NEXT: xegpu.store_nd %[[T0]], %[[DEST]][{{.*}}] : vector<4xi16>, !xegpu.tensor_desc<4x16xi16> -gpu.module @xevm_module{ - gpu.func @vector_bitcast(%arg0: memref<4x16xi16>) { - %cst = "some_op"() {layout_result_0 = #xegpu.layout} - : () -> (vector<4x32xi8>) - %bitcast = vector.bitcast %cst {layout_result_0 = #xegpu.layout} - : vector<4x32xi8> to vector<4x16xi16> - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0 : memref<4x16xi16> - -> !xegpu.tensor_desc<4x16xi16, #xegpu.layout> - xegpu.store_nd %bitcast, %0[%c0, %c0] : vector<4x16xi16>, - !xegpu.tensor_desc<4x16xi16, #xegpu.layout> - gpu.return - } -} +// // ----- +// // CHECK-LABEL: gpu.func @vector_bitcast( +// // CHECK: %[[CAST:.*]] = vector.bitcast %{{.*}} : vector<4x2xi8> to vector<4x1xi16> +// // CHECK-NEXT: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<4x16xi16> -> !xegpu.tensor_desc<4x16xi16> +// // CHECK-NEXT: %[[T0:.*]] = vector.shape_cast %[[CAST]] : vector<4x1xi16> to vector<4xi16> +// // CHECK-NEXT: xegpu.store_nd %[[T0]], %[[DEST]][{{.*}}] : vector<4xi16>, !xegpu.tensor_desc<4x16xi16> +// gpu.module @xevm_module{ +// gpu.func @vector_bitcast(%arg0: memref<4x16xi16>) { +// %cst = "some_op"() {layout_result_0 = #xegpu.layout} +// : () -> (vector<4x32xi8>) +// %bitcast = vector.bitcast %cst {layout_result_0 = #xegpu.layout} +// : vector<4x32xi8> to vector<4x16xi16> +// %c0 = arith.constant 0 : index +// %0 = xegpu.create_nd_tdesc %arg0 : memref<4x16xi16> +// -> !xegpu.tensor_desc<4x16xi16, #xegpu.layout> +// xegpu.store_nd %bitcast, %0[%c0, %c0] : vector<4x16xi16>, +// !xegpu.tensor_desc<4x16xi16, #xegpu.layout> +// gpu.return +// } +// } -// ----- -// CHECK-LABEL: gpu.func @mma_transpose_b( -// CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x8xi32>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK-DAG: %[[ADESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK-DAG: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32> -// CHECK-DAG: %[[A:.*]] = xegpu.load_nd %[[ADESC]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> -// CHECK-DAG: %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}] <{transpose = array}> : !xegpu.tensor_desc<16x8xi32> -> vector<8xi32> -// CHECK-NEXT: %[[BCAST0:.*]] = vector.shape_cast %[[B]] : vector<8xi32> to vector<1x8xi32> -// CHECK-NEXT: %[[BCAST1:.*]] = vector.bitcast %[[BCAST0]] : vector<1x8xi32> to vector<1x16xf16> -// CHECK-NEXT: %[[BCAST2:.*]] = vector.shape_cast %[[BCAST1]] : vector<1x16xf16> to vector<16xf16> -// CHECK-NEXT: %[[C:.*]] = xegpu.dpas %[[A]], %[[BCAST2]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> -gpu.module @xevm_module{ - gpu.func @mma_transpose_b(%arg0: memref<8x16xf16>, %arg1: memref<16x8xi32>, %arg2: memref<8x16xf32>) { - %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> - -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> - %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} - : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> - %2 = xegpu.create_nd_tdesc %arg1 : memref<16x8xi32> - -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout> - %3 = xegpu.load_nd %2[%c0, %c0] {layout_result_0 = #xegpu.layout} - : !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> - %4 = vector.bitcast %3 {layout_result_0 = #xegpu.layout} - : vector<16x8xi32> to vector<16x16xf16> - %5 = vector.transpose %4, [1, 0] {layout_result_0 = #xegpu.layout} - : vector<16x16xf16> to vector<16x16xf16> - %6 = xegpu.dpas %1, %5 {layout_result_0 = #xegpu.layout} - : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - %7 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> - -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - xegpu.store_nd %6, %7[%c0, %c0] : vector<8x16xf32>, - !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - gpu.return +// // ----- +// // CHECK-LABEL: gpu.func @mma_transpose_b( +// // CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x8xi32>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { +// // CHECK-DAG: %[[ADESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> +// // CHECK-DAG: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32> +// // CHECK-DAG: %[[A:.*]] = xegpu.load_nd %[[ADESC]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> +// // CHECK-DAG: %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}] <{transpose = array}> : !xegpu.tensor_desc<16x8xi32> -> vector<8xi32> +// // CHECK-NEXT: %[[BCAST0:.*]] = vector.shape_cast %[[B]] : vector<8xi32> to vector<1x8xi32> +// // CHECK-NEXT: %[[BCAST1:.*]] = vector.bitcast %[[BCAST0]] : vector<1x8xi32> to vector<1x16xf16> +// // CHECK-NEXT: %[[BCAST2:.*]] = vector.shape_cast %[[BCAST1]] : vector<1x16xf16> to vector<16xf16> +// // CHECK-NEXT: %[[C:.*]] = xegpu.dpas %[[A]], %[[BCAST2]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> +// gpu.module @xevm_module{ +// gpu.func @mma_transpose_b(%arg0: memref<8x16xf16>, %arg1: memref<16x8xi32>, %arg2: memref<8x16xf32>) { +// %c0 = arith.constant 0 : index +// %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> +// -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> +// %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} +// : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> +// %2 = xegpu.create_nd_tdesc %arg1 : memref<16x8xi32> +// -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout> +// %3 = xegpu.load_nd %2[%c0, %c0] {layout_result_0 = #xegpu.layout} +// : !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> +// %4 = vector.bitcast %3 {layout_result_0 = #xegpu.layout} +// : vector<16x8xi32> to vector<16x16xf16> +// %5 = vector.transpose %4, [1, 0] {layout_result_0 = #xegpu.layout} +// : vector<16x16xf16> to vector<16x16xf16> +// %6 = xegpu.dpas %1, %5 {layout_result_0 = #xegpu.layout} +// : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> +// %7 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> +// -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// xegpu.store_nd %6, %7[%c0, %c0] : vector<8x16xf32>, +// !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// gpu.return - } -} +// } +// } From 88ed30b39e3e80b6dfc685b8024067808d7eb7a5 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 1 Oct 2025 00:19:48 +0000 Subject: [PATCH 5/9] save work and bug fixes --- .../Transforms/XeGPUSubgroupDistribute.cpp | 32 +- .../XeGPU/subgroup-distribute-unit.mlir | 608 ++++++++++++++++ .../Dialect/XeGPU/subgroup-distribute.mlir | 683 +++--------------- 3 files changed, 716 insertions(+), 607 deletions(-) create mode 100644 mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 882691fd19f58..a6ca25f30a008 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -875,14 +875,17 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern { storeScatterOp, "Some vector operands have no layouts, using defaults instead."); } - VectorType distPayloadTy = distStoreVecByWarpOpOrFailure.value(); - VectorType expectedPayloadTy = VectorType::get( - {distPayloadTy.getNumElements()}, distPayloadTy.getElementType()); + // Distributed store payload type according to the lane layout. + VectorType distPayloadTyByWarpOp = distStoreVecByWarpOpOrFailure.value(); + // Expected distributed payload type is always 1D. + VectorType expectedPayloadTy = + VectorType::get({distPayloadTyByWarpOp.getNumElements()}, + distPayloadTyByWarpOp.getElementType()); SmallVector newRetIndices; SmallVector operands = storeScatterOp->getOperands(); SmallVector operandTypesToYield = { - expectedPayloadTy, operands[1].getType(), + distPayloadTyByWarpOp, operands[1].getType(), distOffsetsByWarpOpOrFailure.value(), distMaskByWarpOpOrFailure.value()}; @@ -890,8 +893,11 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern { rewriter, warpOp, operands, operandTypesToYield, newRetIndices); SmallVector newStoreScatterOpOperands = llvm::map_to_vector( newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); }); - + // The payload operand may need type adjustment due to mismatch between warp + // distributed type and expected SIMT type. rewriter.setInsertionPointAfter(newWarpOp); + newStoreScatterOpOperands[0] = resolveDistributedTy( + newStoreScatterOpOperands[0], expectedPayloadTy, rewriter); xegpu::StoreScatterOp newOp = xegpu::StoreScatterOp::create( rewriter, newWarpOp.getLoc(), TypeRange{}, newStoreScatterOpOperands, storeScatterOp->getAttrs()); @@ -976,8 +982,11 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern { distMaskByWarpOpOrFailure.value()}; const unsigned operandIdx = producedByLastLoad->getOperandNumber(); - VectorType loadVecTy = + VectorType distResultTy = cast(warpOp.getResult(operandIdx).getType()); + // Distributed load op will always be 1D. + VectorType loadVecTy = VectorType::get({distResultTy.getNumElements()}, + distResultTy.getElementType()); gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( rewriter, warpOp, operands, operandTypesToYield, newRetIndices); @@ -991,7 +1000,10 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern { loadGatherOp->getAttrs()); xegpu::removeLayoutAttrs(newOp); Value distributedVal = newWarpOp.getResult(operandIdx); - rewriter.replaceAllUsesWith(distributedVal, newOp->getResult(0)); + // Resolve the output type and replace all uses. + rewriter.replaceAllUsesWith( + distributedVal, + resolveDistributedTy(newOp.getResult(), distResultTy, rewriter)); return success(); } }; @@ -1107,7 +1119,7 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern { return failure(); auto reductionOp = cast(yieldOperand->get().getDefiningOp()); - unsigned operandNumber = yieldOperand->getOperandNumber(); + unsigned operandIdx = yieldOperand->getOperandNumber(); VectorType sourceType = reductionOp.getSourceVectorType(); // Only 2D vectors are supported. if (sourceType.getRank() != 2) @@ -1121,7 +1133,7 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern { warpOp, "Only 1 reduction dimension is supported."); int64_t reductionDim = reductionDims[0]; VectorType distributedResultType = - cast(warpOp.getResult(operandNumber).getType()); + cast(warpOp.getResult(operandIdx).getType()); VectorType resultType = cast(reductionOp.getType()); xegpu::DistributeLayoutAttr sourceLayout = xegpu::getDistributeLayoutAttr(reductionOp.getSource()); @@ -1184,7 +1196,7 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern { cast>(newWarpOp->getResult(newRetIndices[1])), reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter); // Replace the warp op result with the final result. - rewriter.replaceAllUsesWith(reductionOp.getResult(), result); + rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), result); return success(); } // For non-lane-local case, we simply rewrite the MultiReductionOp in terms diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir new file mode 100644 index 0000000000000..d3f88c2df1291 --- /dev/null +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir @@ -0,0 +1,608 @@ +// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' -xegpu-subgroup-distribute \ +// RUN: -allow-unregistered-dialect -canonicalize -cse -split-input-file %s | FileCheck %s + +// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' \ +// RUN: -xegpu-subgroup-distribute="enable-sg-reductions=false" -allow-unregistered-dialect \ +// RUN: -canonicalize -cse -split-input-file %s | FileCheck %s --check-prefix=CHECK-REDUCTION + +// CHECK-LABEL: gpu.func @store_nd_1d +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>) { +// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1xf32> +// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> +// CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}] : vector<1xf32>, !xegpu.tensor_desc<16xf32> +// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> +// CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}] : vector<1xf32>, !xegpu.tensor_desc<16xf32> +// CHECK: gpu.return +gpu.module @xevm_module{ + gpu.func @store_nd_1d(%laneid: index) { + %c0 = arith.constant 0 : index + gpu.warp_execute_on_lane_0(%laneid)[16] { + %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout> + %cst = "some_op"() : () -> vector<16xf32> + xegpu.store_nd %cst, %0 [%c0] {layout_operand_0 = #xegpu.layout} + : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout> + } + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @store_nd_2d +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>) { +// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf16> +// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> +// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> +gpu.module @xevm_module{ + gpu.func @store_nd_2d(%laneid : index) { + %c0 = arith.constant 0 : index + gpu.warp_execute_on_lane_0(%laneid)[16] { + %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %cst = "some_op"() : () -> vector<16x16xf16> + xegpu.store_nd %cst, %0 [%c0, %c0] {layout_operand_0 = #xegpu.layout} + : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + } + gpu.return + } +} + + + +// ----- +// CHECK-LABEL: gpu.func @load_nd_1d +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16xf32>) { +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> +// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16xf32> -> vector<1xf32> +// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> +// CHECK: xegpu.store_nd %[[T1]], %[[T2]][%{{.*}}] : vector<1xf32>, !xegpu.tensor_desc<16xf32> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> +// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16xf32> -> vector<1xf32> +// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> +// CHECK: xegpu.store_nd %[[T1]], %[[T2]][%{{.*}}] : vector<1xf32>, !xegpu.tensor_desc<16xf32> +gpu.module @xevm_module{ + gpu.func @load_nd_1d(%laneid: index) { + %c0 = arith.constant 0 : index + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16xf32>) { + %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout> + %1 = xegpu.load_nd %0 [%c0] {layout_result_0 = #xegpu.layout} : + !xegpu.tensor_desc<16xf32, #xegpu.layout> -> vector<16xf32> + gpu.yield %1 : vector<16xf32> + } + "some_user_op"(%r) : (vector<16xf32>) -> () + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @load_nd_2d +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) { +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> +// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK: xegpu.store_nd %[[T1]], %[[T2]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> +// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK: xegpu.store_nd %[[T1]], %[[T2]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> +gpu.module @xevm_module{ + gpu.func @load_nd_2d(%laneid: index) { + %c0 = arith.constant 0 : index + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16x1xf16>) { + %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} + : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + gpu.yield %1 : vector<16x16xf16> + } + "some_user_op"(%r) : (vector<16x1xf16>) -> () + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @load_nd_array_length +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) { +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<32xf16> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<32xf16> +// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<32xf16> to vector<2x16x1xf16> +// CHECK: %[[T3:.*]] = vector.extract %[[T2]][0] : vector<16x1xf16> from vector<2x16x1xf16> +// CHECK-DAG: %[[T4:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK-DAG: %[[T4:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK-DAG: %[[T5:.*]] = vector.shape_cast %[[T3]] : vector<16x1xf16> to vector<16xf16> +// CHECK: xegpu.store_nd %[[T5]], %[[T4]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> +// CHECK: xegpu.store_nd %[[T5]], %[[T4]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> +gpu.module @xevm_module{ + gpu.func @load_nd_array_length(%laneid: index) { + %c0 = arith.constant 0 : index + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2x16x1xf16>) { + %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, + #xegpu.layout> + %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} + : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, + #xegpu.layout> -> vector<2x16x16xf16> + gpu.yield %1 : vector<2x16x16xf16> + } + "some_user_op"(%r) : (vector<2x16x1xf16>) -> () + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @load_dpas_store +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { +// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> +// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> +// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> +// CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> +// CHECK: xegpu.store_nd %[[T4]], %[[T5]][%{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> +// CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> +// CHECK: xegpu.store_nd %[[T4]], %[[T5]][%{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> +gpu.module @xevm_module{ + gpu.func @dpas(%laneid: index) { + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) { + %0 = "some_op"() : () -> vector<8x16xf16> + %1 = "some_op"() : () -> vector<16x16xf16> + %2 = "some_op"() : () -> vector<8x16xf32> + %3 = xegpu.dpas %0, %1, %2 + { + layout_operand_0 = #xegpu.layout, + layout_operand_1 = #xegpu.layout, + layout_operand_2 = #xegpu.layout, + layout_result_0 = #xegpu.layout + } + : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + gpu.yield %3 : vector<8x16xf32> + } + "some_user_op"(%r) : (vector<8x1xf32>) -> () + gpu.return + } +} + + +// ----- +// CHECK-LABEL: gpu.func @create_nd_tdesc_non_memref +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: ui64, %[[ARG2:[0-9a-zA-Z]+]]: index, +// CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: index, %[[ARG4:[0-9a-zA-Z]+]]: index, +// CHECK-SAME: %[[ARG5:[0-9a-zA-Z]+]]: index, %[[ARG6:[0-9a-zA-Z]+]]: index, %[[ARG7:[0-9a-zA-Z]+]]: index) { +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][{{.*}}] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> +// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> +// CHECK: xegpu.store_nd %[[T1]], %[[T2]][{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][{{.*}}] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> +// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> +// CHECK: xegpu.store_nd %[[T1]], %[[T2]][{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> +gpu.module @xevm_module{ + gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %laneid: index) { + %c0 = arith.constant 0 : index + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (!xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { + %0 = xegpu.create_nd_tdesc %arg0, shape:[64, 128], strides:[128, 1] : ui64 -> + !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + gpu.yield %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + } + "some_user_op"(%r) + : (!xegpu.tensor_desc<16x16xf16, #xegpu.layout>) -> () + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @prefetch_2d +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) { +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK: xegpu.prefetch_nd %[[T0]][%{{.*}}] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16> +gpu.module @xevm_module{ + gpu.func @prefetch_2d(%laneid: index) { + %c0 = arith.constant 0 : index + gpu.warp_execute_on_lane_0(%laneid)[16] { + %0 = "some_op"() : () + -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + xegpu.prefetch_nd %0[%c0, %c0] + <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> + : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + } + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @prefetch_1d +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256xf16> -> !xegpu.tensor_desc<16xf16> +// CHECK: xegpu.prefetch_nd %[[T0]][%{{.*}}] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16> +gpu.module @xevm_module{ + gpu.func @prefetch_1d(%laneid: index) { + %c0 = arith.constant 0 : index + gpu.warp_execute_on_lane_0(%laneid)[16] { + %0 = "some_op"() : () + -> !xegpu.tensor_desc<16xf16, #xegpu.layout> + xegpu.prefetch_nd %0[%c0] + <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> + : !xegpu.tensor_desc<16xf16, #xegpu.layout> + } + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @gpu_barrier({{.*}}) { +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16> +// CHECK-NEXT: %[[T1:.*]] = xegpu.load_nd %[[T0]][{{.*}}] : !xegpu.tensor_desc<16xf16> -> vector<1xf16> +// CHECK-NEXT: gpu.barrier +// CHECK-NEXT: %[[T2:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16> +// CHECK-NEXT: xegpu.store_nd %[[T1]], %[[T2]][{{.*}}] : vector<1xf16>, !xegpu.tensor_desc<16xf16> +gpu.module @xevm_module{ + gpu.func @gpu_barrier(%laneid: index) { + %c0 = arith.constant 0 : index + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf16>) { + %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf16, #xegpu.layout> + %1 = xegpu.load_nd %0[%c0] + {layout_result_0 = #xegpu.layout} + : !xegpu.tensor_desc<16xf16, #xegpu.layout> -> vector<16xf16> + gpu.barrier + gpu.yield %1 : vector<16xf16> + } + "some_user_op"(%r) : (vector<1xf16>) -> () + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction +// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> +// CHECK-SAME: (!xegpu.tensor_desc<1x32xf32, #xegpu.layout>, vector<16x2xf32>) { +// CHECK: %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout} : () -> vector<16x32xf32> +// CHECK-NEXT: gpu.yield %{{.*}}, %[[SRC]] : !xegpu.tensor_desc<1x32xf32, #xegpu.layout>, vector<16x32xf32> +// CHECK-NEXT: } +// CHECK: %[[COL0:.*]] = vector.extract_strided_slice %[[W]]#1 {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> +// CHECK-NEXT: %[[CAST0:.*]] = vector.shape_cast %[[COL0]] : vector<16x1xf32> to vector<16xf32> +// CHECK-NEXT: %[[RED0:.*]] = vector.reduction , %[[CAST0]], %{{.*}} : vector<16xf32> into f32 +// CHECK: %[[COL1:.*]] = vector.extract_strided_slice %[[W]]#1 {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> +// CHECK-NEXT: %[[CAST1:.*]] = vector.shape_cast %[[COL1]] : vector<16x1xf32> to vector<16xf32> +// CHECK-NEXT: %[[RED1:.*]] = vector.reduction , %[[CAST1]], %{{.*}} : vector<16xf32> into f32 +// CHECK-NEXT: vector.from_elements %[[RED0]], %[[RED1]] : vector<2xf32> +gpu.module @xevm_module{ +gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction(%laneid: index) { + %c0 = arith.constant 0 : index + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) { + %src = "some_def"() + {layout_result_0 = #xegpu.layout} + : () -> (vector<16x32xf32>) + %acc = arith.constant + {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} + dense<0.0> : vector<32xf32> + %1 = vector.multi_reduction , %src, %acc + { + layout_operand_0 = #xegpu.layout, + layout_operand_1 = #xegpu.slice<#xegpu.layout, dims = [0]>, + layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]> + } [0] + : vector<16x32xf32> to vector<32xf32> + gpu.yield %1 : vector<32xf32> + } + "some_user_op"(%r) : (vector<2xf32>) -> () + gpu.return +} +} + +// ----- +// CHECK-REDUCTION-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction +// CHECK-REDUCTION: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (!xegpu.tensor_desc<2x16xf32, +// CHECK-REDUCTION-SAME: #xegpu.layout>, f32, f32) { +// CHECK-REDUCTION: %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout} : () -> vector<2x16xf32> +// CHECK-REDUCTION-NEXT: %[[ROW0:.*]] = vector.extract %[[SRC]][0] : vector<16xf32> from vector<2x16xf32> +// CHECK-REDUCTION-NEXT: %[[R0:.*]] = vector.reduction , %[[ROW0]], %{{.*}} : vector<16xf32> into f32 +// CHECK-REDUCTION-NEXT: %[[ROW1:.*]] = vector.extract %[[SRC]][1] : vector<16xf32> from vector<2x16xf32> +// CHECK-REDUCTION-NEXT: %[[R1:.*]] = vector.reduction , %[[ROW1]], %{{.*}} : vector<16xf32> into f32 +// CHECK-REDUCTION-NEXT: gpu.yield %4, %[[R1]], %[[R0]] : !xegpu.tensor_desc<2x16xf32, #xegpu.layout>, f32, f32 +// CHECK-REDUCTION-NEXT: } +// CHECK-REDUCTION-NEXT: vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32> +gpu.module @xevm_module{ +gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index) { + %c0 = arith.constant 0 : index + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) { + %src = "some_def"() + {layout_result_0 = #xegpu.layout} + : () -> (vector<2x16xf32>) + %acc = arith.constant + {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} + dense<0.0> : vector<2xf32> + %1 = vector.multi_reduction , %src, %acc + { + layout_operand_0 = #xegpu.layout, + layout_operand_1 = #xegpu.slice<#xegpu.layout, dims = [1]>, + layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]> + } + [1] : vector<2x16xf32> to vector<2xf32> + gpu.yield %1 : vector<2xf32> + } + "some_user_op"(%r) : (vector<2xf32>) -> () + gpu.return +} +} + +// ----- +// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction +// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%0)[16] -> +// CHECK-SAME: (!xegpu.tensor_desc<32x1xf32, #xegpu.layout>, vector<2x16xf32>) { +// CHECK: %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout} : () -> vector<32x16xf32> +// CHECK-NEXT: gpu.yield %{{.*}}, %[[SRC]] : !xegpu.tensor_desc<32x1xf32, #xegpu.layout>, vector<32x16xf32> +// CHECK-NEXT: } +// CHECK: %[[ROW0:.*]] = vector.extract %[[W]]#1[0] : vector<16xf32> from vector<2x16xf32> +// CHECK-NEXT: %[[R0:.*]] = vector.reduction , %[[ROW0]], %{{.*}} : vector<16xf32> into f32 +// CHECK: %[[ROW1:.*]] = vector.extract %[[W]]#1[1] : vector<16xf32> from vector<2x16xf32> +// CHECK-NEXT: %[[R1:.*]] = vector.reduction , %[[ROW1]], %{{.*}} : vector<16xf32> into f32 +// CHECK-NEXT: vector.from_elements %[[R0]], %[[R1]] : vector<2xf32> +gpu.module @xevm_module{ +gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index) { + %c0 = arith.constant 0 : index + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) { + %src = "some_def"() + {layout_result_0 = #xegpu.layout} + : () -> (vector<32x16xf32>) + %acc = arith.constant + {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} + dense<0.0> : vector<32xf32> + %1 = vector.multi_reduction , %src, %acc + { + layout_operand_0 = #xegpu.layout, + layout_operand_1 = #xegpu.slice<#xegpu.layout, dims = [1]>, + layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]> + } + [1] : vector<32x16xf32> to vector<32xf32> + gpu.yield %1 : vector<32xf32> + } + "some_user_op"(%r) : (vector<2xf32>) -> () + gpu.return +} +} + +// ----- +// CHECK-REDUCTION-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction +// CHECK-REDUCTION: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (!xegpu.tensor_desc<16x2xf32, +// CHECK-REDUCTION-SAME: #xegpu.layout>, f32, f32) { +// CHECK-REDUCTION: %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout} : () -> vector<16x2xf32> +// CHECK-REDUCTION-NEXT: %[[COL0:.*]] = vector.extract_strided_slice %[[SRC]] {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> +// CHECK-REDUCTION-NEXT: %[[CAST0:.*]] = vector.shape_cast %[[COL0]] : vector<16x1xf32> to vector<16xf32> +// CHECK-REDUCTION-NEXT: %[[R0:.*]] = vector.reduction , %[[CAST0]], %{{.*}} : vector<16xf32> into f32 +// CHECK-REDUCTION-NEXT: %[[COL1:.*]] = vector.extract_strided_slice %5 {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> +// CHECK-REDUCTION-NEXT: %[[CAST1:.*]] = vector.shape_cast %[[COL1]] : vector<16x1xf32> to vector<16xf32> +// CHECK-REDUCTION-NEXT: %[[R1:.*]] = vector.reduction , %[[CAST1]], %cst : vector<16xf32> into f32 +// CHECK-REDUCTION-NEXT: gpu.yield %4, %[[R1]], %[[R0]] : !xegpu.tensor_desc<16x2xf32, #xegpu.layout>, f32, f32 +// CHECK-REDUCTION-NEXT: } +// CHECK-REDUCTION-NEXT: vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32> +gpu.module @xevm_module{ +gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) { + %c0 = arith.constant 0 : index + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) { + %src = "some_def"() + {layout_result_0 = #xegpu.layout} + : () -> (vector<16x2xf32>) + %acc = arith.constant + {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} + dense<0.0> : vector<2xf32> + %1 = vector.multi_reduction , %src, %acc + { + layout_operand_0 = #xegpu.layout, + layout_operand_1 = #xegpu.slice<#xegpu.layout, dims = [0]>, + layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]> + } + [0] : vector<16x2xf32> to vector<2xf32> + gpu.yield %1 : vector<2xf32> + } + "some_user_op"(%r) : (vector<2xf32>) -> () + gpu.return +} +} + +// ----- +// CHECK-LABEL: gpu.func @scatter_ops_chunksize({{.*}}) { +// CHECK: %[[MASK:.*]] = arith.constant dense : vector<1xi1> +// CHECK-NEXT: %[[LANE_OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> +// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[LANE_OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> +// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[LANE_OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> +gpu.module @xevm_module{ + gpu.func @scatter_ops_chunksize(%laneid: index, %src: memref<256xf16>) { + gpu.warp_execute_on_lane_0(%laneid)[16] { + %1 = arith.constant + {layout_result_0 = #xegpu.layout} + dense<1>: vector<16xi1> + %offset = arith.constant + {layout_result_0 = #xegpu.layout} + dense<12> : vector<16xindex> + %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> + { + layout_operand_1 = #xegpu.layout, + layout_operand_2 = #xegpu.layout, + layout_result_0 = #xegpu.layout + } + : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> + xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> + { + layout_operand_0 = #xegpu.layout, + layout_operand_2 = #xegpu.layout, + layout_operand_3 = #xegpu.layout + } + : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> + } + gpu.return + } +} + +// // ----- +// // CHECK-LABEL: gpu.func @scatter_ops_scf_yield({{.*}}, +// // CHECK-SAME: %[[PREDICATE:.*]]: i1) { +// // CHECK: %[[DEFAULT:.*]] = arith.constant dense<1.200000e+01> : vector<8xf16> +// // CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> +// // CHECK: %[[MASK:.*]] = arith.constant dense : vector<1xi1> +// // CHECK: %[[PREDICATED_LOAD:.*]] = scf.if %[[PREDICATE]] -> (vector<8xf16>) { +// // CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> +// // CHECK-NEXT: scf.yield %[[LOADED]] : vector<8xf16> +// // CHECK-NEXT: } else { +// // CHECK-NEXT: scf.yield %[[DEFAULT]] : vector<8xf16> +// // CHECK-NEXT: } +// // CHECK-NEXT: xegpu.store %[[PREDICATED_LOAD]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> +// gpu.module @xevm_module{ +// gpu.func @scatter_ops_scf_yield(%src: memref<256xf16>, %pred : i1) { +// %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> +// %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> +// %loaded = scf.if %pred -> (vector<16x8xf16>) { +// %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> { +// layout_result_0 = #xegpu.layout +// } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> +// scf.yield %3 : vector<16x8xf16> +// } else { +// %3 = arith.constant { +// layout_result_0 = #xegpu.layout +// } dense<12.> : vector<16x8xf16> +// scf.yield %3 : vector<16x8xf16> +// } { layout_result_0 = #xegpu.layout } +// xegpu.store %loaded, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// gpu.return +// } +// } + +// // ----- +// // CHECK-LABEL: gpu.func @scatter_ops_scf_non_yield({{.*}}) { +// // CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> +// // CHECK: %[[MASK:.*]] = arith.constant dense : vector<1xi1> +// // CHECK: %[[PREDICATE:.*]] = llvm.mlir.poison : i1 +// // CHECK: scf.if %[[PREDICATE]] { +// // CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> +// // CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> +// // CHECK-NEXT: } +// gpu.module @xevm_module{ +// gpu.func @scatter_ops_scf_non_yield(%src: memref<256xf16>) { +// %pred = llvm.mlir.poison : i1 +// %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> +// %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> +// scf.if %pred { +// %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> { +// layout_result_0 = #xegpu.layout +// } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> +// xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// } +// gpu.return +// } +// } + +// // ----- +// // CHECK-LABEL: gpu.func @scatter_ops({{.*}}) { +// // CHECK: %[[MASK:.*]] = arith.constant dense : vector<1xi1> +// // CHECK-NEXT: %[[LANE_OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> +// // CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[LANE_OFFSET]]], %[[MASK]] : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16> +// // CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[LANE_OFFSET]]], %[[MASK]] : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> +// gpu.module @xevm_module{ +// gpu.func @scatter_ops(%src: memref<256xf16>) { +// %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> +// %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> +// %3 = xegpu.load %src[%offset], %1 { +// layout_result_0 = #xegpu.layout +// } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> +// xegpu.store %3, %src[%offset], %1 : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// gpu.return +// } +// } + +// // ----- +// // CHECK-LABEL: gpu.func @memref_extract_aligned_pointer_as_index( +// // CHECK: %{{.*}} = memref.extract_aligned_pointer_as_index %{{.*}} : memref<256x256xf16> -> index +// gpu.module @xevm_module{ +// gpu.func @memref_extract_aligned_pointer_as_index(%arg0 : memref<256x256xf16>) { +// %c0 = arith.constant 0 : index +// %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16xf16> +// %ptr = memref.extract_aligned_pointer_as_index %arg0 : memref<256x256xf16> -> index +// %ptr_i64 = arith.index_cast %ptr : index to i64 +// %tdesc = xegpu.create_nd_tdesc %ptr_i64, shape: [16], strides: [16] : i64 +// -> !xegpu.tensor_desc<16xf16, #xegpu.layout> +// xegpu.store_nd %cst, %tdesc[%c0] : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout> +// gpu.return +// } +// } + + +// // ----- +// // CHECK-LABEL: gpu.func @vector_transpose( +// // CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<2xf32> +// // CHECK: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<2x16xf32> -> !xegpu.tensor_desc<2x16xf32> +// // CHECK: xegpu.store_nd %[[CST]], %[[DEST]][{{.*}}] : vector<2xf32>, !xegpu.tensor_desc<2x16xf32> +// gpu.module @xevm_module{ +// gpu.func @vector_transpose(%arg0: memref<2x16xf32>) { +// %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> +// : vector<16x2xf32> +// %c0 = arith.constant 0 : index +// %transpose = vector.transpose %cst, [1, 0] {layout_result_0 = #xegpu.layout} +// : vector<16x2xf32> to vector<2x16xf32> +// %0 = xegpu.create_nd_tdesc %arg0 : memref<2x16xf32> +// -> !xegpu.tensor_desc<2x16xf32, #xegpu.layout> +// xegpu.store_nd %transpose, %0[%c0, %c0] : vector<2x16xf32>, +// !xegpu.tensor_desc<2x16xf32, #xegpu.layout> +// gpu.return +// } +// } + +// // ----- +// // CHECK-LABEL: gpu.func @vector_bitcast( +// // CHECK: %[[CAST:.*]] = vector.bitcast %{{.*}} : vector<4x2xi8> to vector<4x1xi16> +// // CHECK-NEXT: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<4x16xi16> -> !xegpu.tensor_desc<4x16xi16> +// // CHECK-NEXT: %[[T0:.*]] = vector.shape_cast %[[CAST]] : vector<4x1xi16> to vector<4xi16> +// // CHECK-NEXT: xegpu.store_nd %[[T0]], %[[DEST]][{{.*}}] : vector<4xi16>, !xegpu.tensor_desc<4x16xi16> +// gpu.module @xevm_module{ +// gpu.func @vector_bitcast(%arg0: memref<4x16xi16>) { +// %cst = "some_op"() {layout_result_0 = #xegpu.layout} +// : () -> (vector<4x32xi8>) +// %bitcast = vector.bitcast %cst {layout_result_0 = #xegpu.layout} +// : vector<4x32xi8> to vector<4x16xi16> +// %c0 = arith.constant 0 : index +// %0 = xegpu.create_nd_tdesc %arg0 : memref<4x16xi16> +// -> !xegpu.tensor_desc<4x16xi16, #xegpu.layout> +// xegpu.store_nd %bitcast, %0[%c0, %c0] : vector<4x16xi16>, +// !xegpu.tensor_desc<4x16xi16, #xegpu.layout> +// gpu.return +// } +// } + +// // ----- +// // CHECK-LABEL: gpu.func @mma_transpose_b( +// // CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x8xi32>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { +// // CHECK-DAG: %[[ADESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> +// // CHECK-DAG: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32> +// // CHECK-DAG: %[[A:.*]] = xegpu.load_nd %[[ADESC]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> +// // CHECK-DAG: %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}] <{transpose = array}> : !xegpu.tensor_desc<16x8xi32> -> vector<8xi32> +// // CHECK-NEXT: %[[BCAST0:.*]] = vector.shape_cast %[[B]] : vector<8xi32> to vector<1x8xi32> +// // CHECK-NEXT: %[[BCAST1:.*]] = vector.bitcast %[[BCAST0]] : vector<1x8xi32> to vector<1x16xf16> +// // CHECK-NEXT: %[[BCAST2:.*]] = vector.shape_cast %[[BCAST1]] : vector<1x16xf16> to vector<16xf16> +// // CHECK-NEXT: %[[C:.*]] = xegpu.dpas %[[A]], %[[BCAST2]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> +// gpu.module @xevm_module{ +// gpu.func @mma_transpose_b(%arg0: memref<8x16xf16>, %arg1: memref<16x8xi32>, %arg2: memref<8x16xf32>) { +// %c0 = arith.constant 0 : index +// %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> +// -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> +// %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} +// : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> +// %2 = xegpu.create_nd_tdesc %arg1 : memref<16x8xi32> +// -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout> +// %3 = xegpu.load_nd %2[%c0, %c0] {layout_result_0 = #xegpu.layout} +// : !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> +// %4 = vector.bitcast %3 {layout_result_0 = #xegpu.layout} +// : vector<16x8xi32> to vector<16x16xf16> +// %5 = vector.transpose %4, [1, 0] {layout_result_0 = #xegpu.layout} +// : vector<16x16xf16> to vector<16x16xf16> +// %6 = xegpu.dpas %1, %5 {layout_result_0 = #xegpu.layout} +// : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> +// %7 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> +// -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// xegpu.store_nd %6, %7[%c0, %c0] : vector<8x16xf32>, +// !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// gpu.return + +// } +// } diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index 4109c400c16ff..4d44e63a47ac1 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -1,173 +1,6 @@ // RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' -xegpu-subgroup-distribute \ // RUN: -allow-unregistered-dialect -canonicalize -cse -split-input-file %s | FileCheck %s -// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' \ -// RUN: -xegpu-subgroup-distribute="enable-sg-reductions=false" -allow-unregistered-dialect \ -// RUN: -canonicalize -cse -split-input-file %s | FileCheck %s --check-prefix=CHECK-REDUCTION - -// CHECK-LABEL: gpu.func @store_nd_1d -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>) { -// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1xf32> -// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> -// CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}] : vector<1xf32>, !xegpu.tensor_desc<16xf32> -// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> -// CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}] : vector<1xf32>, !xegpu.tensor_desc<16xf32> -// CHECK: gpu.return -gpu.module @xevm_module{ - gpu.func @store_nd_1d(%laneid: index) { - %c0 = arith.constant 0 : index - gpu.warp_execute_on_lane_0(%laneid)[16] { - %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout> - %cst = "some_op"() : () -> vector<16xf32> - xegpu.store_nd %cst, %0 [%c0] {layout_operand_0 = #xegpu.layout} - : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout> - } - gpu.return - } -} - -// ----- -// CHECK-LABEL: gpu.func @store_nd_2d -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>) { -// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf16> -// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> -// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> -gpu.module @xevm_module{ - gpu.func @store_nd_2d(%laneid : index) { - %c0 = arith.constant 0 : index - gpu.warp_execute_on_lane_0(%laneid)[16] { - %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - %cst = "some_op"() : () -> vector<16x16xf16> - xegpu.store_nd %cst, %0 [%c0, %c0] {layout_operand_0 = #xegpu.layout} - : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - } - gpu.return - } -} - - - -// ----- -// CHECK-LABEL: gpu.func @load_nd_1d -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16xf32>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> -// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16xf32> -> vector<1xf32> -// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> -// CHECK: xegpu.store_nd %[[T1]], %[[T2]][%{{.*}}] : vector<1xf32>, !xegpu.tensor_desc<16xf32> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> -// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16xf32> -> vector<1xf32> -// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> -// CHECK: xegpu.store_nd %[[T1]], %[[T2]][%{{.*}}] : vector<1xf32>, !xegpu.tensor_desc<16xf32> -gpu.module @xevm_module{ - gpu.func @load_nd_1d(%laneid: index) { - %c0 = arith.constant 0 : index - %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16xf32>) { - %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout> - %1 = xegpu.load_nd %0 [%c0] {layout_result_0 = #xegpu.layout} : - !xegpu.tensor_desc<16xf32, #xegpu.layout> -> vector<16xf32> - gpu.yield %1 : vector<16xf32> - } - "some_user_op"(%r) : (vector<16xf32>) -> () - gpu.return - } -} - -// ----- -// CHECK-LABEL: gpu.func @load_nd_2d -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> -// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.store_nd %[[T1]], %[[T2]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> -// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.store_nd %[[T1]], %[[T2]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> -gpu.module @xevm_module{ - gpu.func @load_nd_2d(%laneid: index) { - %c0 = arith.constant 0 : index - %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16x1xf16>) { - %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} - : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> - gpu.yield %1 : vector<16x16xf16> - } - "some_user_op"(%r) : (vector<16x1xf16>) -> () - gpu.return - } -} - -// ----- -// CHECK-LABEL: gpu.func @load_nd_array_length -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<32xf16> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<32xf16> -// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<32xf16> to vector<2x16x1xf16> -// CHECK: %[[T3:.*]] = vector.extract %[[T2]][0] : vector<16x1xf16> from vector<2x16x1xf16> -// CHECK-DAG: %[[T4:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK-DAG: %[[T4:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK-DAG: %[[T5:.*]] = vector.shape_cast %[[T3]] : vector<16x1xf16> to vector<16xf16> -// CHECK: xegpu.store_nd %[[T5]], %[[T4]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.store_nd %[[T5]], %[[T4]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> -gpu.module @xevm_module{ - gpu.func @load_nd_array_length(%laneid: index) { - %c0 = arith.constant 0 : index - %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2x16x1xf16>) { - %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, - #xegpu.layout> - %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} - : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, - #xegpu.layout> -> vector<2x16x16xf16> - gpu.yield %1 : vector<2x16x16xf16> - } - "some_user_op"(%r) : (vector<2x16x1xf16>) -> () - gpu.return - } -} - -// ----- -// CHECK-LABEL: gpu.func @load_dpas_store -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> -// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> -// CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: xegpu.store_nd %[[T4]], %[[T5]][%{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> -// CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: xegpu.store_nd %[[T4]], %[[T5]][%{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> -gpu.module @xevm_module{ - gpu.func @dpas(%laneid: index) { - %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) { - %0 = "some_op"() : () -> vector<8x16xf16> - %1 = "some_op"() : () -> vector<16x16xf16> - %2 = "some_op"() : () -> vector<8x16xf32> - %3 = xegpu.dpas %0, %1, %2 - { - layout_operand_0 = #xegpu.layout, - layout_operand_1 = #xegpu.layout, - layout_operand_2 = #xegpu.layout, - layout_result_0 = #xegpu.layout - } - : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> - gpu.yield %3 : vector<8x16xf32> - } - "some_user_op"(%r) : (vector<8x1xf32>) -> () - gpu.return - } -} - - -// ----- // CHECK-LABEL: gpu.func @load_dpas_postop_store // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> @@ -183,440 +16,96 @@ gpu.module @xevm_module{ gpu.module @xevm_module{ gpu.func @load_dpas_postop_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> - %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> - %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - %3 = xegpu.load_nd %2[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> - %4 = xegpu.dpas %1, %3 {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - %5 = math.exp %4 {layout_result_0 = #xegpu.layout} : vector<8x16xf32> - %6 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - xegpu.store_nd %5, %6[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> + -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> + %1 = xegpu.load_nd %0[%c0, %c0] + {layout_result_0 = #xegpu.layout} : + !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> + + %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16> + -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %3 = xegpu.load_nd %2[%c0, %c0] + {layout_result_0 = #xegpu.layout} + : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + -> vector<16x16xf16> + + %4 = xegpu.dpas %1, %3 + {layout_result_0 = #xegpu.layout} + : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> + + %5 = math.exp %4 + {layout_result_0 = #xegpu.layout} + : vector<8x16xf32> + + %6 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> + !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + xegpu.store_nd %5, %6[%c0, %c0] : vector<8x16xf32>, + !xegpu.tensor_desc<8x16xf32, #xegpu.layout> gpu.return } } // ----- -// CHECK-LABEL: gpu.func @create_nd_tdesc_non_memref -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: ui64, %[[ARG2:[0-9a-zA-Z]+]]: index, -// CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: index, %[[ARG4:[0-9a-zA-Z]+]]: index, -// CHECK-SAME: %[[ARG5:[0-9a-zA-Z]+]]: index, %[[ARG6:[0-9a-zA-Z]+]]: index, %[[ARG7:[0-9a-zA-Z]+]]: index) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][{{.*}}] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.store_nd %[[T1]], %[[T2]][{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][{{.*}}] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.store_nd %[[T1]], %[[T2]][{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> +// CHECK-LABEL: gpu.func @gemm +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) { +// CHECK-DAG: %[[BLOCK_ID_X:.*]] = gpu.block_id x +// CHECK-DAG: %[[BLOCK_ID_Y:.*]] = gpu.block_id y +// CHECK-DAG: %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index +// CHECK-DAG: %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index +// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32> +// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> +// CHECK-NEXT: %[[T4:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32> +// CHECK: %[[T5:.*]] = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %[[T4]]) -> (vector<8x1xf32>) { +// CHECK-DAG: %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16> +// CHECK-DAG: %[[T11:.*]] = xegpu.load_nd %[[T10]][%[[K]], %[[Y_COORD]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16> +// CHECK-DAG: %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16> +// CHECK-DAG: %[[T13:.*]] = xegpu.load_nd %[[T12]][%[[X_COORD]], %[[K]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16> +// CHECK-DAG: %[[T14:.*]] = vector.shape_cast %[[ARG4]] : vector<8x1xf32> to vector<8xf32> +// CHECK-NEXT: %[[T15:.*]] = xegpu.dpas %[[T13]], %[[T11]], %[[T14]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32> +// CHECK-NEXT: %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32> +// CHECK-NEXT: scf.yield %[[T16]] : vector<8x1xf32> +// CHECK-NEXT: } +// CHECK-NEXT: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32> +// CHECK-NEXT: xegpu.store_nd %[[T9]], %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> gpu.module @xevm_module{ - gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %laneid: index) { - %c0 = arith.constant 0 : index - %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16x1xf16>) { - %0 = xegpu.create_nd_tdesc %arg0, shape:[64, 128], strides:[128, 1] : ui64 -> - !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - %1 = xegpu.load_nd %0[%c0, %c0] - {layout_result_0 = #xegpu.layout} - : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> - gpu.yield %1 : vector<16x16xf16> - } - "some_user_op"(%r) : (vector<16x1xf16>) -> () - gpu.return - } +gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){ + %c0 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c8 = arith.constant 8 : index + %c1024 = arith.constant 1024 : index + %block_id_x = gpu.block_id x + %block_id_y = gpu.block_id y + %0 = arith.muli %block_id_x, %c8 : index + %1 = arith.muli %block_id_y, %c16 : index + %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> -> + !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + %3 = xegpu.load_nd %2[%0, %1] + {layout_result_0 = #xegpu.layout} + : !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -> vector<8x16xf32> + + %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) { + + %5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16> + -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> + %6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16> + -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> + + %7 = xegpu.load_nd %5[%0, %arg3] + {layout_result_0 = #xegpu.layout} + : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> -> vector<8x16xbf16> + %8 = xegpu.load_nd %6[%arg3, %1] + {layout_result_0 = #xegpu.layout} + : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> -> vector<16x16xbf16> + + %9 = xegpu.dpas %7, %8, %arg4 + {layout_result_0 = #xegpu.layout} + : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32> + + scf.yield %9 : vector<8x16xf32> + } {layout_result_0 = #xegpu.layout} + + xegpu.store_nd %4, %2[%0, %1] : vector<8x16xf32>, + !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + gpu.return +} } - -// // ----- -// // TODO: gemm does not use update_nd_offset because of an issue in scf-for distribution. -// // CHECK-LABEL: gpu.func @gemm -// // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) { -// // CHECK-DAG: %[[BLOCK_ID_X:.*]] = gpu.block_id x -// // CHECK-DAG: %[[BLOCK_ID_Y:.*]] = gpu.block_id y -// // CHECK-DAG: %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index -// // CHECK-DAG: %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index -// // CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32> -// // CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> -// // CHECK-NEXT: %[[T4:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32> -// // CHECK: %[[T5:.*]] = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %[[T4]]) -> (vector<8x1xf32>) { -// // CHECK-DAG: %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16> -// // CHECK-DAG: %[[T11:.*]] = xegpu.load_nd %[[T10]][%[[K]], %[[Y_COORD]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16> -// // CHECK-DAG: %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16> -// // CHECK-DAG: %[[T13:.*]] = xegpu.load_nd %[[T12]][%[[X_COORD]], %[[K]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16> -// // CHECK-DAG: %[[T14:.*]] = vector.shape_cast %[[ARG4]] : vector<8x1xf32> to vector<8xf32> -// // CHECK-NEXT: %[[T15:.*]] = xegpu.dpas %[[T13]], %[[T11]], %[[T14]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32> -// // CHECK-NEXT: %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32> -// // CHECK-NEXT: scf.yield %[[T16]] : vector<8x1xf32> -// // CHECK-NEXT: } -// // CHECK-NEXT: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32> -// // CHECK-NEXT: xegpu.store_nd %[[T9]], %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> -// gpu.module @xevm_module{ -// gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){ -// %c0 = arith.constant 0 : index -// %c16 = arith.constant 16 : index -// %c8 = arith.constant 8 : index -// %c1024 = arith.constant 1024 : index -// %block_id_x = gpu.block_id x -// %block_id_y = gpu.block_id y -// %0 = arith.muli %block_id_x, %c8 : index -// %1 = arith.muli %block_id_y, %c16 : index -// %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -// %3 = xegpu.load_nd %2[%0, %1] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -> vector<8x16xf32> -// %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) { -// %5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> -// %6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> -// %7 = xegpu.load_nd %5[%0, %arg3] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> -> vector<8x16xbf16> -// %8 = xegpu.load_nd %6[%arg3, %1] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> -> vector<16x16xbf16> -// %9 = xegpu.dpas %7, %8, %arg4 {layout_result_0 = #xegpu.layout} : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32> -// scf.yield %9 : vector<8x16xf32> -// } {layout_result_0 = #xegpu.layout} -// xegpu.store_nd %4, %2[%0, %1] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -// gpu.return -// } -// } - -// // ----- -// // CHECK-LABEL: gpu.func @prefetch_2d -// // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) { -// // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16> -// // CHECK: xegpu.prefetch_nd %[[T0]][%{{.*}}] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16> -// gpu.module @xevm_module{ -// gpu.func @prefetch_2d(%arg0: memref<256x256xf16>) { -// %c0 = arith.constant 0 : index -// %0 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -// xegpu.prefetch_nd %0[%c0, %c0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -// gpu.return -// } -// } - -// // ----- -// // CHECK-LABEL: gpu.func @prefetch_1d -// // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { -// // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256xf16> -> !xegpu.tensor_desc<16xf16> -// // CHECK: xegpu.prefetch_nd %[[T0]][%{{.*}}] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16> -// gpu.module @xevm_module{ -// gpu.func @prefetch_1d(%arg0: memref<256xf16>) { -// %c0 = arith.constant 0 : index -// %0 = xegpu.create_nd_tdesc %arg0: memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> -// xegpu.prefetch_nd %0[%c0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16, #xegpu.layout> -// gpu.return -// } -// } - -// // ----- -// // CHECK-LABEL: gpu.func @gpu_barrier({{.*}}) { -// // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16> -// // CHECK-NEXT: %[[T1:.*]] = xegpu.load_nd %[[T0]][{{.*}}] : !xegpu.tensor_desc<16xf16> -> vector<1xf16> -// // CHECK-NEXT: gpu.barrier -// // CHECK-NEXT: %[[T2:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16> -// // CHECK-NEXT: xegpu.store_nd %[[T1]], %[[T2]][{{.*}}] : vector<1xf16>, !xegpu.tensor_desc<16xf16> -// gpu.module @xevm_module{ -// gpu.func @gpu_barrier(%arg0: memref<256xf16>, %arg1: memref<256xf16>) { -// %c0 = arith.constant 0 : index -// %0 = xegpu.create_nd_tdesc %arg0 : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> -// %1 = xegpu.load_nd %0[%c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16xf16, #xegpu.layout> -> vector<16xf16> -// gpu.barrier -// %2 = xegpu.create_nd_tdesc %arg1 : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> -// xegpu.store_nd %1, %2[%c0] : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout> -// gpu.return -// } -// } - -// // ----- -// // CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction -// // CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> -// // CHECK-SAME: (!xegpu.tensor_desc<1x32xf32, #xegpu.layout>, vector<16x2xf32>) { -// // CHECK: %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout} : () -> vector<16x32xf32> -// // CHECK-NEXT: gpu.yield %{{.*}}, %[[SRC]] : !xegpu.tensor_desc<1x32xf32, #xegpu.layout>, vector<16x32xf32> -// // CHECK-NEXT: } -// // CHECK: %[[COL0:.*]] = vector.extract_strided_slice %[[W]]#1 {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> -// // CHECK-NEXT: %[[CAST0:.*]] = vector.shape_cast %[[COL0]] : vector<16x1xf32> to vector<16xf32> -// // CHECK-NEXT: %[[RED0:.*]] = vector.reduction , %[[CAST0]], %{{.*}} : vector<16xf32> into f32 -// // CHECK: %[[COL1:.*]] = vector.extract_strided_slice %[[W]]#1 {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> -// // CHECK-NEXT: %[[CAST1:.*]] = vector.shape_cast %[[COL1]] : vector<16x1xf32> to vector<16xf32> -// // CHECK-NEXT: %[[RED1:.*]] = vector.reduction , %[[CAST1]], %{{.*}} : vector<16xf32> into f32 -// // CHECK-NEXT: vector.from_elements %[[RED0]], %[[RED1]] : vector<2xf32> -// gpu.module @xevm_module{ -// gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction() { -// %c0 = arith.constant 0 : index -// %0 = "some_def"() : () -> !xegpu.tensor_desc<1x32xf32, #xegpu.layout> -// %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<16x32xf32>) -// %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} dense<0.0> : vector<32xf32> -// %1 = vector.multi_reduction , %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} [0] -// : vector<16x32xf32> to vector<32xf32> -// %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout} -// : vector<32xf32> to vector<1x32xf32> -// xegpu.store_nd %3, %0[%c0, %c0] : vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #xegpu.layout> -// gpu.return -// } -// } - -// // ----- -// // CHECK-REDUCTION-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction -// // CHECK-REDUCTION: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (!xegpu.tensor_desc<2x16xf32, -// // CHECK-REDUCTION-SAME: #xegpu.layout>, f32, f32) { -// // CHECK-REDUCTION: %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout} : () -> vector<2x16xf32> -// // CHECK-REDUCTION-NEXT: %[[ROW0:.*]] = vector.extract %[[SRC]][0] : vector<16xf32> from vector<2x16xf32> -// // CHECK-REDUCTION-NEXT: %[[R0:.*]] = vector.reduction , %[[ROW0]], %{{.*}} : vector<16xf32> into f32 -// // CHECK-REDUCTION-NEXT: %[[ROW1:.*]] = vector.extract %[[SRC]][1] : vector<16xf32> from vector<2x16xf32> -// // CHECK-REDUCTION-NEXT: %[[R1:.*]] = vector.reduction , %[[ROW1]], %{{.*}} : vector<16xf32> into f32 -// // CHECK-REDUCTION-NEXT: gpu.yield %4, %[[R1]], %[[R0]] : !xegpu.tensor_desc<2x16xf32, #xegpu.layout>, f32, f32 -// // CHECK-REDUCTION-NEXT: } -// // CHECK-REDUCTION-NEXT: vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32> -// gpu.module @xevm_module{ -// gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction() { -// %c0 = arith.constant 0 : index -// %0 = "some_def"() : () -> !xegpu.tensor_desc<2x16xf32, #xegpu.layout> -// %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<2x16xf32>) -// %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} dense<0.0> : vector<2xf32> -// %1 = vector.multi_reduction , %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} -// [1] : vector<2x16xf32> to vector<2xf32> -// %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout} -// : vector<2xf32> to vector<2x1xf32> -// %4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout} : vector<2x1xf32> to vector<2x16xf32> -// xegpu.store_nd %4, %0[%c0, %c0] : vector<2x16xf32>, !xegpu.tensor_desc<2x16xf32, #xegpu.layout> -// gpu.return -// } -// } - -// // ----- -// // CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction -// // CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%0)[16] -> -// // CHECK-SAME: (!xegpu.tensor_desc<32x1xf32, #xegpu.layout>, vector<2x16xf32>) { -// // CHECK: %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout} : () -> vector<32x16xf32> -// // CHECK-NEXT: gpu.yield %{{.*}}, %[[SRC]] : !xegpu.tensor_desc<32x1xf32, #xegpu.layout>, vector<32x16xf32> -// // CHECK-NEXT: } -// // CHECK: %[[ROW0:.*]] = vector.extract %[[W]]#1[0] : vector<16xf32> from vector<2x16xf32> -// // CHECK-NEXT: %[[R0:.*]] = vector.reduction , %[[ROW0]], %{{.*}} : vector<16xf32> into f32 -// // CHECK: %[[ROW1:.*]] = vector.extract %[[W]]#1[1] : vector<16xf32> from vector<2x16xf32> -// // CHECK-NEXT: %[[R1:.*]] = vector.reduction , %[[ROW1]], %{{.*}} : vector<16xf32> into f32 -// // CHECK-NEXT: vector.from_elements %[[R0]], %[[R1]] : vector<2xf32> -// gpu.module @xevm_module{ -// gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction() { -// %c0 = arith.constant 0 : index -// %0 = "some_def"() : () -> !xegpu.tensor_desc<32x1xf32, #xegpu.layout> -// %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<32x16xf32>) -// %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} dense<0.0> : vector<32xf32> -// %1 = vector.multi_reduction , %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} [1] -// : vector<32x16xf32> to vector<32xf32> -// %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout} -// : vector<32xf32> to vector<32x1xf32> -// xegpu.store_nd %3, %0[%c0, %c0] : vector<32x1xf32>, !xegpu.tensor_desc<32x1xf32, #xegpu.layout> -// gpu.return -// } -// } - -// // ----- -// // CHECK-REDUCTION-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction -// // CHECK-REDUCTION: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (!xegpu.tensor_desc<16x2xf32, -// // CHECK-REDUCTION-SAME: #xegpu.layout>, f32, f32) { -// // CHECK-REDUCTION: %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout} : () -> vector<16x2xf32> -// // CHECK-REDUCTION-NEXT: %[[COL0:.*]] = vector.extract_strided_slice %[[SRC]] {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> -// // CHECK-REDUCTION-NEXT: %[[CAST0:.*]] = vector.shape_cast %[[COL0]] : vector<16x1xf32> to vector<16xf32> -// // CHECK-REDUCTION-NEXT: %[[R0:.*]] = vector.reduction , %[[CAST0]], %{{.*}} : vector<16xf32> into f32 -// // CHECK-REDUCTION-NEXT: %[[COL1:.*]] = vector.extract_strided_slice %5 {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> -// // CHECK-REDUCTION-NEXT: %[[CAST1:.*]] = vector.shape_cast %[[COL1]] : vector<16x1xf32> to vector<16xf32> -// // CHECK-REDUCTION-NEXT: %[[R1:.*]] = vector.reduction , %[[CAST1]], %cst : vector<16xf32> into f32 -// // CHECK-REDUCTION-NEXT: gpu.yield %4, %[[R1]], %[[R0]] : !xegpu.tensor_desc<16x2xf32, #xegpu.layout>, f32, f32 -// // CHECK-REDUCTION-NEXT: } -// // CHECK-REDUCTION-NEXT: vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32> -// gpu.module @xevm_module{ -// gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction() { -// %c0 = arith.constant 0 : index -// %0 = "some_def"() : () -> !xegpu.tensor_desc<16x2xf32, #xegpu.layout> -// %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<16x2xf32>) -// %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} dense<0.0> : vector<2xf32> -// %1 = vector.multi_reduction , %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} -// [0] : vector<16x2xf32> to vector<2xf32> -// %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout} -// : vector<2xf32> to vector<1x2xf32> -// %4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout} : vector<1x2xf32> to vector<16x2xf32> -// xegpu.store_nd %4, %0[%c0, %c0] : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.layout> -// gpu.return -// } -// } - -// // ----- -// // CHECK-LABEL: gpu.func @scatter_ops_chunksize({{.*}}) { -// // CHECK: %[[MASK:.*]] = arith.constant dense : vector<1xi1> -// // CHECK-NEXT: %[[LANE_OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> -// // CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[LANE_OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> -// // CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[LANE_OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> -// gpu.module @xevm_module{ -// gpu.func @scatter_ops_chunksize(%src: memref<256xf16>) { -// %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> -// %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> -// %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> { -// layout_result_0 = #xegpu.layout -// } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> -// xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> -// gpu.return -// } -// } - -// // ----- -// // CHECK-LABEL: gpu.func @scatter_ops_scf_yield({{.*}}, -// // CHECK-SAME: %[[PREDICATE:.*]]: i1) { -// // CHECK: %[[DEFAULT:.*]] = arith.constant dense<1.200000e+01> : vector<8xf16> -// // CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> -// // CHECK: %[[MASK:.*]] = arith.constant dense : vector<1xi1> -// // CHECK: %[[PREDICATED_LOAD:.*]] = scf.if %[[PREDICATE]] -> (vector<8xf16>) { -// // CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> -// // CHECK-NEXT: scf.yield %[[LOADED]] : vector<8xf16> -// // CHECK-NEXT: } else { -// // CHECK-NEXT: scf.yield %[[DEFAULT]] : vector<8xf16> -// // CHECK-NEXT: } -// // CHECK-NEXT: xegpu.store %[[PREDICATED_LOAD]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> -// gpu.module @xevm_module{ -// gpu.func @scatter_ops_scf_yield(%src: memref<256xf16>, %pred : i1) { -// %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> -// %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> -// %loaded = scf.if %pred -> (vector<16x8xf16>) { -// %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> { -// layout_result_0 = #xegpu.layout -// } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> -// scf.yield %3 : vector<16x8xf16> -// } else { -// %3 = arith.constant { -// layout_result_0 = #xegpu.layout -// } dense<12.> : vector<16x8xf16> -// scf.yield %3 : vector<16x8xf16> -// } { layout_result_0 = #xegpu.layout } -// xegpu.store %loaded, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> -// gpu.return -// } -// } - -// // ----- -// // CHECK-LABEL: gpu.func @scatter_ops_scf_non_yield({{.*}}) { -// // CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> -// // CHECK: %[[MASK:.*]] = arith.constant dense : vector<1xi1> -// // CHECK: %[[PREDICATE:.*]] = llvm.mlir.poison : i1 -// // CHECK: scf.if %[[PREDICATE]] { -// // CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> -// // CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> -// // CHECK-NEXT: } -// gpu.module @xevm_module{ -// gpu.func @scatter_ops_scf_non_yield(%src: memref<256xf16>) { -// %pred = llvm.mlir.poison : i1 -// %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> -// %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> -// scf.if %pred { -// %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> { -// layout_result_0 = #xegpu.layout -// } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> -// xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> -// } -// gpu.return -// } -// } - -// // ----- -// // CHECK-LABEL: gpu.func @scatter_ops({{.*}}) { -// // CHECK: %[[MASK:.*]] = arith.constant dense : vector<1xi1> -// // CHECK-NEXT: %[[LANE_OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> -// // CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[LANE_OFFSET]]], %[[MASK]] : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16> -// // CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[LANE_OFFSET]]], %[[MASK]] : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> -// gpu.module @xevm_module{ -// gpu.func @scatter_ops(%src: memref<256xf16>) { -// %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> -// %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> -// %3 = xegpu.load %src[%offset], %1 { -// layout_result_0 = #xegpu.layout -// } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> -// xegpu.store %3, %src[%offset], %1 : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> -// gpu.return -// } -// } - -// // ----- -// // CHECK-LABEL: gpu.func @memref_extract_aligned_pointer_as_index( -// // CHECK: %{{.*}} = memref.extract_aligned_pointer_as_index %{{.*}} : memref<256x256xf16> -> index -// gpu.module @xevm_module{ -// gpu.func @memref_extract_aligned_pointer_as_index(%arg0 : memref<256x256xf16>) { -// %c0 = arith.constant 0 : index -// %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16xf16> -// %ptr = memref.extract_aligned_pointer_as_index %arg0 : memref<256x256xf16> -> index -// %ptr_i64 = arith.index_cast %ptr : index to i64 -// %tdesc = xegpu.create_nd_tdesc %ptr_i64, shape: [16], strides: [16] : i64 -// -> !xegpu.tensor_desc<16xf16, #xegpu.layout> -// xegpu.store_nd %cst, %tdesc[%c0] : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout> -// gpu.return -// } -// } - - -// // ----- -// // CHECK-LABEL: gpu.func @vector_transpose( -// // CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<2xf32> -// // CHECK: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<2x16xf32> -> !xegpu.tensor_desc<2x16xf32> -// // CHECK: xegpu.store_nd %[[CST]], %[[DEST]][{{.*}}] : vector<2xf32>, !xegpu.tensor_desc<2x16xf32> -// gpu.module @xevm_module{ -// gpu.func @vector_transpose(%arg0: memref<2x16xf32>) { -// %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> -// : vector<16x2xf32> -// %c0 = arith.constant 0 : index -// %transpose = vector.transpose %cst, [1, 0] {layout_result_0 = #xegpu.layout} -// : vector<16x2xf32> to vector<2x16xf32> -// %0 = xegpu.create_nd_tdesc %arg0 : memref<2x16xf32> -// -> !xegpu.tensor_desc<2x16xf32, #xegpu.layout> -// xegpu.store_nd %transpose, %0[%c0, %c0] : vector<2x16xf32>, -// !xegpu.tensor_desc<2x16xf32, #xegpu.layout> -// gpu.return -// } -// } - -// // ----- -// // CHECK-LABEL: gpu.func @vector_bitcast( -// // CHECK: %[[CAST:.*]] = vector.bitcast %{{.*}} : vector<4x2xi8> to vector<4x1xi16> -// // CHECK-NEXT: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<4x16xi16> -> !xegpu.tensor_desc<4x16xi16> -// // CHECK-NEXT: %[[T0:.*]] = vector.shape_cast %[[CAST]] : vector<4x1xi16> to vector<4xi16> -// // CHECK-NEXT: xegpu.store_nd %[[T0]], %[[DEST]][{{.*}}] : vector<4xi16>, !xegpu.tensor_desc<4x16xi16> -// gpu.module @xevm_module{ -// gpu.func @vector_bitcast(%arg0: memref<4x16xi16>) { -// %cst = "some_op"() {layout_result_0 = #xegpu.layout} -// : () -> (vector<4x32xi8>) -// %bitcast = vector.bitcast %cst {layout_result_0 = #xegpu.layout} -// : vector<4x32xi8> to vector<4x16xi16> -// %c0 = arith.constant 0 : index -// %0 = xegpu.create_nd_tdesc %arg0 : memref<4x16xi16> -// -> !xegpu.tensor_desc<4x16xi16, #xegpu.layout> -// xegpu.store_nd %bitcast, %0[%c0, %c0] : vector<4x16xi16>, -// !xegpu.tensor_desc<4x16xi16, #xegpu.layout> -// gpu.return -// } -// } - -// // ----- -// // CHECK-LABEL: gpu.func @mma_transpose_b( -// // CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x8xi32>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// // CHECK-DAG: %[[ADESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// // CHECK-DAG: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32> -// // CHECK-DAG: %[[A:.*]] = xegpu.load_nd %[[ADESC]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> -// // CHECK-DAG: %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}] <{transpose = array}> : !xegpu.tensor_desc<16x8xi32> -> vector<8xi32> -// // CHECK-NEXT: %[[BCAST0:.*]] = vector.shape_cast %[[B]] : vector<8xi32> to vector<1x8xi32> -// // CHECK-NEXT: %[[BCAST1:.*]] = vector.bitcast %[[BCAST0]] : vector<1x8xi32> to vector<1x16xf16> -// // CHECK-NEXT: %[[BCAST2:.*]] = vector.shape_cast %[[BCAST1]] : vector<1x16xf16> to vector<16xf16> -// // CHECK-NEXT: %[[C:.*]] = xegpu.dpas %[[A]], %[[BCAST2]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> -// gpu.module @xevm_module{ -// gpu.func @mma_transpose_b(%arg0: memref<8x16xf16>, %arg1: memref<16x8xi32>, %arg2: memref<8x16xf32>) { -// %c0 = arith.constant 0 : index -// %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -// -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -// %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} -// : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> -// %2 = xegpu.create_nd_tdesc %arg1 : memref<16x8xi32> -// -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -// %3 = xegpu.load_nd %2[%c0, %c0] {layout_result_0 = #xegpu.layout} -// : !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> -// %4 = vector.bitcast %3 {layout_result_0 = #xegpu.layout} -// : vector<16x8xi32> to vector<16x16xf16> -// %5 = vector.transpose %4, [1, 0] {layout_result_0 = #xegpu.layout} -// : vector<16x16xf16> to vector<16x16xf16> -// %6 = xegpu.dpas %1, %5 {layout_result_0 = #xegpu.layout} -// : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// %7 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -// -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -// xegpu.store_nd %6, %7[%c0, %c0] : vector<8x16xf32>, -// !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -// gpu.return - -// } -// } From d5aa520e4c9f7304de31c7be7947fdfe2a469c37 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 1 Oct 2025 00:22:32 +0000 Subject: [PATCH 6/9] save work and bug fixes --- ...tribute.mlir => propgate-layouts-and-subgroup-distribute.mlir} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename mlir/test/Dialect/XeGPU/{subgroup-distribute.mlir => propgate-layouts-and-subgroup-distribute.mlir} (100%) diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/propgate-layouts-and-subgroup-distribute.mlir similarity index 100% rename from mlir/test/Dialect/XeGPU/subgroup-distribute.mlir rename to mlir/test/Dialect/XeGPU/propgate-layouts-and-subgroup-distribute.mlir From 01fc9290342a127cb0fe53609a38fbd6462e0c09 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Wed, 1 Oct 2025 22:16:47 +0000 Subject: [PATCH 7/9] save work and bug fixes --- .../mlir/Dialect/XeGPU/Transforms/Passes.td | 4 - .../Transforms/XeGPUSubgroupDistribute.cpp | 12 +- ...pgate-layouts-and-subgroup-distribute.mlir | 167 ++++- .../XeGPU/subgroup-distribute-unit.mlir | 571 +++++++++--------- 4 files changed, 408 insertions(+), 346 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td index 83b128e2c7cbf..564d9c4d5422b 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td @@ -27,10 +27,6 @@ def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> { }]; let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect"]; - let options = [Option< - "enableSGReductions", "enable-sg-reductions", "bool", - /*default=*/"true", - "Enable subgroup reductions using subgroup shuffles.">]; } def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> { diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index a6ca25f30a008..c25ec45efb648 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -1415,11 +1415,6 @@ namespace { struct XeGPUSubgroupDistributePass final : public xegpu::impl::XeGPUSubgroupDistributeBase< XeGPUSubgroupDistributePass> { - XeGPUSubgroupDistributePass() = default; - XeGPUSubgroupDistributePass(const XeGPUSubgroupDistributePass &other) = - default; - XeGPUSubgroupDistributePass(xegpu::XeGPUSubgroupDistributeOptions options) - : XeGPUSubgroupDistributeBase(options) {} void runOnOperation() override; }; } // namespace @@ -1527,10 +1522,9 @@ void XeGPUSubgroupDistributePass::runOnOperation() { return laneVal; }; - if (enableSGReductions) - vector::populateDistributeReduction( - patterns, warpReduction, - /*pattern benefit=*/regularPatternBenefit); + vector::populateDistributeReduction( + patterns, warpReduction, + /*pattern benefit=*/regularPatternBenefit); vector::populatePropagateWarpVectorDistributionPatterns( patterns, distributionFn, shuffleFn, diff --git a/mlir/test/Dialect/XeGPU/propgate-layouts-and-subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/propgate-layouts-and-subgroup-distribute.mlir index 4d44e63a47ac1..0e1365aa64171 100644 --- a/mlir/test/Dialect/XeGPU/propgate-layouts-and-subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/propgate-layouts-and-subgroup-distribute.mlir @@ -2,17 +2,18 @@ // RUN: -allow-unregistered-dialect -canonicalize -cse -split-input-file %s | FileCheck %s // CHECK-LABEL: gpu.func @load_dpas_postop_store -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> -// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> -// CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32> -// CHECK: %[[T6:.*]] = math.exp %[[T5]] {{{.*}}} : vector<8x1xf32> -// CHECK-DAG: %[[T8:.*]] = vector.shape_cast %[[T6]] : vector<8x1xf32> to vector<8xf32> -// CHECK-DAG: %[[T7:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: xegpu.store_nd %[[T8]], %[[T7]][{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, +// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { +// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> +// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> +// CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32> +// CHECK: %[[T6:.*]] = math.exp %[[T5]] {{{.*}}} : vector<8x1xf32> +// CHECK-DAG: %[[T8:.*]] = vector.shape_cast %[[T6]] : vector<8x1xf32> to vector<8xf32> +// CHECK-DAG: %[[T7:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> +// CHECK: xegpu.store_nd %[[T8]], %[[T7]][{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> gpu.module @xevm_module{ gpu.func @load_dpas_postop_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index @@ -47,26 +48,29 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @gemm -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) { -// CHECK-DAG: %[[BLOCK_ID_X:.*]] = gpu.block_id x -// CHECK-DAG: %[[BLOCK_ID_Y:.*]] = gpu.block_id y -// CHECK-DAG: %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index -// CHECK-DAG: %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> -// CHECK-NEXT: %[[T4:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32> -// CHECK: %[[T5:.*]] = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %[[T4]]) -> (vector<8x1xf32>) { -// CHECK-DAG: %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16> -// CHECK-DAG: %[[T11:.*]] = xegpu.load_nd %[[T10]][%[[K]], %[[Y_COORD]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16> -// CHECK-DAG: %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16> -// CHECK-DAG: %[[T13:.*]] = xegpu.load_nd %[[T12]][%[[X_COORD]], %[[K]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16> -// CHECK-DAG: %[[T14:.*]] = vector.shape_cast %[[ARG4]] : vector<8x1xf32> to vector<8xf32> -// CHECK-NEXT: %[[T15:.*]] = xegpu.dpas %[[T13]], %[[T11]], %[[T14]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32> -// CHECK-NEXT: %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32> -// CHECK-NEXT: scf.yield %[[T16]] : vector<8x1xf32> -// CHECK-NEXT: } -// CHECK-NEXT: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32> -// CHECK-NEXT: xegpu.store_nd %[[T9]], %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, +// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) { +// CHECK-DAG: %[[BLOCK_ID_X:.*]] = gpu.block_id x +// CHECK-DAG: %[[BLOCK_ID_Y:.*]] = gpu.block_id y +// CHECK-DAG: %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index +// CHECK-DAG: %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index +// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32> +// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> +// CHECK-NEXT: %[[T4:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32> +// CHECK: %[[T5:.*]] = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %[[T4]]) +// CHECK-SAME: -> (vector<8x1xf32>) { +// CHECK-DAG: %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16> +// CHECK-DAG: %[[T11:.*]] = xegpu.load_nd %[[T10]][%[[K]], %[[Y_COORD]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16> +// CHECK-DAG: %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16> +// CHECK-DAG: %[[T13:.*]] = xegpu.load_nd %[[T12]][%[[X_COORD]], %[[K]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16> +// CHECK-DAG: %[[T14:.*]] = vector.shape_cast %[[ARG4]] : vector<8x1xf32> to vector<8xf32> +// CHECK-NEXT: %[[T15:.*]] = xegpu.dpas %[[T13]], %[[T11]], %[[T14]] +// CHECK-SAME: : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32> +// CHECK-NEXT: %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32> +// CHECK-NEXT: scf.yield %[[T16]] : vector<8x1xf32> +// CHECK-NEXT: } +// CHECK-NEXT: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32> +// CHECK-NEXT: xegpu.store_nd %[[T9]], %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> gpu.module @xevm_module{ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){ %c0 = arith.constant 0 : index @@ -109,3 +113,104 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar gpu.return } } + +// ----- +// CHECK-LABEL: gpu.func @scatter_ops_scf_yield +// CHECK: (%{{.*}}: memref<256xf16>, %[[PREDICATE:[a-zA-Z0-9]+]]: i1) { +// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.200000e+01> : vector<1x8xf16> +// CHECK-DAG: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> +// CHECK-DAG: %[[MASK:.*]] = arith.constant dense : vector<1xi1> +// CHECK: %[[IF:.*]] = scf.if %[[PREDICATE]] -> (vector<1x8xf16>) { +// CHECK-NEXT: %[[LD:.*]] = xegpu.load %{{.*}}[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> +// CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> +// CHECK-NEXT: %[[LD_CAST:.*]] = vector.shape_cast %[[LD]] : vector<8xf16> to vector<1x8xf16> +// CHECK-NEXT: scf.yield %[[LD_CAST]] : vector<1x8xf16> +// CHECK-NEXT: } else { +// CHECK-NEXT: scf.yield %[[CST]] : vector<1x8xf16> +// CHECK-NEXT: } +// CHECK-NEXT: %[[IF_CAST:.*]] = vector.shape_cast %[[IF]] : vector<1x8xf16> to vector<8xf16> +// CHECK-NEXT: xegpu.store %[[IF_CAST]], %{{.*}}[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> +// CHECK-SAME: vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> +gpu.module @xevm_module{ + gpu.func @scatter_ops_scf_yield(%src: memref<256xf16>, %pred : i1) { + %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> + %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> + %loaded = scf.if %pred -> (vector<16x8xf16>) { + %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> { + layout_result_0 = #xegpu.layout + } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> + scf.yield %3 : vector<16x8xf16> + } else { + %3 = arith.constant { + layout_result_0 = #xegpu.layout + } dense<12.> : vector<16x8xf16> + scf.yield %3 : vector<16x8xf16> + } { layout_result_0 = #xegpu.layout } + xegpu.store %loaded, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @scatter_ops_scf_non_yield({{.*}}) { +// CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> +// CHECK: %[[MASK:.*]] = arith.constant dense : vector<1xi1> +// CHECK: %[[PREDICATE:.*]] = llvm.mlir.poison : i1 +// CHECK: scf.if %[[PREDICATE]] { +// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> +// CHECK-SAME: memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> +// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> +// CHECK-SAME: vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> +// CHECK-NEXT: } +gpu.module @xevm_module{ + gpu.func @scatter_ops_scf_non_yield(%src: memref<256xf16>) { + %pred = llvm.mlir.poison : i1 + %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> + %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> + scf.if %pred { + %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> { + layout_result_0 = #xegpu.layout + } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> + xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> + } + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @mma_transpose_b( +// CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x8xi32>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { +// CHECK-DAG: %[[ADESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> +// CHECK-DAG: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32> +// CHECK-DAG: %[[A:.*]] = xegpu.load_nd %[[ADESC]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> +// CHECK-DAG: %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}] <{transpose = array}> +// CHECK-SAME: !xegpu.tensor_desc<16x8xi32> -> vector<8xi32> +// CHECK-NEXT: %[[BCAST0:.*]] = vector.shape_cast %[[B]] : vector<8xi32> to vector<1x8xi32> +// CHECK-NEXT: %[[BCAST1:.*]] = vector.bitcast %[[BCAST0]] : vector<1x8xi32> to vector<1x16xf16> +// CHECK-NEXT: %[[BCAST2:.*]] = vector.shape_cast %[[BCAST1]] : vector<1x16xf16> to vector<16xf16> +// CHECK-NEXT: %[[C:.*]] = xegpu.dpas %[[A]], %[[BCAST2]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> +gpu.module @xevm_module{ + gpu.func @mma_transpose_b(%arg0: memref<8x16xf16>, %arg1: memref<16x8xi32>, %arg2: memref<8x16xf32>) { + %c0 = arith.constant 0 : index + %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> + -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> + %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} + : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> + %2 = xegpu.create_nd_tdesc %arg1 : memref<16x8xi32> + -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout> + %3 = xegpu.load_nd %2[%c0, %c0] {layout_result_0 = #xegpu.layout} + : !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> + %4 = vector.bitcast %3 {layout_result_0 = #xegpu.layout} + : vector<16x8xi32> to vector<16x16xf16> + %5 = vector.transpose %4, [1, 0] {layout_result_0 = #xegpu.layout} + : vector<16x16xf16> to vector<16x16xf16> + %6 = xegpu.dpas %1, %5 {layout_result_0 = #xegpu.layout} + : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> + %7 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> + -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + xegpu.store_nd %6, %7[%c0, %c0] : vector<8x16xf32>, + !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + gpu.return + + } +} diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir index d3f88c2df1291..5b56c8981dae8 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir @@ -1,18 +1,16 @@ -// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' -xegpu-subgroup-distribute \ -// RUN: -allow-unregistered-dialect -canonicalize -cse -split-input-file %s | FileCheck %s - -// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' \ -// RUN: -xegpu-subgroup-distribute="enable-sg-reductions=false" -allow-unregistered-dialect \ -// RUN: -canonicalize -cse -split-input-file %s | FileCheck %s --check-prefix=CHECK-REDUCTION +// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' -test-xegpu-sg-distribute -allow-unregistered-dialect \ +// RUN: -canonicalize -cse -split-input-file %s | FileCheck %s // CHECK-LABEL: gpu.func @store_nd_1d -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>) { -// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1xf32> -// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> -// CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}] : vector<1xf32>, !xegpu.tensor_desc<16xf32> -// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> -// CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}] : vector<1xf32>, !xegpu.tensor_desc<16xf32> -// CHECK: gpu.return +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) { +// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] +// CHECK-SAME: -> (vector<1xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout>, index) { +// CHECK: gpu.yield %{{.*}} : vector<16xf32>, +// CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.layout>, index +// CHECK-NEXT: } +// CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16xf32, +// CHECK-SAME: #xegpu.layout> to !xegpu.tensor_desc<16xf32> {resolve_simt_type_mismatch} +// CHECK-NEXT: xegpu.store_nd %[[W]]#0, %[[T1]][%[[W]]#2] : vector<1xf32>, !xegpu.tensor_desc<16xf32> gpu.module @xevm_module{ gpu.func @store_nd_1d(%laneid: index) { %c0 = arith.constant 0 : index @@ -28,12 +26,17 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @store_nd_2d -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>) { -// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf16> -// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> -// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) { +// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] +// CHECK-SAME: -> (vector<16x1xf16>, !xegpu.tensor_desc<16x16xf16, +// CHECK-SAME: #xegpu.layout>, index, index) { +// CHECK: gpu.yield %{{.*}} : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, +// CHECK-SAME: #xegpu.layout>, index, index +// CHECK-NEXT: } +// CHECK-NEXT: %[[CAST:.*]] = vector.shape_cast %[[W]]#0 : vector<16x1xf16> to vector<16xf16> +// CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16x16xf16, +// CHECK-SAME: #xegpu.layout> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch} +// CHECK-NEXT: xegpu.store_nd %[[CAST]], %[[T1]][%[[W]]#2, %[[W]]#3] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> gpu.module @xevm_module{ gpu.func @store_nd_2d(%laneid : index) { %c0 = arith.constant 0 : index @@ -51,40 +54,41 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @load_nd_1d -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16xf32>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> -// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16xf32> -> vector<1xf32> -// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> -// CHECK: xegpu.store_nd %[[T1]], %[[T2]][%{{.*}}] : vector<1xf32>, !xegpu.tensor_desc<16xf32> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> -// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16xf32> -> vector<1xf32> -// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> -// CHECK: xegpu.store_nd %[[T1]], %[[T2]][%{{.*}}] : vector<1xf32>, !xegpu.tensor_desc<16xf32> +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) { +// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<1xf32>, +// CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.layout>, index) { +// CHECK: gpu.yield %{{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, +// CHECK-SAME: #xegpu.layout>, index +// CHECK-NEXT: } +// CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16xf32, +// CHECK-SAME: #xegpu.layout> to !xegpu.tensor_desc<16xf32> {resolve_simt_type_mismatch} +// CHECK-NEXT: xegpu.load_nd %[[T1]][%[[W]]#2] : !xegpu.tensor_desc<16xf32> -> vector<1xf32> gpu.module @xevm_module{ gpu.func @load_nd_1d(%laneid: index) { %c0 = arith.constant 0 : index - %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16xf32>) { + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf32>) { %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout> %1 = xegpu.load_nd %0 [%c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16xf32, #xegpu.layout> -> vector<16xf32> gpu.yield %1 : vector<16xf32> } - "some_user_op"(%r) : (vector<16xf32>) -> () + "some_user_op"(%r) : (vector<1xf32>) -> () gpu.return } } // ----- // CHECK-LABEL: gpu.func @load_nd_2d -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> -// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.store_nd %[[T1]], %[[T2]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> -// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.store_nd %[[T1]], %[[T2]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) { +// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, !xegpu.tensor_desc<16x16xf16, +// CHECK-SAME: #xegpu.layout>, index, index) { +// CHECK: gpu.yield %{{.*}} : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, +// CHECK-SAME: #xegpu.layout>, index, index +// CHECK-NEXT: } +// CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16x16xf16, +// CHECK-SAME: #xegpu.layout> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch} +// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[T1]][%[[W]]#2, %[[W]]#3] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> +// CHECK: vector.shape_cast %[[T2]] : vector<16xf16> to vector<16x1xf16> gpu.module @xevm_module{ gpu.func @load_nd_2d(%laneid: index) { %c0 = arith.constant 0 : index @@ -101,18 +105,19 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @load_nd_array_length -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<32xf16> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<32xf16> -// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<32xf16> to vector<2x16x1xf16> -// CHECK: %[[T3:.*]] = vector.extract %[[T2]][0] : vector<16x1xf16> from vector<2x16x1xf16> -// CHECK-DAG: %[[T4:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK-DAG: %[[T4:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK-DAG: %[[T5:.*]] = vector.shape_cast %[[T3]] : vector<16x1xf16> to vector<16xf16> -// CHECK: xegpu.store_nd %[[T5]], %[[T4]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.store_nd %[[T5]], %[[T4]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) { +// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<2x16x1xf16>, +// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, +// CHECK-SAME: #xegpu.layout>, index, index) { +// CHECK: gpu.yield %{{.*}} : vector<2x16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr< +// CHECK-SAME: array_length = 2 : i64>, #xegpu.layout>, index, index +// CHECK-NEXT: } +// CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16x16xf16, +// CHECK-SAME: #xegpu.block_tdesc_attr, #xegpu.layout> to !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> +// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[T1]][%[[W]]#2, %[[W]]#3] : !xegpu.tensor_desc<16x16xf16, +// CHECK-SAME: #xegpu.block_tdesc_attr> -> vector<32xf16> +// CHECK-NEXT: vector.shape_cast %[[T2]] : vector<32xf16> to vector<2x16x1xf16> gpu.module @xevm_module{ gpu.func @load_nd_array_length(%laneid: index) { %c0 = arith.constant 0 : index @@ -130,21 +135,17 @@ gpu.module @xevm_module{ } // ----- -// CHECK-LABEL: gpu.func @load_dpas_store -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> -// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> -// CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: xegpu.store_nd %[[T4]], %[[T5]][%{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> -// CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: xegpu.store_nd %[[T4]], %[[T5]][%{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> +// CHECK-LABEL: gpu.func @dpas +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) { +// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> +// CHECK-SAME: (vector<8x1xf32>, vector<8x1xf16>, vector<16x1xf16>, vector<8x1xf32>) { +// CHECK: gpu.yield %{{.*}} : vector<8x16xf32>, vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> +// CHECK-NEXT: } +// CHECK-DAG: %[[T1:.*]] = vector.shape_cast %[[W]]#1 : vector<8x1xf16> to vector<8xf16> +// CHECK-DAG: %[[T2:.*]] = vector.shape_cast %[[W]]#2 : vector<16x1xf16> to vector<16xf16> +// CHECK-DAG: %[[T3:.*]] = vector.shape_cast %[[W]]#3 : vector<8x1xf32> to vector<8xf32> +// CHECK-NEXT: %[[T4:.*]] = xegpu.dpas %[[T1]], %[[T2]], %[[T3]] : vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32> +// CHECK-NEXT: vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32> gpu.module @xevm_module{ gpu.func @dpas(%laneid: index) { %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) { @@ -169,17 +170,14 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @create_nd_tdesc_non_memref -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: ui64, %[[ARG2:[0-9a-zA-Z]+]]: index, -// CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: index, %[[ARG4:[0-9a-zA-Z]+]]: index, -// CHECK-SAME: %[[ARG5:[0-9a-zA-Z]+]]: index, %[[ARG6:[0-9a-zA-Z]+]]: index, %[[ARG7:[0-9a-zA-Z]+]]: index) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][{{.*}}] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.store_nd %[[T1]], %[[T2]][{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][{{.*}}] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.store_nd %[[T1]], %[[T2]][{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: index) { +// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG1]])[16] -> (!xegpu.tensor_desc<16x16xf16, +// CHECK-SAME: #xegpu.layout>, ui64) { +// CHECK: gpu.yield %{{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, ui64 +// CHECK-NEXT: } +// CHECK-NEXT: %[[T1:.*]] = xegpu.create_nd_tdesc %[[W]]#1, shape : [64, 128], strides : [128, 1] : ui64 -> !xegpu.tensor_desc<16x16xf16> +// CHECK-NEXT: builtin.unrealized_conversion_cast %[[T1]] : !xegpu.tensor_desc<16x16xf16> to !xegpu.tensor_desc<16x16xf16, +// CHECK-SAME: #xegpu.layout> {resolve_simt_type_mismatch} gpu.module @xevm_module{ gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %laneid: index) { %c0 = arith.constant 0 : index @@ -196,9 +194,16 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @prefetch_2d -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.prefetch_nd %[[T0]][%{{.*}}] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16> +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) { +// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (!xegpu.tensor_desc<16x16xf16, +// CHECK-SAME: #xegpu.layout>, index, index) { +// CHECK: gpu.yield %{{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> +// CHECK-SAME: , index, index +// CHECK-NEXT: } +// CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#0 : !xegpu.tensor_desc<16x16xf16, +// CHECK-SAME: #xegpu.layout> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch} +// CHECK-NEXT: xegpu.prefetch_nd %[[T1]][%[[W]]#1, %[[W]]#2] +// CHECK-SAME: <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16> gpu.module @xevm_module{ gpu.func @prefetch_2d(%laneid: index) { %c0 = arith.constant 0 : index @@ -215,9 +220,15 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @prefetch_1d -// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256xf16> -> !xegpu.tensor_desc<16xf16> -// CHECK: xegpu.prefetch_nd %[[T0]][%{{.*}}] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16> +// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) { +// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (!xegpu.tensor_desc<16xf16, +// CHECK-SAME: #xegpu.layout>, index) { +// CHECK: gpu.yield %{{.*}} : !xegpu.tensor_desc<16xf16, #xegpu.layout>, index +// CHECK-NEXT: } +// CHECK-NEXT: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#0 : !xegpu.tensor_desc<16xf16, +// CHECK-SAME: #xegpu.layout> to !xegpu.tensor_desc<16xf16> {resolve_simt_type_mismatch} +// CHECK-NEXT: xegpu.prefetch_nd %[[T1]][%[[W]]#1] <{l1_hint = #xegpu.cache_hint, +// CHECK-SAME: l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16> gpu.module @xevm_module{ gpu.func @prefetch_1d(%laneid: index) { %c0 = arith.constant 0 : index @@ -234,11 +245,11 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @gpu_barrier({{.*}}) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16> -// CHECK-NEXT: %[[T1:.*]] = xegpu.load_nd %[[T0]][{{.*}}] : !xegpu.tensor_desc<16xf16> -> vector<1xf16> -// CHECK-NEXT: gpu.barrier -// CHECK-NEXT: %[[T2:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16> -// CHECK-NEXT: xegpu.store_nd %[[T1]], %[[T2]][{{.*}}] : vector<1xf16>, !xegpu.tensor_desc<16xf16> +// CHECK: gpu.warp_execute_on_lane_0(%{{.*}})[16] -> ({{.*}}) { +// CHECK: gpu.yield %{{.*}} +// CHECK: } +// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16xf16> -> vector<1xf16> +// CHECK: gpu.barrier gpu.module @xevm_module{ gpu.func @gpu_barrier(%laneid: index) { %c0 = arith.constant 0 : index @@ -257,18 +268,23 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction -// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> -// CHECK-SAME: (!xegpu.tensor_desc<1x32xf32, #xegpu.layout>, vector<16x2xf32>) { -// CHECK: %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout} : () -> vector<16x32xf32> -// CHECK-NEXT: gpu.yield %{{.*}}, %[[SRC]] : !xegpu.tensor_desc<1x32xf32, #xegpu.layout>, vector<16x32xf32> +// CHECK: %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32> +// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] +// CHECK-SAME: -> (vector<2xf32>, vector<16x2xf32>, vector<2xf32>) { +// CHECK: %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<16x32xf32> +// CHECK: gpu.yield %{{.*}}, %[[SRC]], %[[ACC]] : vector<32xf32>, vector<16x32xf32>, vector<32xf32> // CHECK-NEXT: } -// CHECK: %[[COL0:.*]] = vector.extract_strided_slice %[[W]]#1 {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> -// CHECK-NEXT: %[[CAST0:.*]] = vector.shape_cast %[[COL0]] : vector<16x1xf32> to vector<16xf32> -// CHECK-NEXT: %[[RED0:.*]] = vector.reduction , %[[CAST0]], %{{.*}} : vector<16xf32> into f32 -// CHECK: %[[COL1:.*]] = vector.extract_strided_slice %[[W]]#1 {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> -// CHECK-NEXT: %[[CAST1:.*]] = vector.shape_cast %[[COL1]] : vector<16x1xf32> to vector<16xf32> -// CHECK-NEXT: %[[RED1:.*]] = vector.reduction , %[[CAST1]], %{{.*}} : vector<16xf32> into f32 -// CHECK-NEXT: vector.from_elements %[[RED0]], %[[RED1]] : vector<2xf32> +// CHECK: %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1 +// CHECK-SAME: {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> +// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<16x1xf32> to vector<16xf32> +// CHECK: %[[T3:.*]] = vector.extract %[[W]]#2[0] : f32 from vector<2xf32> +// CHECK: %[[T4:.*]] = vector.reduction , %[[T2]], %[[T3]] : vector<16xf32> into f32 +// CHECK: %[[T5:.*]] = vector.extract_strided_slice %[[W]]#1 +// CHECK-SAME: {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> +// CHECK: %[[T6:.*]] = vector.shape_cast %[[T5]] : vector<16x1xf32> to vector<16xf32> +// CHECK: %[[T7:.*]] = vector.extract %[[W]]#2[1] : f32 from vector<2xf32> +// CHECK: %[[T8:.*]] = vector.reduction , %[[T6]], %[[T7]] : vector<16xf32> into f32 +// CHECK: %[[T9:.*]] = vector.from_elements %[[T4]], %[[T8]] : vector<2xf32> gpu.module @xevm_module{ gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction(%laneid: index) { %c0 = arith.constant 0 : index @@ -294,17 +310,16 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction(%laneid: index) } // ----- -// CHECK-REDUCTION-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction -// CHECK-REDUCTION: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (!xegpu.tensor_desc<2x16xf32, -// CHECK-REDUCTION-SAME: #xegpu.layout>, f32, f32) { -// CHECK-REDUCTION: %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout} : () -> vector<2x16xf32> -// CHECK-REDUCTION-NEXT: %[[ROW0:.*]] = vector.extract %[[SRC]][0] : vector<16xf32> from vector<2x16xf32> -// CHECK-REDUCTION-NEXT: %[[R0:.*]] = vector.reduction , %[[ROW0]], %{{.*}} : vector<16xf32> into f32 -// CHECK-REDUCTION-NEXT: %[[ROW1:.*]] = vector.extract %[[SRC]][1] : vector<16xf32> from vector<2x16xf32> -// CHECK-REDUCTION-NEXT: %[[R1:.*]] = vector.reduction , %[[ROW1]], %{{.*}} : vector<16xf32> into f32 -// CHECK-REDUCTION-NEXT: gpu.yield %4, %[[R1]], %[[R0]] : !xegpu.tensor_desc<2x16xf32, #xegpu.layout>, f32, f32 -// CHECK-REDUCTION-NEXT: } -// CHECK-REDUCTION-NEXT: vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32> +// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction +// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) { +// CHECK-NEXT: %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<2x16xf32> +// CHECK-NEXT: %[[T2:.*]] = vector.extract %[[SRC]][0] : vector<16xf32> from vector<2x16xf32> +// CHECK-NEXT: %[[T3:.*]] = vector.reduction , %[[T2]], %cst : vector<16xf32> into f32 +// CHECK-NEXT: %[[T4:.*]] = vector.extract %[[SRC]][1] : vector<16xf32> from vector<2x16xf32> +// CHECK-NEXT: %[[T5:.*]] = vector.reduction , %[[T4]], %cst : vector<16xf32> into f32 +// CHECK-NEXT: %[[T6:.*]] = vector.from_elements %[[T3]], %[[T5]] : vector<2xf32> +// CHECK-NEXT: gpu.yield %[[T6]] : vector<2xf32> +// CHECK-NEXT: } gpu.module @xevm_module{ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index) { %c0 = arith.constant 0 : index @@ -331,16 +346,18 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index) // ----- // CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction -// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%0)[16] -> -// CHECK-SAME: (!xegpu.tensor_desc<32x1xf32, #xegpu.layout>, vector<2x16xf32>) { -// CHECK: %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout} : () -> vector<32x16xf32> -// CHECK-NEXT: gpu.yield %{{.*}}, %[[SRC]] : !xegpu.tensor_desc<32x1xf32, #xegpu.layout>, vector<32x16xf32> -// CHECK-NEXT: } -// CHECK: %[[ROW0:.*]] = vector.extract %[[W]]#1[0] : vector<16xf32> from vector<2x16xf32> -// CHECK-NEXT: %[[R0:.*]] = vector.reduction , %[[ROW0]], %{{.*}} : vector<16xf32> into f32 -// CHECK: %[[ROW1:.*]] = vector.extract %[[W]]#1[1] : vector<16xf32> from vector<2x16xf32> -// CHECK-NEXT: %[[R1:.*]] = vector.reduction , %[[ROW1]], %{{.*}} : vector<16xf32> into f32 -// CHECK-NEXT: vector.from_elements %[[R0]], %[[R1]] : vector<2xf32> +// CHECK: %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32> +// CHECK: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>, vector<2x16xf32>, vector<2xf32>) { +// CHECK: %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<32x16xf32> +// CHECK: gpu.yield %9, %[[SRC]], %[[ACC]] : vector<32xf32>, vector<32x16xf32>, vector<32xf32> +// CHECK: } +// CHECK: %[[T1:.*]] = vector.extract %[[W]]#1[0] : vector<16xf32> from vector<2x16xf32> +// CHECK: %[[T2:.*]] = vector.extract %[[W]]#2[0] : f32 from vector<2xf32> +// CHECK: %[[T3:.*]] = vector.reduction , %[[T1]], %[[T2]] : vector<16xf32> into f32 +// CHECK: %[[T4:.*]] = vector.extract %[[W]]#1[1] : vector<16xf32> from vector<2x16xf32> +// CHECK: %[[T5:.*]] = vector.extract %[[W]]#2[1] : f32 from vector<2xf32> +// CHECK: %[[T6:.*]] = vector.reduction , %[[T4]], %[[T5]] : vector<16xf32> into f32 +// CHECK: %[[T7:.*]] = vector.from_elements %[[T3]], %[[T6]] : vector<2xf32> gpu.module @xevm_module{ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index) { %c0 = arith.constant 0 : index @@ -366,19 +383,20 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index) } // ----- -// CHECK-REDUCTION-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction -// CHECK-REDUCTION: %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (!xegpu.tensor_desc<16x2xf32, -// CHECK-REDUCTION-SAME: #xegpu.layout>, f32, f32) { -// CHECK-REDUCTION: %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout} : () -> vector<16x2xf32> -// CHECK-REDUCTION-NEXT: %[[COL0:.*]] = vector.extract_strided_slice %[[SRC]] {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> -// CHECK-REDUCTION-NEXT: %[[CAST0:.*]] = vector.shape_cast %[[COL0]] : vector<16x1xf32> to vector<16xf32> -// CHECK-REDUCTION-NEXT: %[[R0:.*]] = vector.reduction , %[[CAST0]], %{{.*}} : vector<16xf32> into f32 -// CHECK-REDUCTION-NEXT: %[[COL1:.*]] = vector.extract_strided_slice %5 {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> -// CHECK-REDUCTION-NEXT: %[[CAST1:.*]] = vector.shape_cast %[[COL1]] : vector<16x1xf32> to vector<16xf32> -// CHECK-REDUCTION-NEXT: %[[R1:.*]] = vector.reduction , %[[CAST1]], %cst : vector<16xf32> into f32 -// CHECK-REDUCTION-NEXT: gpu.yield %4, %[[R1]], %[[R0]] : !xegpu.tensor_desc<16x2xf32, #xegpu.layout>, f32, f32 -// CHECK-REDUCTION-NEXT: } -// CHECK-REDUCTION-NEXT: vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32> +// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction +// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) { +// CHECK: %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<16x2xf32> +// CHECK: %[[T1:.*]] = vector.extract_strided_slice %[[SRC]] +// CHECK-SAME: {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> +// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<16x1xf32> to vector<16xf32> +// CHECK: %[[T3:.*]] = vector.reduction , %[[T2]], %{{.*}} : vector<16xf32> into f32 +// CHECK: %[[T4:.*]] = vector.extract_strided_slice %[[SRC]] +// CHECK-SAME: {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> +// CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]] : vector<16x1xf32> to vector<16xf32> +// CHECK: %[[T6:.*]] = vector.reduction , %[[T5]], %{{.*}} : vector<16xf32> into f32 +// CHECK: %[[T7:.*]] = vector.from_elements %[[T3]], %[[T6]] : vector<2xf32> +// CHECK: gpu.yield %[[T7]] : vector<2xf32> +// CHECK: } gpu.module @xevm_module{ gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) { %c0 = arith.constant 0 : index @@ -405,10 +423,17 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) // ----- // CHECK-LABEL: gpu.func @scatter_ops_chunksize({{.*}}) { -// CHECK: %[[MASK:.*]] = arith.constant dense : vector<1xi1> -// CHECK-NEXT: %[[LANE_OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> -// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[LANE_OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> -// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[LANE_OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> +// CHECK: %[[OFFSETS:.*]] = arith.constant {{.*}} dense<12> : vector<16xindex> +// CHECK: %[[MASKS:.*]] = arith.constant {{.*}} dense : vector<16xi1> +// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[16] +// CHECK-SAME: -> (vector<1x8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>) { +// CHECK: gpu.yield %{{.*}}, %{{.*}}, %[[OFFSETS]], %[[MASKS]] : +// CHECK-SAME: vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// CHECK-NEXT: } +// CHECK-NEXT: %[[T1:.*]] = xegpu.load %[[W]]#1[%[[W]]#2], %[[W]]#3 <{chunk_size = 8 : i64}> +// CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> +// CHECK-NEXT: xegpu.store %[[T1]], %[[W]]#1[%[[W]]#2], %[[W]]#3 <{chunk_size = 8 : i64}> +// CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> gpu.module @xevm_module{ gpu.func @scatter_ops_chunksize(%laneid: index, %src: memref<256xf16>) { gpu.warp_execute_on_lane_0(%laneid)[16] { @@ -437,172 +462,114 @@ gpu.module @xevm_module{ } } -// // ----- -// // CHECK-LABEL: gpu.func @scatter_ops_scf_yield({{.*}}, -// // CHECK-SAME: %[[PREDICATE:.*]]: i1) { -// // CHECK: %[[DEFAULT:.*]] = arith.constant dense<1.200000e+01> : vector<8xf16> -// // CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> -// // CHECK: %[[MASK:.*]] = arith.constant dense : vector<1xi1> -// // CHECK: %[[PREDICATED_LOAD:.*]] = scf.if %[[PREDICATE]] -> (vector<8xf16>) { -// // CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> -// // CHECK-NEXT: scf.yield %[[LOADED]] : vector<8xf16> -// // CHECK-NEXT: } else { -// // CHECK-NEXT: scf.yield %[[DEFAULT]] : vector<8xf16> -// // CHECK-NEXT: } -// // CHECK-NEXT: xegpu.store %[[PREDICATED_LOAD]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> -// gpu.module @xevm_module{ -// gpu.func @scatter_ops_scf_yield(%src: memref<256xf16>, %pred : i1) { -// %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> -// %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> -// %loaded = scf.if %pred -> (vector<16x8xf16>) { -// %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> { -// layout_result_0 = #xegpu.layout -// } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> -// scf.yield %3 : vector<16x8xf16> -// } else { -// %3 = arith.constant { -// layout_result_0 = #xegpu.layout -// } dense<12.> : vector<16x8xf16> -// scf.yield %3 : vector<16x8xf16> -// } { layout_result_0 = #xegpu.layout } -// xegpu.store %loaded, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> -// gpu.return -// } -// } - -// // ----- -// // CHECK-LABEL: gpu.func @scatter_ops_scf_non_yield({{.*}}) { -// // CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> -// // CHECK: %[[MASK:.*]] = arith.constant dense : vector<1xi1> -// // CHECK: %[[PREDICATE:.*]] = llvm.mlir.poison : i1 -// // CHECK: scf.if %[[PREDICATE]] { -// // CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> -// // CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> -// // CHECK-NEXT: } -// gpu.module @xevm_module{ -// gpu.func @scatter_ops_scf_non_yield(%src: memref<256xf16>) { -// %pred = llvm.mlir.poison : i1 -// %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> -// %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> -// scf.if %pred { -// %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> { -// layout_result_0 = #xegpu.layout -// } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> -// xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> -// } -// gpu.return -// } -// } - -// // ----- -// // CHECK-LABEL: gpu.func @scatter_ops({{.*}}) { -// // CHECK: %[[MASK:.*]] = arith.constant dense : vector<1xi1> -// // CHECK-NEXT: %[[LANE_OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> -// // CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[LANE_OFFSET]]], %[[MASK]] : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16> -// // CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[LANE_OFFSET]]], %[[MASK]] : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> -// gpu.module @xevm_module{ -// gpu.func @scatter_ops(%src: memref<256xf16>) { -// %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> -// %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> -// %3 = xegpu.load %src[%offset], %1 { -// layout_result_0 = #xegpu.layout -// } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> -// xegpu.store %3, %src[%offset], %1 : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> -// gpu.return -// } -// } - -// // ----- -// // CHECK-LABEL: gpu.func @memref_extract_aligned_pointer_as_index( -// // CHECK: %{{.*}} = memref.extract_aligned_pointer_as_index %{{.*}} : memref<256x256xf16> -> index -// gpu.module @xevm_module{ -// gpu.func @memref_extract_aligned_pointer_as_index(%arg0 : memref<256x256xf16>) { -// %c0 = arith.constant 0 : index -// %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16xf16> -// %ptr = memref.extract_aligned_pointer_as_index %arg0 : memref<256x256xf16> -> index -// %ptr_i64 = arith.index_cast %ptr : index to i64 -// %tdesc = xegpu.create_nd_tdesc %ptr_i64, shape: [16], strides: [16] : i64 -// -> !xegpu.tensor_desc<16xf16, #xegpu.layout> -// xegpu.store_nd %cst, %tdesc[%c0] : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout> -// gpu.return -// } -// } - +// ----- +// CHECK-LABEL: gpu.func @scatter_ops({{.*}}) { +// CHECK: %[[OFFSETS:.*]] = arith.constant {{.*}} dense<12> : vector<16xindex> +// CHECK: %[[MASKS:.*]] = arith.constant {{.*}} dense : vector<16xi1> +// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[16] +// CHECK-SAME: -> (vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>) { +// CHECK: gpu.yield %{{.*}}, %{{.*}}, %[[OFFSETS]], %[[MASKS]] +// CHECK-SAME: : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// CHECK-NEXT: } +// CHECK-NEXT: %[[T1:.*]] = xegpu.load %[[W]]#1[%[[W]]#2], %[[W]]#3 +// CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16> +// CHECK-NEXT: xegpu.store %[[T1]], %[[W]]#1[%[[W]]#2], %[[W]]#3 +// CHECK-SAME: : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> +gpu.module @xevm_module{ + gpu.func @scatter_ops(%src: memref<256xf16>, %laneid: index) { + gpu.warp_execute_on_lane_0(%laneid)[16] { + %1 = arith.constant + {layout_result_0 = #xegpu.layout} + dense<1> : vector<16xi1> + %offset = arith.constant + {layout_result_0 = #xegpu.layout} + dense<12> : vector<16xindex> + %3 = xegpu.load %src[%offset], %1 + { + layout_operand_1 = #xegpu.layout, + layout_operand_2 = #xegpu.layout, + layout_result_0 = #xegpu.layout + } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> + xegpu.store %3, %src[%offset], %1 + { + layout_operand_0 = #xegpu.layout, + layout_operand_2 = #xegpu.layout, + layout_operand_3 = #xegpu.layout + } + : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> + } + gpu.return + } +} -// // ----- -// // CHECK-LABEL: gpu.func @vector_transpose( -// // CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<2xf32> -// // CHECK: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<2x16xf32> -> !xegpu.tensor_desc<2x16xf32> -// // CHECK: xegpu.store_nd %[[CST]], %[[DEST]][{{.*}}] : vector<2xf32>, !xegpu.tensor_desc<2x16xf32> -// gpu.module @xevm_module{ -// gpu.func @vector_transpose(%arg0: memref<2x16xf32>) { -// %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> -// : vector<16x2xf32> -// %c0 = arith.constant 0 : index -// %transpose = vector.transpose %cst, [1, 0] {layout_result_0 = #xegpu.layout} -// : vector<16x2xf32> to vector<2x16xf32> -// %0 = xegpu.create_nd_tdesc %arg0 : memref<2x16xf32> -// -> !xegpu.tensor_desc<2x16xf32, #xegpu.layout> -// xegpu.store_nd %transpose, %0[%c0, %c0] : vector<2x16xf32>, -// !xegpu.tensor_desc<2x16xf32, #xegpu.layout> -// gpu.return -// } -// } +// ----- +// CHECK-LABEL: gpu.func @memref_extract_aligned_pointer_as_index( +// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (index, memref<256x256xf16>) { +// CHECK: gpu.yield %{{.*}}, %{{.*}} : index, memref<256x256xf16> +// CHECK-NEXT: } +// CHECK-NEXT: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[W]]#1 : memref<256x256xf16> -> index +// CHECK-NEXT: arith.index_cast %[[INTPTR]] : index to i64 +gpu.module @xevm_module{ + gpu.func @memref_extract_aligned_pointer_as_index(%arg0 : memref<256x256xf16>, %laneid: index) { + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (index) { + %ptr = memref.extract_aligned_pointer_as_index %arg0 : memref<256x256xf16> -> index + gpu.yield %ptr : index + } + %ptr_i64 = arith.index_cast %r : index to i64 + "some_user_op"(%ptr_i64) : (i64) -> () + gpu.return + } +} -// // ----- -// // CHECK-LABEL: gpu.func @vector_bitcast( -// // CHECK: %[[CAST:.*]] = vector.bitcast %{{.*}} : vector<4x2xi8> to vector<4x1xi16> -// // CHECK-NEXT: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<4x16xi16> -> !xegpu.tensor_desc<4x16xi16> -// // CHECK-NEXT: %[[T0:.*]] = vector.shape_cast %[[CAST]] : vector<4x1xi16> to vector<4xi16> -// // CHECK-NEXT: xegpu.store_nd %[[T0]], %[[DEST]][{{.*}}] : vector<4xi16>, !xegpu.tensor_desc<4x16xi16> -// gpu.module @xevm_module{ -// gpu.func @vector_bitcast(%arg0: memref<4x16xi16>) { -// %cst = "some_op"() {layout_result_0 = #xegpu.layout} -// : () -> (vector<4x32xi8>) -// %bitcast = vector.bitcast %cst {layout_result_0 = #xegpu.layout} -// : vector<4x32xi8> to vector<4x16xi16> -// %c0 = arith.constant 0 : index -// %0 = xegpu.create_nd_tdesc %arg0 : memref<4x16xi16> -// -> !xegpu.tensor_desc<4x16xi16, #xegpu.layout> -// xegpu.store_nd %bitcast, %0[%c0, %c0] : vector<4x16xi16>, -// !xegpu.tensor_desc<4x16xi16, #xegpu.layout> -// gpu.return -// } -// } -// // ----- -// // CHECK-LABEL: gpu.func @mma_transpose_b( -// // CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x8xi32>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// // CHECK-DAG: %[[ADESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// // CHECK-DAG: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32> -// // CHECK-DAG: %[[A:.*]] = xegpu.load_nd %[[ADESC]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> -// // CHECK-DAG: %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}] <{transpose = array}> : !xegpu.tensor_desc<16x8xi32> -> vector<8xi32> -// // CHECK-NEXT: %[[BCAST0:.*]] = vector.shape_cast %[[B]] : vector<8xi32> to vector<1x8xi32> -// // CHECK-NEXT: %[[BCAST1:.*]] = vector.bitcast %[[BCAST0]] : vector<1x8xi32> to vector<1x16xf16> -// // CHECK-NEXT: %[[BCAST2:.*]] = vector.shape_cast %[[BCAST1]] : vector<1x16xf16> to vector<16xf16> -// // CHECK-NEXT: %[[C:.*]] = xegpu.dpas %[[A]], %[[BCAST2]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> -// gpu.module @xevm_module{ -// gpu.func @mma_transpose_b(%arg0: memref<8x16xf16>, %arg1: memref<16x8xi32>, %arg2: memref<8x16xf32>) { -// %c0 = arith.constant 0 : index -// %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -// -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -// %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} -// : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> -// %2 = xegpu.create_nd_tdesc %arg1 : memref<16x8xi32> -// -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -// %3 = xegpu.load_nd %2[%c0, %c0] {layout_result_0 = #xegpu.layout} -// : !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> -// %4 = vector.bitcast %3 {layout_result_0 = #xegpu.layout} -// : vector<16x8xi32> to vector<16x16xf16> -// %5 = vector.transpose %4, [1, 0] {layout_result_0 = #xegpu.layout} -// : vector<16x16xf16> to vector<16x16xf16> -// %6 = xegpu.dpas %1, %5 {layout_result_0 = #xegpu.layout} -// : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// %7 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -// -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -// xegpu.store_nd %6, %7[%c0, %c0] : vector<8x16xf32>, -// !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -// gpu.return +// ----- +// CHECK-LABEL: gpu.func @vector_transpose( +// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2x1xf32>, vector<1x2xf32>) { +// CHECK: %[[SRC:.*]] = "some_op"() {{.*}} : () -> vector<16x2xf32> +// CHECK: gpu.yield %{{.*}}, %[[SRC]] : vector<2x16xf32>, vector<16x2xf32> +// CHECK-NEXT: } +// CHECK-NEXT: %[[T1:.*]] = vector.transpose %[[W]]#1, [1, 0] : vector<1x2xf32> to vector<2x1xf32> +gpu.module @xevm_module{ + gpu.func @vector_transpose(%arg0: memref<2x16xf32>, %laneid: index) { + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2x1xf32>) { + %cst = "some_op"() + {layout_result_0 = #xegpu.layout} + : () -> (vector<16x2xf32>) + %transpose = vector.transpose %cst, [1, 0] + { + layout_operand_0 = #xegpu.layout, + layout_result_0 = #xegpu.layout + } + : vector<16x2xf32> to vector<2x16xf32> + gpu.yield %transpose : vector<2x16xf32> + } + "some_user_op"(%r) : (vector<2x1xf32>) -> () + gpu.return + } +} -// } -// } +// ----- +// CHECK-LABEL: gpu.func @vector_bitcast( +// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<4x1xi16>, vector<4x2xi8>) { +// CHECK: %[[SRC:.*]] = "some_op"() {{.*}} : () -> vector<4x32xi8> +// CHECK: gpu.yield %{{.*}}, %[[SRC]] : vector<4x16xi16>, vector<4x32xi8> +// CHECK: } +// CHECK: vector.bitcast %[[W]]#1 : vector<4x2xi8> to vector<4x1xi16> +gpu.module @xevm_module{ + gpu.func @vector_bitcast(%arg0: memref<4x16xi16>, %laneid: index) { + %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<4x1xi16>) { + %cst = "some_op"() + {layout_result_0 = #xegpu.layout} + : () -> (vector<4x32xi8>) + %bitcast = vector.bitcast %cst + { + layout_operand_0 = #xegpu.layout, + layout_result_0 = #xegpu.layout + } + : vector<4x32xi8> to vector<4x16xi16> + gpu.yield %bitcast : vector<4x16xi16> + } + "some_user_op"(%r) : (vector<4x1xi16>) -> () + gpu.return + } +} From 015b8a3c9789f8e24a056e96970b28d7a06ca89a Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 3 Oct 2025 00:20:53 +0000 Subject: [PATCH 8/9] bug fix in shape cast --- .../Transforms/XeGPUSubgroupDistribute.cpp | 23 +++++++++++++++---- .../XeGPU/subgroup-distribute-unit.mlir | 4 ++-- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index c25ec45efb648..f1dbc5ddb2022 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -1009,7 +1009,7 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern { }; /// Helper to rewrite a 2D VectorMultiReductionOp into a sequence of 1D -/// VectorReductionOps. +/// VectorReductionOps. We also insert layouts for the newly created ops. static Value lowerToVectorReductions(TypedValue src, TypedValue acc, vector::CombiningKind kind, @@ -1026,6 +1026,9 @@ static Value lowerToVectorReductions(TypedValue src, Value reductionResult = arith::ConstantOp::create( rewriter, loc, acc.getType(), DenseElementsAttr::get(acc.getType(), zeroAttr)); + // Reduction result should have the same layout as the accumulator. + xegpu::setDistributeLayoutAttr(cast(reductionResult), + xegpu::getDistributeLayoutAttr(acc)); // For each slice of the source, extract the slice vector, do a reduction // and, insert the reduced value back to the result vector. for (int i = 0; i < nSlices; ++i) { @@ -1041,13 +1044,23 @@ static Value lowerToVectorReductions(TypedValue src, vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets, sliceSizes, {1, 1}); int64_t nSliceElements = extractOp.getResult().getType().getNumElements(); - Value slice = vector::ShapeCastOp::create( + vector::ShapeCastOp slice = vector::ShapeCastOp::create( rewriter, loc, VectorType::get({nSliceElements}, sourceType.getElementType()), extractOp.getResult()); + // Shape cast is currently handled in xegpu side. So layouts must be + // retained during lowering. Shape cast output has the same layout as the + // accumulator. Shape cast source has the same layout as the original + // reduction source. + // TODO: other ops generated here may also need layout attributes. + xegpu::setDistributeLayoutAttr(slice->getOpOperand(0), + xegpu::getDistributeLayoutAttr(src)); + xegpu::setDistributeLayoutAttr(slice->getOpResult(0), + xegpu::getDistributeLayoutAttr(acc)); + // Extract and reduction results in scalars, so no result layout is needed. Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i); - Value reduction = - vector::ReductionOp::create(rewriter, loc, kind, slice, accExtract); + Value reduction = vector::ReductionOp::create( + rewriter, loc, kind, slice.getResult(), accExtract); reductionResult = vector::InsertOp::create(rewriter, loc, reduction, reductionResult, i); } @@ -1229,7 +1242,7 @@ struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern { auto resultDistTy = cast(warpOp.getResult(operandNumber).getType()); xegpu::DistributeLayoutAttr sourceLayout = - xegpu::getDistributeLayoutAttr(shapeCastOp.getSource()); + xegpu::getDistributeLayoutAttr(shapeCastOp->getOpOperand(0)); xegpu::DistributeLayoutAttr resultLayout = xegpu::getDistributeLayoutAttr(shapeCastOp.getResult()); if (!sourceLayout || !resultLayout) diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir index 5b56c8981dae8..40b66d18cc47f 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir @@ -388,11 +388,11 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index) // CHECK: %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<16x2xf32> // CHECK: %[[T1:.*]] = vector.extract_strided_slice %[[SRC]] // CHECK-SAME: {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> -// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<16x1xf32> to vector<16xf32> +// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] {{.*}} : vector<16x1xf32> to vector<16xf32> // CHECK: %[[T3:.*]] = vector.reduction , %[[T2]], %{{.*}} : vector<16xf32> into f32 // CHECK: %[[T4:.*]] = vector.extract_strided_slice %[[SRC]] // CHECK-SAME: {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32> -// CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]] : vector<16x1xf32> to vector<16xf32> +// CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]] {{.*}} : vector<16x1xf32> to vector<16xf32> // CHECK: %[[T6:.*]] = vector.reduction , %[[T5]], %{{.*}} : vector<16xf32> into f32 // CHECK: %[[T7:.*]] = vector.from_elements %[[T3]], %[[T6]] : vector<2xf32> // CHECK: gpu.yield %[[T7]] : vector<2xf32> From a703518d7da237ded3450a3b6e8224322a50f494 Mon Sep 17 00:00:00 2001 From: Charitha Saumya Date: Fri, 3 Oct 2025 18:46:10 +0000 Subject: [PATCH 9/9] fix --- ...outs-and-subgroup-distribute.mlir => subgroup-distribute.mlir} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename mlir/test/Dialect/XeGPU/{propgate-layouts-and-subgroup-distribute.mlir => subgroup-distribute.mlir} (100%) diff --git a/mlir/test/Dialect/XeGPU/propgate-layouts-and-subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir similarity index 100% rename from mlir/test/Dialect/XeGPU/propgate-layouts-and-subgroup-distribute.mlir rename to mlir/test/Dialect/XeGPU/subgroup-distribute.mlir