diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 8957ea5399ea2..2088c3c7fc5ec 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -277,22 +277,13 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern { descOp, "the tensor descriptor lacks layout attribute"); SmallVector newRetIndices; - SmallVector newYieldValues; - SmallVector newYieldTypes; - - for (Value operand : descOp->getOperands()) { - newYieldValues.push_back(operand); - newYieldTypes.push_back(operand.getType()); - } rewriter.setInsertionPoint(warpOp); gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( - rewriter, warpOp, /* new yieled values = */ newYieldValues, - /* new yielded types = */ newYieldTypes, newRetIndices); + rewriter, warpOp, /* new yieled values = */ descOp->getOperands(), + /* new yielded types = */ descOp.getOperandTypes(), newRetIndices); - SmallVector newDescOperands; - for (size_t i : newRetIndices) { - newDescOperands.push_back(newWarpOp.getResult(i)); - } + SmallVector newDescOperands = llvm::map_to_vector( + newRetIndices, [&](size_t i) { return newWarpOp.getResult(i); }); rewriter.setInsertionPointAfter(newWarpOp); xegpu::TensorDescType distributedTensorDescTy = descOp.getType().dropLayouts(); // Distributed tensor descriptor type @@ -696,39 +687,30 @@ struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern { warpOp, "warp result is not a xegpu::UpdateNdOffset op"); auto updateOp = operand->get().getDefiningOp(); unsigned operandIdx = operand->getOperandNumber(); - // new update op does not have layout attribute. - xegpu::TensorDescType newTensorDescTy = - updateOp.getTensorDescType().dropLayouts(); - SmallVector newYieldValues; - SmallVector newYieldTypes; - for (Value operand : updateOp->getOperands()) { - newYieldValues.push_back(operand); - if (isa(operand.getType())) { - newYieldTypes.push_back(newTensorDescTy); - } else { - newYieldTypes.push_back(operand.getType()); - } - } SmallVector newRetIndices; gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( - rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices); + rewriter, warpOp, updateOp->getOperands(), updateOp.getOperandTypes(), + newRetIndices); rewriter.setInsertionPointAfter(newWarpOp); - SmallVector newUpdateOperands; - for (size_t i : newRetIndices) { - // For the tensor descriptor operand, the layout attribute is dropped - // after distribution. Types needs to be resolved in this case. - if (isa(newWarpOp.getResult(i).getType())) { - newUpdateOperands.push_back(resolveDistributedTy( - newWarpOp.getResult(i), newTensorDescTy, rewriter)); - } else { - newUpdateOperands.push_back(newWarpOp.getResult(i)); - } - } + // new update op does not have layout attribute. + xegpu::TensorDescType distributedTensorDescTy = + updateOp.getTensorDescType().dropLayouts(); + SmallVector newUpdateOperands = + llvm::map_to_vector(newRetIndices, [&](size_t i) { + // For the tensor descriptor operand, the layout attribute is + // dropped after distribution. Types needs to be resolved in this + // case. + if (isa(newWarpOp.getResult(i).getType())) { + return resolveDistributedTy(newWarpOp.getResult(i), + distributedTensorDescTy, rewriter); + } + return newWarpOp.getResult(i); + }); // Create a new update op outside the warp op. auto newUpdateOp = xegpu::UpdateNdOffsetOp::create( - rewriter, newWarpOp.getLoc(), newTensorDescTy, newUpdateOperands, - updateOp->getAttrs()); + rewriter, newWarpOp.getLoc(), distributedTensorDescTy, + newUpdateOperands, updateOp->getAttrs()); xegpu::removeLayoutAttrs(newUpdateOp); Value distributedVal = newWarpOp.getResult(operandIdx); // Resolve the distributed type with the original type. diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index e78ae4a17710b..54ef56e013abb 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -xegpu-subgroup-distribute -canonicalize -cse -split-input-file %s | FileCheck %s +// RUN: mlir-opt -xegpu-subgroup-distribute -allow-unregistered-dialect -canonicalize -cse -split-input-file %s | FileCheck %s // CHECK-LABEL: gpu.func @store_nd_1d // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>) { @@ -265,6 +265,28 @@ gpu.module @test { } } +// ----- +// Explicitly check that update_nd_offset op's source retain layout when yielded from the warp op (PR150545) +// CHECK-LABEL: gpu.func @check_update_nd_offset_distributed_tensor_desc +// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> +// CHECK-SAME: (!xegpu.tensor_desc<16x16xf32, #xegpu.layout>) { +// CHECK: %[[T0:.*]] = "some_op"() : () -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout> +// CHECK: gpu.yield %[[T0]] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> +// CHECK: } +// CHECK: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]] : +// CHECK-SAME: !xegpu.tensor_desc<16x16xf32, #xegpu.layout> to !xegpu.tensor_desc<16x16xf32> {resolve_simt_type_mismatch} +// CHECK: xegpu.update_nd_offset %[[T1]], [%{{.*}}] : !xegpu.tensor_desc<16x16xf32> +gpu.module @test { + gpu.func @check_update_nd_offset_distributed_tensor_desc() { + %c32 = arith.constant 32 : index + %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16x16xf32> + %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout> + %1 = xegpu.update_nd_offset %0, [%c32, %c32] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> + xegpu.store_nd %cst, %1 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout> + gpu.return + } +} + // ----- // CHECK-LABEL: gpu.func @prefetch_1d // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {