diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 426377fcf598f..4c67856b559b1 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -843,7 +843,8 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr:$chunk_size, OptionalAttr:$l1_hint, OptionalAttr:$l2_hint, - OptionalAttr:$l3_hint); + OptionalAttr:$l3_hint, + OptionalAttr:$layout); let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value); let extraClassDeclaration = extraBaseClassDeclaration # [{ @@ -895,7 +896,14 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { "IntegerAttr": $chunk_size, "xegpu::CachePolicyAttr": $l1_hint, "xegpu::CachePolicyAttr": $l2_hint, - "xegpu::CachePolicyAttr": $l3_hint)> + "xegpu::CachePolicyAttr": $l3_hint)>, + OpBuilder<(ins "Type": $value, "Value": $source, + "ArrayRef": $offsets, "Value": $mask, + "IntegerAttr": $chunk_size, + "xegpu::CachePolicyAttr": $l1_hint, + "xegpu::CachePolicyAttr": $l2_hint, + "xegpu::CachePolicyAttr": $l3_hint, + "xegpu::DistributeLayoutAttr": $layout)> ]; let hasVerifier = 1; @@ -979,7 +987,8 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr:$chunk_size, OptionalAttr:$l1_hint, OptionalAttr:$l2_hint, - OptionalAttr:$l3_hint); + OptionalAttr:$l3_hint, + OptionalAttr:$layout); let extraClassDeclaration = extraBaseClassDeclaration#[{ Type getDestType() { @@ -1030,7 +1039,14 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { "IntegerAttr": $chunk_size, "xegpu::CachePolicyAttr": $l1_hint, "xegpu::CachePolicyAttr": $l2_hint, - "xegpu::CachePolicyAttr": $l3_hint)> + "xegpu::CachePolicyAttr": $l3_hint)>, + OpBuilder<(ins "Value": $value, "Value": $dest, + "ArrayRef": $offsets, "Value": $mask, + "IntegerAttr": $chunk_size, + "xegpu::CachePolicyAttr": $l1_hint, + "xegpu::CachePolicyAttr": $l2_hint, + "xegpu::CachePolicyAttr": $l3_hint, + "xegpu::DistributeLayoutAttr": $layout)> ]; let hasVerifier = 1; diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp index e2c7d803e5a5e..b5d9323de47a6 100644 --- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp +++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp @@ -435,7 +435,8 @@ static LogicalResult lowerToScatteredLoadOp(vector::TransferReadOp readOp, /*chunk_size=*/IntegerAttr{}, /*l1_hint=*/xegpu::CachePolicyAttr{}, /*l2_hint=*/xegpu::CachePolicyAttr{}, - /*l3_hint=*/xegpu::CachePolicyAttr{}); + /*l3_hint=*/xegpu::CachePolicyAttr{}, + /*layout=*/nullptr); rewriter.replaceOp(readOp, gatherOp.getResult()); return success(); @@ -469,7 +470,8 @@ static LogicalResult lowerToScatteredStoreOp(vector::TransferWriteOp writeOp, /*chunk_size=*/IntegerAttr{}, /*l1_hint=*/xegpu::CachePolicyAttr{}, /*l2_hint=*/xegpu::CachePolicyAttr{}, - /*l3_hint=*/xegpu::CachePolicyAttr{}); + /*l3_hint=*/xegpu::CachePolicyAttr{}, + /*layout=*/nullptr); rewriter.eraseOp(writeOp); return success(); } @@ -621,7 +623,8 @@ struct GatherLowering : public OpRewritePattern { /*chunk_size=*/IntegerAttr{}, /*l1_hint=*/xegpu::CachePolicyAttr{}, /*l2_hint=*/xegpu::CachePolicyAttr{}, - /*l3_hint=*/xegpu::CachePolicyAttr{}); + /*l3_hint=*/xegpu::CachePolicyAttr{}, + /*layout=*/nullptr); auto selectOp = arith::SelectOp::create(rewriter, loc, gatherOp.getMask(), @@ -655,7 +658,8 @@ struct ScatterLowering : public OpRewritePattern { /*chunk_size=*/IntegerAttr{}, /*l1_hint=*/xegpu::CachePolicyAttr{}, /*l2_hint=*/xegpu::CachePolicyAttr{}, - /*l3_hint=*/xegpu::CachePolicyAttr{}); + /*l3_hint=*/xegpu::CachePolicyAttr{}, + /*layout=*/nullptr); rewriter.eraseOp(scatterOp); return success(); } diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index abd12e2e69ac0..2a7c7ac7e8cde 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -859,7 +859,7 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state, xegpu::CachePolicyAttr l2_hint, xegpu::CachePolicyAttr l3_hint) { build(builder, state, valueType, source, Value(), mask, IntegerAttr(), - l1_hint, l2_hint, l3_hint); + l1_hint, l2_hint, l3_hint, /*layout=*/nullptr); } void LoadGatherOp::build(OpBuilder &builder, OperationState &state, @@ -875,7 +875,24 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state, auto offset = vector::FromElementsOp::create(builder, loc, type, values); build(builder, state, valueType, source, offset, mask, chunk_size, l1_hint, - l2_hint, l3_hint); + l2_hint, l3_hint, /*layout=*/nullptr); +} + +void LoadGatherOp::build(OpBuilder &builder, OperationState &state, + Type valueType, Value source, + ArrayRef offsets, Value mask, + IntegerAttr chunk_size, xegpu::CachePolicyAttr l1_hint, + xegpu::CachePolicyAttr l2_hint, + xegpu::CachePolicyAttr l3_hint, + DistributeLayoutAttr layout) { + auto loc = source.getLoc(); + int64_t size = static_cast(offsets.size()); + auto type = VectorType::get(size, builder.getIndexType()); + auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets); + auto offset = vector::FromElementsOp::create(builder, loc, type, values); + + build(builder, state, valueType, source, offset, mask, chunk_size, l1_hint, + l2_hint, l3_hint, layout); } //===----------------------------------------------------------------------===// @@ -926,7 +943,7 @@ void StoreScatterOp::build(OpBuilder &builder, OperationState &state, xegpu::CachePolicyAttr l2_hint, xegpu::CachePolicyAttr l3_hint) { build(builder, state, value, dest, Value(), mask, IntegerAttr(), l1_hint, - l2_hint, l3_hint); + l2_hint, l3_hint, /*layout=*/nullptr); } void StoreScatterOp::build(OpBuilder &builder, OperationState &state, @@ -944,7 +961,23 @@ void StoreScatterOp::build(OpBuilder &builder, OperationState &state, // Call the correct builder overload that does not expect result types. build(builder, state, value, dest, offset, mask, chunk_size, l1_hint, l2_hint, - l3_hint); + l3_hint, /*layout=*/nullptr); +} + +void StoreScatterOp::build( + OpBuilder &builder, OperationState &state, Value value, Value dest, + ArrayRef offsets, Value mask, IntegerAttr chunk_size, + xegpu::CachePolicyAttr l1_hint, xegpu::CachePolicyAttr l2_hint, + xegpu::CachePolicyAttr l3_hint, DistributeLayoutAttr layout) { + auto loc = dest.getLoc(); + int64_t size = static_cast(offsets.size()); + auto type = VectorType::get(size, builder.getIndexType()); + auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets); + auto offset = vector::FromElementsOp::create(builder, loc, type, values); + + // Call the correct builder overload that does not expect result types. + build(builder, state, value, dest, offset, mask, chunk_size, l1_hint, l2_hint, + l3_hint, layout); } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp index e6e71cc29a80a..c3bf9606693a8 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp @@ -678,12 +678,16 @@ struct UnrollLoadGatherOpWithOffset pack(offsets, convertedOffsetTypes, *targetShape, loc, rewriter); } + auto layout = dyn_cast_if_present(op.getLayoutAttr()); + if (layout) + layout = layout.dropInstData(); + SmallVector newOps; for (auto [o, m] : llvm::zip(convertedOffsets, convertedMasks)) { auto newOp = xegpu::LoadGatherOp::create( rewriter, loc, newValueTy, op.getSource(), o, m, rewriter.getI64IntegerAttr(chunkSize), op.getL1HintAttr(), - op.getL2HintAttr(), op.getL3HintAttr()); + op.getL2HintAttr(), op.getL3HintAttr(), layout); newOps.push_back(newOp); } @@ -774,12 +778,16 @@ struct UnrollStoreScatterOpWithOffsets SmallVector convertedValues = pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter); + auto layout = dyn_cast_if_present(op.getLayoutAttr()); + if (layout) + layout = layout.dropInstData(); + for (auto [v, o, m] : llvm::zip(convertedValues, convertedOffsets, convertedMasks)) { xegpu::StoreScatterOp::create(rewriter, loc, v, op.getDest(), o, m, rewriter.getI64IntegerAttr(chunkSize), op.getL1HintAttr(), op.getL2HintAttr(), - op.getL3HintAttr()); + op.getL3HintAttr(), layout); } rewriter.eraseOp(op); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index 9fc5ad9af5c7b..ceeafbd7bd5e5 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -914,9 +914,8 @@ struct WgToSgLoadGatherOpWithOffset llvm::zip(adaptor.getOffsets(), adaptor.getMask())) { auto newLoadOp = xegpu::LoadGatherOp::create( rewriter, loc, newTy, op.getSource(), offsets, mask, chunkSizeAttr, - op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr()); - xegpu::setDistributeLayoutAttr(newLoadOp->getResult(0), - layout.dropSgLayoutAndData()); + op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr(), + layout.dropSgLayoutAndData()); newLoadOps.push_back(newLoadOp); } rewriter.replaceOpWithMultiple(op, {newLoadOps}); @@ -964,7 +963,8 @@ struct WgToSgStoreScatterOpWithOffset adaptor.getValue(), adaptor.getOffsets(), adaptor.getMask())) { auto store = xegpu::StoreScatterOp::create( rewriter, loc, val, op.getDest(), offs, mask, chunkSizeAttr, - op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr()); + op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr(), + layout.dropSgLayoutAndData()); // Update the layout attribute to drop sg_layout and sg_data. if (!layout.getEffectiveLaneLayoutAsInt().empty() || !layout.getEffectiveInstDataAsInt().empty()) { diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index b4605cd7e94d6..6e918f162a5ea 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -105,12 +105,22 @@ mlir::xegpu::getDistributedVectorType(VectorType originalType, std::string xegpu::getLayoutName(const OpOperand &operand) { const StringRef prefix("layout_operand_"); unsigned idx = const_cast(operand).getOperandNumber(); - return llvm::formatv("{0}{1}", prefix, idx).str(); + auto owner = operand.getOwner(); + auto tempLayout = llvm::formatv("{0}{1}", prefix, idx).str(); + if (isa(operand.getOwner()) && idx == 0 && + !owner->hasAttr(tempLayout)) + return "layout"; + return tempLayout; } std::string xegpu::getLayoutName(const OpResult result) { const StringRef prefix = "layout_result_"; - return llvm::formatv("{0}{1}", prefix, result.getResultNumber()).str(); + auto owner = result.getOwner(); + auto tempLayout = + llvm::formatv("{0}{1}", prefix, result.getResultNumber()).str(); + if (isa(owner) && !owner->hasAttr(tempLayout)) + return "layout"; + return tempLayout; } xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { @@ -144,6 +154,11 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { std::string layoutName = getLayoutName(result); if (defOp->hasAttr(layoutName)) return defOp->getAttrOfType(layoutName); + + // check for "permament" layout only after "temporary" layout name lookup + // for backward compatibility + if (auto loadGatherOp = dyn_cast(defOp)) + return loadGatherOp.getLayoutAttr(); } if (auto arg = dyn_cast(value)) { @@ -171,6 +186,13 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) { std::string layoutName = xegpu::getLayoutName(opr); if (op->hasAttr(layoutName)) return op->getAttrOfType(layoutName); + + // check for "permament" layout only after "temporary" layout name lookup + // for backward compatibility + if (auto storeScatterOp = dyn_cast(op)) + if (auto layout = storeScatterOp.getLayoutAttr()) + return layout; + return getDistributeLayoutAttr(opr.get()); } diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir index 30f785ded975a..3e2644beffc35 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir @@ -97,7 +97,7 @@ func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor // CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK-NEXT: %[[T2:.*]] = xegpu.create_tdesc %[[ARG1]], %[[CST]] : memref<256xf16>, vector<16xindex> -> // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout> -// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]] {layout_result_0 = #xegpu.layout} +// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]] <{layout = #xegpu.layout}> // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<16xi1> -> vector<16x16xf16> func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index @@ -122,7 +122,7 @@ func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256 // CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK-NEXT: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %[[CST]] : memref<256xf32>, vector<16xindex> -> // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> -// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] {layout_result_0 = #xegpu.layout} : +// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<16xi1> -> vector<16xf32> func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> @@ -167,8 +167,8 @@ func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) { // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> -// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64}> -// CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> +// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64, +// CHECK-SAME: layout = #xegpu.layout}> : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> // CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> func.func @scatter_ops_chunksize(%src: memref<256xf16>) { %1 = arith.constant dense<1>: vector<16xi1> @@ -186,7 +186,7 @@ func.func @scatter_ops_chunksize(%src: memref<256xf16>) { // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> // CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] -// CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> +// CHECK-SAME: <{layout = #xegpu.layout}> : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> // CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> func.func @scatter_ops(%src: memref<256xf16>) { %1 = arith.constant dense<1>: vector<16xi1> diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir index f233dff609f2b..e602163ff5aaa 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir @@ -462,6 +462,47 @@ gpu.module @xevm_module{ } } +// ----- +// CHECK-LABEL: gpu.func @scatter_ops_chunksize_perm_layout({{.*}}) { +// CHECK: %[[OFFSETS:.*]] = arith.constant {{.*}} dense<12> : vector<16xindex> +// CHECK: %[[MASKS:.*]] = arith.constant {{.*}} dense : vector<16xi1> +// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[16] +// CHECK-SAME: -> (vector<1x8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>) { +// CHECK: gpu.yield %{{.*}}, %{{.*}}, %[[OFFSETS]], %[[MASKS]] : +// CHECK-SAME: vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// CHECK-NEXT: } +// CHECK-NEXT: %[[T1:.*]] = xegpu.load %[[W]]#1[%[[W]]#2], %[[W]]#3 <{chunk_size = 8 : i64}> +// CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> +// CHECK-NEXT: xegpu.store %[[T1]], %[[W]]#1[%[[W]]#2], %[[W]]#3 <{chunk_size = 8 : i64}> +// CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> +gpu.module @xevm_module{ + gpu.func @scatter_ops_chunksize_perm_layout(%laneid: index, %src: memref<256xf16>) { + gpu.warp_execute_on_lane_0(%laneid)[16] { + %1 = arith.constant + {layout_result_0 = #xegpu.layout} + dense<1>: vector<16xi1> + %offset = arith.constant + {layout_result_0 = #xegpu.layout} + dense<12> : vector<16xindex> + %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> + { + layout_operand_1 = #xegpu.layout, + layout_operand_2 = #xegpu.layout, + layout = #xegpu.layout + } + : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> + xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> + { + layout = #xegpu.layout, + layout_operand_2 = #xegpu.layout, + layout_operand_3 = #xegpu.layout + } + : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> + } + gpu.return + } +} + // ----- // CHECK-LABEL: gpu.func @scatter_ops({{.*}}) { // CHECK: %[[OFFSETS:.*]] = arith.constant {{.*}} dense<12> : vector<16xindex> @@ -502,6 +543,46 @@ gpu.module @xevm_module{ } } +// ----- +// CHECK-LABEL: gpu.func @scatter_ops_perm_layout({{.*}}) { +// CHECK: %[[OFFSETS:.*]] = arith.constant {{.*}} dense<12> : vector<16xindex> +// CHECK: %[[MASKS:.*]] = arith.constant {{.*}} dense : vector<16xi1> +// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[16] +// CHECK-SAME: -> (vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>) { +// CHECK: gpu.yield %{{.*}}, %{{.*}}, %[[OFFSETS]], %[[MASKS]] +// CHECK-SAME: : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// CHECK-NEXT: } +// CHECK-NEXT: %[[T1:.*]] = xegpu.load %[[W]]#1[%[[W]]#2], %[[W]]#3 +// CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16> +// CHECK-NEXT: xegpu.store %[[T1]], %[[W]]#1[%[[W]]#2], %[[W]]#3 +// CHECK-SAME: : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> +gpu.module @xevm_module{ + gpu.func @scatter_ops_perm_layout(%src: memref<256xf16>, %laneid: index) { + gpu.warp_execute_on_lane_0(%laneid)[16] { + %1 = arith.constant + {layout_result_0 = #xegpu.layout} + dense<1> : vector<16xi1> + %offset = arith.constant + {layout_result_0 = #xegpu.layout} + dense<12> : vector<16xindex> + %3 = xegpu.load %src[%offset], %1 + { + layout_operand_1 = #xegpu.layout, + layout_operand_2 = #xegpu.layout, + layout = #xegpu.layout + } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> + xegpu.store %3, %src[%offset], %1 + { + layout = #xegpu.layout, + layout_operand_2 = #xegpu.layout, + layout_operand_3 = #xegpu.layout + } + : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> + } + gpu.return + } +} + // ----- // CHECK-LABEL: gpu.func @memref_extract_aligned_pointer_as_index( // CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (index, memref<256x256xf16>) { diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index 27a3dc373c739..50db5d0c5189d 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -151,6 +151,43 @@ gpu.module @xevm_module{ } } +// ----- +// CHECK-LABEL: gpu.func @scatter_ops_scf_yield_perm_layout +// CHECK: (%{{.*}}: memref<256xf16>, %[[PREDICATE:[a-zA-Z0-9]+]]: i1) { +// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.200000e+01> : vector<1x8xf16> +// CHECK-DAG: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> +// CHECK-DAG: %[[MASK:.*]] = arith.constant dense : vector<1xi1> +// CHECK: %[[IF:.*]] = scf.if %[[PREDICATE]] -> (vector<1x8xf16>) { +// CHECK-NEXT: %[[LD:.*]] = xegpu.load %{{.*}}[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> +// CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> +// CHECK-NEXT: %[[LD_CAST:.*]] = vector.shape_cast %[[LD]] : vector<8xf16> to vector<1x8xf16> +// CHECK-NEXT: scf.yield %[[LD_CAST]] : vector<1x8xf16> +// CHECK-NEXT: } else { +// CHECK-NEXT: scf.yield %[[CST]] : vector<1x8xf16> +// CHECK-NEXT: } +// CHECK-NEXT: %[[IF_CAST:.*]] = vector.shape_cast %[[IF]] : vector<1x8xf16> to vector<8xf16> +// CHECK-NEXT: xegpu.store %[[IF_CAST]], %{{.*}}[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> +// CHECK-SAME: vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> +gpu.module @xevm_module{ + gpu.func @scatter_ops_scf_yield_perm_layout(%src: memref<256xf16>, %pred : i1) { + %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> + %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> + %loaded = scf.if %pred -> (vector<16x8xf16>) { + %3 = xegpu.load %src[%offset], %1 <{chunk_size=8, + layout = #xegpu.layout + }> : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> + scf.yield %3 : vector<16x8xf16> + } else { + %3 = arith.constant { + layout_result_0 = #xegpu.layout + } dense<12.> : vector<16x8xf16> + scf.yield %3 : vector<16x8xf16> + } { layout_result_0 = #xegpu.layout } + xegpu.store %loaded, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> + gpu.return + } +} + // ----- // CHECK-LABEL: gpu.func @scatter_ops_scf_non_yield({{.*}}) { // CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> @@ -177,6 +214,32 @@ gpu.module @xevm_module{ } } +// ----- +// CHECK-LABEL: gpu.func @scatter_ops_scf_non_yield_perm_layout({{.*}}) { +// CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex> +// CHECK: %[[MASK:.*]] = arith.constant dense : vector<1xi1> +// CHECK: %[[PREDICATE:.*]] = llvm.mlir.poison : i1 +// CHECK: scf.if %[[PREDICATE]] { +// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> +// CHECK-SAME: memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> +// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> +// CHECK-SAME: vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> +// CHECK-NEXT: } +gpu.module @xevm_module{ + gpu.func @scatter_ops_scf_non_yield_perm_layout(%src: memref<256xf16>) { + %pred = llvm.mlir.poison : i1 + %1 = arith.constant {layout_result_0 = #xegpu.layout} dense<1>: vector<16xi1> + %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> + scf.if %pred { + %3 = xegpu.load %src[%offset], %1 <{chunk_size=8, + layout = #xegpu.layout + }> : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> + xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> + } + gpu.return + } +} + // ----- // CHECK-LABEL: gpu.func @mma_transpose_b( // CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x8xi32>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir index 7e742af754fbe..a87ac4945ffa6 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir @@ -605,6 +605,26 @@ gpu.module @test_kernel { } } +// ----- +gpu.module @test_kernel { + // CHECK-LABEL: load_with_offsets_perm_layout + // CHECK-COUNT-2: xegpu.load {{.*}}[{{.*}}], {{.*}} <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32> + gpu.func @load_with_offsets_perm_layout(%src: ui64) -> vector<32xf32> { + %cst = arith.constant dense<[ + 0, 8, 16, 24, 32, 40, 48, 56, + 64, 72, 80, 88, 96, 104, 112, 120, + 128, 136, 144, 152, 160, 168, 176, 184, + 192, 200, 208, 216, 224, 232, 240, 248 + ]> : vector<32xindex> + + %c17 = arith.constant 17: index + %mask = vector.create_mask %c17: vector<32xi1> + %ld = xegpu.load %src[%cst], %mask <{chunk_size = 1, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint}> : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32> + + gpu.return %ld : vector<32xf32> + } +} + // ----- gpu.module @test_kernel { // CHECK-LABEL: store_with_offsets @@ -630,6 +650,31 @@ gpu.module @test_kernel { } } +// ----- +gpu.module @test_kernel { + // CHECK-LABEL: store_with_offsets_perm_layout + // CHECK-COUNT-2: xegpu.store {{.*}}[{{.*}}], {{.*}} <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint}> : vector<16xf32>, ui64, vector<16xindex>, vector<16xi1> + gpu.func @store_with_offsets_perm_layout(%src: ui64) { + %cst = arith.constant dense<[ + 0, 8, 16, 24, 32, 40, 48, 56, + 64, 72, 80, 88, 96, 104, 112, 120, + 128, 136, 144, 152, 160, 168, 176, 184, + 192, 200, 208, 216, 224, 232, 240, 248 + ]> : vector<32xindex> + + %c17 = arith.constant 17: index + %mask = vector.create_mask %c17: vector<32xi1> + + %st_vec = arith.constant dense<1023.0>: vector<32xf32> + xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout, + layout_operand_2 = #xegpu.layout, + layout_operand_3 = #xegpu.layout, + l1_hint = #xegpu.cache_hint} : vector<32xf32>, ui64, vector<32xindex>, vector<32xi1> + + gpu.return + } +} + // ----- gpu.module @test_kernel { // CHECK-LABEL: load_with_offsets_chunk @@ -654,6 +699,30 @@ gpu.module @test_kernel { } } +// ----- +gpu.module @test_kernel { + // CHECK-LABEL: load_with_offsets_chunk_perm_layout + // CHECK: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<32x4xf32> + // CHECK: [[cst0:%.+]] = arith.constant dense<[130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250]> : vector<16xindex> + // CHECK: [[cst1:%.+]] = arith.constant dense<[2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122]> : vector<16xindex> + // CHECK: [[cst2:%.+]] = arith.constant dense<[128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]> : vector<16xindex> + // CHECK: [[cst3:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex> + // CHECK-COUNT-4: xegpu.load {{.*}}[{{.*}}], {{.*}} <{chunk_size = 2 : i64, l1_hint = #xegpu.cache_hint}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16x2xf32> + gpu.func @load_with_offsets_chunk_perm_layout(%src: ui64) -> vector<32x4xf32> { + %cst = arith.constant dense<[ + 0, 8, 16, 24, 32, 40, 48, 56, + 64, 72, 80, 88, 96, 104, 112, 120, + 128, 136, 144, 152, 160, 168, 176, 184, + 192, 200, 208, 216, 224, 232, 240, 248 + ]> : vector<32xindex> + + %c17 = arith.constant 17: index + %mask = vector.create_mask %c17: vector<32xi1> + %ld = xegpu.load %src[%cst], %mask <{chunk_size = 4, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint}> : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32> + gpu.return %ld : vector<32x4xf32> + } +} + // ----- gpu.module @test_kernel { // CHECK-LABEL: store_with_offsets_chunk @@ -683,6 +752,35 @@ gpu.module @test_kernel { } } +// ----- +gpu.module @test_kernel { + // CHECK-LABEL: store_with_offsets_chunk_perm_layout + // CHECK: [[cst:%.+]] = arith.constant dense<1.023000e+03> : vector<16x2xf32 + // CHECK: [[cst0:%.+]] = arith.constant dense<[130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250]> : vector<16xindex> + // CHECK: [[cst1:%.+]] = arith.constant dense<[2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122]> : vector<16xindex> + // CHECK: [[cst2:%.+]] = arith.constant dense<[128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]> : vector<16xindex> + // CHECK: [[cst3:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex> + // CHECK-COUNT-4: xegpu.store {{.*}}[{{.*}}], {{.*}} <{chunk_size = 2 : i64, l1_hint = #xegpu.cache_hint}> : vector<16x2xf32>, ui64, vector<16xindex>, vector<16xi1> + gpu.func @store_with_offsets_chunk_perm_layout(%src: ui64) { + %cst = arith.constant dense<[ + 0, 8, 16, 24, 32, 40, 48, 56, + 64, 72, 80, 88, 96, 104, 112, 120, + 128, 136, 144, 152, 160, 168, 176, 184, + 192, 200, 208, 216, 224, 232, 240, 248 + ]> : vector<32xindex> + + %c17 = arith.constant 17: index + %mask = vector.create_mask %c17: vector<32xi1> + + %st_vec = arith.constant dense<1023.>: vector<32x4xf32> + xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 4, layout = #xegpu.layout, + layout_operand_2 = #xegpu.layout, + layout_operand_3 = #xegpu.layout, + l1_hint = #xegpu.cache_hint} : vector<32x4xf32>, ui64, vector<32xindex>, vector<32xi1> + gpu.return + } +} + // ----- gpu.module @test_kernel { // CHECK-LABEL: remove_unit_dim_inst_data @@ -710,6 +808,33 @@ gpu.module @test_kernel { } } +// ----- +gpu.module @test_kernel { + // CHECK-LABEL: remove_unit_dim_inst_data_perm_layout + // CHECK-SAME: [[arg0:%.+]]: ui64 + // CHECK: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<1x1x32xf32> + // CHECK: [[cst_0:%.+]] = arith.constant dense : vector<16xi1> + // CHECK: [[cst_1:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex> + // CHECK: [[cst_2:%.+]] = arith.constant dense<[128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]> : vector<16xindex> + // CHECK: [[ld_0:%.+]] = xegpu.load [[arg0]][[[cst_1]]], [[cst_0]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32> + // CHECK: [[ld_1:%.+]] = xegpu.load [[arg0]][[[cst_2]]], [[cst_0]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32> + // CHECK: [[ins_0:%.+]] = vector.insert_strided_slice [[ld_0]], [[cst]] {offsets = [0, 0, 0], strides = [1]} : vector<16xf32> into vector<1x1x32xf32> + // CHECK: [[ins_1:%.+]] = vector.insert_strided_slice [[ld_1]], [[ins_0]] {offsets = [0, 0, 16], strides = [1]} : vector<16xf32> into vector<1x1x32xf32> + gpu.func @remove_unit_dim_inst_data_perm_layout(%src: ui64) -> vector<1x1x32xf32> { + %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<[[ + [0, 8, 16, 24, 32, 40, 48, 56, + 64, 72, 80, 88, 96, 104, 112, 120, + 128, 136, 144, 152, 160, 168, 176, 184, + 192, 200, 208, 216, 224, 232, 240, 248] + ]]> : vector<1x1x32xindex> + + %mask = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<1x1x32xi1> + %ld = xegpu.load %src[%cst], %mask <{chunk_size = 1, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint}> : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32> + + gpu.return %ld : vector<1x1x32xf32> + } +} + // ----- #l = #xegpu.layout gpu.module @test_kernel { diff --git a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir index dbc52b8a98894..1a924e36cb2e6 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir @@ -231,6 +231,28 @@ gpu.module @test { gpu.return %ld : vector<32xf32> } +//----- + + + // CHECK-LABEL: load_with_offsets_perm_layout + // CHECK-SAME: [[arg0:%.+]]: ui64 + // CHECK-COUNT-2: xegpu.load {{.*}}[{{.*}}], {{.*}} <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32> + gpu.func @load_with_offsets_perm_layout(%src: ui64) -> vector<32xf32> { + %cst = arith.constant dense<[ + 0, 8, 16, 24, 32, 40, 48, 56, + 64, 72, 80, 88, 96, 104, 112, 120, + 128, 136, 144, 152, 160, 168, 176, 184, + 192, 200, 208, 216, 224, 232, 240, 248 + ]> : vector<32xindex> + + %c17 = arith.constant 17: index + %mask = vector.create_mask %c17: vector<32xi1> + %ld = xegpu.load %src[%cst], %mask <{chunk_size = 1, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint}> : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32> + + gpu.return %ld : vector<32xf32> + } + + //----- // CHECK-LABEL: prefetch @@ -385,6 +407,29 @@ gpu.module @test { gpu.return %ld : vector<32x4xf32> } +//----- + // CHECK-LABEL: load_with_offsets_chunk_perm_layout + // CHECK-SAME: [[arg0:%.+]]: ui64 + // CHECK: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<32x4xf32> + // CHECK: [[cst0:%.+]] = arith.constant dense<[130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250]> : vector<16xindex> + // CHECK: [[cst1:%.+]] = arith.constant dense<[2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122]> : vector<16xindex> + // CHECK: [[cst2:%.+]] = arith.constant dense<[128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]> : vector<16xindex> + // CHECK: [[cst3:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex> + // CHECK-COUNT-4: xegpu.load {{.*}}[{{.*}}], {{.*}} <{chunk_size = 2 : i64, l1_hint = #xegpu.cache_hint}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16x2xf32> + gpu.func @load_with_offsets_chunk_perm_layout(%src: ui64) -> vector<32x4xf32> { + %cst = arith.constant dense<[ + 0, 8, 16, 24, 32, 40, 48, 56, + 64, 72, 80, 88, 96, 104, 112, 120, + 128, 136, 144, 152, 160, 168, 176, 184, + 192, 200, 208, 216, 224, 232, 240, 248 + ]> : vector<32xindex> + + %c17 = arith.constant 17: index + %mask = vector.create_mask %c17: vector<32xi1> + %ld = xegpu.load %src[%cst], %mask <{chunk_size = 4, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint}> : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32> + gpu.return %ld : vector<32x4xf32> + } + //----- // CHECK-LABEL: store_chunk // CHECK-SAME: [[arg0:%.+]]: ui64 diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir index 742d11f8052ec..689b2dbe313a2 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir @@ -279,14 +279,28 @@ gpu.module @test_distribution { gpu.return } + // CHECK-LABEL: @load_gather_perm_layout + // CHECK-SAME: %[[ARG0:.*]]: memref + gpu.func @load_gather_perm_layout(%src : memref) { + // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<32x4xindex> + // CHECK: %[[MASK:.*]] = arith.constant dense : vector<32x4xi1> + // CHECK: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint}> + // CHECK-SAME: : memref, vector<32x4xindex>, vector<32x4xi1> -> vector<32x4xf16> + %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<256x16xindex> + %mask = arith.constant {layout_result_0 = #xegpu.layout} dense<1> : vector<256x16xi1> + %load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} + : memref, vector<256x16xindex>, vector<256x16xi1> -> vector<256x16xf16> + gpu.return + } + // CHECK-LABEL: @store_scatter // CHECK-SAME: %[[ARG0:.*]]: memref<256xf16> gpu.func @store_scatter(%dest : memref<256xf16>) { // CHECK: %[[VAL:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<2.550000e+01> : vector<8xf16> // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<8xindex> // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<8xi1> - // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint}> - // CHECK-SAME: {layout_operand_0 = #xegpu.layout, layout_operand_2 = #xegpu.layout, + // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint, layout = #xegpu.layout}> + // CHECK-SAME: {layout_operand_2 = #xegpu.layout, // CHECK-SAME: layout_operand_3 = #xegpu.layout} // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1> %val = arith.constant {layout_result_0 = #xegpu.layout} dense<25.5> : vector<256xf16> @@ -300,6 +314,27 @@ gpu.module @test_distribution { gpu.return } + // CHECK-LABEL: @store_scatter_perm_layout + // CHECK-SAME: %[[ARG0:.*]]: memref<256xf16> + gpu.func @store_scatter_perm_layout(%dest : memref<256xf16>) { + // CHECK: %[[VAL:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<2.550000e+01> : vector<8xf16> + // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<8xindex> + // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<8xi1> + // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint, layout = #xegpu.layout}> + // CHECK-SAME: {layout_operand_2 = #xegpu.layout, + // CHECK-SAME: layout_operand_3 = #xegpu.layout} + // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1> + %val = arith.constant {layout_result_0 = #xegpu.layout} dense<25.5> : vector<256xf16> + %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<256xindex> + %mask = arith.constant {layout_result_0 = #xegpu.layout} dense<1> : vector<256xi1> + xegpu.store %val, %dest[%offset], %mask {chunk_size = 1, layout = #xegpu.layout, + layout_operand_2 = #xegpu.layout, + layout_operand_3 = #xegpu.layout, + l1_hint = #xegpu.cache_hint} + : vector<256xf16>, memref<256xf16>, vector<256xindex>, vector<256xi1> + gpu.return + } + // CHECK-LABEL: @load_with_non_unit_chunk_size // CHECK-SAME: %[[ARG0:.*]]: memref gpu.func @load_with_non_unit_chunk_size(%src : memref) { @@ -314,6 +349,20 @@ gpu.module @test_distribution { gpu.return } + // CHECK-LABEL: @load_with_non_unit_chunk_size_perm_layout + // CHECK-SAME: %[[ARG0:.*]]: memref + gpu.func @load_with_non_unit_chunk_size_perm_layout(%src : memref) { + // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<8xindex> + // CHECK: %[[MASK:.*]] = arith.constant dense : vector<8xi1> + // CHECK: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 4 : i64, l1_hint = #xegpu.cache_hint}> + // CHECK-SAME: : memref, vector<8xindex>, vector<8xi1> -> vector<8x4xf16> + %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<256xindex> + %mask = arith.constant {layout_result_0 = #xegpu.layout} dense<1> : vector<256xi1> + %load = xegpu.load %src[%offset], %mask {chunk_size = 4, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} + : memref, vector<256xindex>, vector<256xi1> -> vector<256x4xf16> + gpu.return + } + // CHECK-LABEL: distribute_load_matrix // CHECK-SAME: [[arg0:%.+]]: memref<32768xi8, 3> gpu.func @distribute_load_matrix(%arg0: memref<32768xi8, 3>) { @@ -407,6 +456,18 @@ gpu.module @test_distribution { : vector<4x2x6x32xf16> to vector<4x2x6xf16> gpu.return } + + // CHECK-LABEL: @vector_reduce_4D_perm_layout + gpu.func @vector_reduce_4D_perm_layout(%src: ui64) { + %cst_acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [3]>} dense<0.0> : vector<4x2x6xf16> + %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<4x2x6x32xindex> + %mask = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<4x2x6x32xi1> + %load = xegpu.load %src[%offset], %mask <{layout = #xegpu.layout}> : ui64, vector<4x2x6x32xindex>, vector<4x2x6x32xi1> -> vector<4x2x6x32xf16> + // CHECK: vector.multi_reduction , {{.*}}, {{.*}} [3] : vector<1x1x1x32xf16> to vector<1x1x1xf16> + %reduce = vector.multi_reduction , %load, %cst_acc {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [3]>} [3] + : vector<4x2x6x32xf16> to vector<4x2x6xf16> + gpu.return + } // CHECK-LABEL: vector_step_op gpu.func @vector_step_op_slice_attr() {