diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 31fe93d209a6d..85d0d5d8e7c22 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -77,11 +77,6 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface For the case of dynamic memrefs or pointer, the shape and layout information of the memory region should be explicitly passed via `shape` and `strides` parameters. - - `offsets`: [optional] index values represents offsets from the "source" at the each dimension - at which the subview of the target memory will be created. It is encoded via - "offsets" and "const_offsets", such that it can accept various forms, such as, - operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4]). Offsets is optional and may be set at load_nd, store_nd, and prefetch_nd. - - `shape`: the shape information of the memory region pointed by the "source". It is typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>. But if "source" is simply a pointer represented as uint64_t type, or a memref @@ -100,41 +95,34 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface Example 1 (suppose the tensor shape inferred by the compiler is 8x16): ```mlir %0 = memref.alloc() : memref<1024x1024xf32> - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %1 = xegpu.create_nd_tdesc %0[%c0, %c0]: memref<1024x1024xf32> -> TensorDesc<8x16xf32> + %1 = xegpu.create_nd_tdesc %0 : memref<1024x1024xf32> -> TensorDesc<8x16xf32> ``` Example 2 (suppose the tensor shape inferred by the compiler is 8x16): ```mlir %0 = memref.alloc(%h, %w) : memref - %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index - %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: memref -> TensorDesc<8x16xf32> + %1 = xegpu.create_nd_tdesc %0, shape:[%h, %w], strides:[%w, %c1]: memref -> TensorDesc<8x16xf32> ``` Example 3 (suppose the tensor shape inferred by the compiler is 8x16): ```mlir %0 = ... : ui64 - %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index - %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32> + %1 = xegpu.create_nd_tdesc %0, shape:[%h, %w], strides:[%w, %c1]: ui64 -> TensorDesc<8x16xf32> ``` }]; let arguments = (ins XeGPU_BaseAddrType: $source, - Variadic: $offsets, Variadic: $shape, Variadic: $strides, - OptionalAttr: $const_offsets, OptionalAttr: $const_shape, OptionalAttr: $const_strides ); let assemblyFormat = [{ $source `` - custom($offsets, $const_offsets) (`,` `shape` `:` custom($shape, $const_shape)^ `,` `strides``:` custom($strides, $const_strides))? attr-dict `:` type($source) `->` qualified(type($TensorDesc)) @@ -148,14 +136,6 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface OpBuilder<(ins "Type": $tdesc, "TypedValue": $source)>, OpBuilder<(ins "Type": $tdesc, "Value ": $source, - "llvm::ArrayRef": $shape, - "llvm::ArrayRef": $strides)>, - - OpBuilder<(ins "Type": $tdesc, "TypedValue": $source, - "llvm::ArrayRef": $offsets)>, - - OpBuilder<(ins "Type": $tdesc, "Value": $source, - "llvm::ArrayRef": $offsets, "llvm::ArrayRef": $shape, "llvm::ArrayRef": $strides)> ]; @@ -181,14 +161,6 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface return getType().getShape(); } - SmallVector getMixedOffsets() { - auto statics = getConstOffsets().value_or(SmallVector()); - auto dynamics = getOffsets(); - if (statics.size() == 0 && dynamics.size() == 0) - return {}; - return getMixedValues(statics, dynamics, getContext()); - } - SmallVector getMixedSizes() { SmallVector statics; @@ -261,7 +233,7 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", [AnchorLayoutInterface]> { - `TensorDesc`: A tensor descriptor specifying the base nd-region of memory and tensor tile to be prefetched. - - `offsets`: [optional] index values representing per-dimension offsets from the + - `offsets`: index values representing per-dimension offsets from the base position encoded in `TensorDesc`. It is encoded via "offsets" and "const_offsets". @@ -286,7 +258,7 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", [AnchorLayoutInterface]> { let arguments = (ins XeGPU_TensorDesc: $TensorDesc, Variadic: $offsets, - OptionalAttr: $const_offsets, + DenseI64ArrayAttr: $const_offsets, OptionalAttr: $l1_hint, OptionalAttr: $l2_hint, OptionalAttr: $l3_hint, @@ -306,11 +278,7 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", [AnchorLayoutInterface]> { } SmallVector getMixedOffsets() { - auto statics = getConstOffsets().value_or(SmallVector()); - auto dynamics = getOffsets(); - if (statics.size() == 0 && dynamics.size() == 0) - return {}; - return getMixedValues(statics, dynamics, getContext()); + return getMixedValues(getConstOffsets(), getOffsets(), getContext()); } xegpu::DistributeLayoutAttr getDescLayoutAttr() { @@ -325,15 +293,11 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", [AnchorLayoutInterface]> { let assemblyFormat = [{ $TensorDesc `` - custom($offsets, $const_offsets) + custom($offsets, $const_offsets) prop-dict attr-dict `:` qualified(type($TensorDesc)) }]; let builders = [ - OpBuilder<(ins "Value": $TensorDesc, - "xegpu::CachePolicyAttr": $l1_hint, - "xegpu::CachePolicyAttr": $l2_hint, - "xegpu::CachePolicyAttr": $l3_hint)>, OpBuilder<(ins "Value": $TensorDesc, "ArrayRef": $offsets, "xegpu::CachePolicyAttr": $l1_hint, @@ -409,7 +373,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [ let arguments = (ins XeGPU_TensorDesc: $TensorDesc, Variadic: $offsets, - OptionalAttr: $const_offsets, + DenseI64ArrayAttr: $const_offsets, OptionalAttr: $packed, OptionalAttr: $transpose, OptionalAttr: $l1_hint, @@ -437,11 +401,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [ } SmallVector getMixedOffsets() { - auto statics = getConstOffsets().value_or(SmallVector()); - auto dynamics = getOffsets(); - if (statics.size() == 0 && dynamics.size() == 0) - return {}; - return getMixedValues(statics, dynamics, getContext()); + return getMixedValues(getConstOffsets(), getOffsets(), getContext()); } xegpu::DistributeLayoutAttr getDescLayoutAttr() { @@ -456,16 +416,11 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [ let assemblyFormat = [{ $TensorDesc `` - custom($offsets, $const_offsets) + custom($offsets, $const_offsets) prop-dict attr-dict `:` qualified(type($TensorDesc)) `->` type($value) }]; let builders = [ - OpBuilder<(ins "Type": $value, "Value": $TensorDesc, - "UnitAttr": $packed, "DenseI64ArrayAttr": $transpose, - "xegpu::CachePolicyAttr": $l1_hint, - "xegpu::CachePolicyAttr": $l2_hint, - "xegpu::CachePolicyAttr": $l3_hint)>, OpBuilder<(ins "Type": $value, "Value": $TensorDesc, "ArrayRef": $offsets, "UnitAttr": $packed, "DenseI64ArrayAttr": $transpose, @@ -534,7 +489,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [ let arguments = (ins XeGPU_ValueType: $value, XeGPU_TensorDesc: $TensorDesc, Variadic: $offsets, - OptionalAttr: $const_offsets, + DenseI64ArrayAttr: $const_offsets, OptionalAttr: $l1_hint, OptionalAttr: $l2_hint, OptionalAttr: $l3_hint, @@ -558,11 +513,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [ } SmallVector getMixedOffsets() { - auto statics = getConstOffsets().value_or(SmallVector()); - auto dynamics = getOffsets(); - if (statics.size() == 0 && dynamics.size() == 0) - return {}; - return getMixedValues(statics, dynamics, getContext()); + return getMixedValues(getConstOffsets(), getOffsets(), getContext()); } xegpu::DistributeLayoutAttr getDescLayoutAttr() { @@ -578,15 +529,11 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [ let assemblyFormat = [{ $value `,` $TensorDesc `` - custom($offsets, $const_offsets) + custom($offsets, $const_offsets) prop-dict attr-dict `:` type($value) `,` qualified(type($TensorDesc)) }]; let builders = [ - OpBuilder<(ins "Value": $value, "Value": $TensorDesc, - "xegpu::CachePolicyAttr": $l1_hint, - "xegpu::CachePolicyAttr": $l2_hint, - "xegpu::CachePolicyAttr": $l3_hint)>, OpBuilder<(ins "Value": $value, "Value": $TensorDesc, "ArrayRef": $offsets, "xegpu::CachePolicyAttr": $l1_hint, @@ -599,55 +546,6 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [ let hasVerifier = 1; } -def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset", - [Pure, AllTypesMatch<["TensorDesc", "result"]>]> { - let summary = "It updates the offsets for the TensorDesc."; - let description = [{The op updates the offset of the given TensorDesc. - The offsets are relative offset to the current position in the number - of elements. It will result in a same type TensorDesc as the input. - - Example: - ``` - %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32> - ``` - }]; - - let arguments = (ins - XeGPU_TensorDesc: $TensorDesc, - Variadic: $offsets, - DenseI64ArrayAttr: $const_offsets); - - let results = (outs XeGPU_TensorDesc: $result); - - let extraClassDeclaration = extraBaseClassDeclaration # [{ - xegpu::TensorDescType getTensorDescType() { - return getTensorDesc().getType(); - } - - SmallVector getMixedOffsets() { - Builder b(getContext()); - return getMixedValues(getConstOffsets(), getOffsets(), b); - } - - size_t getNumOffsets() { - return getMixedOffsets().size(); - } - - OpFoldResult getOffset(unsigned idx) { - assert(idx < getNumOffsets() && "Invalid out of bound access."); - return getMixedOffsets()[idx]; - } - }]; - - let assemblyFormat = [{ - $TensorDesc `,` - custom($offsets, $const_offsets) - attr-dict `:` qualified(type($result)) - }]; - - let hasVerifier = 1; -} - def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [AnchorLayoutInterface]> { let summary = "prefetches a set of scattered data points to cache"; @@ -679,12 +577,14 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [AnchorLayoutInterface]> { Example 1 (Workgroup level): ```mlir - xegpu.prefetch %tdesc {l1_hint = #xegpu.cache_hint, + %a = memref.alloc() : memref<256xf16> + %offsets = arith.constant dense<[0, 1, ..., 255]> : vector<256xindex> + xegpu.prefetch %a[%offsets] {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, layout = #xegpu.layout } - : !xegpu.tensor_desc<256xf16> + : memref<256xf16>, vector<256xindex> ``` Example 2 (lane level): @@ -720,8 +620,8 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [AnchorLayoutInterface]> { }]; - let arguments = (ins XeGPU_GatherScatterSourceType:$source, - Optional>:$offsets, + let arguments = (ins XeGPU_GatherScatterBaseAddrType:$source, + AnyTypeOf<[XeGPU_OffsetType, Index]>:$offsets, OptionalAttr:$l1_hint, OptionalAttr:$l2_hint, OptionalAttr:$l3_hint, @@ -741,33 +641,14 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [AnchorLayoutInterface]> { setLayoutAttr(anchorLayout); } - TypedValue getTensorDesc() { - if (auto tdescType = getTensorDescType()) { - return llvm::cast>(getSource()); - } - return TypedValue(); - } - - xegpu::TensorDescType getTensorDescType() { - return dyn_cast(getSourceType()); - } - }]; let assemblyFormat = [{ - $source - (`[` $offsets^ `]`)? + $source `[` $offsets `]` prop-dict attr-dict `:` type(operands) }]; - let builders = [ - OpBuilder<(ins "Value": $source, - "xegpu::CachePolicyAttr": $l1_hint, - "xegpu::CachePolicyAttr": $l2_hint, - "xegpu::CachePolicyAttr": $l3_hint)> - ]; - let hasVerifier = 1; } @@ -837,8 +718,8 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayou }]; - let arguments = (ins XeGPU_GatherScatterSourceType:$source, - Optional>:$offsets, + let arguments = (ins XeGPU_GatherScatterBaseAddrType:$source, + AnyTypeOf<[XeGPU_OffsetType, Index]>:$offsets, AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr:$chunk_size, OptionalAttr:$l1_hint, OptionalAttr:$l2_hint, @@ -860,17 +741,6 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayou setLayoutAttr(anchorLayout); } - TypedValue getTensorDesc() { - if (auto tdescType = getTensorDescType()) { - return llvm::cast>(getSource()); - } - return TypedValue(); - } - - xegpu::TensorDescType getTensorDescType() { - return dyn_cast(getSourceType()); - } - mlir::Type getElementType() { auto type = getValue().getType(); return getElementTypeOrSelf(type); @@ -887,17 +757,12 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayou }]; let assemblyFormat = [{ - $source - (`[` $offsets^ `]`)? `,` + $source `[` $offsets `]` `,` $mask prop-dict attr-dict `:` type(operands) `->` type($value) }]; let builders = [ - OpBuilder<(ins "Type": $value, "Value": $source, "Value": $mask, - "xegpu::CachePolicyAttr": $l1_hint, - "xegpu::CachePolicyAttr": $l2_hint, - "xegpu::CachePolicyAttr": $l3_hint)>, OpBuilder<(ins "Type": $value, "Value": $source, "ArrayRef": $offsets, "Value": $mask, "IntegerAttr": $chunk_size, @@ -983,8 +848,8 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL }]; let arguments = (ins XeGPU_ValueOrScalarType:$value, - XeGPU_GatherScatterSourceType:$dest, - Optional>:$offsets, + XeGPU_GatherScatterBaseAddrType:$dest, + AnyTypeOf<[XeGPU_OffsetType, Index]>:$offsets, AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr:$chunk_size, OptionalAttr:$l1_hint, OptionalAttr:$l2_hint, @@ -1004,17 +869,6 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL setLayoutAttr(anchorLayout); } - TypedValue getTensorDesc() { - if (auto tdescType = getTensorDescType()) { - return llvm::cast>(getDest()); - } - return TypedValue(); - } - - xegpu::TensorDescType getTensorDescType() { - return dyn_cast(getDestType()); - } - mlir::Type getElementType() { auto type = getValue().getType(); return getElementTypeOrSelf(type); @@ -1031,18 +885,13 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL let assemblyFormat = [{ $value `,` - $dest - (`[` $offsets^ `]`)? `,` + $dest `[` $offsets `]` `,` $mask prop-dict attr-dict `:` type(operands) }]; let builders = [ - OpBuilder<(ins "Value": $value, "Value": $dest, "Value": $mask, - "xegpu::CachePolicyAttr": $l1_hint, - "xegpu::CachePolicyAttr": $l2_hint, - "xegpu::CachePolicyAttr": $l3_hint)>, OpBuilder<(ins "Value": $value, "Value": $dest, "ArrayRef": $offsets, "Value": $mask, "IntegerAttr": $chunk_size, diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td index f529ba73f942f..95a3e22cf803b 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td @@ -174,9 +174,6 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", let genVerifyDecl = 1; } -def XeGPU_GatherScatterSourceType - : AnyTypeOf<[XeGPU_TensorDesc, XeGPU_GatherScatterBaseAddrType]>; - def XeGPU_Nbarrier: XeGPUTypeDef<"Nbarrier", "nbarrier", [], "mlir::Type"> { let summary = "!xegpu.nbarrier a custom XeGPU type representing a barrier."; diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp index 50eba56a16080..d9ff452fb4db3 100644 --- a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp +++ b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp @@ -186,9 +186,6 @@ class CreateNdDescToXeVMPattern matchAndRewrite(xegpu::CreateNdDescOp op, xegpu::CreateNdDescOp::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { - SmallVector mixedOffsets = op.getMixedOffsets(); - if (mixedOffsets.size() != 0) - return rewriter.notifyMatchFailure(op, "Offsets not supported."); auto loc = op.getLoc(); auto source = op.getSource(); // Op is lowered to a code sequence that populates payload. @@ -545,7 +542,6 @@ class LoadStoreToXeVMPattern : public OpConversionPattern { return rewriter.notifyMatchFailure(op, "Expected offset to be provided."); auto loc = op.getLoc(); auto ctxt = rewriter.getContext(); - auto tdescTy = op.getTensorDescType(); Value basePtrI64; // Load result or Store valye Type can be vector or scalar. Type valOrResTy; @@ -567,10 +563,6 @@ class LoadStoreToXeVMPattern : public OpConversionPattern { // Default memory space is global. LLVM::LLVMPointerType ptrTypeLLVM = LLVM::LLVMPointerType::get( ctxt, getNumericXeVMAddrSpace(xegpu::MemorySpace::Global)); - // If tensor descriptor is available, we use its memory space. - if (tdescTy) - ptrTypeLLVM = LLVM::LLVMPointerType::get( - ctxt, getNumericXeVMAddrSpace(tdescTy.getMemorySpace())); // Base pointer can come from source (load) or dest (store). // If they are memrefs, we use their memory space. if constexpr (std::is_same_v) { @@ -805,7 +797,6 @@ class PrefetchToXeVMPattern : public OpConversionPattern { ConversionPatternRewriter &rewriter) const override { auto loc = op.getLoc(); auto ctxt = rewriter.getContext(); - auto tdescTy = op.getTensorDescType(); Value basePtrI64 = adaptor.getSource(); // Base pointer is passed as i32 or i64 by adaptor, cast to i64 if needed. if (basePtrI64.getType() != rewriter.getI64Type()) @@ -821,12 +812,8 @@ class PrefetchToXeVMPattern : public OpConversionPattern { } else { int64_t elemBitWidth{0}; int64_t elemByteSize; - // Element byte size can come from three sources: - if (tdescTy) { - // If tensor descriptor is available, we use its element type to - // determine element byte size. - elemBitWidth = tdescTy.getElementType().getIntOrFloatBitWidth(); - } else if (auto memRefTy = dyn_cast(op.getSourceType())) { + // Element byte size can come from two sources: + if (auto memRefTy = dyn_cast(op.getSourceType())) { // If memref is available, we use its element type to // determine element byte size. elemBitWidth = memRefTy.getElementType().getIntOrFloatBitWidth(); @@ -847,10 +834,6 @@ class PrefetchToXeVMPattern : public OpConversionPattern { // Default memory space is global. LLVM::LLVMPointerType ptrTypeLLVM = LLVM::LLVMPointerType::get( ctxt, getNumericXeVMAddrSpace(xegpu::MemorySpace::Global)); - // If tensor descriptor is available, we use its memory space. - if (tdescTy) - ptrTypeLLVM = LLVM::LLVMPointerType::get( - ctxt, getNumericXeVMAddrSpace(tdescTy.getMemorySpace())); // If source is a memref, we use its memory space. if (auto memRefTy = dyn_cast(op.getSource().getType())) { auto addrSpace = memRefTy.getMemorySpaceAsInt(); diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 7f9d0f10ece8a..4fe15c625ea49 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -192,10 +192,8 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, [[maybe_unused]] auto ty = source.getType(); assert(ty.hasStaticShape() && "expecting a memref with static shape"); - build(builder, state, tdesc, source, ValueRange({}) /* dynamic offsets */, - ValueRange({}) /* empty dynamic shape */, + build(builder, state, tdesc, source, ValueRange({}) /* empty dynamic shape */, ValueRange({}) /* empty dynamic strides */, - DenseI64ArrayAttr({}) /* const offsets */, DenseI64ArrayAttr({}) /* empty const shape*/, DenseI64ArrayAttr({}) /* empty const strides*/); } @@ -234,72 +232,8 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, } } - build(builder, state, tdesc, source, ValueRange({}), dynamicShape, - dynamicStrides, builder.getDenseI64ArrayAttr({}), staticShapeAttr, - staticStridesAttr); -} - -void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, - Type tdesc, TypedValue source, - llvm::ArrayRef offsets) { - [[maybe_unused]] auto ty = source.getType(); - assert(ty.hasStaticShape() && offsets.size() == (size_t)ty.getRank()); - - llvm::SmallVector staticOffsets; - llvm::SmallVector dynamicOffsets; - dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); - - build(builder, state, tdesc, source, dynamicOffsets /* dynamic offsets */, - ValueRange({}) /* empty dynamic shape */, - ValueRange({}) /* empty dynamic strides */, - builder.getDenseI64ArrayAttr(staticOffsets) /* const offsets */, - {} /* empty const shape*/, {} /* empty const strides*/); -} - -void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, - Type tdesc, Value source, - llvm::ArrayRef offsets, - llvm::ArrayRef shape, - llvm::ArrayRef strides) { - assert(!shape.empty() && !offsets.empty() && !strides.empty() && - shape.size() == strides.size() && shape.size() == offsets.size()); - - Type srcTy = source.getType(); - assert((isa(srcTy)) && - "Source has to be either int or memref."); - - llvm::SmallVector dynamicOffsets; - llvm::SmallVector dynamicShape; - llvm::SmallVector dynamicStrides; - - llvm::SmallVector staticOffsets; - llvm::SmallVector staticShape; - llvm::SmallVector staticStrides; - - dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); - dispatchIndexOpFoldResults(shape, dynamicShape, staticShape); - dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides); - - auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets); - auto staticShapeAttr = builder.getDenseI64ArrayAttr(staticShape); - auto staticStridesAttr = builder.getDenseI64ArrayAttr(staticStrides); - - if (auto memrefTy = dyn_cast(srcTy)) { - auto memrefShape = memrefTy.getShape(); - auto [memrefStrides, _] = memrefTy.getStridesAndOffset(); - - // if shape and strides are from Memref, we don't need attributes for them - // to keep the IR print clean (only do so for full-static case, otherwise - // printer would fail trying to print empty array-attr). - if (staticShape == memrefShape && staticStrides == memrefStrides && - dynamicShape.empty() && dynamicStrides.empty()) { - staticShapeAttr = DenseI64ArrayAttr(); - staticStridesAttr = DenseI64ArrayAttr(); - } - } - - build(builder, state, tdesc, source, dynamicOffsets, dynamicShape, - dynamicStrides, staticOffsetsAttr, staticShapeAttr, staticStridesAttr); + build(builder, state, tdesc, source, dynamicShape, dynamicStrides, + staticShapeAttr, staticStridesAttr); } LogicalResult CreateNdDescOp::verify() { @@ -318,9 +252,6 @@ LogicalResult CreateNdDescOp::verify() { << " Source: " << srcMemorySpace << ", TensorDesc: " << tdescMemorySpace; - if (size_t offsetRank = getMixedOffsets().size()) - invalidRank |= (offsetRank != rank); - // check source type matches the rank if it is a memref. // It also should have the same ElementType as TensorDesc. if (auto memrefTy = dyn_cast(getSourceType())) @@ -335,14 +266,13 @@ LogicalResult CreateNdDescOp::verify() { if (invalidRank) return emitOpError( - "Expecting the rank of shape, strides, offsets, and source (if source " + "Expecting the rank of shape, strides, and source (if source " "is a memref) should match with each other."); // check result TensorDesc rank if (getType().getRank() > (int64_t)rank) - return emitOpError( - "Expecting the TensorDesc rank is not greater than the " - "ranks of shape, strides, offsets or the memref source."); + return emitOpError("Expecting the TensorDesc rank is not greater than the " + "ranks of shape, strides or the memref source."); if (invalidElemTy) return emitOpError("TensorDesc should have the same element " @@ -351,65 +281,10 @@ LogicalResult CreateNdDescOp::verify() { return success(); } -static ParseResult parseOptionalDynamicIndexList( - OpAsmParser &parser, - SmallVectorImpl &values, - DenseI64ArrayAttr &integers, SmallVectorImpl *valueTypes = nullptr, - AsmParser::Delimiter delimiter = AsmParser::Delimiter::Square) { - - SmallVector integerVals; - auto parseIntegerOrValue = [&]() { - OpAsmParser::UnresolvedOperand operand; - auto res = parser.parseOptionalOperand(operand); - - if (res.has_value() && succeeded(res.value())) { - values.push_back(operand); - integerVals.push_back(ShapedType::kDynamic); - if (valueTypes && parser.parseColonType(valueTypes->emplace_back())) - return failure(); - } else { - int64_t integer; - if (failed(parser.parseInteger(integer))) - return failure(); - integerVals.push_back(integer); - } - return success(); - }; - - // If the optional values are given there must be left bracket - if (parser.parseOptionalLSquare().succeeded()) { - if (parser.parseCommaSeparatedList(parseIntegerOrValue) || - parser.parseRSquare()) - return parser.emitError(parser.getNameLoc()) - << "expected a list of SSA values or integers"; - integers = parser.getBuilder().getDenseI64ArrayAttr(integerVals); - return success(); - } - - return success(); -} - -static void printOptionalDynamicIndexList(OpAsmPrinter &printer, Operation *op, - OperandRange values, - DenseI64ArrayAttr integers) { - if (!integers || integers.empty()) - return; - printDynamicIndexList(printer, op, values, integers, - /*scalableFlags=*/{}, {}, AsmParser::Delimiter::Square); -} //===----------------------------------------------------------------------===// // XeGPU_PrefetchNdOp //===----------------------------------------------------------------------===// -void PrefetchNdOp::build(OpBuilder &builder, OperationState &state, - Value tensorDesc, xegpu::CachePolicyAttr l1_hint, - xegpu::CachePolicyAttr l2_hint, - xegpu::CachePolicyAttr l3_hint) { - - return build(builder, state, tensorDesc, ValueRange(), DenseI64ArrayAttr(), - l1_hint, l2_hint, l3_hint, /*anchor_layout=*/nullptr); -} - void PrefetchNdOp::build(OpBuilder &builder, OperationState &state, Value tensorDesc, ArrayRef offsets, xegpu::CachePolicyAttr l1_hint, @@ -440,7 +315,7 @@ LogicalResult PrefetchNdOp::verify() { int64_t tDescRank = tdescTy.getRank(); int64_t offsetSize = getMixedOffsets().size(); - if (offsetSize != 0 && offsetSize != tDescRank) + if (offsetSize != tDescRank) return emitOpError( "Mismatched ranks between offsets and tensor descriptor"); @@ -457,18 +332,6 @@ LogicalResult PrefetchNdOp::verify() { // XeGPU_LoadNdOp //===----------------------------------------------------------------------===// -void LoadNdOp::build(OpBuilder &builder, OperationState &state, Type retType, - Value tensorDesc, UnitAttr packed, - DenseI64ArrayAttr transpose, - xegpu::CachePolicyAttr l1_hint, - xegpu::CachePolicyAttr l2_hint, - xegpu::CachePolicyAttr l3_hint) { - - return build(builder, state, retType, tensorDesc, ValueRange(), - DenseI64ArrayAttr(), packed, transpose, l1_hint, l2_hint, - l3_hint, /*anchor_layout=*/nullptr); -} - void LoadNdOp::build(OpBuilder &builder, OperationState &state, Type retType, Value tensorDesc, ArrayRef offsets, UnitAttr packed, DenseI64ArrayAttr transpose, @@ -567,7 +430,7 @@ LogicalResult LoadNdOp::verify() { int64_t tDescRank = tdescTy.getRank(); int64_t offsetSize = getMixedOffsets().size(); - if (offsetSize != 0 && offsetSize != tDescRank) + if (offsetSize != tDescRank) return emitOpError( "Mismatched ranks between offsets and tensor descriptor"); @@ -584,16 +447,6 @@ LogicalResult LoadNdOp::verify() { // XeGPU_StoreNdOp //===----------------------------------------------------------------------===// -void StoreNdOp::build(OpBuilder &builder, OperationState &state, Value value, - Value tensorDesc, xegpu::CachePolicyAttr l1_hint, - xegpu::CachePolicyAttr l2_hint, - xegpu::CachePolicyAttr l3_hint) { - - return build(builder, state, value, tensorDesc, ValueRange(), - DenseI64ArrayAttr(), l1_hint, l2_hint, l3_hint, - /*anchor_layout=*/nullptr); -} - void StoreNdOp::build(OpBuilder &builder, OperationState &state, Value value, Value tensorDesc, ArrayRef offsets, xegpu::CachePolicyAttr l1_hint, @@ -663,7 +516,7 @@ LogicalResult StoreNdOp::verify() { int64_t tDescRank = dstTy.getRank(); int64_t offsetSize = getMixedOffsets().size(); - if (offsetSize != 0 && offsetSize != tDescRank) + if (offsetSize != tDescRank) return emitOpError( "Mismatched ranks between offsets and tensor descriptor"); @@ -676,31 +529,10 @@ LogicalResult StoreNdOp::verify() { return success(); } -//===----------------------------------------------------------------------===// -// XeGPU_UpdateNDOffsetOp -//===----------------------------------------------------------------------===// -LogicalResult UpdateNdOffsetOp::verify() { - auto ty = getTensorDescType(); - - // number of offsets specified must match the rank of the tensor descriptor - if (ty.getRank() != (int64_t)getNumOffsets()) { - return emitOpError("Invalid number of offsets."); - } - return success(); -} - //===----------------------------------------------------------------------===// // XeGPU_PrefetchOp //===----------------------------------------------------------------------===// LogicalResult PrefetchOp::verify() { - auto tdescTy = getTensorDescType(); - - if (!tdescTy && !getOffsets()) - return emitOpError("Expects offsets."); - - if (tdescTy && getOffsets()) - return emitOpError("offsets not allowed."); - if (!isReadHintOrNone(getL1HintAttr())) return emitOpError("invalid l1_hint: ") << getL1HintAttr(); @@ -719,39 +551,22 @@ LogicalResult PrefetchOp::verify() { if (auto layout = getAnchorLayout()) { // get the offset operand and its shape - if (auto offsets = getOffsets()) { - auto offsetsTy = offsets.getType(); - if (llvm::isa(offsetsTy) && - !layout.isDistributable(getShapeOf(offsetsTy))) - return emitOpError("offset shape is not distributable with the layout"); - } + auto offsetsTy = getOffsets().getType(); + if (llvm::isa(offsetsTy) && + !layout.isDistributable(getShapeOf(offsetsTy))) + return emitOpError("offset shape is not distributable with the layout"); } return success(); } -void PrefetchOp::build(OpBuilder &builder, OperationState &state, Value source, - xegpu::CachePolicyAttr l1_hint, - xegpu::CachePolicyAttr l2_hint, - xegpu::CachePolicyAttr l3_hint) { - build(builder, state, source, Value(), l1_hint, l2_hint, l3_hint, - IntegerAttr{}, /*anchor_layout=*/nullptr); -} - //===----------------------------------------------------------------------===// // XeGPU_LoadGatherOp //===----------------------------------------------------------------------===// LogicalResult LoadGatherOp::verify() { - auto tdescTy = getTensorDescType(); auto maskTy = getMaskType(); auto valueTy = getValueType(); - if (!tdescTy && !getOffsets()) - return emitOpError("Expects offsets."); - - if (tdescTy && getOffsets()) - return emitOpError("offsets not allowed."); - if (!isReadHintOrNone(getL1HintAttr())) return emitOpError("invalid l1_hint: ") << getL1HintAttr(); @@ -778,15 +593,6 @@ LogicalResult LoadGatherOp::verify() { [&]() { return emitOpError(); }); } -void LoadGatherOp::build(OpBuilder &builder, OperationState &state, - Type valueType, Value source, Value mask, - xegpu::CachePolicyAttr l1_hint, - xegpu::CachePolicyAttr l2_hint, - xegpu::CachePolicyAttr l3_hint) { - build(builder, state, valueType, source, Value(), mask, IntegerAttr(), - l1_hint, l2_hint, l3_hint, /*anchor_layout=*/nullptr); -} - void LoadGatherOp::build(OpBuilder &builder, OperationState &state, Type valueType, Value source, ArrayRef offsets, Value mask, @@ -824,16 +630,9 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state, // XeGPU_StoreScatterOp //===----------------------------------------------------------------------===// LogicalResult StoreScatterOp::verify() { - auto tdescTy = getTensorDescType(); auto maskTy = getMaskType(); auto valueTy = getValueType(); - if (!tdescTy && !getOffsets()) - return emitOpError("Expects offsets."); - - if (tdescTy && getOffsets()) - return emitOpError("offsets not allowed."); - if (!isWriteHintOrNone(getL1HintAttr())) return emitOpError("invalid l1_hint: ") << getL1HintAttr(); @@ -860,15 +659,6 @@ LogicalResult StoreScatterOp::verify() { [&]() { return emitOpError(); }); } -void StoreScatterOp::build(OpBuilder &builder, OperationState &state, - Value value, Value dest, Value mask, - xegpu::CachePolicyAttr l1_hint, - xegpu::CachePolicyAttr l2_hint, - xegpu::CachePolicyAttr l3_hint) { - build(builder, state, value, dest, Value(), mask, IntegerAttr(), l1_hint, - l2_hint, l3_hint, /*anchor_layout=*/nullptr); -} - void StoreScatterOp::build(OpBuilder &builder, OperationState &state, Value value, Value dest, ArrayRef offsets, Value mask, diff --git a/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp b/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp index 153ef5b500a1b..d10265772cacb 100644 --- a/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp +++ b/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp @@ -405,11 +405,6 @@ transform::InsertPrefetchOp::apply(transform::TransformRewriter &rewriter, if (!maybeDescOp) return emitSilenceableFailure(getLoc()) << "Could not find descriptor op."; auto descOp = *maybeDescOp; - if (descOp.getMixedOffsets().size() > 0) { - auto diag = emitSilenceableFailure(getLoc()) - << "desc op with offsets is not supported."; - diag.attachNote(descOp.getLoc()) << "desc op"; - } // Clone desc op outside the loop. rewriter.setInsertionPoint(forOp); @@ -442,7 +437,7 @@ transform::InsertPrefetchOp::apply(transform::TransformRewriter &rewriter, llvm::map_to_vector(loadOp.getOffsets(), [&](Value v) { return mapping.lookupOrDefault(v); }); - auto constOffsets = loadOp.getConstOffsets().value(); + auto constOffsets = loadOp.getConstOffsets(); return getMixedValues(constOffsets, dynamicOffsets, ctx); }; diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 7fc5d2fffae51..98c9dc3f5e53a 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -136,8 +136,7 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const { std::optional> XeGPUBlockingPass::getTileShape(Operation *op) const { - if (isa( - op)) + if (isa(op)) return getTileShape(op->getOpResult(0)); if (isa(op)) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 88341e120267b..0eb32c29a8b3f 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -343,10 +343,6 @@ class LayoutInfoPropagation ArrayRef operands, ArrayRef results); - void visitUpdateNdOffsetOp(xegpu::UpdateNdOffsetOp updateNdOffset, - ArrayRef operands, - ArrayRef results); - void visitPrefetchNdOp(xegpu::PrefetchNdOp prefetch, ArrayRef operands, ArrayRef results); @@ -441,9 +437,6 @@ LogicalResult LayoutInfoPropagation::visitOperation( .Case([&](xegpu::LoadGatherOp loadGatherOp) { visitLoadGatherOp(loadGatherOp, operands, results); }) - .Case([&](xegpu::UpdateNdOffsetOp updateNdOffsetOp) { - visitUpdateNdOffsetOp(updateNdOffsetOp, operands, results); - }) .Case([&](xegpu::PrefetchNdOp prefetchNdOp) { visitPrefetchNdOp(prefetchNdOp, operands, results); }) @@ -736,20 +729,6 @@ void LayoutInfoPropagation::visitShapeCastOp( propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr))); } -/// Propagate the layout of the result tensor to the source tensor descriptor -/// in UpdateNdOffsetOp. -void LayoutInfoPropagation::visitUpdateNdOffsetOp( - xegpu::UpdateNdOffsetOp updateNdOffset, - ArrayRef operands, - ArrayRef results) { - // The layout of the result must be present. - LayoutInfo resultLayout = results[0]->getValue(); - if (!resultLayout.isAssigned()) - return; - // Propagate the layout to the source operand. - propagateIfChanged(operands[0], operands[0]->meet(resultLayout)); -} - /// Set the layouts for DPAS A, B, and C operands. void LayoutInfoPropagation::visitDpasOp( xegpu::DpasOp dpas, ArrayRef operands, diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 001f8f561eb76..1b4dddcb4ae55 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -260,11 +260,6 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern { if (!layout) return rewriter.notifyMatchFailure( descOp, "the tensor descriptor lacks layout attribute"); - // CreateNdOp must not have offsets. - if (descOp.getMixedOffsets().size()) - return rewriter.notifyMatchFailure( - descOp, "xegpu::CreateNdDescOp must not have offsets"); - SmallVector newRetIndices; rewriter.setInsertionPoint(warpOp); gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp index 51693da389a49..d11ce207cc064 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp @@ -177,22 +177,10 @@ struct UnrollCreateNdOp : public UnrollPattern { SmallVector newOps; auto newTdescTy = getUnrolledTypes(tdescTy, *targetShape)[0]; - bool hasOffsets = op.getMixedOffsets().size() != 0; - if (!hasOffsets) { - auto newOp = xegpu::CreateNdDescOp::create( - rewriter, loc, newTdescTy, op.getSource(), op.getMixedSizes(), - op.getMixedStrides()); - newOps.push_back(newOp); - } else { - auto createOp = [&](SmallVector offsets) -> Value { - return xegpu::CreateNdDescOp::create( - rewriter, loc, newTdescTy, op.getSource(), offsets, - op.getMixedSizes(), op.getMixedStrides()); - }; - - newOps = computeUnrolledOffsets(op.getMixedOffsets(), tdescTy, - *targetShape, createOp, loc, rewriter); - } + auto newOp = + xegpu::CreateNdDescOp::create(rewriter, loc, newTdescTy, op.getSource(), + op.getMixedSizes(), op.getMixedStrides()); + newOps.push_back(newOp); Value castOp = unpack(newOps, tdescTy, *targetShape, loc, rewriter); rewriter.replaceOp(op, castOp); @@ -200,34 +188,6 @@ struct UnrollCreateNdOp : public UnrollPattern { } }; -struct UnrollUpdateNdOffsetOp : public UnrollPattern { - using UnrollPattern::UnrollPattern; - LogicalResult matchAndRewrite(xegpu::UpdateNdOffsetOp op, - PatternRewriter &rewriter) const override { - Location loc = op.getLoc(); - xegpu::TensorDescType tdescTy = op.getTensorDescType(); - - std::optional> targetShape = getTargetShape(op); - if (!targetShape) - return failure(); - - SmallVector convertedTdescTypes = - getUnrolledTypes(tdescTy, *targetShape); - SmallVector convertedTdesc = pack( - op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter); - - SmallVector newOps; - for (auto t : convertedTdesc) { - auto newOp = xegpu::UpdateNdOffsetOp::create( - rewriter, loc, t.getType(), t, op.getOffsets(), op.getConstOffsets()); - newOps.push_back(newOp); - } - Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter); - rewriter.replaceOp(op, castOp); - return success(); - } -}; - struct UnrollPrefetchNdOp : public UnrollPattern { using UnrollPattern::UnrollPattern; LogicalResult matchAndRewrite(xegpu::PrefetchNdOp op, @@ -242,31 +202,23 @@ struct UnrollPrefetchNdOp : public UnrollPattern { xegpu::DistributeLayoutAttr layout = op.getLayoutAttr(); if (layout) layout = layout.dropInstData(); - int64_t offsetSize = static_cast(op.getOffsets().size()); - bool hasOffsets = (offsetSize != 0) || op.getConstOffsetsAttr(); - SmallVector convertedTdescTypes = getUnrolledTypes( - tdescTy, *targetShape, /*returnSingleType*/ hasOffsets); + SmallVector convertedTdescTypes = + getUnrolledTypes(tdescTy, *targetShape, /*returnSingleType*/ true); SmallVector convertedTdesc = pack( op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter); - if (!hasOffsets) { - for (auto t : convertedTdesc) - xegpu::PrefetchNdOp::create(rewriter, loc, TypeRange(), t, - xegpu::dropInstDataOnAttrs(op->getAttrs())); - } else { - auto createPrefetch = [&](SmallVector offsets) -> Value { - xegpu::PrefetchNdOp::create(rewriter, loc, convertedTdesc[0], offsets, - op.getL1HintAttr(), op.getL2HintAttr(), - op.getL3HintAttr(), layout); - // return dummy Value to satisfy function's signature - return nullptr; - }; + auto createPrefetch = [&](SmallVector offsets) -> Value { + xegpu::PrefetchNdOp::create(rewriter, loc, convertedTdesc[0], offsets, + op.getL1HintAttr(), op.getL2HintAttr(), + op.getL3HintAttr(), layout); + // return dummy Value to satisfy function's signature + return nullptr; + }; - computeUnrolledOffsets(op.getMixedOffsets(), tdescTy, *targetShape, - createPrefetch, loc, rewriter); - } + computeUnrolledOffsets(op.getMixedOffsets(), tdescTy, *targetShape, + createPrefetch, loc, rewriter); rewriter.eraseOp(op); return success(); @@ -289,36 +241,25 @@ struct UnrollLoadNdOp : public UnrollPattern { xegpu::DistributeLayoutAttr layout = op.getLayoutAttr(); if (layout) layout = layout.dropInstData(); - int64_t offsetSize = static_cast(op.getOffsets().size()); - bool hasOffsets = (offsetSize != 0) || op.getConstOffsetsAttr(); Type elemTy = tdescTy.getElementType(); VectorType newValueTy = valueTy.cloneWith(*targetShape, elemTy); - SmallVector convertedTdescTypes = getUnrolledTypes( - tdescTy, *targetShape, /*returnSingleType*/ hasOffsets); + SmallVector convertedTdescTypes = + getUnrolledTypes(tdescTy, *targetShape, /*returnSingleType*/ true); SmallVector convertedTdescs = pack( op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter); SmallVector newOps; - if (!hasOffsets) { - for (auto t : convertedTdescs) { - auto newOp = - xegpu::LoadNdOp::create(rewriter, loc, newValueTy, t, - xegpu::dropInstDataOnAttrs(op->getAttrs())); - newOps.push_back(newOp); - } - } else { - auto createLoad = [&](SmallVector offsets) { - return xegpu::LoadNdOp::create( - rewriter, loc, newValueTy, convertedTdescs[0], offsets, - op.getPackedAttr(), op.getTransposeAttr(), op.getL1HintAttr(), - op.getL2HintAttr(), op.getL3HintAttr(), layout); - }; - newOps = computeUnrolledOffsets(op.getMixedOffsets(), tdescTy, - *targetShape, createLoad, loc, rewriter); - } + auto createLoad = [&](SmallVector offsets) { + return xegpu::LoadNdOp::create( + rewriter, loc, newValueTy, convertedTdescs[0], offsets, + op.getPackedAttr(), op.getTransposeAttr(), op.getL1HintAttr(), + op.getL2HintAttr(), op.getL3HintAttr(), layout); + }; + newOps = computeUnrolledOffsets(op.getMixedOffsets(), tdescTy, *targetShape, + createLoad, loc, rewriter); Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter); @@ -342,37 +283,29 @@ struct UnrollStoreNdOp : public UnrollPattern { xegpu::DistributeLayoutAttr layout = op.getLayoutAttr(); if (layout) layout = layout.dropInstData(); - int64_t offsetSize = static_cast(op.getOffsets().size()); - bool hasOffsets = (offsetSize != 0) || op.getConstOffsetsAttr(); SmallVector convertedValTypes = getUnrolledTypes(valueTy, *targetShape); - SmallVector convertedTdescTypes = getUnrolledTypes( - tdescTy, *targetShape, /*returnSingleType*/ hasOffsets); + SmallVector convertedTdescTypes = + getUnrolledTypes(tdescTy, *targetShape, /*returnSingleType*/ true); SmallVector convertedTdescs = pack( op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter); SmallVector convertedValues = pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter); - if (!hasOffsets) { - for (auto [v, t] : llvm::zip(convertedValues, convertedTdescs)) - xegpu::StoreNdOp::create(rewriter, loc, v, t, op.getL1HintAttr(), - op.getL2HintAttr(), op.getL3HintAttr()); - } else { - size_t valueIndex = 0; - auto createStore = [&](SmallVector offsets) { - xegpu::StoreNdOp::create(rewriter, loc, convertedValues[valueIndex++], - convertedTdescs[0], offsets, - op.getL1HintAttr(), op.getL2HintAttr(), - op.getL3HintAttr(), layout); - // return dummy Value to satisfy function's signature - return nullptr; - }; - - computeUnrolledOffsets(op.getMixedOffsets(), tdescTy, *targetShape, - createStore, loc, rewriter); - } + + size_t valueIndex = 0; + auto createStore = [&](SmallVector offsets) { + xegpu::StoreNdOp::create(rewriter, loc, convertedValues[valueIndex++], + convertedTdescs[0], offsets, op.getL1HintAttr(), + op.getL2HintAttr(), op.getL3HintAttr(), layout); + // return dummy Value to satisfy function's signature + return nullptr; + }; + + computeUnrolledOffsets(op.getMixedOffsets(), tdescTy, *targetShape, + createStore, loc, rewriter); rewriter.eraseOp(op); return success(); @@ -477,77 +410,11 @@ struct UnrollDpasOp : public UnrollPattern { } }; -struct UnrollLoadGatherOp : public UnrollPattern { - using UnrollPattern::UnrollPattern; - LogicalResult matchAndRewrite(xegpu::LoadGatherOp op, - PatternRewriter &rewriter) const override { - - Location loc = op.getLoc(); - VectorType valueTy = llvm::dyn_cast(op.getValue().getType()); - xegpu::TensorDescType tdescTy = op.getTensorDescType(); - - // TODO: handle the unstructure source case (!tdesTy) - if (!tdescTy || op.getOffsets()) - return failure(); - - std::optional> targetShape = getTargetShape(op); - if (!targetShape) - return failure(); - - SmallVector targetMaskShape(*targetShape); - int originalChunkSize = op.getChunkSize().value_or(1); - - VectorType maskTy = llvm::dyn_cast(op.getMask().getType()); - - Type elemTy = tdescTy.getElementType(); - VectorType newValueTy = valueTy.cloneWith(*targetShape, elemTy); - - SmallVector convertedTdescTypes = - getUnrolledTypes(tdescTy, *targetShape); - SmallVector convertedTdescs = pack( - op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter); - - SmallVector convertedMaskTypes; - SmallVector convertedMasks; - - if (originalChunkSize > 1) { - targetMaskShape.pop_back(); - convertedMaskTypes = getUnrolledTypes(maskTy, targetMaskShape); - int64_t blockedChunkSize = targetShape->back(); - int64_t numNewChunks = originalChunkSize / blockedChunkSize; - - // the mask is reused across the chunk_size dimension - for (auto mask : pack(op.getMask(), convertedMaskTypes, targetMaskShape, - loc, rewriter)) - convertedMasks.append(numNewChunks, mask); - - newValueTy = valueTy.cloneWith(*targetShape, elemTy); - } else { - convertedMaskTypes = getUnrolledTypes(maskTy, targetMaskShape); - convertedMasks = pack(op.getMask(), convertedMaskTypes, targetMaskShape, - loc, rewriter); - } - - SmallVector newOps; - for (auto [t, m] : llvm::zip(convertedTdescs, convertedMasks)) { - auto newOp = xegpu::LoadGatherOp::create( - rewriter, loc, newValueTy, t, m, op.getL1HintAttr(), - op.getL2HintAttr(), op.getL3HintAttr()); - newOps.push_back(newOp); - } - - Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter); - rewriter.replaceOp(op, castOp); - return success(); - } -}; - /// This pattern handles the unrolling of LoadGatherOp with offsets (gathered /// load). /// It unrolls the offsets and mask operands accordingly, and creates multiple /// LoadGatherOp with the unrolled operands. -struct UnrollLoadGatherOpWithOffset - : public UnrollPattern { +struct UnrollLoadGatherOp : public UnrollPattern { using UnrollPattern::UnrollPattern; LogicalResult matchAndRewrite(xegpu::LoadGatherOp op, PatternRewriter &rewriter) const override { @@ -556,10 +423,6 @@ struct UnrollLoadGatherOpWithOffset Value offsets = op.getOffsets(); Value mask = op.getMask(); - // Only handle the case where offsets are present (scattered load) - if (!offsets) - return failure(); - std::optional> targetShape = getTargetShape(op); if (!targetShape) return failure(); @@ -645,8 +508,7 @@ struct UnrollLoadGatherOpWithOffset /// store). /// It unrolls the offsets and mask operands accordingly, and creates multiple /// StoreScatterOp with the unrolled operands. -struct UnrollStoreScatterOpWithOffsets - : public UnrollPattern { +struct UnrollStoreScatterOp : public UnrollPattern { using UnrollPattern::UnrollPattern; LogicalResult matchAndRewrite(xegpu::StoreScatterOp op, PatternRewriter &rewriter) const override { @@ -655,10 +517,6 @@ struct UnrollStoreScatterOpWithOffsets Value offsets = op.getOffsets(); Value mask = op.getMask(); - // Only handle the case where offsets are present (scattered store) - if (!offsets) - return failure(); - std::optional> targetShape = getTargetShape(op); if (!targetShape) return failure(); @@ -739,99 +597,6 @@ struct UnrollStoreScatterOpWithOffsets } }; -struct UnrollPrefetchOp : public UnrollPattern { - using UnrollPattern::UnrollPattern; - LogicalResult matchAndRewrite(xegpu::PrefetchOp op, - PatternRewriter &rewriter) const override { - Location loc = op.getLoc(); - xegpu::TensorDescType tdescTy = op.getTensorDescType(); - - // TODO: handle the unstructure source case (!tdesTy) - if (!tdescTy || op.getOffsets()) - return failure(); - - std::optional> targetShape = getTargetShape(op); - if (!targetShape) - return failure(); - - SmallVector convertedTdescTypes = - getUnrolledTypes(tdescTy, *targetShape); - SmallVector convertedTdesc = pack( - op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter); - - for (auto t : convertedTdesc) - xegpu::PrefetchOp::create(rewriter, loc, TypeRange(), t, - xegpu::dropInstDataOnAttrs(op->getAttrs())); - - rewriter.eraseOp(op); - return success(); - } -}; - -struct UnrollStoreScatterOp : public UnrollPattern { - using UnrollPattern::UnrollPattern; - LogicalResult matchAndRewrite(xegpu::StoreScatterOp op, - PatternRewriter &rewriter) const override { - - Location loc = op.getLoc(); - VectorType valueTy = llvm::dyn_cast(op.getValue().getType()); - xegpu::TensorDescType tdescTy = op.getTensorDescType(); - - // TODO: handle the unstructure source case (!tdesTy) - if (!tdescTy || op.getOffsets()) - return failure(); - - std::optional> targetShape = getTargetShape(op); - if (!targetShape) - return failure(); - - SmallVector targetMaskShape(*targetShape); - int originalChunkSize = op.getChunkSize().value_or(1); - - VectorType maskTy = llvm::dyn_cast(op.getMask().getType()); - - SmallVector convertedTdescTypes = - getUnrolledTypes(tdescTy, *targetShape); - SmallVector convertedTdescs = pack( - op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter); - - SmallVector convertedMaskTypes; - SmallVector convertedMasks; - - if (originalChunkSize > 1) { - targetMaskShape.pop_back(); - int64_t blockedChunkSize = targetShape->back(); - int64_t numNewChunks = originalChunkSize / blockedChunkSize; - convertedMaskTypes = getUnrolledTypes(maskTy, targetMaskShape); - - // the mask is reused across the chunk_size dimension - for (auto mask : pack(op.getMask(), convertedMaskTypes, targetMaskShape, - loc, rewriter)) - convertedMasks.append(numNewChunks, mask); - } else { - convertedMaskTypes = getUnrolledTypes(maskTy, targetMaskShape); - convertedMasks = pack(op.getMask(), convertedMaskTypes, targetMaskShape, - loc, rewriter); - } - - SmallVector convertedValTypes = - getUnrolledTypes(valueTy, *targetShape); - SmallVector convertedValues = - pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter); - - for (size_t i = 0; i < convertedValues.size(); ++i) { - Value v = convertedValues[i]; - Value t = convertedTdescs[i]; - Value m = op.getMask() ? convertedMasks[i] : nullptr; - xegpu::StoreScatterOp::create(rewriter, loc, v, t, m, op.getL1HintAttr(), - op.getL2HintAttr(), op.getL3HintAttr()); - } - - rewriter.eraseOp(op); - return success(); - } -}; - struct UnrollLoadMatrixOp : public UnrollPattern { using UnrollPattern::UnrollPattern; LogicalResult matchAndRewrite(xegpu::LoadMatrixOp op, @@ -973,11 +738,8 @@ struct UnrollConvertLayoutOp : public UnrollPattern { void mlir::xegpu::populateXeGPUUnrollPatterns( RewritePatternSet &patterns, const xegpu::UnrollOptions &options) { - patterns - .add( - patterns.getContext(), options); + patterns.add(patterns.getContext(), options); } diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index 9114e37b0e42b..8aa0758943cd1 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -65,11 +65,10 @@ getSgShapeAndCount(ArrayRef shape, /// or sub-MemDescs to be accessed by current subgroup (sgId) based on the /// associated distribute layout attribute, the shape, subgroup id and the /// original offsets of the op -template < - typename OpType, - typename = std::enable_if_t::value>> +template ::value>> static LogicalResult genOffsetsList(ConversionPatternRewriter &rewriter, OpType op, SmallVector> &offsetsList) { @@ -136,13 +135,13 @@ genOffsetsList(ConversionPatternRewriter &rewriter, OpType op, /// from a workgroup descriptor. It replaces the offsets and sizes with /// appropriate values for the subgroup. /// It uses round-robin assignment to distribute the work to the subgroups. -/// Following create_nd_desc operation:, -/// %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x24xf32> +/// Following create_nd_desc operation: +/// %tdesc = xegpu.create_nd_tdesc %src : memref<24x24xf32> /// -> !xegpu.tensor_desc<24x24xf32, #xegpu.layout> /// is converted to 9 subgroup level operations based on the sg_layout & /// sg_data: -/// %tdesc = xegpu.create_nd_tdesc %src[off1, off2] : memref<24x24xf32> -> +/// %tdesc = xegpu.create_nd_tdesc %src : memref<24x24xf32> -> /// !xegpu.tensor_desc<2x2xf32, #xegpu.layout> /// @@ -177,53 +176,14 @@ genOffsetsList(ConversionPatternRewriter &rewriter, OpType op, /// pattern and all the other ops just follow. /// TODO: Decouple the distribution logic from WgToSgCreateNdOp for all the /// ops in the pass. +// This pattern transforms the CreateNdDescOp to create a +// subgroup descriptor from a workgroup descriptor. struct WgToSgCreateNdOp : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(xegpu::CreateNdDescOp op, OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - SmallVector> offsetsList; - if (failed(genOffsetsList(rewriter, op, offsetsList))) - return failure(); - - MLIRContext *ctx = op.getContext(); - xegpu::TensorDescType tdescTy = op.getType(); - ArrayRef wgShape = tdescTy.getShape(); - Type elemTy = tdescTy.getElementType(); - xegpu::DistributeLayoutAttr layout = tdescTy.getLayoutAttr(); - SmallVector sgShape = getSgShapeAndCount(wgShape, layout).first; - auto newTdescTy = - xegpu::TensorDescType::get(ctx, sgShape, elemTy, tdescTy.getEncoding(), - layout.dropSgLayoutAndData()); - - SmallVector newOps; - for (auto offsets : offsetsList) { - auto newOp = xegpu::CreateNdDescOp::create( - rewriter, op.getLoc(), newTdescTy, op.getSource(), offsets, - op.getMixedSizes(), op.getMixedStrides()); - - newOps.push_back(newOp); - } - rewriter.replaceOpWithMultiple(op, {newOps}); - - return success(); - } -}; - -// This pattern transforms the CreateNdDescOp without offsets to create a -// subgroup descriptor from a workgroup descriptor -struct WgToSgCreateNdOpNoOffset - : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - - LogicalResult - matchAndRewrite(xegpu::CreateNdDescOp op, OneToNOpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - - // Check no offsets are specified. - if (!op.getMixedOffsets().empty()) - return failure(); Location loc = op.getLoc(); MLIRContext *ctx = op.getContext(); @@ -256,52 +216,6 @@ struct WgToSgCreateNdOpNoOffset /// This pattern transforms the LoadNdOp to load subgroup data. struct WgToSgLoadNdOp : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult - matchAndRewrite(xegpu::LoadNdOp op, OneToNOpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - if (!op.getMixedOffsets().empty()) - return failure(); - - SmallVector newLoadOps; - for (auto src : adaptor.getTensorDesc()) { - xegpu::TensorDescType tdescTy = - dyn_cast(src.getType()); - ArrayRef srcShape = tdescTy.getShape(); - VectorType newResTy = VectorType::get(srcShape, tdescTy.getElementType()); - auto newLoadOp = xegpu::LoadNdOp::create( - rewriter, op.getLoc(), newResTy, src, - xegpu::dropSgLayoutAndDataOnAttrs(op->getAttrs())); - newLoadOps.push_back(newLoadOp); - } - rewriter.replaceOpWithMultiple(op, {newLoadOps}); - return mlir::success(); - } -}; - -/// This pattern transforms the StoreNdOp to store to a subgroup descriptor -/// It creates a StoreNdOp op to store the updated values to the new subgroup -/// src tensor descriptors. -struct WgToSgStoreNdOp : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult - matchAndRewrite(xegpu::StoreNdOp op, OneToNOpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - if (!op.getMixedOffsets().empty()) - return failure(); - - for (auto [v, t] : llvm::zip(adaptor.getValue(), adaptor.getTensorDesc())) - xegpu::StoreNdOp::create(rewriter, op.getLoc(), v, t, op.getL1HintAttr(), - op.getL2HintAttr(), op.getL3HintAttr()); - - rewriter.eraseOp(op); - return success(); - } -}; - -// This pattern transforms the LoadNdOp with explicit offsets to load -// subgroup data. -struct WgToSgLoadNdOpWithOffset : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(xegpu::LoadNdOp op, OneToNOpAdaptor adaptor, @@ -332,10 +246,8 @@ struct WgToSgLoadNdOpWithOffset : public OpConversionPattern { } }; -// This pattern transforms the StoreNdOp with explicit offsets to store -// subgroup data. -struct WgToSgStoreNdOpWithOffset - : public OpConversionPattern { +/// This pattern transforms the StoreNdOp to store subgroup data. +struct WgToSgStoreNdOp : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(xegpu::StoreNdOp op, OneToNOpAdaptor adaptor, @@ -359,10 +271,8 @@ struct WgToSgStoreNdOpWithOffset } }; -// This pattern transforms the PrefetchNdOp with explicit offsets to prefetch -// subgroup data. -struct WgToSgPrefetchNdOpWithOffset - : public OpConversionPattern { +/// This pattern transforms the PrefetchNdOp to prefetch subgroup data. +struct WgToSgPrefetchNdOp : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(xegpu::PrefetchNdOp op, OneToNOpAdaptor adaptor, @@ -386,28 +296,6 @@ struct WgToSgPrefetchNdOpWithOffset } }; -/// This pattern transforms the UpdateNdOffsetOp to update the offsets of a -/// subgroup descriptor. It creates an UpdateNdOffsetOp op to update the -/// offsets of the new subgroup src tensor descriptors. -struct WgToSgUpdateNdOffsetOp - : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult - matchAndRewrite(xegpu::UpdateNdOffsetOp op, OneToNOpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - llvm::SmallVector newUpdateTileOffsetOps; - for (auto tDesc : adaptor.getTensorDesc()) { - auto newUpdateTileOffsetOp = xegpu::UpdateNdOffsetOp::create( - rewriter, op.getLoc(), tDesc.getType(), tDesc, op.getOffsets(), - op.getConstOffsets()); - newUpdateTileOffsetOps.push_back(newUpdateTileOffsetOp); - } - - rewriter.replaceOpWithMultiple(op, {newUpdateTileOffsetOps}); - return success(); - } -}; - /// This pattern transforms the DpasOp to work at subgroup level. struct WgToSgDpasOp : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; @@ -455,26 +343,6 @@ struct WgToSgDpasOp : public OpConversionPattern { } }; -/// This pattern transforms the PrefetchNdOp to prefetch the subgroup data. -struct WgToSgPrefetchNdOp : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult - matchAndRewrite(xegpu::PrefetchNdOp op, OneToNOpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - - int64_t offsetSize = static_cast(op.getOffsets().size()); - if ((offsetSize != 0) || op.getConstOffsetsAttr()) - return failure(); - - for (auto src : adaptor.getTensorDesc()) - xegpu::PrefetchNdOp::create( - rewriter, op.getLoc(), TypeRange(), src, - xegpu::dropSgLayoutAndDataOnAttrs(op->getAttrs())); - rewriter.eraseOp(op); - return success(); - } -}; - /// This pattern transforms vector.broadcast ops to work at subgroup level. struct WgToSgVectorBroadcastOp : public OpConversionPattern { @@ -941,8 +809,7 @@ struct WgToSgArithConstantOp : public OpConversionPattern { // This pattern transforms the LoadGatherOp with explicit offsets to load // subgroup data -struct WgToSgLoadGatherOpWithOffset - : public OpConversionPattern { +struct WgToSgLoadGatherOp : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(xegpu::LoadGatherOp op, OneToNOpAdaptor adaptor, @@ -992,7 +859,7 @@ struct WgToSgLoadGatherOpWithOffset // This pattern transforms the StoreScatterOp with explicit offsets to store // subgroup data -struct WgToSgStoreScatterOpWithOffset +struct WgToSgStoreScatterOp : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; LogicalResult @@ -1541,18 +1408,15 @@ using WgToSgVectorCreateMaskOp = WgToSgVectorMaskOp; namespace mlir { namespace xegpu { void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) { - patterns - .add( - patterns.getContext()); + patterns.add( + patterns.getContext()); } } // namespace xegpu } // namespace mlir @@ -1652,8 +1516,6 @@ void XeGPUWgToSgDistributePass::runOnOperation() { return loadOp.getTensorDescType(); if (auto storeOp = dyn_cast(op)) return storeOp.getTensorDescType(); - if (auto updateOp = dyn_cast(op)) - return updateOp.getType(); if (auto prefetchOp = dyn_cast(op)) return prefetchOp.getTensorDescType(); return xegpu::TensorDescType(); @@ -1664,12 +1526,13 @@ void XeGPUWgToSgDistributePass::runOnOperation() { }; target.addDynamicallyLegalOp([=](Operation *op) -> bool { - auto tdescTy = getTensorDescType(op); - auto layout = dyn_cast_if_present(tdescTy.getLayout()); - return isLegal(layout); - }); + xegpu::StoreNdOp, xegpu::PrefetchNdOp>( + [=](Operation *op) -> bool { + auto tdescTy = getTensorDescType(op); + auto layout = + dyn_cast_if_present(tdescTy.getLayout()); + return isLegal(layout); + }); target.addDynamicallyLegalOp([=](xegpu::DpasOp op) -> bool { auto layout = op.getLayoutCdAttr(); diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir index 42b38c09e0765..7e6fb35cc6974 100644 --- a/mlir/test/Dialect/XeGPU/invalid.mlir +++ b/mlir/test/Dialect/XeGPU/invalid.mlir @@ -2,8 +2,8 @@ // ----- func.func @create_nd_tdesc_1(%src: memref<24xf32>) { - // expected-error@+1 {{Expecting the TensorDesc rank is not greater than the ranks of shape, strides, offsets or the memref source}} - %1 = xegpu.create_nd_tdesc %src[0] : memref<24xf32> -> !xegpu.tensor_desc<8x16xf32> + // expected-error@+1 {{Expecting the TensorDesc rank is not greater than the ranks of shape, strides or the memref source}} + %1 = xegpu.create_nd_tdesc %src : memref<24xf32> -> !xegpu.tensor_desc<8x16xf32> return } @@ -11,42 +11,42 @@ func.func @create_nd_tdesc_1(%src: memref<24xf32>) { func.func @create_nd_tdesc_2(%src: memref<24x32xf32>) { // expected-error@+1 {{TensorDesc should have the same element type with the source if it is a memref}} - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf16> return } // ----- func.func @create_nd_tdesc_3(%src: memref<2x24x32xf32, 3>) { // expected-error@+1 {{SLM is only supported for 1D block tensor}} - %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + %1 = xegpu.create_nd_tdesc %src : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> return } // ----- func.func @create_nd_tdesc_4(%src: memref<2x24x32xf32, 3>) { // expected-error@+1 {{Memory space mismatch}} - %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32> + %1 = xegpu.create_nd_tdesc %src : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32> return } // ----- func.func @create_nd_tdesc_5(%src: memref<128x128xf32>) { // expected-error@+1 {{cannot distribute [128, 128] using #xegpu.layout}} - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> + %1 = xegpu.create_nd_tdesc %src : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> return } // ----- func.func @create_nd_tdesc_6(%src: memref<128x128xf32>) { // expected-error@+1 {{cannot distribute [128, 128] using #xegpu.layout}} - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> + %1 = xegpu.create_nd_tdesc %src : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> return } // ----- func.func @create_nd_tdesc_7(%src: memref<128x128xf32>) { // expected-error@+1 {{cannot distribute [128, 128] using #xegpu.layout}} - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> + %1 = xegpu.create_nd_tdesc %src : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> return } @@ -60,49 +60,49 @@ func.func @create_nd_tdesc_8(%src: ui64) { // ----- func.func @create_nd_tdesc_9(%src: ui64) { // expected-error@+1 {{expecting strides and shape to be present for integer source}} - %1 = xegpu.create_nd_tdesc %src[0, 0] : ui64-> !xegpu.tensor_desc<128x128xf32> + %1 = xegpu.create_nd_tdesc %src : ui64-> !xegpu.tensor_desc<128x128xf32> return } // ----- func.func @create_nd_tdesc_10(%src: memref<24xindex>) { // expected-error @+1 {{unsupported element type 'index': expected integer or float}} - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24xindex> -> !xegpu.tensor_desc<24xindex> + %1 = xegpu.create_nd_tdesc %src : memref<24xindex> -> !xegpu.tensor_desc<24xindex> return } // ----- func.func @prefetch_nd_vc_1(%src: memref<24x32xf16>) { - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %src : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint}} - xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<8x16xf16> + xegpu.prefetch_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<8x16xf16> return } // ----- func.func @load_nd_vc_1(%src: memref<8x16xf16>) { - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %src : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint}} - %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint}> + %2 = xegpu.load_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16> return } // ----- func.func @load_nd_vc_3(%src: memref<8x16xf16>) { - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16> + %1 = xegpu.create_nd_tdesc %src : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16> // expected-warning@+1 {{Invalid Packed Attr.}} - %2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> + %2 = xegpu.load_nd %1[0] <{packed, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16> -> vector<16xf16> return } // ----- func.func @load_nd_vc_4(%src: memref<24x32xf32>) { - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> + %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> // expected-error@+1 {{Result shape [8, 1] is not consistent with tensor descriptor}} - %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, + %2 = xegpu.load_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32> -> vector<8x1xf32> return @@ -110,9 +110,9 @@ func.func @load_nd_vc_4(%src: memref<24x32xf32>) { // ----- func.func @subgroup_load_nd_9(%src: memref<4x8x16xf16>) { - %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<4x8x16xf16> -> !xegpu.tensor_desc<4x8x16xf16> + %1 = xegpu.create_nd_tdesc %src : memref<4x8x16xf16> -> !xegpu.tensor_desc<4x8x16xf16> // expected-error@+1 {{Expects a 1D or 2D TensorDesc}} - %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x8x16xf16> -> vector<4x8x16xf16> + %2 = xegpu.load_nd %1[0, 0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x8x16xf16> -> vector<4x8x16xf16> return } @@ -143,70 +143,70 @@ func.func @subgroup_load_nd_offset_3(%src: memref<4x8x16xf16>, %x : index) { // ----- func.func @load_nd_layout(%src: memref<24x32xf32>) { - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16xf32> + %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<16xf32> // expected-error@+1 {{Result shape [3] is not a valid distribution for tensor descriptor}} - %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, + %2 = xegpu.load_nd %1[0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf32> -> vector<3xf32> return } // ----- func.func @load_nd_simt(%src: memref<24x32xf32>) { - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> // expected-error@+1 {{TensorDesc doesn't need LayoutAttr for SIMT code}} - %2 = xegpu.load_nd %1 : !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -> vector<8xf32> + %2 = xegpu.load_nd %1[0, 0] : !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -> vector<8xf32> return } // ----- func.func @store_nd_vc_1(%dst: memref<24x32xf16>) { %1 = arith.constant dense<1.0>: vector<24x32xf16> - %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> + %2 = xegpu.create_nd_tdesc %dst : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint}} - xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint}>: vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16> + xegpu.store_nd %1, %2[0, 0] <{l1_hint = #xegpu.cache_hint}>: vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16> return } // ----- func.func @store_nd_vc_3(%dst: memref<24x32xf16>) { %1 = arith.constant dense<1.0>: vector<2x24x32xf16> - %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr> + %2 = xegpu.create_nd_tdesc %dst : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr> // expected-error@+1 {{array length is not supported by store_nd}} - xegpu.store_nd %1, %2: vector<2x24x32xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr> + xegpu.store_nd %1, %2[0, 0]: vector<2x24x32xf16>, !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr> return } // ----- func.func @store_nd_vc_4(%dst: memref<8x24x32xf16>) { %1 = arith.constant dense<1.0>: vector<8x24x32xf16> - %2 = xegpu.create_nd_tdesc %dst[0, 0, 0] : memref<8x24x32xf16> -> !xegpu.tensor_desc<8x24x32xf16> + %2 = xegpu.create_nd_tdesc %dst : memref<8x24x32xf16> -> !xegpu.tensor_desc<8x24x32xf16> // expected-error@+1 {{Expects a 1D or 2D TensorDesc}} - xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: vector<8x24x32xf16>, !xegpu.tensor_desc<8x24x32xf16> + xegpu.store_nd %1, %2[0, 0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: vector<8x24x32xf16>, !xegpu.tensor_desc<8x24x32xf16> return } // ----- func.func @store_nd_simt(%dst: memref<24x32xf32>, %data: vector<3xf32>) { - %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16xf32> + %1 = xegpu.create_nd_tdesc %dst : memref<24x32xf32> -> !xegpu.tensor_desc<16xf32> // expected-error@+1 {{Value shape [3] is not a valid distribution for tensor descriptor}} - xegpu.store_nd %data, %1 : vector<3xf32>, !xegpu.tensor_desc<16xf32> + xegpu.store_nd %data, %1[0] : vector<3xf32>, !xegpu.tensor_desc<16xf32> return } // ----- func.func @store_nd_simt(%src: memref<24x32xf32>, %data: vector<8xf32>) { - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> // expected-error@+1 {{TensorDesc doesn't need LayoutAttr for SIMT code}} - xegpu.store_nd %data, %1 : vector<8xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + xegpu.store_nd %data, %1[0, 0] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> return } // ----- func.func @store_nd_vc_5(%dst: memref<24x32xf32>, %data: vector<8x1xf32>) { - %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> -> + %1 = xegpu.create_nd_tdesc %dst : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> // expected-error@+1 {{Value shape [8, 1] is not consistent with tensor descriptor}} - xegpu.store_nd %data, %1 : vector<8x1xf32>, !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %data, %1[0, 0] : vector<8x1xf32>, !xegpu.tensor_desc<8x16xf32> return } @@ -284,18 +284,11 @@ func.func @store_scatter_simt_1(%dst: memref) { // ----- func.func @prefetch_offset_wi_1(%src: memref<4x4xf32>) { %offsets = arith.constant dense<[0]> : vector<1xindex> - // expected-error@+1 {{op operand #0 must be TensorDesc describing regions of interested data}} + // expected-error@+1 {{op operand #0 must be 1D memref}} xegpu.prefetch %src[%offsets]: memref<4x4xf32>, vector<1xindex> return } -// ----- -func.func @prefetch_offset_wi_3(%src: memref<16xf32>) { - // expected-error@+1 {{Expects offsets}} - xegpu.prefetch %src: memref<16xf32> - return -} - // ----- func.func @prefetch_offset_wi_4(%src: memref<16xf32>) { %offsets = arith.constant dense<[0]> : vector<1xindex> @@ -348,28 +341,18 @@ func.func @store_scatter_offset_wi_2(%src: memref<4x4xf16>) { %val = arith.constant dense<2.9>: vector<4xf16> %offsets = arith.constant dense<[0]> : vector<1xindex> %mask = arith.constant dense<1>: vector<1xi1> - // expected-error@+1 {{op operand #1 must be TensorDesc describing regions of interested data}} + // expected-error@+1 {{op operand #1 must be 1D memref}} xegpu.store %val, %src[%offsets], %mask : vector<4xf16>, memref<4x4xf16>, vector<1xindex>, vector<1xi1> return } -// ----- -func.func @store_scatter_offset_wi_3(%src: memref<16xf16>) { - %val = arith.constant dense<2.9>: vector<1xf16> - %mask = arith.constant dense<1>: vector<1xi1> - // expected-error@+1 {{Expects offsets}} - xegpu.store %val, %src, %mask - : vector<1xf16>, memref<16xf16>, vector<1xi1> - return -} - // ----- func.func @store_scatter_offset_wi_4(%src: !xegpu.tensor_desc<1x1xf32>) { %val = arith.constant dense<2.9>: vector<1xf16> %offsets = arith.constant dense<[0]> : vector<1xindex> %mask = arith.constant dense<1>: vector<1xi1> - // expected-error@+1 {{offsets not allowed}} + // expected-error@+1 {{op operand #1 must be 1D memref}} xegpu.store %val, %src[%offsets], %mask : vector<1xf16>, !xegpu.tensor_desc<1x1xf32>, vector<1xindex>, vector<1xi1> return @@ -379,19 +362,11 @@ func.func @store_scatter_offset_wi_4(%src: !xegpu.tensor_desc<1x1xf32>) { func.func @load_gather_offset_wi_4(%src: !xegpu.tensor_desc<1x2xf16>) { %mask = arith.constant dense<1>: vector<1xi1> %offsets = arith.constant dense<[0]> : vector<1xindex> - // expected-error@+1 {{offsets not allowed}} + // expected-error@+1 {{op operand #0 must be 1D memref}} %2 = xegpu.load %src[%offsets], %mask <{chunk_size = 2}> : !xegpu.tensor_desc<1x2xf16>, vector<1xindex>, vector<1xi1> -> vector<2xf16> return } -// ----- -func.func @load_gather_offset_wi_3(%src: ui64) { - %mask = arith.constant dense<1>: vector<1xi1> - // expected-error@+1 {{Expects offsets}} - %2 = xegpu.load %src, %mask <{chunk_size = 2}> : ui64, vector<1xi1> -> vector<2xf16> - return -} - // ----- func.func @load_gather_offset_wi_2(%src: ui64) { %mask = arith.constant dense<1>: vector<1xi1> @@ -405,7 +380,7 @@ func.func @load_gather_offset_wi_2(%src: ui64) { func.func @load_gather_offset_wi_1(%src: memref<4x4xf32>) { %mask = arith.constant dense<1>: vector<1xi1> %offsets = arith.constant dense<[0]> : vector<1xindex> - // expected-error@+1 {{op operand #0 must be TensorDesc describing regions of interested data}} + // expected-error@+1 {{op operand #0 must be 1D memref}} %2 = xegpu.load %src[%offsets], %mask <{chunk_size = 2}> : memref<4x4xf32>, vector<1xindex>, vector<1xi1> -> vector<2xf32> return } @@ -454,7 +429,7 @@ func.func @dpas_simt_1(%a : vector<8xf16>, %b: vector<15xf16>) { // ----- func.func @tensor_desc_invalid_rank_1(%src: memref<24x32xf32>) { - %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> + %0 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> // expected-error@+1 {{expected non-zero rank tensor}} !xegpu.tensor_desc return @@ -462,7 +437,7 @@ func.func @tensor_desc_invalid_rank_1(%src: memref<24x32xf32>) { // ----- func.func @tensor_desc_1D_invalid_map_layout(%src: memref<24x32xf32>) { - %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> + %0 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> // expected-error@+1 {{expected layout rank to match tensor rank}} !xegpu.tensor_desc<16xf32, #xegpu.layout> return @@ -470,7 +445,7 @@ func.func @tensor_desc_1D_invalid_map_layout(%src: memref<24x32xf32>) { // ----- func.func @tensor_desc_1D_invalid_map_data(%src: memref<24x32xf32>) { - %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> + %0 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> // expected-error@+1 {{expected layout rank to match tensor rank}} !xegpu.tensor_desc<16xf32, #xegpu.layout> return @@ -478,7 +453,7 @@ func.func @tensor_desc_1D_invalid_map_data(%src: memref<24x32xf32>) { // ----- func.func @tensor_desc_invalid_map_layout(%src: memref<24x32xf32>) { - %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> + %0 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> // expected-error@+1 {{cannot distribute [4, 8] using #xegpu.layout}} !xegpu.tensor_desc<4x8xf32, #xegpu.layout> return @@ -486,7 +461,7 @@ func.func @tensor_desc_invalid_map_layout(%src: memref<24x32xf32>) { // ----- func.func @tensor_desc_invalid_map_layout_1(%src: memref<24x32xf32>) { - %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> + %0 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> // expected-error@+1 {{cannot distribute [4, 8] using #xegpu.layout}} !xegpu.tensor_desc<4x8xf32, #xegpu.layout> return @@ -494,7 +469,7 @@ func.func @tensor_desc_invalid_map_layout_1(%src: memref<24x32xf32>) { // ----- func.func @tensor_desc_invalid_map_data_1(%src: memref<24x32xf32>) { - %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> + %0 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> // expected-error@+1 {{cannot distribute [4, 8] using #xegpu.layout}} !xegpu.tensor_desc<4x8xf32, #xegpu.layout> return @@ -565,7 +540,7 @@ func.func @layout_rank_mismatch_sg_data(%src: memref) { // ----- func.func @layout_rank_mismatch_tensor(%src: memref<16x32xf32>) { - %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<16x32xf32> -> + %0 = xegpu.create_nd_tdesc %src : memref<16x32xf32> -> // expected-error@+1 {{expected layout rank to match tensor rank}} !xegpu.tensor_desc<16x2xf32, #xegpu.layout> diff --git a/mlir/test/Dialect/XeGPU/layout.mlir b/mlir/test/Dialect/XeGPU/layout.mlir index bd332ddf4480a..2cd5c70f59064 100644 --- a/mlir/test/Dialect/XeGPU/layout.mlir +++ b/mlir/test/Dialect/XeGPU/layout.mlir @@ -8,22 +8,22 @@ gpu.module @test { // CHECK: gpu.func @create_nd_tdesc_subgroup_1(%[[arg0:.*]]: memref<128x128xf32>) { gpu.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> + %1 = xegpu.create_nd_tdesc %src : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> gpu.return } // CHECK: gpu.func @create_nd_tdesc_subgroup_2(%[[arg0:.*]]: memref<128x128xf32>) { gpu.func @create_nd_tdesc_subgroup_2(%src: memref<128x128xf32>) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> + %1 = xegpu.create_nd_tdesc %src : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> gpu.return } // CHECK: gpu.func @create_nd_tdesc_subgroup_3(%[[arg0:.*]]: memref<128x128xf32>) { gpu.func @create_nd_tdesc_subgroup_3(%src: memref<128x128xf32>) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> + %1 = xegpu.create_nd_tdesc %src : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> gpu.return } @@ -31,16 +31,16 @@ gpu.func @create_nd_tdesc_subgroup_3(%src: memref<128x128xf32>) { // ----- // CHECK: func.func @create_nd_tdesc_wrap_around_layout(%[[arg0:.*]]: memref<24x32xf32>) { func.func @create_nd_tdesc_wrap_around_layout(%src: memref<24x32xf32>) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<4x8xf32, #xegpu.layout> - %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]] : memref<24x32xf32> -> !xegpu.tensor_desc<4x8xf32, #xegpu.layout> + %0 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<4x8xf32, #xegpu.layout> return } // CHECK: gpu.func @create_nd_tdesc_wg_1(%[[arg0:.*]]: memref<24x32xf32>) { gpu.func @create_nd_tdesc_wg_1(%src: memref<24x32xf32>) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> + %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> gpu.return } diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir index b32e297b60fc8..857ec099b9f9c 100644 --- a/mlir/test/Dialect/XeGPU/ops.mlir +++ b/mlir/test/Dialect/XeGPU/ops.mlir @@ -8,8 +8,8 @@ gpu.module @test { // CHECK: gpu.func @create_nd_tdesc_1(%[[arg0:.*]]: memref<24x32xf32>) { gpu.func @create_nd_tdesc_1(%src: memref<24x32xf32>) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> gpu.return } @@ -17,43 +17,43 @@ gpu.func @create_nd_tdesc_1(%src: memref<24x32xf32>) { gpu.func @create_nd_tdesc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) { //CHECK: %[[C:.*]] = arith.constant 1 : index %c1 = arith.constant 1 : index - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], shape : [%[[arg2]], %[[arg1]]], strides : [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32> - %1 = xegpu.create_nd_tdesc %src[%x, %y], shape:[%h, %w], strides: [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32> + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]], shape : [%[[arg2]], %[[arg1]]], strides : [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32> + %1 = xegpu.create_nd_tdesc %src , shape:[%h, %w], strides: [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32> gpu.return } // CHECK: gpu.func @create_nd_tdesc_3(%[[arg0:.*]]: memref<24x32xf32>) { gpu.func @create_nd_tdesc_3(%src: memref<24x32xf32>) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr> + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr + %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr> gpu.return } // CHECK: gpu.func @create_nd_tdesc_4(%[[arg0:.*]]: memref<2x24x32xf32>) { gpu.func @create_nd_tdesc_4(%src: memref<2x24x32xf32>) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32> - %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0 : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + %1 = xegpu.create_nd_tdesc %src : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32> gpu.return } // CHECK: gpu.func @create_nd_tdesc_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) { gpu.func @create_nd_tdesc_5(%src: memref<2x24x32xf32, 3>) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr> - %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr> + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0 : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr> + %1 = xegpu.create_nd_tdesc %src : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr> gpu.return } // CHECK: gpu.func @create_nd_tdesc_6(%[[arg0:.*]]: memref<24x32xf32>) { gpu.func @create_nd_tdesc_6(%src: memref<24x32xf32>) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr> + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr> + %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr> gpu.return } // CHECK: gpu.func @create_nd_tdesc_7(%[[arg0:.*]]: memref<8x24x32x48x64xf32>) { gpu.func @create_nd_tdesc_7(%src: memref<8x24x32x48x64xf32>) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0, 0, 0, 0] : memref<8x24x32x48x64xf32> -> !xegpu.tensor_desc<8x8x8x24x32xf32> - %1 = xegpu.create_nd_tdesc %src[0, 0, 0, 0, 0] : memref<8x24x32x48x64xf32> -> !xegpu.tensor_desc<8x8x8x24x32xf32> + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]] : memref<8x24x32x48x64xf32> -> !xegpu.tensor_desc<8x8x8x24x32xf32> + %1 = xegpu.create_nd_tdesc %src : memref<8x24x32x48x64xf32> -> !xegpu.tensor_desc<8x8x8x24x32xf32> gpu.return } @@ -83,8 +83,8 @@ gpu.func @test_create_nd_tdesc_8(%src: ui64, %w : index, %h : index, %x : index, gpu.func @test_create_nd_tdesc_9(%src: memref, %w : index, %h : index, %x : index, %y : index) { %c1 = arith.constant 1 : index - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[%arg3, %arg4], shape : [%arg2, %arg1], strides : [%arg1, %c1] : memref -> !xegpu.tensor_desc<8x16xf16> - %1 = xegpu.create_nd_tdesc %src[%x, %y], shape:[%h, %w], strides:[%w, %c1] : memref -> !xegpu.tensor_desc<8x16xf16> + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0, shape : [%arg2, %arg1], strides : [%arg1, %c1] : memref -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %src , shape:[%h, %w], strides:[%w, %c1] : memref -> !xegpu.tensor_desc<8x16xf16> gpu.return } @@ -100,19 +100,19 @@ gpu.func @test_create_nd_tdesc_10(%src: memref, %w : index, %h : index, // CHECK: gpu.func @prefetch_nd(%[[arg0:.*]]: memref<24x32xf16>) { gpu.func @prefetch_nd(%src: memref<24x32xf16>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> - // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf16> - xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<8x16xf16> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %src : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> + // CHECK: xegpu.prefetch_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf16> + xegpu.prefetch_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<8x16xf16> gpu.return } // CHECK: gpu.func @prefetch_nd_2(%[[arg0:.*]]: memref<48x64xf16>) { gpu.func @prefetch_nd_2(%src: memref<48x64xf16>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<48x64xf16> -> !xegpu.tensor_desc<8x16xf16> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<48x64xf16> -> !xegpu.tensor_desc<8x16xf16> - // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf16> - xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<8x16xf16> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]] : memref<48x64xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %src : memref<48x64xf16> -> !xegpu.tensor_desc<8x16xf16> + // CHECK: xegpu.prefetch_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf16> + xegpu.prefetch_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<8x16xf16> gpu.return } @@ -127,140 +127,140 @@ gpu.func @prefetch_nd_offset_1(%src: memref<48x64xf16>, %x : index, %y : index) // CHECK: func @subgroup_load_nd(%[[arg0:.*]]: memref<8x16xf16>) { gpu.func @subgroup_load_nd(%src: memref<8x16xf16>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16> - %2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %src : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16> + %2 = xegpu.load_nd %1[0, 0] <{packed, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16> gpu.return } // CHECK: func @simt_load_nd(%[[arg0:.*]]: memref<8x16xf16>) { gpu.func @simt_load_nd(%src: memref<8x16xf16>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> - %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %src : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> + %2 = xegpu.load_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> gpu.return } // CHECK: func @subgroup_load_nd_2(%[[arg0:.*]]: memref<8x16xf16>) { gpu.func @subgroup_load_nd_2(%src: memref<8x16xf16>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16> - // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16> -> vector<16xf16> - %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16> -> vector<16xf16> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16> + %1 = xegpu.create_nd_tdesc %src : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16> -> vector<16xf16> + %2 = xegpu.load_nd %1[0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16> -> vector<16xf16> gpu.return } // CHECK: func @simt_load_nd_2(%[[arg0:.*]]: memref<8x16xf16>) { gpu.func @simt_load_nd_2(%src: memref<8x16xf16>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16> - // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16> -> vector<1xf16> - %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16> -> vector<1xf16> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16> + %1 = xegpu.create_nd_tdesc %src : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16> -> vector<1xf16> + %2 = xegpu.load_nd %1[0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16> -> vector<1xf16> gpu.return } // CHECK: func @subgroup_load_nd_3(%[[arg0:.*]]: memref<24x32xf32>) { gpu.func @subgroup_load_nd_3(%src: memref<24x32xf32>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> - // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> - %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> + %2 = xegpu.load_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> gpu.return } // CHECK: func @simt_load_nd_3(%[[arg0:.*]]: memref<24x32xf32>) { gpu.func @simt_load_nd_3(%src: memref<24x32xf32>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> - // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> - %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> + %2 = xegpu.load_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> gpu.return } // CHECK: func @subgroup_load_nd_4(%[[arg0:.*]]: memref<24x32xf16>) { gpu.func @subgroup_load_nd_4(%src: memref<24x32xf16>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16> - // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16> - %2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16> + %1 = xegpu.create_nd_tdesc %src : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16> + %2 = xegpu.load_nd %1[0, 0] <{packed, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16> gpu.return } // CHECK: func @simt_load_nd_4(%[[arg0:.*]]: memref<24x32xf16>) { gpu.func @simt_load_nd_4(%src: memref<24x32xf16>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16> - // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> - %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16> + %1 = xegpu.create_nd_tdesc %src : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> + %2 = xegpu.load_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> gpu.return } // CHECK: func @subgroup_load_nd_5(%[[arg0:.*]]: memref<24x32xf32>) { gpu.func @subgroup_load_nd_5(%src: memref<24x32xf32>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32> - // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32xf32> -> vector<32xf32> - %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32xf32> -> vector<32xf32> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32> + %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32xf32> -> vector<32xf32> + %2 = xegpu.load_nd %1[0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32xf32> -> vector<32xf32> gpu.return } // CHECK: func @simt_load_nd_5(%[[arg0:.*]]: memref<24x32xf32>) { gpu.func @simt_load_nd_5(%src: memref<24x32xf32>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32> - // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32xf32> -> vector<2xf32> - %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32xf32> -> vector<2xf32> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32> + %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32xf32> -> vector<2xf32> + %2 = xegpu.load_nd %1[0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32xf32> -> vector<2xf32> gpu.return } // CHECK: func @subgroup_load_nd_6(%[[arg0:.*]]: memref<24x32xf16>) { gpu.func @subgroup_load_nd_6(%src: memref<24x32xf16>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> - // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<2x16x16xf16> - %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<2x16x16xf16> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> + %1 = xegpu.create_nd_tdesc %src : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<2x16x16xf16> + %2 = xegpu.load_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<2x16x16xf16> gpu.return } // CHECK: func @simt_load_nd_6(%[[arg0:.*]]: memref<24x32xf16>) { gpu.func @simt_load_nd_6(%src: memref<24x32xf16>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> - // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<32xf16> - %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> + %1 = xegpu.create_nd_tdesc %src : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<32xf16> + %2 = xegpu.load_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<32xf16> gpu.return } // CHECK: func @subgroup_load_nd_7(%[[arg0:.*]]: memref<24x32xf16>) { gpu.func @subgroup_load_nd_7(%src: memref<24x32xf16>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> - // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<2x8x16x2xf16> - %2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<2x8x16x2xf16> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> + %1 = xegpu.create_nd_tdesc %src : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<2x8x16x2xf16> + %2 = xegpu.load_nd %1[0, 0] <{packed, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<2x8x16x2xf16> gpu.return } // CHECK: func @simt_load_nd_7(%[[arg0:.*]]: memref<24x32xf16>) { gpu.func @simt_load_nd_7(%src: memref<24x32xf16>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> - // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<32xf16> - %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> + %1 = xegpu.create_nd_tdesc %src : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<32xf16> + %2 = xegpu.load_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<32xf16> gpu.return } // CHECK: func @subgroup_load_nd_8(%[[arg0:.*]]: memref<24x32xf32>) { gpu.func @subgroup_load_nd_8(%src: memref<24x32xf32>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> - // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose = array}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32> - %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose = array}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> + %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose = array}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32> + %2 = xegpu.load_nd %1[0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose = array}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32> gpu.return } @@ -284,15 +284,6 @@ gpu.func @subgroup_load_nd_offset_2(%src: memref<24x32xf32>, %x : index) { // CHECK: func @simt_load_nd_8(%[[arg0:.*]]: memref<24x32xf32>) { gpu.func @simt_load_nd_8(%src: memref<24x32xf32>) { - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> - // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose = array}> : !xegpu.tensor_desc<16x8xf32> -> vector<8xf32> - %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose = array}> : !xegpu.tensor_desc<16x8xf32> -> vector<8xf32> - gpu.return -} - -// CHECK: func @simt_load_nd_offset_1(%[[arg0:.*]]: memref<24x32xf32>) { -gpu.func @simt_load_nd_offset_1(%src: memref<24x32xf32>) { // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose = array}> : !xegpu.tensor_desc<16x8xf32> -> vector<8xf32> @@ -304,10 +295,10 @@ gpu.func @simt_load_nd_offset_1(%src: memref<24x32xf32>) { gpu.func @subgroup_store_nd(%dst: memref<24x32xf16>) { // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<24x32xf16> %1 = arith.constant dense<1.0>: vector<24x32xf16> - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> - %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> - // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16> - xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> + %2 = xegpu.create_nd_tdesc %dst : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> + // CHECK: xegpu.store_nd %[[C]], %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16> + xegpu.store_nd %1, %2[0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16> gpu.return } @@ -315,10 +306,10 @@ gpu.func @subgroup_store_nd(%dst: memref<24x32xf16>) { gpu.func @simt_store_nd(%src: memref<24x32xf16>) { // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<48xf16> %1 = arith.constant dense<1.0>: vector<48xf16> - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> - %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> - // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<48xf16>, !xegpu.tensor_desc<24x32xf16> - xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: vector<48xf16>, !xegpu.tensor_desc<24x32xf16> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> + %2 = xegpu.create_nd_tdesc %src : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> + // CHECK: xegpu.store_nd %[[C]], %[[R0]][0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<48xf16>, !xegpu.tensor_desc<24x32xf16> + xegpu.store_nd %1, %2[0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: vector<48xf16>, !xegpu.tensor_desc<24x32xf16> gpu.return } @@ -337,26 +328,15 @@ gpu.func @subgroup_store_nd_2(%dst: memref<24x32xf16>, %x : index) { gpu.func @subgroup_store_nd_offset_1(%dst: memref<24x32xf16>) { // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<32xf16> %1 = arith.constant dense<1.0>: vector<32xf16> - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16> - %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16> - // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<32xf16>, !xegpu.tensor_desc<32xf16> - xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: vector<32xf16>, !xegpu.tensor_desc<32xf16> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16> + %2 = xegpu.create_nd_tdesc %dst : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16> + // CHECK: xegpu.store_nd %[[C]], %[[R0]][0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<32xf16>, !xegpu.tensor_desc<32xf16> + xegpu.store_nd %1, %2[0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: vector<32xf16>, !xegpu.tensor_desc<32xf16> gpu.return } // CHECK: func @simt_store_nd_2(%[[arg0:.*]]: memref<24x32xf16>) { gpu.func @simt_store_nd_2(%src: memref<24x32xf16>) { - // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<2xf16> - %1 = arith.constant dense<1.0>: vector<2xf16> - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16> - %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16> - // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<2xf16>, !xegpu.tensor_desc<32xf16> - xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: vector<2xf16>, !xegpu.tensor_desc<32xf16> - gpu.return -} - -// CHECK: func @simt_store_nd_offset_1(%[[arg0:.*]]: memref<24x32xf16>) { -gpu.func @simt_store_nd_offset_1(%src: memref<24x32xf16>) { // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<2xf16> %1 = arith.constant dense<1.0>: vector<2xf16> // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16> @@ -366,24 +346,6 @@ gpu.func @simt_store_nd_offset_1(%src: memref<24x32xf16>) { gpu.return } -// CHECK: gpu.func @update_nd_tdesc(%[[arg0:.*]]: memref<24x32xf32>) { -gpu.func @update_nd_tdesc(%src: memref<24x32xf32>) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> - // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32> - %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32> - gpu.return -} - -// CHECK: gpu.func @update_nd_tdesc_2(%[[arg0:.*]]: memref<8x24x32xf32>) { -gpu.func @update_nd_tdesc_2(%src: memref<8x24x32xf32>) { - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<8x24x32xf32> -> !xegpu.tensor_desc<2x8x16xf32> - %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<8x24x32xf32> -> !xegpu.tensor_desc<2x8x16xf32> - // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 0, 16] : !xegpu.tensor_desc<2x8x16xf32> - %2 = xegpu.update_nd_offset %1, [0, 0, 16]: !xegpu.tensor_desc<2x8x16xf32> - gpu.return -} - // CHECK: gpu.func @simt_load_4(%[[arg0:.*]]: memref<256xf16>, %[[arg1:.*]]: vector<1xindex>, %[[arg2:.*]]: vector<1xi1>) { gpu.func @simt_load_4(%arg0: memref<256xf16>, %arg1: vector<1xindex>, %arg2: vector<1xi1>) { // CHECK: %0 = xegpu.load %[[arg0]][%[[arg1]]], %[[arg2]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir index 5a95185c8de48..666fcac58a0f3 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir @@ -6,11 +6,11 @@ // CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32> // CHECK: %[[TDESC_SRC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32, #xegpu.layout> // CHECK: %[[TDESC_DST:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32, #xegpu.layout> -// CHECK: xegpu.prefetch_nd %[[TDESC_SRC]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, layout = #xegpu.layout}> : +// CHECK: xegpu.prefetch_nd %[[TDESC_SRC]][0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<8x32xf32, #xegpu.layout> -// CHECK: %[[LOADED:.*]] = xegpu.load_nd %0 <{layout = #xegpu.layout}> +// CHECK: %[[LOADED:.*]] = xegpu.load_nd %0[0, 0] <{layout = #xegpu.layout}> // CHECK-SAME: !xegpu.tensor_desc<8x32xf32, #xegpu.layout> -> vector<8x32xf32> -// CHECK: xegpu.store_nd %[[LOADED]], %[[TDESC_DST]] <{layout = #xegpu.layout}> : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #xegpu.layout> +// CHECK: xegpu.store_nd %[[LOADED]], %[[TDESC_DST]][0, 0] <{layout = #xegpu.layout}> : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #xegpu.layout> gpu.module @test { // Although the uArch allows 8x32 inst data using block count (or array_len), // it is up to optimization passes to decide on the block count usage. @@ -18,9 +18,9 @@ func.func @load_store_no_array_len(%arg0: memref<8x32xf32>, %arg1: memref<8x32xf %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> %0 = xegpu.create_nd_tdesc %arg0 : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32> %1 = xegpu.create_nd_tdesc %arg1 : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32> - xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<8x32xf32> - %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x32xf32> -> vector<8x32xf32> - xegpu.store_nd %2, %1 : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32> + xegpu.prefetch_nd %0[0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<8x32xf32> + %2 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<8x32xf32> -> vector<8x32xf32> + xegpu.store_nd %2, %1[0, 0] : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32> return } } @@ -30,27 +30,27 @@ func.func @load_store_no_array_len(%arg0: memref<8x32xf32>, %arg1: memref<8x32xf // CHECK-LABEL: func.func @dpas_f16( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout -// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{layout = #xegpu.layout}> : +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout +// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> +// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]][0, 0] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{layout = #xegpu.layout}> : +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]][0, 0] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> -// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout -// CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout +// CHECK: xegpu.store_nd %[[T4]], %[[T5]][0, 0] <{layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> gpu.module @test { func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> - %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> + %2 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %3 = xegpu.load_nd %1[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> %4 = xegpu.dpas %2, %3, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> - %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + %5 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %4, %5[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } } @@ -65,30 +65,21 @@ gpu.module @test_kernel { %block_id_y = gpu.block_id y %m = arith.muli %block_id_x, %c32 : index - %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16> - %b_tdesc = xegpu.create_nd_tdesc %B[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16> - %c_tdesc = xegpu.create_nd_tdesc %C[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16> + %a_tdesc = xegpu.create_nd_tdesc %A : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16> + %b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16> + %c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16> - %out:3 = scf.for %k = %c0 to %c1024 step %c32 - iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) - -> (!xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>) { + scf.for %k = %c0 to %c1024 step %c32 { //CHECK: xegpu.load_nd {{.*}} <{layout = #xegpu.layout}> : //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.layout> -> vector<16x32xf16> - %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16> - %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16> + %a = xegpu.load_nd %a_tdesc[0, %k] : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16> + %b = xegpu.load_nd %b_tdesc[0, %k] : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16> //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout} : vector<16x32xf16> %c = arith.addf %a, %b : vector<16x32xf16> //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #xegpu.layout> - xegpu.store_nd %c, %arg2: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16> - - //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x32xf16, #xegpu.layout> - %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16> - %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16> - %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16> - scf.yield %a_next_tdesc, %b_next_tdesc, %c_next_tdesc - : !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16> + xegpu.store_nd %c, %c_tdesc[0, %k] : vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16> } gpu.return } @@ -104,30 +95,21 @@ gpu.module @test_kernel { %block_id_y = gpu.block_id y %m = arith.muli %block_id_x, %c32 : index - %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16> - %b_tdesc = xegpu.create_nd_tdesc %B[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16> - %c_tdesc = xegpu.create_nd_tdesc %C[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16> + %a_tdesc = xegpu.create_nd_tdesc %A : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16> + %b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16> + %c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf16> -> !xegpu.tensor_desc<12x32xf16> - %out:3 = scf.for %k = %c0 to %c1024 step %c32 - iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) - -> (!xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>) { + scf.for %k = %c0 to %c1024 step %c32 { //CHECK: xegpu.load_nd {{.*}} <{layout = #xegpu.layout}> : //CHECK-SAME: !xegpu.tensor_desc<12x32xf16, #xegpu.layout> -> vector<12x32xf16> - %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16> - %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16> + %a = xegpu.load_nd %a_tdesc[0, %k] : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16> + %b = xegpu.load_nd %b_tdesc[0, %k] : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16> //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout} : vector<12x32xf16> %c = arith.addf %a, %b : vector<12x32xf16> //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16, #xegpu.layout> - xegpu.store_nd %c, %arg2: vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16> - - //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<12x32xf16, #xegpu.layout> - %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16> - %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16> - %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16> - scf.yield %a_next_tdesc, %b_next_tdesc, %c_next_tdesc - : !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16> + xegpu.store_nd %c, %c_tdesc[0, %k] : vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16> } gpu.return } @@ -241,15 +223,15 @@ gpu.module @test { // CHECK: %[[CST_SMALL:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<4x16xf32> // CHECK: %[[CST_LARGE:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x32xf32> // CHECK: %[[INSERT:.*]] = vector.insert_strided_slice %[[CST_SMALL]], %[[CST_LARGE]] {layout_result_0 = #xegpu.layout, offsets = [0, 0], strides = [1, 1]} : vector<4x16xf32> into vector<8x32xf32> -// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32, #xegpu.layout> -// CHECK: xegpu.store_nd %[[INSERT]], %[[TDESC]] <{layout = #xegpu.layout}> : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #xegpu.layout> +// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32, #xegpu.layout> +// CHECK: xegpu.store_nd %[[INSERT]], %[[TDESC]][0, 0] <{layout = #xegpu.layout}> : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #xegpu.layout> func.func @insert_strided_slice_inst_data_no_packing(%arg0: memref<8x32xf32>) { %c0 = arith.constant 0 : index %cst_small = arith.constant dense<1.0> : vector<4x16xf32> %cst_large = arith.constant dense<0.0> : vector<8x32xf32> %insert = vector.insert_strided_slice %cst_small, %cst_large {offsets = [0, 0], strides = [1, 1]} : vector<4x16xf32> into vector<8x32xf32> - %tdesc = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32> - xegpu.store_nd %insert, %tdesc : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32> + %tdesc = xegpu.create_nd_tdesc %arg0 : memref<8x32xf32> -> !xegpu.tensor_desc<8x32xf32> + xegpu.store_nd %insert, %tdesc[0, 0] : vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32> return } } @@ -266,8 +248,8 @@ func.func @insert_strided_slice_inst_data_with_packing(%arg0: memref<8x64xi8>) { %cst_small = arith.constant dense<1> : vector<4x64xi8> %cst_large = arith.constant dense<0> : vector<8x64xi8> %insert = vector.insert_strided_slice %cst_small, %cst_large {offsets = [0, 0], strides = [1, 1]} : vector<4x64xi8> into vector<8x64xi8> - %tdesc = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x64xi8> -> !xegpu.tensor_desc<8x64xi8, #xegpu.layout> - xegpu.store_nd %insert, %tdesc <{layout = #xegpu.layout}>: vector<8x64xi8>, !xegpu.tensor_desc<8x64xi8, #xegpu.layout> + %tdesc = xegpu.create_nd_tdesc %arg0 : memref<8x64xi8> -> !xegpu.tensor_desc<8x64xi8, #xegpu.layout> + xegpu.store_nd %insert, %tdesc[0, 0] <{layout = #xegpu.layout}>: vector<8x64xi8>, !xegpu.tensor_desc<8x64xi8, #xegpu.layout> return } } diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir index d8a07d7c85a6c..d61a509bb6bc1 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir @@ -6,14 +6,14 @@ gpu.module @test { func.func @store_nd(%src: memref<256x128xf32>) { // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf32> // CHECK-SAME: -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - // CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]] <{layout = #xegpu.layout}> + // CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]][0, 0] <{layout = #xegpu.layout}> // CHECK-SAME: : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> // CHECK-SAME: -> vector<256x128xf32> - // CHECK: xegpu.store_nd %[[LOAD]], %[[TDESC]] <{layout = #xegpu.layout}> + // CHECK: xegpu.store_nd %[[LOAD]], %[[TDESC]][0, 0] <{layout = #xegpu.layout}> // CHECK-SAME: : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout> %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32> - %load = xegpu.load_nd %tdesc : !xegpu.tensor_desc<256x128xf32> -> vector<256x128xf32> - xegpu.store_nd %load, %tdesc <{layout = #xegpu.layout}> + %load = xegpu.load_nd %tdesc[0, 0] : !xegpu.tensor_desc<256x128xf32> -> vector<256x128xf32> + xegpu.store_nd %load, %tdesc[0, 0] <{layout = #xegpu.layout}> : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32> return } @@ -90,14 +90,14 @@ gpu.module @test { // CHECK: %[[TDESC_A:.*]] = xegpu.create_nd_tdesc %[[A_MEMREF]] : memref<128x128xf16> -> // CHECK-SAME: !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - // CHECK: %[[A_LOADED:.*]] = xegpu.load_nd %[[TDESC_A]] + // CHECK: %[[A_LOADED:.*]] = xegpu.load_nd %[[TDESC_A]][0, 0] // CHECK-SAME: <{layout = #xegpu.layout}> // CHECK-SAME: : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> // CHECK: %[[TDESC_B:.*]] = xegpu.create_nd_tdesc %[[B_MEMREF]] : memref<128x128xf16> -> // CHECK-SAME: !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - // CHECK: %[[B_LOADED:.*]] = xegpu.load_nd %[[TDESC_B]] <{layout = #xegpu.layout}> + // CHECK: %[[B_LOADED:.*]] = xegpu.load_nd %[[TDESC_B]][0, 0] <{layout = #xegpu.layout}> // CHECK-SAME: : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> // CHECK: %[[DPAS_RES:.*]] = xegpu.dpas %[[A_LOADED]], %[[B_LOADED]] @@ -114,9 +114,9 @@ gpu.module @test { // CHECK-SAME: vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout> %tdesc_a = xegpu.create_nd_tdesc %a : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16> - %load_a = xegpu.load_nd %tdesc_a : !xegpu.tensor_desc<128x128xf16> -> vector<128x128xf16> + %load_a = xegpu.load_nd %tdesc_a[0, 0] : !xegpu.tensor_desc<128x128xf16> -> vector<128x128xf16> %tdesc_b = xegpu.create_nd_tdesc %b : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16> - %load_b = xegpu.load_nd %tdesc_b : !xegpu.tensor_desc<128x128xf16> -> vector<128x128xf16> + %load_b = xegpu.load_nd %tdesc_b[0, 0] : !xegpu.tensor_desc<128x128xf16> -> vector<128x128xf16> %dpas = xegpu.dpas %load_a, %load_b : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32> %tdesc_cd = xegpu.create_nd_tdesc %d : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32> xegpu.store_nd %dpas, %tdesc_cd[0, 0] : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32> @@ -131,10 +131,10 @@ gpu.module @test { gpu.func @vector_row_reduction(%src: memref<32x64xf32>, %dst: memref<32xf32>) kernel attributes {known_block_size = array} { %cst = arith.constant dense<0.000000e+00> : vector<32xf32> %tdesc_src = xegpu.create_nd_tdesc %src : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32> - %load = xegpu.load_nd %tdesc_src : !xegpu.tensor_desc<32x64xf32> -> vector<32x64xf32> + %load = xegpu.load_nd %tdesc_src[0, 0] : !xegpu.tensor_desc<32x64xf32> -> vector<32x64xf32> %reduce = vector.multi_reduction , %load, %cst [1] : vector<32x64xf32> to vector<32xf32> %tdesc_dst = xegpu.create_nd_tdesc %dst : memref<32xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout> - xegpu.store_nd %reduce, %tdesc_dst <{layout = #xegpu.layout}> + xegpu.store_nd %reduce, %tdesc_dst[0] <{layout = #xegpu.layout}> : vector<32xf32>, !xegpu.tensor_desc<32xf32, #xegpu.layout> gpu.return } @@ -147,7 +147,7 @@ gpu.module @test { gpu.func @vector_row_reduction_scalar(%src: memref<32x64xf32>, %dst: memref<32xf32>) kernel attributes {known_block_size = array} { %cst = arith.constant 0.000000e+00 : f32 %tdesc_src = xegpu.create_nd_tdesc %src : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32> - %load = xegpu.load_nd %tdesc_src : !xegpu.tensor_desc<32x64xf32> -> vector<32x64xf32> + %load = xegpu.load_nd %tdesc_src[0, 0] : !xegpu.tensor_desc<32x64xf32> -> vector<32x64xf32> %reduce = vector.multi_reduction , %load, %cst [0, 1] : vector<32x64xf32> to f32 gpu.return } @@ -160,7 +160,7 @@ gpu.module @test { %cst = arith.constant dense<0.000000e+00> : vector<32xf32> %cst1 = arith.constant dense<0.000000e+00> : vector<32x128xf32> %tdesc_src = xegpu.create_nd_tdesc %src : memref<32x128xf32> -> !xegpu.tensor_desc<32x128xf32> - %load = xegpu.load_nd %tdesc_src : !xegpu.tensor_desc<32x128xf32> -> vector<32x128xf32> + %load = xegpu.load_nd %tdesc_src[0, 0] : !xegpu.tensor_desc<32x128xf32> -> vector<32x128xf32> %bcast1 = vector.broadcast %load: vector<32x128xf32> to vector<4x32x128xf32> // CHECK: %[[BCAST1:.*]] = vector.broadcast %{{.*}} {layout_result_0 = #xegpu.layout} : vector<32x128xf32> to vector<4x32x128xf32> @@ -182,7 +182,7 @@ gpu.module @test { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout, dims = [0]>, dims = [1]>} dense<0.000000e+00> : vector<32xf32> // CHECK: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} dense<0.000000e+00> : vector<32x128xf32> // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<32x128xf32> -> !xegpu.tensor_desc<32x128xf32, #xegpu.slice<#xegpu.layout, dims = [0]>> -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]] <{layout = #xegpu.slice<#xegpu.layout, dims = [0]>}> +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]][0, 0] <{layout = #xegpu.slice<#xegpu.layout, dims = [0]>}> // CHECK-SAME: -> vector<32x128xf32> // CHECK: %[[BCAST1:.*]] = vector.broadcast %[[LOAD]] {layout_result_0 = #xegpu.layout} : vector<32x128xf32> to vector<4x32x128xf32> // CHECK: %[[REDUCE1:.*]] = vector.multi_reduction , %[[BCAST1]], %[[CST0]] @@ -198,7 +198,7 @@ gpu.module @test { %cst = arith.constant dense<0.000000e+00> : vector<32xf32> %cst1 = arith.constant dense<0.000000e+00> : vector<32x128xf32> %tdesc_src = xegpu.create_nd_tdesc %src : memref<32x128xf32> -> !xegpu.tensor_desc<32x128xf32> - %load = xegpu.load_nd %tdesc_src : !xegpu.tensor_desc<32x128xf32> -> vector<32x128xf32> + %load = xegpu.load_nd %tdesc_src[0, 0] : !xegpu.tensor_desc<32x128xf32> -> vector<32x128xf32> %bcast1 = vector.broadcast %load: vector<32x128xf32> to vector<4x32x128xf32> %bcast = vector.multi_reduction , %bcast1, %cst1 [0]: vector<4x32x128xf32> to vector<32x128xf32> %reduce = vector.multi_reduction , %bcast, %cst [1] : vector<32x128xf32> to vector<32xf32> diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir index c87dbf3ec2108..44a243fbc7d25 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir @@ -4,26 +4,26 @@ gpu.module @test { // CHECK-LABEL: func.func @dpas_f16( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{layout = #xegpu.layout}> : +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> +// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> +// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]][0, 0] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{layout = #xegpu.layout}> : +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]][0, 0] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> -// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -// CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// CHECK: xegpu.store_nd %[[T4]], %[[T5]][0, 0] <{layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> - %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> + %2 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %3 = xegpu.load_nd %1[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> %4 = xegpu.dpas %2, %3, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> - %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + %5 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %4, %5[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } } @@ -34,26 +34,26 @@ gpu.module @test { // CHECK-LABEL: func.func @dpas_f16_result_f16( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf16>) { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf16> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{layout = #xegpu.layout}> : +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> +// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> +// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]][0, 0] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{layout = #xegpu.layout}> : +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]][0, 0] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf16> -> vector<8x16xf16> -// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -// CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{layout = #xegpu.layout}> : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16, #xegpu.layout> +// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> +// CHECK: xegpu.store_nd %[[T4]], %[[T5]][0, 0] <{layout = #xegpu.layout}> : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16, #xegpu.layout> func.func @dpas_f16_result_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf16>) { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.000000e+00> : vector<8x16xf16> - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> - %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> + %2 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %3 = xegpu.load_nd %1[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> %4 = xegpu.dpas %2, %3, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf16> -> vector<8x16xf16> - %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - xegpu.store_nd %4, %5 : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16> + %5 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + xegpu.store_nd %4, %5[0, 0] : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16> return } } @@ -68,8 +68,8 @@ gpu.module @test { func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) { %c0 = arith.constant 0 : index %0 = xegpu.dpas %arg0, %arg1 : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32> - %1 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32> - xegpu.store_nd %0, %1 : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32> + %1 = xegpu.create_nd_tdesc %arg2 : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32> + xegpu.store_nd %0, %1[0, 0] : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32> return } } @@ -83,13 +83,13 @@ gpu.module @test { func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> - %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %3 = xegpu.load_nd %1 <{transpose = array}> : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> + %2 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %3 = xegpu.load_nd %1[0, 0] <{transpose = array}> : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> %4 = xegpu.dpas %2, %3, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> - %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + %5 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %4, %5[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } } @@ -102,14 +102,14 @@ gpu.module @test { func.func @vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> - %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> + %2 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %3 = xegpu.load_nd %1[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> %4 = vector.transpose %3, [1, 0] : vector<16x16xf16> to vector<16x16xf16> %5 = xegpu.dpas %2, %4, %cst : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> - %6 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %5, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + %6 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %5, %6[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } } @@ -122,8 +122,8 @@ gpu.module @test { // CHECK: %[[T2:.*]] = arith.extf %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x16xf16> to vector<16x16xf32> // CHECK-NEXT: %{{.*}} = arith.truncf %[[T2]] {layout_result_0 = #xegpu.layout} : vector<16x16xf32> to vector<16x16xf16> func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) -> vector<8x16xf32> { - %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %0 = xegpu.load_nd %arg0[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %1 = xegpu.load_nd %arg1[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> %2 = arith.extf %1 : vector<16x16xf16> to vector<16x16xf32> %3 = arith.truncf %2 : vector<16x16xf32> to vector<16x16xf16> %4 = xegpu.dpas %0, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> @@ -141,16 +141,16 @@ gpu.module @test { // CHECK-NEXT: %{{.*}} = xegpu.load %arg1[%[[OFFSET]]], %[[MASK]] <{chunk_size = 16 : i64, layout = #xegpu.layout}> : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x16xf16> func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %offset = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> %mask = arith.constant dense : vector<16xi1> %3 = xegpu.load %arg1[%offset], %mask <{chunk_size=16}> : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x16xf16> %4 = vector.transpose %3, [1, 0] : vector<16x16xf16> to vector<16x16xf16> %5 = xegpu.dpas %1, %4 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - %6 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %5, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + %6 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %5, %6[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } } @@ -262,15 +262,15 @@ gpu.module @test { // CHECK-SAME: vector<16x16xi16> to vector<16x16xf16> func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x16xi16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16> - %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xi16> -> !xegpu.tensor_desc<16x16xi16> - %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xi16> -> vector<8x16xi16> - %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<16x16xi16> -> vector<16x16xi16> + %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16> + %1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xi16> -> !xegpu.tensor_desc<16x16xi16> + %2 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<8x16xi16> -> vector<8x16xi16> + %3 = xegpu.load_nd %1[0, 0] : !xegpu.tensor_desc<16x16xi16> -> vector<16x16xi16> %4 = vector.bitcast %2 : vector<8x16xi16> to vector<8x16xf16> %5 = vector.bitcast %3 : vector<16x16xi16> to vector<16x16xf16> %6 = xegpu.dpas %4, %5 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - %7 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %6, %7 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + %7 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %6, %7[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } } @@ -283,15 +283,15 @@ gpu.module @test { // CHECK-SAME: vector<16x8xi32> to vector<16x16xf16> func.func @vector_bitcast_i32_to_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x8xi32>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32> - %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<16x8xi32> -> vector<16x8xi32> + %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %arg1 : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32> + %2 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %3 = xegpu.load_nd %1[0, 0] : !xegpu.tensor_desc<16x8xi32> -> vector<16x8xi32> %4 = vector.bitcast %3 : vector<16x8xi32> to vector<16x16xf16> %5 = vector.transpose %4, [1, 0] : vector<16x16xf16> to vector<16x16xf16> %6 = xegpu.dpas %2, %5 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - %7 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %6, %7 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + %7 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %6, %7[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } } @@ -304,11 +304,11 @@ gpu.module @test { // CHECK-SAME: vector<8x32xi16> to vector<8x16xi32> func.func @vector_bitcast_i16_to_i32(%arg0: memref<8x32xi16>, %arg1: memref<8x16xi32>) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x32xi16> -> !xegpu.tensor_desc<8x32xi16> - %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32> - %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x32xi16> -> vector<8x32xi16> + %0 = xegpu.create_nd_tdesc %arg0 : memref<8x32xi16> -> !xegpu.tensor_desc<8x32xi16> + %1 = xegpu.create_nd_tdesc %arg1 : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32> + %2 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<8x32xi16> -> vector<8x32xi16> %3 = vector.bitcast %2 : vector<8x32xi16> to vector<8x16xi32> - xegpu.store_nd %3, %1 : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32> + xegpu.store_nd %3, %1[0, 0] : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32> return } } @@ -321,11 +321,11 @@ gpu.module @test { // CHECK-SAME: vector<8x16xi32> to vector<8x32xi16> func.func @vector_bitcast_require_cross_lane_shuffle(%arg0: memref<8x16xi32>, %arg1: memref<8x32xi16>) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32> - %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<8x32xi16> -> !xegpu.tensor_desc<8x32xi16> - %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xi32> -> vector<8x16xi32> + %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32> + %1 = xegpu.create_nd_tdesc %arg1 : memref<8x32xi16> -> !xegpu.tensor_desc<8x32xi16> + %2 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<8x16xi32> -> vector<8x16xi32> %3 = vector.bitcast %2 : vector<8x16xi32> to vector<8x32xi16> - xegpu.store_nd %3, %1 : vector<8x32xi16>, !xegpu.tensor_desc<8x32xi16> + xegpu.store_nd %3, %1[0, 0] : vector<8x32xi16>, !xegpu.tensor_desc<8x32xi16> return } } @@ -336,18 +336,18 @@ gpu.module @test { // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>) { -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> : +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]][0, 0] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> : +// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]][0, 0] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: %{{.*}} = arith.addf %[[T1]], %[[T2]] {layout_result_0 = #xegpu.layout} : vector<16x16xf16> func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) { - %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> - %2 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %0 = xegpu.load_nd %arg0[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %1 = xegpu.load_nd %arg1[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %2 = xegpu.load_nd %arg1[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> %3 = arith.addf %1, %2 : vector<16x16xf16> %4 = xegpu.dpas %0, %3 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - xegpu.store_nd %4, %arg2 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %4, %arg2[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } } @@ -360,16 +360,16 @@ gpu.module @test { // CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { // CHECK: %[[T2:.*]] = arith.addf %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout} : vector<16x16xf16> // CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// CHECK-NEXT: xegpu.store_nd %[[T3]], %[[ARG2]] <{layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -// CHECK-NEXT: xegpu.store_nd %[[T2]], %[[ARG3]] <{layout = #xegpu.layout}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> +// CHECK-NEXT: xegpu.store_nd %[[T3]], %[[ARG2]][0, 0] <{layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// CHECK-NEXT: xegpu.store_nd %[[T2]], %[[ARG3]][0, 0] <{layout = #xegpu.layout}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) { - %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %0 = xegpu.load_nd %arg0[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %1 = xegpu.load_nd %arg1[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> %cst = arith.constant dense<1.000000e+00> : vector<16x16xf16> %2 = arith.addf %1, %cst : vector<16x16xf16> %3 = xegpu.dpas %0, %2 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - xegpu.store_nd %3, %arg2 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %2, %arg3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> + xegpu.store_nd %3, %arg2[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %2, %arg3[0, 0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> return } } @@ -377,41 +377,36 @@ func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: ! gpu.module @test { // CHECK-LABEL: func.func @for_op( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x128xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<128x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -// CHECK-NEXT: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<128x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> +// CHECK-NEXT: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<128x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> // CHECK-NEXT: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32> -// CHECK-NEXT: %[[T2:.*]]:3 = scf.for %{{.*}} iter_args(%[[ARG4:.*]] = %[[T0]], %[[ARG5:.*]] = %[[T1]], %[[ARG6:.*]] = %[[CST]]) -> -// CHECK-SAME: (!xegpu.tensor_desc<8x16xf16, #xegpu.layout>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, vector<8x16xf32>) { -// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG4]] <{layout = #xegpu.layout}> : +// CHECK-NEXT: %[[T2:.*]] = scf.for %[[ARG3:.*]] = {{.*}} iter_args(%[[ARG6:.*]] = %[[CST]]) -> +// CHECK-SAME: (vector<8x16xf32>) { +// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[T0]][0, %[[ARG3]]] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> -// CHECK-NEXT: %[[T5:.*]] = xegpu.load_nd %[[ARG5]] <{layout = #xegpu.layout}> : +// CHECK-NEXT: %[[T5:.*]] = xegpu.load_nd %[[T1]][%[[ARG3]], 0] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> -// CHECK-NEXT: %[[T7:.*]] = xegpu.update_nd_offset %[[ARG4]], [{{.*}}] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -// CHECK-NEXT: %[[T8:.*]] = xegpu.update_nd_offset %[[ARG5]], [{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -// CHECK-NEXT: scf.yield %[[T7]], %[[T8]], %[[T6]] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout>, -// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, vector<8x16xf32> -// CHECK-NEXT: } {layout_result_2 = #xegpu.layout} -// CHECK-NEXT: %[[T3:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -// CHECK-NEXT: xegpu.store_nd %[[T2]]#2, %[[T3]] <{layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// CHECK-NEXT: scf.yield %[[T6]] : vector<8x16xf32> +// CHECK-NEXT: } {layout_result_0 = #xegpu.layout} +// CHECK-NEXT: %[[T3:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// CHECK-NEXT: xegpu.store_nd %[[T2]], %[[T3]][0, 0] <{layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %c128 = arith.constant 128 : index %c16 = arith.constant 16 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16> - %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<128x16xf16> -> !xegpu.tensor_desc<16x16xf16> + %0 = xegpu.create_nd_tdesc %arg0 : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %arg1 : memref<128x16xf16> -> !xegpu.tensor_desc<16x16xf16> %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> - %2:3 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %0, %arg5 = %1, %arg6 = %cst) -> (!xegpu.tensor_desc<8x16xf16>, !xegpu.tensor_desc<16x16xf16>, vector<8x16xf32>) { - %4 = xegpu.load_nd %arg4 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %5 = xegpu.load_nd %arg5 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %2 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg6 = %cst) -> (vector<8x16xf32>) { + %4 = xegpu.load_nd %0[0, %arg3] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %5 = xegpu.load_nd %1[%arg3, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> %6 = xegpu.dpas %4, %5, %arg6 : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> - %7 = xegpu.update_nd_offset %arg4, [%c0, %c16] : !xegpu.tensor_desc<8x16xf16> - %8 = xegpu.update_nd_offset %arg5, [%c16, %c0] : !xegpu.tensor_desc<16x16xf16> - scf.yield %7, %8, %6 : !xegpu.tensor_desc<8x16xf16>, !xegpu.tensor_desc<16x16xf16>, vector<8x16xf32> + scf.yield %6 : vector<8x16xf32> } - %3 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %2#2, %3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + %3 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %2, %3[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } } @@ -422,25 +417,25 @@ gpu.module @test { // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>) { // CHECK: %{{.*}} = scf.if %[[ARG2]] -> (vector<16x16xf16>) { -// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> : +// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]][0, 0] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T3]] : vector<16x16xf16> // CHECK-NEXT: } else { -// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> : +// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]][0, 0] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T4]] : vector<16x16xf16> // CHECK-NEXT: } {layout_result_0 = #xegpu.layout} func.func @if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>) { - %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %0 = xegpu.load_nd %arg0[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %1 = scf.if %arg2 -> (vector<16x16xf16>) { - %3 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %3 = xegpu.load_nd %arg1[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> scf.yield %3 : vector<16x16xf16> } else { - %3 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %3 = xegpu.load_nd %arg1[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> scf.yield %3 : vector<16x16xf16> } %2 = xegpu.dpas %0, %1 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - xegpu.store_nd %2, %arg3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %2, %arg3[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> return } } @@ -452,26 +447,26 @@ gpu.module @test { // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout>, // CHECK-SAME: %[[ARG4:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { // CHECK: %[[T1:.*]] = scf.if %[[ARG2]] -> (vector<16x16xf16>) { -// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> : +// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]][0, 0] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T3]] : vector<16x16xf16> // CHECK-NEXT: } else { -// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{layout = #xegpu.layout}> : +// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]][0, 0] <{layout = #xegpu.layout}> : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T4]] : vector<16x16xf16> // CHECK-NEXT: } {layout_result_0 = #xegpu.layout} func.func @if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>, %arg4: !xegpu.tensor_desc<16x16xf16>) { - %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %0 = xegpu.load_nd %arg0[0, 0] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %1 = scf.if %arg2 -> (vector<16x16xf16>) { - %3 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %3 = xegpu.load_nd %arg1[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> scf.yield %3 : vector<16x16xf16> } else { - %3 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %3 = xegpu.load_nd %arg1[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> scf.yield %3 : vector<16x16xf16> } %2 = xegpu.dpas %0, %1 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - xegpu.store_nd %2, %arg3 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %1, %arg4 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> + xegpu.store_nd %2, %arg3[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %1, %arg4[0, 0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> return } } @@ -483,7 +478,7 @@ gpu.module @test { func.func @vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { %cst = arith.constant dense<0.000000e+00> : vector<16xf32> %0 = vector.multi_reduction , %arg0, %cst [0] : vector<16x16xf32> to vector<16xf32> - xegpu.store_nd %0, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32> + xegpu.store_nd %0, %arg1[0] : vector<16xf32>, !xegpu.tensor_desc<16xf32> return } } @@ -495,39 +490,33 @@ gpu.module @test { func.func @vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { %cst = arith.constant dense<0.000000e+00> : vector<16xf32> %0 = vector.multi_reduction , %arg0, %cst [1] : vector<16x16xf32> to vector<16xf32> - xegpu.store_nd %0, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32> + xegpu.store_nd %0, %arg1[0] : vector<16xf32>, !xegpu.tensor_desc<16xf32> return } } // ----- gpu.module @test { -// CHECK-LABEL: func.func @update_nd_offset_1d( +// CHECK-LABEL: func.func @store_nd_with_offset( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> -// CHECK-NEXT: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%{{.*}}] : !xegpu.tensor_desc<16xf32, #xegpu.layout> -func.func @update_nd_offset_1d(%arg0: memref<256xf32>){ - %c0 = arith.constant 0 : index +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> +func.func @store_nd_with_offset(%arg0: memref<256xf32>){ %c32 = arith.constant 32 : index %1 = arith.constant dense<1.000000e+00> : vector<16xf32> - %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf32> -> !xegpu.tensor_desc<16xf32> - %2 = xegpu.update_nd_offset %0, [%c32] : !xegpu.tensor_desc<16xf32> - xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32> + %0 = xegpu.create_nd_tdesc %arg0 : memref<256xf32> -> !xegpu.tensor_desc<16xf32> + xegpu.store_nd %1, %0[%c32] : vector<16xf32>, !xegpu.tensor_desc<16xf32> return } } // ----- gpu.module @test { -// CHECK-LABEL: func.func @update_nd_offset_2d( +// CHECK-LABEL: func.func @store_nd_with_offset_2d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf32>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout> -// CHECK-NEXT: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%{{.*}}, %{{.*}}] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> -func.func @update_nd_offset_2d(%arg0: memref<256x256xf32>){ - %c0 = arith.constant 0 : index +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout> +func.func @store_nd_with_offset_2d(%arg0: memref<256x256xf32>){ %c32 = arith.constant 32 : index %1 = arith.constant dense<1.000000e+00> : vector<16x16xf32> - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32> - %2 = xegpu.update_nd_offset %0, [%c32, %c32] : !xegpu.tensor_desc<16x16xf32> - xegpu.store_nd %1, %2 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32> + %0 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32> + xegpu.store_nd %1, %0[%c32, %c32] : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32> return } } @@ -535,12 +524,12 @@ func.func @update_nd_offset_2d(%arg0: memref<256x256xf32>){ gpu.module @test { // CHECK-LABEL: func.func @prefetch_2d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> +// CHECK-NEXT: xegpu.prefetch_nd %[[T0]][0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> func.func @prefetch_2d(%arg0: memref<256x256xf16>){ %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16> - xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<16x16xf16> + %0 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16> + xegpu.prefetch_nd %0[0, 0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<16x16xf16> return } } @@ -548,12 +537,12 @@ func.func @prefetch_2d(%arg0: memref<256x256xf16>){ gpu.module @test { // CHECK-LABEL: func.func @prefetch_1d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> -// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, layout = #xegpu.layout}> : !xegpu.tensor_desc<16xf16, #xegpu.layout> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> +// CHECK-NEXT: xegpu.prefetch_nd %[[T0]][0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, layout = #xegpu.layout}> : !xegpu.tensor_desc<16xf16, #xegpu.layout> func.func @prefetch_1d(%arg0: memref<256xf16>){ %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16> - xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<16xf16> + %0 = xegpu.create_nd_tdesc %arg0 : memref<256xf16> -> !xegpu.tensor_desc<16xf16> + xegpu.prefetch_nd %0[0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<16xf16> return } } @@ -561,32 +550,33 @@ func.func @prefetch_1d(%arg0: memref<256xf16>){ gpu.module @test { // CHECK-LABEL: func.func @scf_while_and_condition( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) { -// CHECK: %{{.*}}:3 = scf.while ({{.*}}) : (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout>) -// CHECK-SAME: -> (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout>) { -// CHECK: scf.condition(%{{.*}}) {{.*}} : vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout> +// CHECK: %{{.*}}:2 = scf.while ({{.*}}) : (vector<16xf32>, i32) +// CHECK-SAME: -> (vector<16xf32>, i32) { +// CHECK: scf.condition(%{{.*}}) {{.*}} : vector<16xf32>, i32 // CHECK-NEXT: } do { -// CHECK-NEXT: ^bb0(%{{.*}}: vector<16xf32>, %{{.*}}: i32, %{{.*}}: !xegpu.tensor_desc<16xf32, #xegpu.layout>): -// CHECK: scf.yield {{.*}} : vector<16xf32>, i32, !xegpu.tensor_desc<16xf32, #xegpu.layout> +// CHECK-NEXT: ^bb0(%{{.*}}: vector<16xf32>, %{{.*}}: i32): +// CHECK: scf.yield {{.*}} : vector<16xf32>, i32 // CHECK-NEXT: } attributes {layout_result_0 = #xegpu.layout} func.func @scf_while_and_condition(%arg0: memref<256xf32>, %arg1: memref<256xf32>) { %c0 = arith.constant 0 : i32 %c16 = arith.constant 16 : i32 + %c16_idx = arith.constant 16 : index %c256 = arith.constant 256 : i32 - %0 = xegpu.create_nd_tdesc %arg0[0] : memref<256xf32> -> !xegpu.tensor_desc<16xf32> - %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<16xf32> -> vector<16xf32> - %2 = xegpu.create_nd_tdesc %arg1[0] : memref<256xf32> -> !xegpu.tensor_desc<16xf32> + %0 = xegpu.create_nd_tdesc %arg0 : memref<256xf32> -> !xegpu.tensor_desc<16xf32> + %1 = xegpu.load_nd %0[0] : !xegpu.tensor_desc<16xf32> -> vector<16xf32> + %2 = xegpu.create_nd_tdesc %arg1 : memref<256xf32> -> !xegpu.tensor_desc<16xf32> - %3:3 = scf.while (%arg2 = %1, %arg3 = %c0, %arg4 = %0) : (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32>) - -> (vector<16xf32>, i32, !xegpu.tensor_desc<16xf32>) { + %3:2 = scf.while (%arg2 = %1, %arg3 = %c0) : (vector<16xf32>, i32) + -> (vector<16xf32>, i32) { %4 = arith.cmpi slt, %arg3, %c256 : i32 - scf.condition(%4) %arg2, %arg3, %arg4 : vector<16xf32>, i32, !xegpu.tensor_desc<16xf32> + scf.condition(%4) %arg2, %arg3 : vector<16xf32>, i32 } do { - ^bb0(%arg2: vector<16xf32>, %arg3: i32, %arg4: !xegpu.tensor_desc<16xf32>): - xegpu.store_nd %arg2, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32> + ^bb0(%arg2: vector<16xf32>, %arg3: i32): + xegpu.store_nd %arg2, %2[0] : vector<16xf32>, !xegpu.tensor_desc<16xf32> %4 = arith.addi %arg3, %c16 : i32 - %5 = xegpu.update_nd_offset %arg4, [16] : !xegpu.tensor_desc<16xf32> - %6 = xegpu.load_nd %5 : !xegpu.tensor_desc<16xf32> -> vector<16xf32> - scf.yield %6, %4, %5 : vector<16xf32>, i32, !xegpu.tensor_desc<16xf32> + %offset = arith.index_cast %4 : i32 to index + %6 = xegpu.load_nd %0[%offset] : !xegpu.tensor_desc<16xf32> -> vector<16xf32> + scf.yield %6, %4 : vector<16xf32>, i32 } return } @@ -596,7 +586,7 @@ gpu.module @test { // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim1_distributed( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{layout = #xegpu.layout}> +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]][0, 0] <{layout = #xegpu.layout}> // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction , %[[LOAD]], %{{[0-9a-zA-Z]+}} // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16> @@ -605,11 +595,11 @@ gpu.module @test { func.func @vector_shape_cast_1d_to_2d_dim1_distributed(%arg0: !xegpu.tensor_desc<16x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.0000> : vector<16xf16> - %3 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %3 = xegpu.load_nd %arg0[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> %4 = vector.multi_reduction , %3, %cst [0] : vector<16x16xf16> to vector<16xf16> %2 = vector.shape_cast %4 : vector<16xf16> to vector<1x16xf16> %5 = vector.broadcast %2 : vector<1x16xf16> to vector<16x16xf16> - xegpu.store_nd %5, %arg1 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> + xegpu.store_nd %5, %arg1[0, 0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> return } } @@ -618,7 +608,7 @@ gpu.module @test { // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %arg0 <{layout = #xegpu.layout}> +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %arg0[0, 0] <{layout = #xegpu.layout}> // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction , %[[LOAD]], %{{[0-9a-zA-Z]+}} // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} [1] @@ -628,11 +618,11 @@ gpu.module @test { func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(%arg0: !xegpu.tensor_desc<16x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.0000> : vector<16xf16> - %3 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %3 = xegpu.load_nd %arg0[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> %4 = vector.multi_reduction , %3, %cst [1] : vector<16x16xf16> to vector<16xf16> %2 = vector.shape_cast %4 : vector<16xf16> to vector<16x1xf16> %5 = vector.broadcast %2 : vector<16x1xf16> to vector<16x16xf16> - xegpu.store_nd %5, %arg1 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> + xegpu.store_nd %5, %arg1[0, 0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> return } } @@ -845,7 +835,7 @@ gpu.module @test { // CHECK-LABEL: func.func @vector_broadcast_1d_to_2d_broadcast_along_row( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout>) { -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{layout = #xegpu.layout}> +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]][0, 0] <{layout = #xegpu.layout}> // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction , %[[LOAD]], %{{[0-9a-zA-Z]+}} // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16> @@ -854,10 +844,10 @@ gpu.module @test { func.func @vector_broadcast_1d_to_2d_broadcast_along_row(%arg0: !xegpu.tensor_desc<16x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.0000> : vector<16xf16> - %3 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %3 = xegpu.load_nd %arg0[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> %4 = vector.multi_reduction , %3, %cst [0] : vector<16x16xf16> to vector<16xf16> %5 = vector.broadcast %4 : vector<16xf16> to vector<16x16xf16> - xegpu.store_nd %5, %arg1 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> + xegpu.store_nd %5, %arg1[0, 0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> return } } @@ -877,12 +867,12 @@ gpu.module @test { func.func @vector_broadcast_2d_to_2d_along_column(%arg0: !xegpu.tensor_desc<16x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.0000> : vector<16xf16> - %3 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> + %3 = xegpu.load_nd %arg0[0, 0] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> %4 = vector.multi_reduction , %3, %cst [1] : vector<16x16xf16> to vector<16xf16> %5 = vector.shape_cast %4 : vector<16xf16> to vector<16x1xf16> %6 = math.exp %5: vector<16x1xf16> %7 = vector.broadcast %6 : vector<16x1xf16> to vector<16x16xf16> - xegpu.store_nd %7, %arg1 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> + xegpu.store_nd %7, %arg1[0, 0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> return } } @@ -897,7 +887,7 @@ gpu.module @test { func.func @vector_broadcast_scalar_to_vector(%arg0: !xegpu.tensor_desc<16x16xf16>) { %cst = arith.constant 0.0000 : f16 %6 = vector.broadcast %cst : f16 to vector<16x16xf16> - xegpu.store_nd %6, %arg0 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> + xegpu.store_nd %6, %arg0[0, 0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16> return } } @@ -921,15 +911,15 @@ gpu.module @test { // CHECK: %[[CST_SMALL:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<2x32xf32> // CHECK: %[[CST_LARGE:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<4x64xf32> // CHECK: %[[INSERT:.*]] = vector.insert_strided_slice %[[CST_SMALL]], %[[CST_LARGE]] {layout_result_0 = #xegpu.layout, offsets = [0, 0], strides = [1, 1]} : vector<2x32xf32> into vector<4x64xf32> -// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<4x64xf32> -> !xegpu.tensor_desc<4x64xf32, #xegpu.layout> -// CHECK: xegpu.store_nd %[[INSERT]], %[[TDESC]] <{layout = #xegpu.layout}> : vector<4x64xf32>, !xegpu.tensor_desc<4x64xf32, #xegpu.layout> +// CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<4x64xf32> -> !xegpu.tensor_desc<4x64xf32, #xegpu.layout> +// CHECK: xegpu.store_nd %[[INSERT]], %[[TDESC]][0, 0] <{layout = #xegpu.layout}> : vector<4x64xf32>, !xegpu.tensor_desc<4x64xf32, #xegpu.layout> func.func @insert_strided_slice_lane_layout_no_packing(%arg0: memref<4x64xf32>) { %c0 = arith.constant 0 : index %cst_small = arith.constant dense<1.0> : vector<2x32xf32> %cst_large = arith.constant dense<0.0> : vector<4x64xf32> %insert = vector.insert_strided_slice %cst_small, %cst_large {offsets = [0, 0], strides = [1, 1]} : vector<2x32xf32> into vector<4x64xf32> - %tdesc = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<4x64xf32> -> !xegpu.tensor_desc<4x64xf32> - xegpu.store_nd %insert, %tdesc : vector<4x64xf32>, !xegpu.tensor_desc<4x64xf32> + %tdesc = xegpu.create_nd_tdesc %arg0 : memref<4x64xf32> -> !xegpu.tensor_desc<4x64xf32> + xegpu.store_nd %insert, %tdesc[0, 0] : vector<4x64xf32>, !xegpu.tensor_desc<4x64xf32> return } } @@ -946,8 +936,8 @@ func.func @insert_strided_slice_lane_layout_with_packing(%arg0: memref<4x64xf16> %cst_small = arith.constant dense<1.0> : vector<2x32xf16> %cst_large = arith.constant dense<0.0> : vector<4x64xf16> %insert = vector.insert_strided_slice %cst_small, %cst_large {offsets = [0, 0], strides = [1, 1]} : vector<2x32xf16> into vector<4x64xf16> - %tdesc = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<4x64xf16> -> !xegpu.tensor_desc<4x64xf16, #xegpu.layout> - xegpu.store_nd %insert, %tdesc <{layout = #xegpu.layout}>: vector<4x64xf16>, !xegpu.tensor_desc<4x64xf16, #xegpu.layout> + %tdesc = xegpu.create_nd_tdesc %arg0 : memref<4x64xf16> -> !xegpu.tensor_desc<4x64xf16, #xegpu.layout> + xegpu.store_nd %insert, %tdesc[0, 0] <{layout = #xegpu.layout}>: vector<4x64xf16>, !xegpu.tensor_desc<4x64xf16, #xegpu.layout> return } } @@ -970,8 +960,8 @@ func.func @insert_strided_slice_with_slice_layout(%arg0: memref<8x16xf32>) { %cst_small8 = vector.extract_strided_slice %cst_large_new {offsets = [0], sizes = [8], strides = [1]} : vector<16xf32> to vector<8xf32> %cst_small16x8 = vector.broadcast %cst_small8 : vector<8xf32> to vector<16x8xf32> %cst_small8x16 = vector.transpose %cst_small16x8, [1, 0] : vector<16x8xf32> to vector<8x16xf32> - %tdesc = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - xegpu.store_nd %cst_small8x16, %tdesc <{layout = #xegpu.layout}>: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + %tdesc = xegpu.create_nd_tdesc %arg0 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + xegpu.store_nd %cst_small8x16, %tdesc[0, 0] <{layout = #xegpu.layout}>: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> return } } diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir index 8b57b262ebddf..c2aac8fa6cf0b 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir @@ -14,31 +14,25 @@ gpu.module @test_kernel { %m = arith.muli %block_id_x, %c16 : index %n = arith.muli %block_id_y, %c32 : index - %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #c> - %c_init = xegpu.load_nd %c_tdesc {layout = #c}: !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32> - - %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #a> - %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #b> - %out:3 = scf.for %k = %c0 to %c1024 step %c32 - iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init) - -> (!xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>) { + %c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #c> + %c_init = xegpu.load_nd %c_tdesc[0, 0] {layout = #c}: !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32> + + %a_tdesc = xegpu.create_nd_tdesc %A : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #a> + %b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #b> + %out = scf.for %k = %c0 to %c1024 step %c32 + iter_args(%arg2 = %c_init) + -> (vector<16x32xf32>) { //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16> - %a = xegpu.load_nd %arg0 {layout = #a}: !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16> + %a = xegpu.load_nd %a_tdesc[%c0, %k] {layout = #a}: !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16> //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16> - %b = xegpu.load_nd %arg1 {layout = #b}: !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16> - //CHECK-COUNT-8: xegpu.dpas {{.*}} + %b = xegpu.load_nd %b_tdesc[%k, %c0] {layout = #b}: !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16> + //CHECK-COUNT-8: xegpu.dpas {{.*}} %c = xegpu.dpas %a, %b, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32> - //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> - %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #a> - //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32, %c0] : !xegpu.tensor_desc<32x32xf16, #b> - scf.yield %a_next_tdesc, %b_next_tdesc, %c - : !xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32> - } {layout_result_0 = #a, - layout_result_1 = #b, - layout_result_2 = #c} + scf.yield %c + : vector<16x32xf32> + } {layout_result_0 = #c} //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - xegpu.store_nd %out#2, %c_tdesc {layout = #c}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c> + xegpu.store_nd %out, %c_tdesc[0, 0] {layout = #c}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c> gpu.return } } @@ -57,31 +51,25 @@ gpu.module @test_kernel { %m = arith.muli %block_id_x, %c16 : index %n = arith.muli %block_id_y, %c32 : index - %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #l1> - %c_init = xegpu.load_nd %c_tdesc {layout = #l1}: !xegpu.tensor_desc<16x32xf32, #l1> -> vector<16x32xf32> + %c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #l1> + %c_init = xegpu.load_nd %c_tdesc[0, 0] {layout = #l1}: !xegpu.tensor_desc<16x32xf32, #l1> -> vector<16x32xf32> - %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l1> - %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #l2> - %out:3 = scf.for %k = %c0 to %c1024 step %c32 - iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init) - -> (!xegpu.tensor_desc<16x32xf16, #l1>, !xegpu.tensor_desc<32x32xf16, #l2>, vector<16x32xf32>) { + %a_tdesc = xegpu.create_nd_tdesc %A : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l1> + %b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #l2> + %out = scf.for %k = %c0 to %c1024 step %c32 + iter_args(%arg2 = %c_init) + -> (vector<16x32xf32>) { //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16> - %a = xegpu.load_nd %arg0 {layout = #l1}: !xegpu.tensor_desc<16x32xf16, #l1> -> vector<16x32xf16> + %a = xegpu.load_nd %a_tdesc[%c0, %k] {layout = #l1}: !xegpu.tensor_desc<16x32xf16, #l1> -> vector<16x32xf16> //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16> - %b = xegpu.load_nd %arg1 {layout = #l2}: !xegpu.tensor_desc<32x32xf16, #l2> -> vector<32x32xf16> - //CHECK-COUNT-8: xegpu.dpas {{.*}} + %b = xegpu.load_nd %b_tdesc[%k, %c0] {layout = #l2}: !xegpu.tensor_desc<32x32xf16, #l2> -> vector<32x32xf16> + //CHECK-COUNT-8: xegpu.dpas {{.*}} %c = xegpu.dpas %a, %b, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1,layout_result_0 = #l1}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32> - //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16> - %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #l1> - //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16> - %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32, %c0] : !xegpu.tensor_desc<32x32xf16, #l2> - scf.yield %a_next_tdesc, %b_next_tdesc, %c - : !xegpu.tensor_desc<16x32xf16, #l1>, !xegpu.tensor_desc<32x32xf16, #l2>, vector<16x32xf32> - } {layout_result_0 = #l1, - layout_result_1 = #l2, - layout_result_2 = #l1} + scf.yield %c + : vector<16x32xf32> + } {layout_result_0 = #l1} //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %out#2, %c_tdesc {layout = #l1}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #l1> + xegpu.store_nd %out, %c_tdesc[0, 0] {layout = #l1}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #l1> gpu.return } } @@ -101,32 +89,28 @@ gpu.module @test_kernel { %m = arith.muli %block_id_x, %c8 : index %n = arith.muli %block_id_y, %c32 : index - %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x32xf32, #l1> + %c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x32xf32, #l1> //CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> - %c_init = xegpu.load_nd %c_tdesc {layout = #l1}: !xegpu.tensor_desc<8x32xf32, #l1> -> vector<8x32xf32> - - %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16, #l1> - %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l2> - %out:3 = scf.for %k = %c0 to %c1024 step %c16 - iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init) - -> (!xegpu.tensor_desc<8x16xf16, #l1>, !xegpu.tensor_desc<16x32xf16, #l2>, vector<8x32xf32>) { + %c_init = xegpu.load_nd %c_tdesc[0, 0] {layout = #l1}: !xegpu.tensor_desc<8x32xf32, #l1> -> vector<8x32xf32> + + %c2 = arith.constant 2 : index + %a_tdesc = xegpu.create_nd_tdesc %A : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16, #l1> + %b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l2> + %out = scf.for %k = %c0 to %c1024 step %c16 + iter_args(%arg2 = %c_init) + -> (vector<8x32xf32>) { + %a_off = arith.muli %k, %c2 : index //CHECK: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %a = xegpu.load_nd %arg0 {layout = #l1}: !xegpu.tensor_desc<8x16xf16, #l1> -> vector<8x16xf16> + %a = xegpu.load_nd %a_tdesc[%c0, %a_off] {layout = #l1}: !xegpu.tensor_desc<8x16xf16, #l1> -> vector<8x16xf16> //CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> - %b = xegpu.load_nd %arg1 {layout = #l2}: !xegpu.tensor_desc<16x32xf16, #l2> -> vector<16x32xf16> + %b = xegpu.load_nd %b_tdesc[%a_off, %c0] {layout = #l2}: !xegpu.tensor_desc<16x32xf16, #l2> -> vector<16x32xf16> %c = xegpu.dpas %a, %b, %arg2 {layout_a=#l1, layout_b = #l2, layout_cd = #l1,layout_result_0 = #l1}: vector<8x16xf16>, vector<16x32xf16>, vector<8x32xf32> -> vector<8x32xf32> - //CHECK: xegpu.update_nd_offset {{.*}} [%c0, %c32] : !xegpu.tensor_desc<8x16xf16> - %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<8x16xf16, #l1> - //CHECK-COUNT-2: xegpu.update_nd_offset {{.*}} [%c32, %c0] : !xegpu.tensor_desc<16x16xf16> - %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32, %c0] : !xegpu.tensor_desc<16x32xf16, #l2> - scf.yield %a_next_tdesc, %b_next_tdesc, %c - : !xegpu.tensor_desc<8x16xf16, #l1>, !xegpu.tensor_desc<16x32xf16, #l2>, vector<8x32xf32> - } {layout_result_0 = #l1, - layout_result_1 = #l2, - layout_result_2 = #l1} + scf.yield %c + : vector<8x32xf32> + } {layout_result_0 = #l1} //CHECK-COUNT-2: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %out#2, %c_tdesc {layout = #l1}: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #l1> + xegpu.store_nd %out, %c_tdesc[0, 0] {layout = #l1}: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #l1> gpu.return } } @@ -146,33 +130,27 @@ gpu.module @test_kernel { %m = arith.muli %block_id_x, %c16 : index %n = arith.muli %block_id_y, %c32 : index - %c_tdesc = xegpu.create_nd_tdesc %C[%m, %n] : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #c> - %c_init = xegpu.load_nd %c_tdesc {layout = #c}: !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32> + %c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf32> -> !xegpu.tensor_desc<16x32xf32, #c> + %c_init = xegpu.load_nd %c_tdesc[0, 0] {layout = #c}: !xegpu.tensor_desc<16x32xf32, #c> -> vector<16x32xf32> - %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #a> - %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %n] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #b> - %out:3 = scf.for %k = %c0 to %c1024 step %c32 - iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_init) - -> (!xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32>) { + %a_tdesc = xegpu.create_nd_tdesc %A : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #a> + %b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #b> + %out = scf.for %k = %c0 to %c1024 step %c32 + iter_args(%arg2 = %c_init) + -> (vector<16x32xf32>) { //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<8x16xf16> - %a = xegpu.load_nd %arg0 {layout = #a}: !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16> + %a = xegpu.load_nd %a_tdesc[%c0, %k] {layout = #a}: !xegpu.tensor_desc<16x32xf16, #a> -> vector<16x32xf16> //CHECK-COUNT-4: xegpu.load_nd {{.*}} -> vector<16x16xf16> - %b = xegpu.load_nd %arg1 {layout = #b}: !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16> + %b = xegpu.load_nd %b_tdesc[%k, %c0] {layout = #b}: !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16> //CHECK-COUNT-4: math.exp {{.*}} : vector<8x16xf16> %e = math.exp %a {layout_result_0 = #a} : vector<16x32xf16> - //CHECK-COUNT-8: xegpu.dpas {{.*}} + //CHECK-COUNT-8: xegpu.dpas {{.*}} %c = xegpu.dpas %e, %b, %arg2 {layout_a=#a, layout_b = #b, layout_cd = #c,layout_result_0 = #c}: vector<16x32xf16>, vector<32x32xf16>, vector<16x32xf32> -> vector<16x32xf32> - //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> - %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #a> - //CHECK-COUNT-4: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32, %c0] : !xegpu.tensor_desc<32x32xf16, #b> - scf.yield %a_next_tdesc, %b_next_tdesc, %c - : !xegpu.tensor_desc<16x32xf16, #a>, !xegpu.tensor_desc<32x32xf16, #b>, vector<16x32xf32> - } {layout_result_0 = #a, - layout_result_1 = #b, - layout_result_2 = #c} + scf.yield %c + : vector<16x32xf32> + } {layout_result_0 = #c} //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - xegpu.store_nd %out#2, %c_tdesc {layout = #c}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c> + xegpu.store_nd %out, %c_tdesc[0, 0] {layout = #c}: vector<16x32xf32>, !xegpu.tensor_desc<16x32xf32, #c> gpu.return } } @@ -188,29 +166,20 @@ gpu.module @test_kernel { %block_id_y = gpu.block_id y %m = arith.muli %block_id_x, %c32 : index - %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l> - %b_tdesc = xegpu.create_nd_tdesc %B[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l> - %c_tdesc = xegpu.create_nd_tdesc %C[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l> + %a_tdesc = xegpu.create_nd_tdesc %A : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l> + %b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l> + %c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x32xf16, #l> - %out:3 = scf.for %k = %c0 to %c1024 step %c32 - iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) - -> (!xegpu.tensor_desc<16x32xf16, #l>, !xegpu.tensor_desc<16x32xf16, #l>, !xegpu.tensor_desc<16x32xf16, #l>) { + scf.for %k = %c0 to %c1024 step %c32 { //CHECK-COUNT-8: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %a = xegpu.load_nd %arg0 {layout = #l}: !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16> - %b = xegpu.load_nd %arg1 {layout = #l}: !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16> + %a = xegpu.load_nd %a_tdesc[%c0, %k] {layout = #l}: !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16> + %b = xegpu.load_nd %b_tdesc[%c0, %k] {layout = #l}: !xegpu.tensor_desc<16x32xf16, #l> -> vector<16x32xf16> //CHECK-COUNT-4: arith.addf {{.*}} : vector<8x16xf16> %c = arith.addf %a, %b {layout_result_0 = #l} : vector<16x32xf16> //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16> - xegpu.store_nd %c, %arg2 {layout = #l}: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #l> - - //CHECK-COUNT-12: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf16> - %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #l> - %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #l> - %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16, #l> - scf.yield %a_next_tdesc, %b_next_tdesc, %c_next_tdesc - : !xegpu.tensor_desc<16x32xf16, #l>, !xegpu.tensor_desc<16x32xf16, #l>, !xegpu.tensor_desc<16x32xf16, #l> + xegpu.store_nd %c, %c_tdesc[%c0, %k] {layout = #l}: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #l> } gpu.return } @@ -227,29 +196,20 @@ gpu.module @test_kernel { %block_id_y = gpu.block_id y %m = arith.muli %block_id_x, %c32 : index - %a_tdesc = xegpu.create_nd_tdesc %A[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32xf16, #l> - %b_tdesc = xegpu.create_nd_tdesc %B[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32xf16, #l> - %c_tdesc = xegpu.create_nd_tdesc %C[%m, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32xf16, #l> + %a_tdesc = xegpu.create_nd_tdesc %A : memref<1024x1024xf16> -> !xegpu.tensor_desc<32xf16, #l> + %b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf16> -> !xegpu.tensor_desc<32xf16, #l> + %c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf16> -> !xegpu.tensor_desc<32xf16, #l> - %out:3 = scf.for %k = %c0 to %c1024 step %c32 - iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) - -> (!xegpu.tensor_desc<32xf16, #l>, !xegpu.tensor_desc<32xf16, #l>, !xegpu.tensor_desc<32xf16, #l>) { + scf.for %k = %c0 to %c1024 step %c32 { //CHECK-COUNT-8: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8xf16> -> vector<8xf16> - %a = xegpu.load_nd %arg0 {layout = #l}: !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16> - %b = xegpu.load_nd %arg1 {layout = #l}: !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16> + %a = xegpu.load_nd %a_tdesc[%k] {layout = #l}: !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16> + %b = xegpu.load_nd %b_tdesc[%k] {layout = #l}: !xegpu.tensor_desc<32xf16, #l> -> vector<32xf16> //CHECK-COUNT-4: arith.addf {{.*}} : vector<8xf16> %c = arith.addf %a, %b {layout_result_0 = #l} : vector<32xf16> //CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<8xf16>, !xegpu.tensor_desc<8xf16> - xegpu.store_nd %c, %arg2 {layout = #l}: vector<32xf16>, !xegpu.tensor_desc<32xf16, #l> - - //CHECK-COUNT-12: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8xf16> - %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c32] : !xegpu.tensor_desc<32xf16, #l> - %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c32] : !xegpu.tensor_desc<32xf16, #l> - %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c32] : !xegpu.tensor_desc<32xf16, #l> - scf.yield %a_next_tdesc, %b_next_tdesc, %c_next_tdesc - : !xegpu.tensor_desc<32xf16, #l>, !xegpu.tensor_desc<32xf16, #l>, !xegpu.tensor_desc<32xf16, #l> + xegpu.store_nd %c, %c_tdesc[%k] {layout = #l}: vector<32xf16>, !xegpu.tensor_desc<32xf16, #l> } gpu.return } @@ -264,13 +224,13 @@ gpu.module @test_kernel { %c64 = arith.constant 64 : index %block_id_x = gpu.block_id x %m = arith.muli %block_id_x, %c64 : index - %0 = xegpu.create_nd_tdesc %a[0, %m] : memref<16x512xf32> -> !xegpu.tensor_desc<16x64xf32, #l> - %1 = xegpu.load_nd %0 {layout = #l}: !xegpu.tensor_desc<16x64xf32, #l> -> vector<16x64xf32> + %0 = xegpu.create_nd_tdesc %a : memref<16x512xf32> -> !xegpu.tensor_desc<16x64xf32, #l> + %1 = xegpu.load_nd %0[0, 0] {layout = #l}: !xegpu.tensor_desc<16x64xf32, #l> -> vector<16x64xf32> // CHECK: vector.multi_reduction , {{.*}}, [[ACC:%[0-9A-Za-z]+]] [0] : vector<16x16xf32> to vector<16xf32> // CHECK-COUNT-3: vector.multi_reduction , {{.*}}, [[ACC]] [0] : vector<16x16xf32> to vector<16xf32> %2 = vector.multi_reduction , %1, %acc {layout_result_0 = #r} [0]: vector<16x64xf32> to vector<64xf32> - %3 = xegpu.create_nd_tdesc %b[%m] : memref<512xf32> -> !xegpu.tensor_desc<64xf32, #r> - xegpu.store_nd %2, %3 {layout = #r}: vector<64xf32>, !xegpu.tensor_desc<64xf32, #r> + %3 = xegpu.create_nd_tdesc %b : memref<512xf32> -> !xegpu.tensor_desc<64xf32, #r> + xegpu.store_nd %2, %3[0] {layout = #r}: vector<64xf32>, !xegpu.tensor_desc<64xf32, #r> gpu.return } } @@ -289,15 +249,15 @@ gpu.module @test_kernel { %m = arith.muli %block_id_x, %c32 : index %n = arith.muli %block_id_y, %c32 : index - %0 = xegpu.create_nd_tdesc %a[%m, %n] : memref<512x32xf32> -> !xegpu.tensor_desc<32x128xf32, #l> - %1 = xegpu.load_nd %0 {layout = #l}: !xegpu.tensor_desc<32x128xf32, #l> -> vector<32x128xf32> + %0 = xegpu.create_nd_tdesc %a : memref<512x32xf32> -> !xegpu.tensor_desc<32x128xf32, #l> + %1 = xegpu.load_nd %0[0, 0] {layout = #l}: !xegpu.tensor_desc<32x128xf32, #l> -> vector<32x128xf32> // CHECK: vector.multi_reduction , {{.*}}, [[INIT:%[0-9A-Za-z]+]] [1] : vector<16x16xf32> to vector<16xf32> // CHECK-COUNT-1: vector.multi_reduction , {{.*}}, [[INIT]] [1] : vector<16x16xf32> to vector<16xf32> %2 = vector.multi_reduction , %1, %acc {layout_result_0 = #r} [1]: vector<32x128xf32> to vector<32xf32> - %3 = xegpu.create_nd_tdesc %b[%n] : memref<512xf32> -> !xegpu.tensor_desc<32xf32, #r> - xegpu.store_nd %2, %3 {layout = #r}: vector<32xf32>, !xegpu.tensor_desc<32xf32, #r> + %3 = xegpu.create_nd_tdesc %b : memref<512xf32> -> !xegpu.tensor_desc<32xf32, #r> + xegpu.store_nd %2, %3[0] {layout = #r}: vector<32xf32>, !xegpu.tensor_desc<32xf32, #r> gpu.return } } @@ -311,12 +271,12 @@ gpu.module @test_kernel { %c64 = arith.constant 64 : index %block_id_x = gpu.block_id x %m = arith.muli %block_id_x, %c64 : index - %0 = xegpu.create_nd_tdesc %a[%m] : memref<512xf32> -> !xegpu.tensor_desc<64xf32, #r> - %1 = xegpu.load_nd %0 {layout = #r}: !xegpu.tensor_desc<64xf32, #r> -> vector<64xf32> + %0 = xegpu.create_nd_tdesc %a : memref<512xf32> -> !xegpu.tensor_desc<64xf32, #r> + %1 = xegpu.load_nd %0[0] {layout = #r}: !xegpu.tensor_desc<64xf32, #r> -> vector<64xf32> // CHECK-COUNT-4: vector.broadcast {{.*}} : vector<16xf32> to vector<16x16xf32> %2 = vector.broadcast %1 {layout_result_0 = #l} : vector<64xf32> to vector<16x64xf32> - %3 = xegpu.create_nd_tdesc %b[0, %m] : memref<16x512xf32> -> !xegpu.tensor_desc<16x64xf32, #l> - xegpu.store_nd %2, %3 {layout = #l}: vector<16x64xf32>, !xegpu.tensor_desc<16x64xf32, #l> + %3 = xegpu.create_nd_tdesc %b : memref<16x512xf32> -> !xegpu.tensor_desc<16x64xf32, #l> + xegpu.store_nd %2, %3[0, 0] {layout = #l}: vector<16x64xf32>, !xegpu.tensor_desc<16x64xf32, #l> gpu.return } } @@ -330,13 +290,13 @@ gpu.module @test_kernel { %c32 = arith.constant 32 : index %block_id_x = gpu.block_id x %m = arith.muli %block_id_x, %c32 : index - %0 = xegpu.create_nd_tdesc %a[%m] : memref<512xf32> -> !xegpu.tensor_desc<32xf32, #r> - %1 = xegpu.load_nd %0 {layout = #r}: !xegpu.tensor_desc<32xf32, #r> -> vector<32xf32> + %0 = xegpu.create_nd_tdesc %a : memref<512xf32> -> !xegpu.tensor_desc<32xf32, #r> + %1 = xegpu.load_nd %0[0] {layout = #r}: !xegpu.tensor_desc<32xf32, #r> -> vector<32xf32> %11 = vector.shape_cast %1 {layout_result_0 = #l} : vector<32xf32> to vector<32x1xf32> // CHECK-COUNT-8: vector.broadcast {{.*}}: vector<16x1xf32> to vector<16x16xf32> %2 = vector.broadcast %11 {layout_result_0 = #l} : vector<32x1xf32> to vector<32x64xf32> - %3 = xegpu.create_nd_tdesc %b[0, %m] : memref<16x512xf32> -> !xegpu.tensor_desc<32x64xf32, #l> - xegpu.store_nd %2, %3: vector<32x64xf32>, !xegpu.tensor_desc<32x64xf32, #l> + %3 = xegpu.create_nd_tdesc %b : memref<16x512xf32> -> !xegpu.tensor_desc<32x64xf32, #l> + xegpu.store_nd %2, %3[0, 0] : vector<32x64xf32>, !xegpu.tensor_desc<32x64xf32, #l> gpu.return } } @@ -350,12 +310,12 @@ gpu.module @test_kernel { %c32 = arith.constant 32 : index %block_id_x = gpu.block_id x %m = arith.muli %block_id_x, %c32 : index - %0 = xegpu.create_nd_tdesc %a[%m, 0] : memref<512x8xf32> -> !xegpu.tensor_desc<32x8xf32, #l> - %1 = xegpu.load_nd %0 {layout = #l}: !xegpu.tensor_desc<32x8xf32, #l> -> vector<32x8xf32> + %0 = xegpu.create_nd_tdesc %a : memref<512x8xf32> -> !xegpu.tensor_desc<32x8xf32, #l> + %1 = xegpu.load_nd %0[0, 0] {layout = #l}: !xegpu.tensor_desc<32x8xf32, #l> -> vector<32x8xf32> // CHECK-COUNT-2: vector.transpose {{.*}} [1, 0] : vector<16x8xf32> to vector<8x16xf32> %2 = vector.transpose %1, [1, 0] {layout_result_0 = #t} : vector<32x8xf32> to vector<8x32xf32> - %3 = xegpu.create_nd_tdesc %b[0, %m] : memref<8x512xf32> -> !xegpu.tensor_desc<8x32xf32, #t> - xegpu.store_nd %2, %3 {layout = #t}: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #t> + %3 = xegpu.create_nd_tdesc %b : memref<8x512xf32> -> !xegpu.tensor_desc<8x32xf32, #t> + xegpu.store_nd %2, %3[0, 0] {layout = #t}: vector<8x32xf32>, !xegpu.tensor_desc<8x32xf32, #t> gpu.return } } @@ -423,30 +383,29 @@ gpu.module @test_kernel { //CHECK-SAME: [[arg0:%.+]]: memref<16x16xf16>, [[arg1:%.+]]: memref<16x16xf16>, [[arg2:%.+]]: memref<16x16xf32> //CHECK: [[c8:%.+]] = arith.constant 8 : index //CHECK: [[c0:%.+]] = arith.constant 0 : index - //CHECK: [[a:%.+]] = xegpu.create_nd_tdesc [[arg0]][[[c0]], [[c0]]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - //CHECK: [[b:%.+]] = xegpu.create_nd_tdesc [[arg1]][[[c0]], [[c0]]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - //CHECK: [[load_a:%.+]] = xegpu.load_nd [[a]] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> - //CHECK: [[load_b:%.+]] = xegpu.load_nd [[b]] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + //CHECK: [[a:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + //CHECK: [[b:%.+]] = xegpu.create_nd_tdesc [[arg1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + //CHECK: [[load_a:%.+]] = xegpu.load_nd [[a]][[[c0]], [[c0]]] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + //CHECK: [[load_b:%.+]] = xegpu.load_nd [[b]][[[c0]], [[c0]]] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> //CHECK: [[cvt:%.+]] = xegpu.convert_layout [[load_a]] <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<16x16xf16> //CHECK: [[a0:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16> //CHECK: [[a1:%.+]] = vector.extract_strided_slice [[cvt]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16> //CHECK: [[dpas0:%.+]] = xegpu.dpas [[a0]], [[load_b]] //CHECK: [[dpas1:%.+]] = xegpu.dpas [[a1]], [[load_b]] - //CHECK: [[c_tdesc_0:%.+]] = xegpu.create_nd_tdesc [[arg2]][[[c0]], [[c0]]] : memref<16x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - //CHECK: [[c_tdesc_1:%.+]] = xegpu.create_nd_tdesc [[arg2]][[[c8]], [[c0]]] : memref<16x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - //CHECK: xegpu.store_nd [[dpas0]], [[c_tdesc_0]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - //CHECK: xegpu.store_nd [[dpas1]], [[c_tdesc_1]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + //CHECK: [[c_tdesc:%.+]] = xegpu.create_nd_tdesc [[arg2]] : memref<16x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + //CHECK: xegpu.store_nd [[dpas0]], [[c_tdesc]][[[c0]], [[c0]]] <{layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + //CHECK: xegpu.store_nd [[dpas1]], [[c_tdesc]][[[c8]], [[c0]]] <{layout = #xegpu.layout}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> gpu.func @convert_layout(%A: memref<16x16xf16>, %B: memref<16x16xf16>, %C: memref<16x16xf32>) { %c0 = arith.constant 0 : index - %a_tdesc = xegpu.create_nd_tdesc %A[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #b> - %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #b> - %a = xegpu.load_nd %a_tdesc {layout = #b}: !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> - %b = xegpu.load_nd %b_tdesc {layout = #b}: !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> + %a_tdesc = xegpu.create_nd_tdesc %A : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #b> + %b_tdesc = xegpu.create_nd_tdesc %B : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #b> + %a = xegpu.load_nd %a_tdesc[0, 0] {layout = #b}: !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> + %b = xegpu.load_nd %b_tdesc[0, 0] {layout = #b}: !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> %a1 = xegpu.convert_layout %a <{input_layout = #b, target_layout = #a}> : vector<16x16xf16> %c = xegpu.dpas %a1, %b {layout_a=#a, layout_b = #b, layout_cd = #c, layout_result_0 = #c}: vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> - %c_tdesc = xegpu.create_nd_tdesc %C[%c0, %c0] : memref<16x16xf32> -> !xegpu.tensor_desc<16x16xf32, #c> - xegpu.store_nd %c, %c_tdesc {layout = #c}: vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #c> + %c_tdesc = xegpu.create_nd_tdesc %C : memref<16x16xf32> -> !xegpu.tensor_desc<16x16xf32, #c> + xegpu.store_nd %c, %c_tdesc[0, 0] {layout = #c}: vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #c> gpu.return } } @@ -460,8 +419,8 @@ gpu.module @test_kernel { gpu.func @convert_layout_scalar(%arg0: memref<16x16xf16>, %arg1: memref<4xf16>) { %acc = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %a_tdesc = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #a> - %a = xegpu.load_nd %a_tdesc {layout = #a}: !xegpu.tensor_desc<16x16xf16, #a> -> vector<16x16xf16> + %a_tdesc = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #a> + %a = xegpu.load_nd %a_tdesc[0, 0] {layout = #a}: !xegpu.tensor_desc<16x16xf16, #a> -> vector<16x16xf16> %a_reduce = vector.multi_reduction , %a, %acc {layout_operand_0 = #a, layout_result_0 = #xegpu.slice<#a, dims = [0, 1]>} [0, 1] : vector<16x16xf16> to f16 %13 = xegpu.convert_layout %a_reduce <{input_layout = #xegpu.slice<#a, dims = [0, 1]>, target_layout = #xegpu.slice<#a, dims = [0, 1]>}> : f16 memref.store %13, %arg1[%c0] : memref<4xf16> diff --git a/mlir/test/Dialect/XeGPU/xegpu-recover-layout.mlir b/mlir/test/Dialect/XeGPU/xegpu-recover-layout.mlir index a00d6d7bb3b14..e2a4897fac519 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-recover-layout.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-recover-layout.mlir @@ -14,11 +14,11 @@ gpu.func @for_basic(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: %c16 = arith.constant 16 : index // CHECK: xegpu.create_nd_tdesc // CHECK-SAME: -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x128xf16> + %0 = xegpu.create_nd_tdesc %arg0 : memref<8x128xf16> -> !xegpu.tensor_desc<8x16xf16> // CHECK: xegpu.create_nd_tdesc // CHECK-SAME: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<128x16xf16> + %1 = xegpu.create_nd_tdesc %arg1 : memref<128x16xf16> -> !xegpu.tensor_desc<16x16xf16> // Recovery propagates layout from dpas (via store_nd) back to arith.constant. // CHECK: arith.constant {layout_result_0 = #xegpu.layout} @@ -27,9 +27,9 @@ gpu.func @for_basic(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: // CHECK: scf.for %2 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg6 = %cst) -> (vector<8x16xf32>) { - %4 = xegpu.load_nd %0 {layout = #xegpu.layout} + %4 = xegpu.load_nd %0[%c0, %c0] {layout = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %5 = xegpu.load_nd %1 {layout = #xegpu.layout} + %5 = xegpu.load_nd %1[%c0, %c0] {layout = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> %6 = xegpu.dpas %4, %5, %arg6 {layout_a = #xegpu.layout, @@ -45,9 +45,9 @@ gpu.func @for_basic(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: } // CHECK: xegpu.create_nd_tdesc // CHECK-SAME: -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - %3 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> + %3 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %2, %3 {layout = #xegpu.layout} + xegpu.store_nd %2, %3[%c0, %c0] {layout = #xegpu.layout} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> gpu.return } @@ -67,13 +67,13 @@ gpu.func @while_basic(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) { %c0_i32 = arith.constant 0 : i32 // CHECK: xegpu.create_nd_tdesc // CHECK-SAME: -> !xegpu.tensor_desc<256xf32, #xegpu.layout> - %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> + %0 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32> - %1 = xegpu.load_nd %0 {layout = #xegpu.layout} + %1 = xegpu.load_nd %0[0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32> -> vector<256xf32> // CHECK: xegpu.create_nd_tdesc // CHECK-SAME: -> !xegpu.tensor_desc<256xf32, #xegpu.layout> - %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> + %2 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32> // CHECK: scf.while @@ -86,12 +86,10 @@ gpu.func @while_basic(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) { scf.condition(%4) %arg2, %arg3 : vector<256xf32>, i32 } do { ^bb0(%arg2: vector<256xf32>, %arg3: i32): - xegpu.store_nd %arg2, %2 {layout = #xegpu.layout} + xegpu.store_nd %arg2, %2[0] {layout = #xegpu.layout} : vector<256xf32>, !xegpu.tensor_desc<256xf32> %4 = arith.addi %arg3, %c1_i32 : i32 - %5 = xegpu.update_nd_offset %0, [256] - : !xegpu.tensor_desc<256xf32> - %6 = xegpu.load_nd %5 {layout = #xegpu.layout} + %6 = xegpu.load_nd %0[256] {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32> -> vector<256xf32> // Recovery propagates layout to scf.yield in the "do" region via // sibling region propagation (from "before" region arg back to "do" yield). @@ -101,7 +99,7 @@ gpu.func @while_basic(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) { // CHECK: } attributes {layout_operand_0 = #xegpu.layout, // CHECK-SAME: layout_result_0 = #xegpu.layout} } - xegpu.store_nd %3#0, %2 {layout = #xegpu.layout} + xegpu.store_nd %3#0, %2[0] {layout = #xegpu.layout} : vector<256xf32>, !xegpu.tensor_desc<256xf32> gpu.return } @@ -118,18 +116,18 @@ gpu.func @if_basic( %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>) { - %0 = xegpu.load_nd %arg0 {layout = #xegpu.layout} + %0 = xegpu.load_nd %arg0[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> // CHECK: scf.if %1 = scf.if %arg2 -> (vector<16x16xf16>) { - %3 = xegpu.load_nd %arg1 {layout = #xegpu.layout} + %3 = xegpu.load_nd %arg1[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> // Recovery propagates layout to scf.yield operand in "then" region. // CHECK: scf.yield {layout_operand_0 = #xegpu.layout} scf.yield %3 : vector<16x16xf16> } else { - %3 = xegpu.load_nd %arg1 {layout = #xegpu.layout} + %3 = xegpu.load_nd %arg1[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> // Recovery propagates layout to scf.yield operand in "else" region. @@ -143,7 +141,7 @@ gpu.func @if_basic( layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - xegpu.store_nd %2, %arg3 {layout = #xegpu.layout} + xegpu.store_nd %2, %arg3[0, 0] {layout = #xegpu.layout} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> gpu.return } diff --git a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns-no-desc-offsets.mlir b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns-no-desc-offsets.mlir deleted file mode 100644 index 6eee5a544e3f8..0000000000000 --- a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns-no-desc-offsets.mlir +++ /dev/null @@ -1,61 +0,0 @@ -// RUN: mlir-opt --test-xegpu-unrolling-patterns -split-input-file %s | FileCheck %s - -gpu.module @xevm_test { - - // CHECK-LABEL: create_nd_tdesc - // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32> - // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> - // CHECK: [[cast:%.+]] = builtin.unrealized_conversion_cast - // CHECK-SAME: !xegpu.tensor_desc<8x16xf32> - // CHECK-SAME: to !xegpu.tensor_desc<24x32xf32, #xegpu.layout> {__xegpu_blocking_tile_shape__ = array, __xegpu_blocking_unpack__} - gpu.func @create_nd_tdesc(%src: memref<24x32xf32>) -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> { - %tdesc = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - gpu.return %tdesc : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - } - -//----- - // CHECK-LABEL: load_nd - // CHECK-SAME: [[arg0:%.+]]: memref<256x318xf32> - // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<256x318xf32> -> !xegpu.tensor_desc<8x16xf32> - // CHECK-COUNT-6: [[ld:%.+]] = xegpu.load_nd {{.*}}[{{.*}}] : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> - // CHECK-COUNT-6: [[insert:%.+]] = vector.insert_strided_slice {{.*}} : vector<8x16xf32> into vector<24x32xf32> - gpu.func @load_nd(%src: memref<256x318xf32>) -> vector<24x32xf32> { - %tdesc = xegpu.create_nd_tdesc %src : memref<256x318xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - %ld = xegpu.load_nd %tdesc[8, 16]: !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - gpu.return %ld : vector<24x32xf32> - } - -//----- - // CHECK-LABEL: load_nd_store_nd - // CHECK-SAME: [[arg0:%.+]]: memref<256x318xf32> - // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<256x318xf32> -> !xegpu.tensor_desc<8x16xf32> - // CHECK-COUNT-6: [[data:%.+]] = xegpu.load_nd {{.*}}[{{.*}}] : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> - // CHECK-COUNT-6: xegpu.store_nd {{.*}}[{{.*}}] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> - gpu.func @load_nd_store_nd(%src: memref<256x318xf32>) { - %tdesc = xegpu.create_nd_tdesc %src : memref<256x318xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - %ld = xegpu.load_nd %tdesc[8, 16]: !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - xegpu.store_nd %ld, %tdesc[0, 0] : vector<24x32xf32>, !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - gpu.return - } - -//----- - // CHECK-LABEL: prefetch_nd_tdesc - // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32> - // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> - // CHECK-COUNT-6: xegpu.prefetch_nd {{.*}}[{{.*}}] : !xegpu.tensor_desc<8x16xf32> - gpu.func @prefetch_nd_tdesc(%src: memref<24x32xf32>) { - %tdesc = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - xegpu.prefetch_nd %tdesc[8, 16] : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - gpu.return - } - -//----- - - // CHECK-LABEL: load_nd_offsets_at_both_places - // CHECK-COUNT-2: builtin.unrealized_conversion_cast - gpu.func @load_nd_offsets_at_both_places(%src: memref<256x318xf32>) -> vector<24x32xf32> { - %tdesc = xegpu.create_nd_tdesc %src[16, 8] : memref<256x318xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - %ld = xegpu.load_nd %tdesc[8, 16]: !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - gpu.return %ld : vector<24x32xf32> - } -} diff --git a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir index 750007077164f..b60700c81f518 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir @@ -4,150 +4,116 @@ gpu.module @test { // CHECK-LABEL: create_nd_tdesc // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32> - // CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> // CHECK: [[cast:%.+]] = builtin.unrealized_conversion_cast - // CHECK-SAME: !xegpu.tensor_desc<8x16xf32>, !xegpu.tensor_desc<8x16xf32>, - // CHECK-SAME: !xegpu.tensor_desc<8x16xf32>, !xegpu.tensor_desc<8x16xf32>, - // CHECK-SAME: !xegpu.tensor_desc<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + // CHECK-SAME: !xegpu.tensor_desc<8x16xf32> // CHECK-SAME: to !xegpu.tensor_desc<24x32xf32, #xegpu.layout> {__xegpu_blocking_tile_shape__ = array, __xegpu_blocking_unpack__} gpu.func @create_nd_tdesc(%src: memref<24x32xf32>) -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> { - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> + %tdesc = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> gpu.return %tdesc : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> } - //----- - +//----- // CHECK-LABEL: create_nd_tdesc_1d // CHECK-SAME: [[arg0:%.+]]: memref<64xf32> - // CHECK-COUNT-2: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<64xf32> -> !xegpu.tensor_desc<16xf32> + // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<64xf32> -> !xegpu.tensor_desc<16xf32> // CHECK: [[cast:%.+]] = builtin.unrealized_conversion_cast - // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32> + // CHECK-SAME: !xegpu.tensor_desc<16xf32> // CHECK-SAME: to !xegpu.tensor_desc<32xf32, #xegpu.layout> {__xegpu_blocking_tile_shape__ = array, __xegpu_blocking_unpack__} gpu.func @create_nd_tdesc_1d(%src: memref<64xf32>) -> !xegpu.tensor_desc<32xf32, #xegpu.layout> { - %tdesc = xegpu.create_nd_tdesc %src[0] : memref<64xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout> + %tdesc = xegpu.create_nd_tdesc %src : memref<64xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout> gpu.return %tdesc : !xegpu.tensor_desc<32xf32, #xegpu.layout> } - //----- - - // CHECK-LABEL: update_nd_tdesc - // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32> - // CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> - // CHECK-COUNT-6: [[update:%.+]] = xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf32> - gpu.func @update_nd_tdesc(%src: memref<24x32xf32>) -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> { - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - %update = xegpu.update_nd_offset %tdesc, [0, 16] : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - gpu.return %update : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - } - - //----- - - // CHECK-LABEL: update_nd_tdesc_1d - // CHECK-SAME: [[arg0:%.+]]: memref<64xf32> - // CHECK-COUNT-2: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<64xf32> -> !xegpu.tensor_desc<16xf32> - // CHECK-COUNT-2: [[update:%.+]] = xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16xf32> - gpu.func @update_nd_tdesc_1d(%src: memref<64xf32>) -> !xegpu.tensor_desc<32xf32, #xegpu.layout> { - %tdesc = xegpu.create_nd_tdesc %src[0] : memref<64xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout> - %update = xegpu.update_nd_offset %tdesc, [32] : !xegpu.tensor_desc<32xf32, #xegpu.layout> - gpu.return %update : !xegpu.tensor_desc<32xf32, #xegpu.layout> - } - - //----- - +//----- // CHECK-LABEL: prefetch_nd_tdesc // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32> - // CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> - // CHECK-COUNT-6: xegpu.prefetch_nd {{.*}} : !xegpu.tensor_desc<8x16xf32> + // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + // CHECK-COUNT-6: xegpu.prefetch_nd {{.*}}[{{.*}}] : !xegpu.tensor_desc<8x16xf32> gpu.func @prefetch_nd_tdesc(%src: memref<24x32xf32>) { - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - xegpu.prefetch_nd %tdesc : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> + %tdesc = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> + xegpu.prefetch_nd %tdesc[0, 0] : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> gpu.return } - //----- - +//----- // CHECK-LABEL: prefetch_nd_tdesc_1d // CHECK-SAME: [[arg0:%.+]]: memref<64xf32> - // CHECK-COUNT-4: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<64xf32> -> !xegpu.tensor_desc<16xf32> - // CHECK-COUNT-4: xegpu.prefetch_nd {{.*}} : !xegpu.tensor_desc<16xf32> + // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<64xf32> -> !xegpu.tensor_desc<16xf32> + // CHECK-COUNT-4: xegpu.prefetch_nd {{.*}}[{{.*}}] : !xegpu.tensor_desc<16xf32> gpu.func @prefetch_nd_tdesc_1d(%src: memref<64xf32>) { - %tdesc = xegpu.create_nd_tdesc %src[0] : memref<64xf32> -> !xegpu.tensor_desc<64xf32, #xegpu.layout> - xegpu.prefetch_nd %tdesc : !xegpu.tensor_desc<64xf32, #xegpu.layout> + %tdesc = xegpu.create_nd_tdesc %src : memref<64xf32> -> !xegpu.tensor_desc<64xf32, #xegpu.layout> + xegpu.prefetch_nd %tdesc[0] : !xegpu.tensor_desc<64xf32, #xegpu.layout> gpu.return } - //----- +//----- // CHECK-LABEL: load_nd // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32> - // CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> - // CHECK-COUNT-6: [[ld:%.+]] = xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> + // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + // CHECK-COUNT-6: [[ld:%.+]] = xegpu.load_nd {{.*}}[{{.*}}] : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> // CHECK-COUNT-6: [[insert:%.+]] = vector.insert_strided_slice {{.*}} : vector<8x16xf32> into vector<24x32xf32> gpu.func @load_nd(%src: memref<24x32xf32>) -> vector<24x32xf32> { - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - %ld = xegpu.load_nd %tdesc: !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> + %tdesc = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> + %ld = xegpu.load_nd %tdesc[0, 0] : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> gpu.return %ld : vector<24x32xf32> } - //----- - +//----- // CHECK-LABEL: load_nd_1d // CHECK-SAME: [[arg0:%.+]]: memref<64xf32> - // CHECK-COUNT-4: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<64xf32> -> !xegpu.tensor_desc<16xf32> - // CHECK-COUNT-4: [[ld:%.+]] = xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16xf32> -> vector<16xf32> + // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<64xf32> -> !xegpu.tensor_desc<16xf32> + // CHECK-COUNT-4: [[ld:%.+]] = xegpu.load_nd {{.*}}[{{.*}}] : !xegpu.tensor_desc<16xf32> -> vector<16xf32> // CHECK-COUNT-4: [[insert:%.+]] = vector.insert_strided_slice {{.*}} : vector<16xf32> into vector<64xf32> gpu.func @load_nd_1d(%src: memref<64xf32>) -> vector<64xf32> { - %tdesc = xegpu.create_nd_tdesc %src[0] : memref<64xf32> -> !xegpu.tensor_desc<64xf32, #xegpu.layout> - %data = xegpu.load_nd %tdesc: !xegpu.tensor_desc<64xf32, #xegpu.layout> -> vector<64xf32> + %tdesc = xegpu.create_nd_tdesc %src : memref<64xf32> -> !xegpu.tensor_desc<64xf32, #xegpu.layout> + %data = xegpu.load_nd %tdesc[0] : !xegpu.tensor_desc<64xf32, #xegpu.layout> -> vector<64xf32> gpu.return %data : vector<64xf32> } - //----- - +//----- // CHECK-LABEL: store_nd // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32> - // CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> - // CHECK-COUNT-6: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + // CHECK-COUNT-6: xegpu.store_nd {{.*}}[{{.*}}] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> gpu.func @store_nd(%src: memref<24x32xf32>) { - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> + %tdesc = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> %data = arith.constant dense<9.0> : vector<24x32xf32> - xegpu.store_nd %data, %tdesc: vector<24x32xf32>, !xegpu.tensor_desc<24x32xf32, #xegpu.layout> + xegpu.store_nd %data, %tdesc[0, 0] : vector<24x32xf32>, !xegpu.tensor_desc<24x32xf32, #xegpu.layout> gpu.return } - //----- - +//----- // CHECK-LABEL: store_nd_1d // CHECK-SAME: [[arg0:%.+]]: memref<64xf32> - // CHECK-COUNT-4: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<64xf32> -> !xegpu.tensor_desc<16xf32> - // CHECK-COUNT-4: xegpu.store_nd {{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32> + // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<64xf32> -> !xegpu.tensor_desc<16xf32> + // CHECK-COUNT-4: xegpu.store_nd {{.*}}[{{.*}}] : vector<16xf32>, !xegpu.tensor_desc<16xf32> gpu.func @store_nd_1d(%src: memref<64xf32>) { - %tdesc = xegpu.create_nd_tdesc %src[0] : memref<64xf32> -> !xegpu.tensor_desc<64xf32, #xegpu.layout> + %tdesc = xegpu.create_nd_tdesc %src : memref<64xf32> -> !xegpu.tensor_desc<64xf32, #xegpu.layout> %data = arith.constant dense<9.0> : vector<64xf32> - xegpu.store_nd %data, %tdesc: vector<64xf32>, !xegpu.tensor_desc<64xf32, #xegpu.layout> + xegpu.store_nd %data, %tdesc[0] : vector<64xf32>, !xegpu.tensor_desc<64xf32, #xegpu.layout> gpu.return } - //----- - +//----- // CHECK-LABEL: createNd_loadNd_storeNd // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32> - //CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> - //CHECK-COUNT-6: [[data:%.+]] = xegpu.load_nd {{.*}} : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> + //CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + //CHECK-COUNT-6: [[data:%.+]] = xegpu.load_nd {{.*}}[{{.*}}] : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> //CHECK-COUNT-6: [[insert:%.+]] = vector.insert_strided_slice {{.*}} : vector<8x16xf32> into vector<24x32xf32> //CHECK: [[add:%.+]] = arith.addf {{.*}} : vector<24x32xf32> //CHECK-COUNT-6: [[extract:%.+]] = vector.extract_strided_slice {{.*}} : vector<24x32xf32> to vector<8x16xf32> - //CHECK-COUNT-6: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + //CHECK-COUNT-6: xegpu.store_nd {{.*}}[{{.*}}] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> gpu.func @createNd_loadNd_storeNd(%src: memref<24x32xf32>) { - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> + %tdesc = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> %data = arith.constant dense<9.0> : vector<24x32xf32> - %ld = xegpu.load_nd %tdesc: !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> + %ld = xegpu.load_nd %tdesc[0, 0] : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> %add = arith.addf %data, %ld : vector<24x32xf32> - xegpu.store_nd %add, %tdesc: vector<24x32xf32>, !xegpu.tensor_desc<24x32xf32, #xegpu.layout> + xegpu.store_nd %add, %tdesc[0, 0] : vector<24x32xf32>, !xegpu.tensor_desc<24x32xf32, #xegpu.layout> gpu.return } - //----- - +//----- // CHECK-LABEL: dpas // CHECK-SAME: [[arg0:%.+]]: vector<32x32xf16>, [[arg1:%.+]]: vector<32x32xf16> //CHECK-COUNT-8: [[extract1:%.+]] = vector.extract_strided_slice [[arg0]] {{.*}} : vector<32x32xf16> to vector<8x16xf16> @@ -160,13 +126,6 @@ gpu.module @test { } //----- - -//----- - -//----- - -//----- - // CHECK-LABEL: load_with_offsets // CHECK-SAME: [[arg0:%.+]]: ui64 // CHECK-COUNT-2: xegpu.load {{.*}}[{{.*}}], {{.*}} <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32> @@ -186,11 +145,6 @@ gpu.module @test { } //----- - -//----- - - //----- - // CHECK-LABEL: store_with_offsets // CHECK-SAME: [[arg0:%.+]]: ui64 // CHECK-COUNT-2: xegpu.store {{.*}}[{{.*}}], {{.*}} <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint}> : vector<16xf32>, ui64, vector<16xindex>, vector<16xi1> @@ -211,12 +165,6 @@ gpu.module @test { gpu.return } -//----- - -//----- - -//----- - //----- // CHECK-LABEL: load_with_offsets_chunk // CHECK-SAME: [[arg0:%.+]]: ui64 @@ -240,8 +188,6 @@ gpu.module @test { gpu.return %ld : vector<32x4xf32> } -//----- - //----- // CHECK-LABEL: store_with_offsets_chunk // CHECK-SAME: [[arg0:%.+]]: ui64 @@ -268,7 +214,17 @@ gpu.module @test { } //----- + // CHECK-LABEL: load_nd_store_nd + // CHECK-SAME: [[arg0:%.+]]: memref<256x318xf32> + // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<256x318xf32> -> !xegpu.tensor_desc<8x16xf32> + // CHECK-COUNT-6: [[data:%.+]] = xegpu.load_nd {{.*}}[{{.*}}] : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> + // CHECK-COUNT-6: xegpu.store_nd {{.*}}[{{.*}}] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + gpu.func @load_nd_store_nd(%src: memref<256x318xf32>) { + %tdesc = xegpu.create_nd_tdesc %src : memref<256x318xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> + %ld = xegpu.load_nd %tdesc[8, 16]: !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> + xegpu.store_nd %ld, %tdesc[0, 0] : vector<24x32xf32>, !xegpu.tensor_desc<24x32xf32, #xegpu.layout> + gpu.return + } -//----- } diff --git a/mlir/test/Dialect/XeGPU/xegpu-vector-linearize.mlir b/mlir/test/Dialect/XeGPU/xegpu-vector-linearize.mlir index 1f74b20819b11..e4244b8071860 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-vector-linearize.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-vector-linearize.mlir @@ -217,18 +217,17 @@ func.func @gather_memref_2d(%base: memref, %v: vector<2x3xindex>, %mask // CHECK: gpu.func @test_kernel(%[[A:.*]]: memref<8x16xf16>, %[[B:.*]]: memref<16x16xf16>, %[[C:.*]]: memref<8x16xf32>) kernel { // CHECK: %[[POISON_F32:.*]] = ub.poison : vector<128xf32> -// CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[CST_A:.*]] = arith.constant dense<0.000000e+00> : vector<64xf16> // CHECK: %[[CST_C:.*]] = arith.constant dense<5.000000e+00> : vector<64xf32> -// CHECK: %[[A_TDESC:.*]] = xegpu.create_nd_tdesc %[[A]][%[[C0]], %[[C0]]] -// CHECK: %[[A_VAL:.*]] = xegpu.load_nd %[[A_TDESC]] +// CHECK: %[[A_TDESC:.*]] = xegpu.create_nd_tdesc %[[A]] +// CHECK: %[[A_VAL:.*]] = xegpu.load_nd %[[A_TDESC]][0, 0] // CHECK: %[[A_CAST:.*]] = vector.shape_cast %[[A_VAL]] : vector<8x16xf16> to vector<128xf16> // CHECK: %[[A_SHUFFLE:.*]] = vector.shuffle %[[A_CAST]], %[[CST_A]] {{.*}} : vector<128xf16>, vector<64xf16> // CHECK: %[[A_RESULT:.*]] = vector.shape_cast %[[A_SHUFFLE]] : vector<128xf16> to vector<8x16xf16> -// CHECK: %[[B_TDESC:.*]] = xegpu.create_nd_tdesc %[[B]][%[[C0]], %[[C0]]] -// CHECK: %[[B_VAL:.*]] = xegpu.load_nd %[[B_TDESC]] +// CHECK: %[[B_TDESC:.*]] = xegpu.create_nd_tdesc %[[B]] +// CHECK: %[[B_VAL:.*]] = xegpu.load_nd %[[B_TDESC]][0, 0] // CHECK: %[[B_CAST:.*]] = vector.shape_cast %[[B_VAL]] : vector<16x16xf16> to vector<256xf16> // CHECK: %[[B_SHUFFLE:.*]] = vector.shuffle %[[B_CAST]], %[[CST_A]] {{.*}} : vector<256xf16>, vector<64xf16> // CHECK: %[[B_RESULT:.*]] = vector.shape_cast %[[B_SHUFFLE]] : vector<256xf16> to vector<16x16xf16> @@ -240,8 +239,8 @@ func.func @gather_memref_2d(%base: memref, %v: vector<2x3xindex>, %mask // CHECK: %[[INSERT_SHUFFLE:.*]] = vector.shuffle %[[DPAS_CAST]], %[[ADDF]] {{.*}} : vector<128xf32>, vector<64xf32> // CHECK: %[[C_RESULT:.*]] = vector.shape_cast %[[INSERT_SHUFFLE]] : vector<128xf32> to vector<8x16xf32> -// CHECK: %[[C_TDESC:.*]] = xegpu.create_nd_tdesc %[[C]][%[[C0]], %[[C0]]] -// CHECK: xegpu.store_nd %[[C_RESULT]], %[[C_TDESC]] +// CHECK: %[[C_TDESC:.*]] = xegpu.create_nd_tdesc %[[C]] +// CHECK: xegpu.store_nd %[[C_RESULT]], %[[C_TDESC]][0, 0] // CHECK: gpu.return gpu.module @test_kernel { @@ -250,19 +249,19 @@ gpu.module @test_kernel { %cst_vec_0 = arith.constant dense<0.000000e+00> : vector<8x8xf16> %cst_vec_1 = arith.constant dense<0.000000e+00> : vector<8x8xf16> %cst_vec_2 = arith.constant dense<5.000000e+00> : vector<8x8xf32> - %a_tdesc = xegpu.create_nd_tdesc %A[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr> - %a_val = xegpu.load_nd %a_tdesc : !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr> -> vector<8x16xf16> + %a_tdesc = xegpu.create_nd_tdesc %A : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr> + %a_val = xegpu.load_nd %a_tdesc[0, 0] : !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr> -> vector<8x16xf16> %a_val_0 = vector.insert_strided_slice %cst_vec_0, %a_val{offsets = [0, 0], sizes = [8, 8], strides = [1, 1]}: vector<8x8xf16> into vector<8x16xf16> - %b_tdesc = xegpu.create_nd_tdesc %B[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> + %b_tdesc = xegpu.create_nd_tdesc %B : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> - %b_val = xegpu.load_nd %b_tdesc : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<16x16xf16> + %b_val = xegpu.load_nd %b_tdesc[0, 0] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<16x16xf16> %b_val_0 = vector.insert_strided_slice %cst_vec_1, %b_val{offsets = [0, 0], sizes = [8, 8], strides = [1, 1]}: vector<8x8xf16> into vector<16x16xf16> %c_val = xegpu.dpas %a_val_0, %b_val_0 : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> %c_val_0 = vector.extract_strided_slice %c_val {offsets = [0, 0], sizes = [8, 8], strides = [1, 1]} : vector<8x16xf32> to vector<8x8xf32> %c_addf = arith.addf %c_val_0, %cst_vec_2 : vector<8x8xf32> %c_result = vector.insert_strided_slice %c_addf, %c_val {offsets = [0, 0], sizes = [8, 8], strides = [1, 1]} : vector<8x8xf32> into vector<8x16xf32> - %c_tdesc = xegpu.create_nd_tdesc %C[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - xegpu.store_nd %c_result, %c_tdesc : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + %c_tdesc = xegpu.create_nd_tdesc %C : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %c_result, %c_tdesc[0, 0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> gpu.return } } diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir index 762530e5d189f..53ce8d0e38949 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir @@ -4,9 +4,9 @@ gpu.module @test_elementwise_ops { // CHECK-LABEL: unary_ops_sg_layout_only gpu.func @unary_ops_sg_layout_only(%a: memref<24x32xf32>) { - %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32> + %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} + %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> // CHECK: math.exp {{.*}} : vector<12x8xf32> @@ -22,9 +22,9 @@ gpu.module @test_elementwise_ops { // CHECK-LABEL: unary_ops gpu.func @unary_ops(%a: memref<24x32xf32>) { - %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32> + %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} + %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> // CHECK: math.exp {{.*}} : vector<12x8xf32> @@ -40,14 +40,14 @@ gpu.module @test_elementwise_ops { // CHECK-LABEL: binary_ops gpu.func @binary_ops(%a: memref<24x32xf32>, %b: memref<24x32xf32>) { - %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32> + %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32> + %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} + %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} + %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> // CHECK: arith.addf {{.*}}, {{.*}} : vector<12x8xf32> @@ -63,19 +63,19 @@ gpu.module @test_elementwise_ops { // CHECK-LABEL: ternary_ops gpu.func @ternary_ops(%a: memref<24x32xf32>, %b: memref<24x32xf32>, %c: memref<24x32xi1>) { - %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32> + %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32> + %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - %tdesc_c = xegpu.create_nd_tdesc %c[0, 0] : memref<24x32xi1> + %tdesc_c = xegpu.create_nd_tdesc %c : memref<24x32xi1> -> !xegpu.tensor_desc<24x32xi1, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} + %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} + %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - %load_c = xegpu.load_nd %tdesc_c {layout = #xegpu.layout} + %load_c = xegpu.load_nd %tdesc_c[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xi1, #xegpu.layout> -> vector<24x32xi1> // CHECK: arith.select {{.*}}, {{.*}}, {{.*}} : vector<12x8xi1>, vector<12x8xf32> @@ -91,14 +91,14 @@ gpu.module @test_elementwise_ops { // CHECK-LABEL: type_conversion_ops gpu.func @type_conversion_ops(%a: memref<24x32xf32>, %b: memref<24x32xi32>) { - %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32> + %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xi32> + %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xi32> -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} + %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} + %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xi32, #xegpu.layout> -> vector<24x32xi32> // CHECK: arith.truncf {{.*}} : vector<12x8xf32> to vector<12x8xf16> @@ -114,24 +114,24 @@ gpu.module @test_elementwise_ops { // CHECK-LABEL: comparison_ops gpu.func @comparison_ops(%a: memref<24x32xf32>, %b: memref<24x32xf32>, %c: memref<24x32xi32>, %d: memref<24x32xi32>) { - %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32> + %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32> + %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - %tdesc_c = xegpu.create_nd_tdesc %c[0, 0] : memref<24x32xi32> + %tdesc_c = xegpu.create_nd_tdesc %c : memref<24x32xi32> -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout> - %tdesc_d = xegpu.create_nd_tdesc %d[0, 0] : memref<24x32xi32> + %tdesc_d = xegpu.create_nd_tdesc %d : memref<24x32xi32> -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} + %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} + %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - %load_c = xegpu.load_nd %tdesc_c {layout = #xegpu.layout} + %load_c = xegpu.load_nd %tdesc_c[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xi32, #xegpu.layout> -> vector<24x32xi32> - %load_d = xegpu.load_nd %tdesc_d {layout = #xegpu.layout} + %load_d = xegpu.load_nd %tdesc_d[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xi32, #xegpu.layout> -> vector<24x32xi32> // CHECK: arith.cmpf ult, {{.*}}, {{.*}} : vector<12x8xf32> @@ -148,14 +148,14 @@ gpu.module @test_elementwise_ops { // 1 to N decomposition of elementwise operations // CHECK-LABEL: elementwise_ops_rr_assignment gpu.func @elementwise_ops_rr_assignment(%a: memref<24x32xf32>, %b: memref<24x32xf32>) { - %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32> + %tdesc_a = xegpu.create_nd_tdesc %a : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32> + %tdesc_b = xegpu.create_nd_tdesc %b : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} + %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> - %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} + %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> // CHECK-COUNT-12: arith.negf {{.*}} : vector<2x2xf32> diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir index e4bf3b6c3bf1d..17a5db6b8401d 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir @@ -1,74 +1,50 @@ // RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s -gpu.module @test_round_robin_assignment { +gpu.module @test_distribution { // CHECK-LABEL: create_nd_tdesc // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> gpu.func @create_nd_tdesc(%src: memref<256x128xf32>) { - // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout> + // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf32> + // CHECK-SAME: -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout> // CHECK-NOT: xegpu.create_nd_tdesc - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> + %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> gpu.return - } + } - // CHECK-LABEL: create_nd_tdesc_with_shared_data - // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> - gpu.func @create_nd_tdesc_with_shared_data(%src: memref<256x128xf32>) { - // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK: %[[C4:.*]] = arith.constant 4 : index - // CHECK: %[[IDX:.*]] = arith.remui %[[SGID]], %[[C4]] - // CHECK: %[[IDY_DIV:.*]] = arith.divui %[[SGID]], %[[C4]] - // CHECK: %[[C8:.*]] = arith.constant 8 : index - // CHECK: %[[IDY:.*]] = arith.remui %[[IDY_DIV]], %[[C8]] - // CHECK: %[[C16:.*]] = arith.constant 16 : index - // CHECK: %[[LY:.*]] = arith.muli %[[IDY]], %[[C16]] - // CHECK: %[[C64:.*]] = arith.constant 64 : index - // CHECK: %[[LX:.*]] = arith.muli %[[IDX]], %[[C64]] - // CHECK: %[[C128:.*]] = arith.constant 128 : index - // CHECK: %[[OFFY:.*]] = arith.remui %[[LY]], %[[C128]] - // CHECK: %[[C64_1:.*]] = arith.constant 64 : index - // CHECK: %[[OFFX:.*]] = arith.remui %[[LX]], %[[C64_1]] - // CHECK: xegpu.create_nd_tdesc %[[ARG_0]][%[[OFFY]], %[[OFFX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<16x64xf32> - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> - -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout> + // CHECK-LABEL: load_nd + gpu.func @load_nd(%src: memref<256x128xf32>) { + // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> -> vector<16x16xf32> + // CHECK-NOT: xegpu.load_nd + %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> + -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} + : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + -> vector<256x128xf32> gpu.return } - // CHECK-LABEL: load_nd_tdesc - // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> - gpu.func @load_nd_tdesc(%src: memref<256x128xf32>) { - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - // CHECK-COUNT-4: xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> -> vector<16x16xf32> - // CHECK-NOT: xegpu.load_nd - %load = xegpu.load_nd %tdesc {layout = #xegpu.layout} - : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - -> vector<256x128xf32> - gpu.return - } - - // CHECK-LABEL: store_nd - // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> - gpu.func @store_nd(%src: memref<256x128xf32>) { - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, %{{.*}} : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout> - // CHECK-NOT: xegpu.store_nd - %load = xegpu.load_nd %tdesc {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - -> vector<256x128xf32> - xegpu.store_nd %load, %tdesc - : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - gpu.return + // CHECK-LABEL: store_nd_with_offset + gpu.func @store_nd_with_offset(%src: memref<256x128xf32>) { + // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout}> : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout> + // CHECK-NOT: xegpu.store_nd + %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> + -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} + : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + -> vector<256x128xf32> + xegpu.store_nd %load, %tdesc[0, 0] {layout = #xegpu.layout} + : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + gpu.return } - // CHECK-LABEL: update_nd - // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> - gpu.func @update_nd(%src: memref<256x128xf32>){ - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - // CHECK-COUNT-4: xegpu.update_nd_offset %{{.*}}, [0, 16] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> - // CHECK-NOT: xegpu.update_nd_offset - %update = xegpu.update_nd_offset %tdesc, [0, 16] + // CHECK-LABEL: prefetch_nd + gpu.func @prefetch_nd(%src: memref<256x128xf32>) { + // CHECK-COUNT-4: xegpu.prefetch_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> + // CHECK-NOT: xegpu.prefetch_nd + %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> + -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + xegpu.prefetch_nd %tdesc[0, 0] : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> gpu.return } @@ -76,44 +52,212 @@ gpu.module @test_round_robin_assignment { // CHECK-LABEL: dpas // CHECK-SAME: (%[[ARG_0:.*]]: memref<256x128xf16>, %[[ARG_1:.*]]: memref<128x256xf16>) gpu.func @dpas(%a: memref<256x128xf16>, %b: memref<128x256xf16>) { - // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]][%{{.*}}, %{{.*}}] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> // CHECK-NOT: xegpu.dpas - %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<256x128xf16> + %tdesc_a = xegpu.create_nd_tdesc %a : memref<256x128xf16> -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf16, #xegpu.layout> + %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout} + : !xegpu.tensor_desc<256x128xf16, #xegpu.layout> -> vector<256x128xf16> - %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<128x256xf16> + %tdesc_b = xegpu.create_nd_tdesc %b : memref<128x256xf16> -> !xegpu.tensor_desc<128x256xf16, #xegpu.layout> - %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} : !xegpu.tensor_desc<128x256xf16, #xegpu.layout> + %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout} + : !xegpu.tensor_desc<128x256xf16, #xegpu.layout> -> vector<128x256xf16> - %dpas = xegpu.dpas %load_a, %load_b - {layout_a = #xegpu.layout, - layout_b = #xegpu.layout, - layout_cd = #xegpu.layout} + %dpas = xegpu.dpas %load_a, %load_b + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} : vector<256x128xf16>, vector<128x256xf16> -> vector<256x256xf32> gpu.return } - // CHECK-LABEL: prefetch_nd_tdesc + // CHECK-LABEL: vector_reduce_dim_1 + gpu.func @vector_reduce_dim_1(%src: memref<256x64xf32>) { + // CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf32> + %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} dense<1.0> : vector<256xf32> + %tdesc = xegpu.create_nd_tdesc %src : memref<256x64xf32> + -> !xegpu.tensor_desc<256x64xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} + : !xegpu.tensor_desc<256x64xf32, #xegpu.layout> + -> vector<256x64xf32> + // CHECK-COUNT-2: vector.multi_reduction , {{.*}}, %[[C0:.*]] [1] : vector<16x64xf32> to vector<16xf32> + // CHECK-NOT: vector.multi_reduction + // CHECK-COUNT-2: arith.addf {{.*}}, {{.*}} : vector<16xf32> + // CHECK-NOT: arith.addf + %reduce = vector.multi_reduction , %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} [1] + : vector<256x64xf32> to vector<256xf32> + gpu.return + } + + // CHECK-LABEL: non_splat_constant + gpu.func @non_splat_constant() { + // CHECK-DAG: %[[CST:.*]] = arith.constant dense<{{.*}}0{{.*}}, {{.*}}16{{.*}}> : vector<2x1xindex> + // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK-DAG: %[[T1:.*]] = arith.remui %[[SGID]], %[[C8:.*]] : index + // CHECK-DAG: %[[T2:.*]] = arith.muli %[[T1]], %[[C2:.*]] : index + // CHECK-DAG: %[[T3:.*]] = arith.remui %[[T2]], %[[C32:.*]] : index + // CHECK-DAG: %[[T4:.*]] = arith.addi %[[T2]], %[[C16:.*]] : index + // CHECK-DAG: %[[T5:.*]] = arith.remui %[[T4]], %[[C32_6:.*]] : index + // CHECK-DAG: %[[T6:.*]] = arith.muli %[[T3]], %[[C16_10:.*]] : index + // CHECK-DAG: %[[T7:.*]] = arith.addi %[[C0_11:.*]], %[[T6]] : index + // CHECK-DAG: %[[T8:.*]] = arith.muli %[[C0_4:.*]], %[[C0_9:.*]] : index + // CHECK-DAG: %[[T9:.*]] = arith.addi %[[T7]], %[[T8]] : index + // CHECK-DAG: %[[T10:.*]] = vector.broadcast %[[T9]] : index to vector<2x1xindex> + // CHECK-DAG: %[[T11:.*]] = arith.addi %[[CST]], %[[T10]] : vector<2x1xindex> + // CHECK-DAG: %[[T12:.*]] = arith.muli %[[T5]], %[[C16_10:.*]] : index + // CHECK-DAG: %[[T13:.*]] = arith.addi %[[C0_12:.*]], %[[T12]] : index + // CHECK-DAG: %[[T14:.*]] = arith.muli %[[C0_8:.*]], %[[C0_9:.*]] : index + // CHECK-DAG: %[[T15:.*]] = arith.addi %[[T13]], %[[T14]] : index + // CHECK-DAG: %[[T16:.*]] = vector.broadcast %[[T15]] : index to vector<2x1xindex> + // CHECK-DAG: %[[T17:.*]] = arith.addi %[[CST]], %[[T16]] : vector<2x1xindex> + %cst_2 = arith.constant {layout_result_0 = #xegpu.layout} dense<[[0], [16], [32], [48], [64], [80], [96], [112], [128], [144], [160], [176], [192], [208], [224], [240], [256], [272], [288], [304], [320], [336], [352], [368], [384], [400], [416], [432], [448], [464], [480], [496]]> : vector<32x1xindex> + gpu.return + } + + // CHECK-LABEL: vector_transpose + gpu.func @vector_transpose(%src: memref<256x128xf32>) { + %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> + -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} + : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + -> vector<256x128xf32> + // CHECK-COUNT-2: vector.transpose {{.*}}, [1, 0] : vector<32x16xf32> to vector<16x32xf32> + // CHECK-NOT: vector.transpose + %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout} + : vector<256x128xf32> to vector<128x256xf32> + gpu.return + } + + // CHECK-LABEL: vector_mask_2D + gpu.func @vector_mask_2D() { + // CHECK-COUNT-4: vector.create_mask {{.*}}, {{.*}} : vector<16x16xi1> + // CHECK-NOT: vector.create_mask + %constant_mask = vector.constant_mask [16, 16] {layout_result_0 = #xegpu.layout} : vector<256x128xi1> + gpu.return + } + + gpu.func @vector_create_mask_2D() { + // CHECK-COUNT-4: vector.create_mask {{.*}}, {{.*}} : vector<16x16xi1> + // CHECK-NOT: vector.create_mask + %cst16 = arith.constant 16 : index + %constant_mask = vector.create_mask %cst16, %cst16 {layout_result_0 = #xegpu.layout} : vector<256x128xi1> + gpu.return + } + + // CHECK-LABEL: distribute_shapecast_expandunitdims_broadcast + // CHECK: %[[CAST:.*]] = vector.shape_cast %[[REDUCE:.*]] : vector<8xf32> to vector<8x1xf32> + // CHECK: %[[BCAST:.*]] = vector.broadcast %[[CAST]] : vector<8x1xf32> to vector<8x128xf32> + gpu.func @distribute_shapecast_expandunitdims_broadcast(%arg0: memref<4096x128xf32>, %arg1: memref<4096x128xf32>) { + %cst_0 = arith.constant {layout_result_0=#xegpu.slice<#xegpu.layout, dims = [1]>} dense<0xFF800000> : vector<256xf32> + %block_id_x = gpu.block_id x + %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr, #xegpu.layout> + %1 = xegpu.load_nd %0[%block_id_x, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr, #xegpu.layout> -> vector<256x128xf32> + %2 = vector.multi_reduction , %1, %cst_0 {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} [1] : vector<256x128xf32> to vector<256xf32> + %3 = vector.shape_cast %2 {layout_result_0 = #xegpu.layout, layout_operand_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} : vector<256xf32> to vector<256x1xf32> + %4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout} : vector<256x1xf32>to vector<256x128xf32> + %9 = xegpu.create_nd_tdesc %arg0 : memref<4096x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr, #xegpu.layout> + xegpu.store_nd %4, %9[%block_id_x, 0] : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr, #xegpu.layout> + gpu.return + } + + // CHECK-LABEL: gpu.func @reduction_cross_sg_rr + gpu.func @reduction_cross_sg_rr(%arg0: memref<2048xf32, 1>) kernel { + // CHECK: %[[CST_OFFSETS0:.*]] = arith.constant dense<0> : vector<4x16xindex> + // CHECK: %[[CST_OFFSETS1:.*]] = arith.constant dense<0> : vector<4x16xindex> + // CHECK: %[[CST_ACC0:.*]] = arith.constant dense<0.000000e+00> : vector<4xf32> + // CHECK: %[[CST_ACC1:.*]] = arith.constant dense<0.000000e+00> : vector<4xf32> + // CHECK: %[[CST_MASK0:.*]] = arith.constant dense : vector<4x16xi1> + // CHECK: %[[CST_MASK1:.*]] = arith.constant dense : vector<4x16xi1> + // + // CHECK: %[[LOAD0:.*]] = xegpu.load %arg0[%[[CST_OFFSETS0]]], %[[CST_MASK0]] + // CHECK-SAME: -> vector<4x16xf32> + // CHECK: %[[LOAD1:.*]] = xegpu.load %arg0[%[[CST_OFFSETS1]]], %[[CST_MASK1]] + // CHECK-SAME: -> vector<4x16xf32> + // + // Local reductions + // CHECK: %[[NEUTRAL0:.*]] = arith.constant dense<0.000000e+00> : vector<4xf32> + // CHECK: %[[LOCAL_RED0:.*]] = vector.multi_reduction , %[[LOAD0]], %[[NEUTRAL0]] [1] : vector<4x16xf32> to vector<4xf32> + // CHECK: %[[NEUTRAL1:.*]] = arith.constant dense<0.000000e+00> : vector<4xf32> + // CHECK: %[[LOCAL_RED1:.*]] = vector.multi_reduction , %[[LOAD1]], %[[NEUTRAL1]] [1] : vector<4x16xf32> to vector<4xf32> + // + // Shape cast for SLM store + // CHECK: %[[SC0:.*]] = vector.shape_cast %[[LOCAL_RED0]] : vector<4xf32> to vector<4x1xf32> + // CHECK: %[[SC1:.*]] = vector.shape_cast %[[LOCAL_RED1]] : vector<4xf32> to vector<4x1xf32> + // + // SLM allocation and mem_desc + // CHECK: %[[SLM:.*]] = memref.alloca() : memref<512xi8, 3> + // CHECK: %[[MEMDESC:.*]] = xegpu.create_mem_desc %[[SLM]] : memref<512xi8, 3> -> !xegpu.mem_desc<8x16xf32> + // + // Store to SLM + // CHECK: xegpu.store_matrix %[[SC0]], %[[MEMDESC]]{{.*}} : vector<4x1xf32>, !xegpu.mem_desc<8x16xf32> + // CHECK: xegpu.store_matrix %[[SC1]], %[[MEMDESC]]{{.*}} : vector<4x1xf32>, !xegpu.mem_desc<8x16xf32> + // CHECK: gpu.barrier + // + // Load from SLM + // CHECK: %[[SLM_LOAD0:.*]] = xegpu.load_matrix %[[MEMDESC]]{{.*}} -> vector<4x16xf32> + // CHECK: %[[SLM_LOAD1:.*]] = xegpu.load_matrix %[[MEMDESC]]{{.*}} -> vector<4x16xf32> + // + // Final reduction + // CHECK: %[[FINAL_NEUTRAL:.*]] = arith.constant dense<0.000000e+00> : vector<4xf32> + // CHECK: %[[FINAL_RED0:.*]] = vector.multi_reduction , %[[SLM_LOAD0]], %[[FINAL_NEUTRAL]] [1] : vector<4x16xf32> to vector<4xf32> + // CHECK: %[[RES0:.*]] = arith.addf %[[FINAL_RED0]], %[[CST_ACC0]] : vector<4xf32> + // CHECK: %[[FINAL_RED1:.*]] = vector.multi_reduction , %[[SLM_LOAD1]], %[[FINAL_NEUTRAL]] [1] : vector<4x16xf32> to vector<4xf32> + // CHECK: %[[RES1:.*]] = arith.addf %[[FINAL_RED1]], %[[CST_ACC1]] : vector<4xf32> + + %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<8x256xindex> + %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} dense<0.000000e+00> : vector<8xf32> + %mask = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<8x256xi1> + %val = xegpu.load %arg0[%offset], %mask <{layout = #xegpu.layout}> : memref<2048xf32, 1>, vector<8x256xindex>, vector<8x256xi1> -> vector<8x256xf32> + %reduce = vector.multi_reduction , %val, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} [1] : vector<8x256xf32> to vector<8xf32> + gpu.return + } + + // CHECK-LABEL: splat_constant + gpu.func @splat_constant() { + // CHECK-COUNT-2: %[[CST:.*]] = arith.constant dense<0> : vector<4xindex> + %cst_2 = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} dense<0> : vector<8xindex> + gpu.return + } + + // CHECK-LABEL: gpu.func @step_broadcast + gpu.func @step_broadcast() { + // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK-DAG: %[[C16:.*]] = arith.constant 16 : index + // CHECK: %[[REM:.*]] = arith.remui %[[SGID]], %[[C16]] : index + // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index + // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index + // CHECK: %[[STEP:.*]] = vector.step : vector<4xindex> + // CHECK: %[[BCST0:.*]] = vector.broadcast %[[C0:.*]] : index to vector<4xindex> + // CHECK: %[[ADD0:.*]] = arith.addi %[[STEP]], %[[BCST0]] : vector<4xindex> + // CHECK: %[[BCST4:.*]] = vector.broadcast %[[C4:.*]] : index to vector<4xindex> + // CHECK: %[[ADD4:.*]] = arith.addi %[[STEP]], %[[BCST4]] : vector<4xindex> + // CHECK: %[[RES0:.*]] = vector.broadcast %[[ADD0]] : vector<4xindex> to vector<16x4xindex> + // CHECK: %[[RES1:.*]] = vector.broadcast %[[ADD4]] : vector<4xindex> to vector<16x4xindex> + %2 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} : vector<8xindex> + %bcast = vector.broadcast %2 {layout_result_0 = #xegpu.layout} : vector<8xindex> to vector<256x8xindex> + gpu.return + } + + // CHECK-LABEL: create_nd_tdesc_with_shared_data // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> - gpu.func @prefetch_nd_tdesc(%src: memref<256x128xf32>) { - // CHECK-COUNT-4: xegpu.prefetch_nd %{{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> - // CHECK-NOT: xegpu.prefetch_nd - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - xegpu.prefetch_nd %tdesc - : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + gpu.func @create_nd_tdesc_with_shared_data(%src: memref<256x128xf32>) { + // CHECK: xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf32> -> !xegpu.tensor_desc<16x64xf32> + %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> + -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout> gpu.return } // CHECK-LABEL: broadcast // CHECK-SAME: %[[ARG_0:.*]]: memref<128x1xf32> gpu.func @broadcast(%src: memref<128x1xf32>) { - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<128x1xf32> + %tdesc = xegpu.create_nd_tdesc %src : memref<128x1xf32> -> !xegpu.tensor_desc<128x1xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc {layout = #xegpu.layout} + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<128x1xf32, #xegpu.layout> -> vector<128x1xf32> // CHECK-COUNT-4: vector.broadcast {{.*}} : vector<16x1xf32> to vector<16x32xf32> @@ -130,19 +274,12 @@ gpu.module @test_round_robin_assignment { %c0 = arith.constant 0 : index %c256 = arith.constant 256 : index %c1024 = arith.constant 1024 : index - %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> - %1 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> + %0 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> + %1 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> // CHECK-LABEL: scf.for - // CHECK-SAME: (!xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>) - %2:2 = scf.for %arg2 = %c0 to %c1024 step %c256 iter_args(%arg3 = %0, %arg4 = %1) - -> (!xegpu.tensor_desc<256xf32, #xegpu.layout>, !xegpu.tensor_desc<256xf32, #xegpu.layout>) { - %3 = xegpu.load_nd %0 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> - xegpu.store_nd %3, %arg3 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout> - %4 = xegpu.update_nd_offset %arg3, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout> - %5 = xegpu.update_nd_offset %arg4, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout> - // CHECK-LABEL: scf.yield - // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32> - scf.yield %4, %5 : !xegpu.tensor_desc<256xf32, #xegpu.layout>, !xegpu.tensor_desc<256xf32, #xegpu.layout> + scf.for %arg2 = %c0 to %c1024 step %c256 { + %3 = xegpu.load_nd %0[%arg2] {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + xegpu.store_nd %3, %1[%arg2] : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout> } gpu.return } @@ -151,9 +288,10 @@ gpu.module @test_round_robin_assignment { %c1_i32 = arith.constant 1 : i32 %c10_i32 = arith.constant 10 : i32 %c0_i32 = arith.constant 0 : i32 - %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> - %1 = xegpu.load_nd %0 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> - %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> + %c256 = arith.constant 256 : index + %0 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> + %1 = xegpu.load_nd %0[0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %2 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> // CHECK: scf.while ({{.*}}) : (vector<16xf32>, vector<16xf32>, i32) -> (vector<16xf32>, vector<16xf32>, i32) %3:2 = scf.while (%arg2 = %1, %arg3 = %c0_i32) : (vector<256xf32>, i32) -> (vector<256xf32>, i32) { %4 = arith.cmpi slt, %arg3, %c10_i32 : i32 @@ -162,10 +300,9 @@ gpu.module @test_round_robin_assignment { } do { // CHECK: ([[arg2:%.+]]: vector<16xf32>, [[arg3:%.+]]: vector<16xf32>, [[arg4:%.+]]: i32) ^bb0(%arg2: vector<256xf32>, %arg3: i32): - xegpu.store_nd %arg2, %2 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout> + xegpu.store_nd %arg2, %2[0] : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout> %4 = arith.addi %arg3, %c1_i32 : i32 - %5 = xegpu.update_nd_offset %0, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout> - %6 = xegpu.load_nd %5 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %6 = xegpu.load_nd %0[%c256] {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> scf.yield %6, %4 : vector<256xf32>, i32 } gpu.return @@ -174,23 +311,23 @@ gpu.module @test_round_robin_assignment { gpu.func @scf_if(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) { %c10 = arith.constant 10 : index %0 = gpu.subgroup_id : index - %1 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> - %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> + %1 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> + %2 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> %3 = arith.cmpi eq, %0, %c10 : index // CHECK-LABEL: scf.if // CHECK-SAME: (vector<16xf32>, vector<16xf32>) %4 = scf.if %3 -> (vector<256xf32>) { - %5 = xegpu.load_nd %1 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %5 = xegpu.load_nd %1[0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> // CHECK-LABEL: scf.yield // CHECK-SAME: vector<16xf32>, vector<16xf32> scf.yield %5 : vector<256xf32> } else { - %5 = xegpu.load_nd %2 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %5 = xegpu.load_nd %2[0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> // CHECK-LABEL: scf.yield // CHECK-SAME: vector<16xf32>, vector<16xf32> scf.yield %5 : vector<256xf32> } {layout_result_0 = #xegpu.layout} - xegpu.store_nd %4, %1 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout> + xegpu.store_nd %4, %1[0] : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout> gpu.return } @@ -198,34 +335,35 @@ gpu.module @test_round_robin_assignment { %c10 = arith.constant 10 : index %id = gpu.subgroup_id : index - %t = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> - %d = xegpu.load_nd %t {layout = #xegpu.layout}: !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %t = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> + %d = xegpu.load_nd %t[0] {layout = #xegpu.layout}: !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> %0 = arith.cmpi eq, %id, %c10 : index // CHECK-LABEL: scf.if // CHECK-SAME: (!xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>) %1 = scf.if %0 -> (!xegpu.tensor_desc<256xf32, #xegpu.layout>) { - %2 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> + %2 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> // CHECK-LABEL: scf.yield // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32> scf.yield %2 : !xegpu.tensor_desc<256xf32, #xegpu.layout> } else { - %3 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> + %3 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> // CHECK-LABEL: scf.yield // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32> scf.yield %3 : !xegpu.tensor_desc<256xf32, #xegpu.layout> } - xegpu.store_nd %d, %1 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout> + xegpu.store_nd %d, %1[0] : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout> gpu.return } gpu.func @convert_layout_optimal(%arg0: memref<32x64xf32>) { - %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32, #xegpu.layout> + %0 = xegpu.create_nd_tdesc %arg0 : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32, #xegpu.layout> // CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> -> vector<16x16xf32> // CHECK-COUNT-2: xegpu.convert_layout {{.*}} <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<16x16xf32> - %1 = xegpu.load_nd %0 {layout = #xegpu.layout} : !xegpu.tensor_desc<32x64xf32, #xegpu.layout> -> vector<32x64xf32> + %1 = xegpu.load_nd %0[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<32x64xf32, #xegpu.layout> -> vector<32x64xf32> %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<32x64xf32> gpu.return } + } diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir deleted file mode 100644 index 897eab12329e2..0000000000000 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir +++ /dev/null @@ -1,246 +0,0 @@ -// RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s - -gpu.module @test_distribution { - // CHECK-LABEL: create_nd_tdesc_no_offset - // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> - gpu.func @create_nd_tdesc_no_offset(%src: memref<256x128xf32>) { - // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf32> - // CHECK-SAME: -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout> - // CHECK-NOT: xegpu.create_nd_tdesc - %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - gpu.return - } - - // CHECK-LABEL: load_nd_tdesc_with_offset - gpu.func @load_nd_tdesc_with_offset(%src: memref<256x128xf32>) { - // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> -> vector<16x16xf32> - // CHECK-NOT: xegpu.load_nd - %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} - : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - -> vector<256x128xf32> - gpu.return - } - - // CHECK-LABEL: store_nd_with_offset - gpu.func @store_nd_with_offset(%src: memref<256x128xf32>) { - // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout}> : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout> - // CHECK-NOT: xegpu.store_nd - %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} - : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - -> vector<256x128xf32> - xegpu.store_nd %load, %tdesc[0, 0] {layout = #xegpu.layout} - : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - gpu.return - } - - // CHECK-LABEL: prefetch_nd_tdesc_with_offset - gpu.func @prefetch_nd_tdesc_with_offset(%src: memref<256x128xf32>) { - // CHECK-COUNT-4: xegpu.prefetch_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> - // CHECK-NOT: xegpu.prefetch_nd - %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - xegpu.prefetch_nd %tdesc[0, 0] - : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - gpu.return - } - - // CHECK-LABEL: dpas - // CHECK-SAME: (%[[ARG_0:.*]]: memref<256x128xf16>, %[[ARG_1:.*]]: memref<128x256xf16>) - gpu.func @dpas(%a: memref<256x128xf16>, %b: memref<128x256xf16>) { - // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> - // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> - // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> - // CHECK-NOT: xegpu.dpas - %tdesc_a = xegpu.create_nd_tdesc %a : memref<256x128xf16> - -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout} - : !xegpu.tensor_desc<256x128xf16, #xegpu.layout> - -> vector<256x128xf16> - %tdesc_b = xegpu.create_nd_tdesc %b : memref<128x256xf16> - -> !xegpu.tensor_desc<128x256xf16, #xegpu.layout> - %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout} - : !xegpu.tensor_desc<128x256xf16, #xegpu.layout> - -> vector<128x256xf16> - %dpas = xegpu.dpas %load_a, %load_b - {layout_a = #xegpu.layout, - layout_b = #xegpu.layout, - layout_cd = #xegpu.layout} - : vector<256x128xf16>, vector<128x256xf16> -> vector<256x256xf32> - gpu.return - } - - // CHECK-LABEL: vector_reduce_dim_1 - gpu.func @vector_reduce_dim_1(%src: memref<256x64xf32>) { - // CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf32> - %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} dense<1.0> : vector<256xf32> - %tdesc = xegpu.create_nd_tdesc %src : memref<256x64xf32> - -> !xegpu.tensor_desc<256x64xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} - : !xegpu.tensor_desc<256x64xf32, #xegpu.layout> - -> vector<256x64xf32> - // CHECK-COUNT-2: vector.multi_reduction , {{.*}}, %[[C0:.*]] [1] : vector<16x64xf32> to vector<16xf32> - // CHECK-NOT: vector.multi_reduction - // CHECK-COUNT-2: arith.addf {{.*}}, {{.*}} : vector<16xf32> - // CHECK-NOT: arith.addf - %reduce = vector.multi_reduction , %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} [1] - : vector<256x64xf32> to vector<256xf32> - gpu.return - } - - // CHECK-LABEL: non_splat_constant - gpu.func @non_splat_constant() { - // CHECK-DAG: %[[CST:.*]] = arith.constant dense<{{.*}}0{{.*}}, {{.*}}16{{.*}}> : vector<2x1xindex> - // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[T1:.*]] = arith.remui %[[SGID]], %[[C8:.*]] : index - // CHECK-DAG: %[[T2:.*]] = arith.muli %[[T1]], %[[C2:.*]] : index - // CHECK-DAG: %[[T3:.*]] = arith.remui %[[T2]], %[[C32:.*]] : index - // CHECK-DAG: %[[T4:.*]] = arith.addi %[[T2]], %[[C16:.*]] : index - // CHECK-DAG: %[[T5:.*]] = arith.remui %[[T4]], %[[C32_6:.*]] : index - // CHECK-DAG: %[[T6:.*]] = arith.muli %[[T3]], %[[C16_10:.*]] : index - // CHECK-DAG: %[[T7:.*]] = arith.addi %[[C0_11:.*]], %[[T6]] : index - // CHECK-DAG: %[[T8:.*]] = arith.muli %[[C0_4:.*]], %[[C0_9:.*]] : index - // CHECK-DAG: %[[T9:.*]] = arith.addi %[[T7]], %[[T8]] : index - // CHECK-DAG: %[[T10:.*]] = vector.broadcast %[[T9]] : index to vector<2x1xindex> - // CHECK-DAG: %[[T11:.*]] = arith.addi %[[CST]], %[[T10]] : vector<2x1xindex> - // CHECK-DAG: %[[T12:.*]] = arith.muli %[[T5]], %[[C16_10:.*]] : index - // CHECK-DAG: %[[T13:.*]] = arith.addi %[[C0_12:.*]], %[[T12]] : index - // CHECK-DAG: %[[T14:.*]] = arith.muli %[[C0_8:.*]], %[[C0_9:.*]] : index - // CHECK-DAG: %[[T15:.*]] = arith.addi %[[T13]], %[[T14]] : index - // CHECK-DAG: %[[T16:.*]] = vector.broadcast %[[T15]] : index to vector<2x1xindex> - // CHECK-DAG: %[[T17:.*]] = arith.addi %[[CST]], %[[T16]] : vector<2x1xindex> - %cst_2 = arith.constant {layout_result_0 = #xegpu.layout} dense<[[0], [16], [32], [48], [64], [80], [96], [112], [128], [144], [160], [176], [192], [208], [224], [240], [256], [272], [288], [304], [320], [336], [352], [368], [384], [400], [416], [432], [448], [464], [480], [496]]> : vector<32x1xindex> - gpu.return - } - - // CHECK-LABEL: vector_transpose - gpu.func @vector_transpose(%src: memref<256x128xf32>) { - %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} - : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - -> vector<256x128xf32> - // CHECK-COUNT-2: vector.transpose {{.*}}, [1, 0] : vector<32x16xf32> to vector<16x32xf32> - // CHECK-NOT: vector.transpose - %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout} - : vector<256x128xf32> to vector<128x256xf32> - gpu.return - } - - // CHECK-LABEL: vector_mask_2D - gpu.func @vector_mask_2D() { - // CHECK-COUNT-4: vector.create_mask {{.*}}, {{.*}} : vector<16x16xi1> - // CHECK-NOT: vector.create_mask - %constant_mask = vector.constant_mask [16, 16] {layout_result_0 = #xegpu.layout} : vector<256x128xi1> - gpu.return - } - - gpu.func @vector_create_mask_2D() { - // CHECK-COUNT-4: vector.create_mask {{.*}}, {{.*}} : vector<16x16xi1> - // CHECK-NOT: vector.create_mask - %cst16 = arith.constant 16 : index - %constant_mask = vector.create_mask %cst16, %cst16 {layout_result_0 = #xegpu.layout} : vector<256x128xi1> - gpu.return - } - - // CHECK-LABEL: distribute_shapecast_expandunitdims_broadcast - // CHECK: %[[CAST:.*]] = vector.shape_cast %[[REDUCE:.*]] : vector<8xf32> to vector<8x1xf32> - // CHECK: %[[BCAST:.*]] = vector.broadcast %[[CAST]] : vector<8x1xf32> to vector<8x128xf32> - gpu.func @distribute_shapecast_expandunitdims_broadcast(%arg0: memref<4096x128xf32>, %arg1: memref<4096x128xf32>) { - %cst_0 = arith.constant {layout_result_0=#xegpu.slice<#xegpu.layout, dims = [1]>} dense<0xFF800000> : vector<256xf32> - %block_id_x = gpu.block_id x - %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr, #xegpu.layout> - %1 = xegpu.load_nd %0[%block_id_x, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr, #xegpu.layout> -> vector<256x128xf32> - %2 = vector.multi_reduction , %1, %cst_0 {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} [1] : vector<256x128xf32> to vector<256xf32> - %3 = vector.shape_cast %2 {layout_result_0 = #xegpu.layout, layout_operand_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} : vector<256xf32> to vector<256x1xf32> - %4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout} : vector<256x1xf32>to vector<256x128xf32> - %9 = xegpu.create_nd_tdesc %arg0 : memref<4096x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr, #xegpu.layout> - xegpu.store_nd %4, %9[%block_id_x, 0] : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.block_tdesc_attr, #xegpu.layout> - gpu.return - } - - // CHECK-LABEL: gpu.func @reduction_cross_sg_rr - gpu.func @reduction_cross_sg_rr(%arg0: memref<2048xf32, 1>) kernel { - // CHECK: %[[CST_OFFSETS0:.*]] = arith.constant dense<0> : vector<4x16xindex> - // CHECK: %[[CST_OFFSETS1:.*]] = arith.constant dense<0> : vector<4x16xindex> - // CHECK: %[[CST_ACC0:.*]] = arith.constant dense<0.000000e+00> : vector<4xf32> - // CHECK: %[[CST_ACC1:.*]] = arith.constant dense<0.000000e+00> : vector<4xf32> - // CHECK: %[[CST_MASK0:.*]] = arith.constant dense : vector<4x16xi1> - // CHECK: %[[CST_MASK1:.*]] = arith.constant dense : vector<4x16xi1> - // - // CHECK: %[[LOAD0:.*]] = xegpu.load %arg0[%[[CST_OFFSETS0]]], %[[CST_MASK0]] - // CHECK-SAME: -> vector<4x16xf32> - // CHECK: %[[LOAD1:.*]] = xegpu.load %arg0[%[[CST_OFFSETS1]]], %[[CST_MASK1]] - // CHECK-SAME: -> vector<4x16xf32> - // - // Local reductions - // CHECK: %[[NEUTRAL0:.*]] = arith.constant dense<0.000000e+00> : vector<4xf32> - // CHECK: %[[LOCAL_RED0:.*]] = vector.multi_reduction , %[[LOAD0]], %[[NEUTRAL0]] [1] : vector<4x16xf32> to vector<4xf32> - // CHECK: %[[NEUTRAL1:.*]] = arith.constant dense<0.000000e+00> : vector<4xf32> - // CHECK: %[[LOCAL_RED1:.*]] = vector.multi_reduction , %[[LOAD1]], %[[NEUTRAL1]] [1] : vector<4x16xf32> to vector<4xf32> - // - // Shape cast for SLM store - // CHECK: %[[SC0:.*]] = vector.shape_cast %[[LOCAL_RED0]] : vector<4xf32> to vector<4x1xf32> - // CHECK: %[[SC1:.*]] = vector.shape_cast %[[LOCAL_RED1]] : vector<4xf32> to vector<4x1xf32> - // - // SLM allocation and mem_desc - // CHECK: %[[SLM:.*]] = memref.alloca() : memref<512xi8, 3> - // CHECK: %[[MEMDESC:.*]] = xegpu.create_mem_desc %[[SLM]] : memref<512xi8, 3> -> !xegpu.mem_desc<8x16xf32> - // - // Store to SLM - // CHECK: xegpu.store_matrix %[[SC0]], %[[MEMDESC]]{{.*}} : vector<4x1xf32>, !xegpu.mem_desc<8x16xf32> - // CHECK: xegpu.store_matrix %[[SC1]], %[[MEMDESC]]{{.*}} : vector<4x1xf32>, !xegpu.mem_desc<8x16xf32> - // CHECK: gpu.barrier - // - // Load from SLM - // CHECK: %[[SLM_LOAD0:.*]] = xegpu.load_matrix %[[MEMDESC]]{{.*}} -> vector<4x16xf32> - // CHECK: %[[SLM_LOAD1:.*]] = xegpu.load_matrix %[[MEMDESC]]{{.*}} -> vector<4x16xf32> - // - // Final reduction - // CHECK: %[[FINAL_NEUTRAL:.*]] = arith.constant dense<0.000000e+00> : vector<4xf32> - // CHECK: %[[FINAL_RED0:.*]] = vector.multi_reduction , %[[SLM_LOAD0]], %[[FINAL_NEUTRAL]] [1] : vector<4x16xf32> to vector<4xf32> - // CHECK: %[[RES0:.*]] = arith.addf %[[FINAL_RED0]], %[[CST_ACC0]] : vector<4xf32> - // CHECK: %[[FINAL_RED1:.*]] = vector.multi_reduction , %[[SLM_LOAD1]], %[[FINAL_NEUTRAL]] [1] : vector<4x16xf32> to vector<4xf32> - // CHECK: %[[RES1:.*]] = arith.addf %[[FINAL_RED1]], %[[CST_ACC1]] : vector<4xf32> - - %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<8x256xindex> - %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} dense<0.000000e+00> : vector<8xf32> - %mask = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<8x256xi1> - %val = xegpu.load %arg0[%offset], %mask <{layout = #xegpu.layout}> : memref<2048xf32, 1>, vector<8x256xindex>, vector<8x256xi1> -> vector<8x256xf32> - %reduce = vector.multi_reduction , %val, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} [1] : vector<8x256xf32> to vector<8xf32> - gpu.return - } - - // CHECK-LABEL: splat_constant - gpu.func @splat_constant() { - // CHECK-COUNT-2: %[[CST:.*]] = arith.constant dense<0> : vector<4xindex> - %cst_2 = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} dense<0> : vector<8xindex> - gpu.return - } - - // CHECK-LABEL: gpu.func @step_broadcast - gpu.func @step_broadcast() { - // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[C16:.*]] = arith.constant 16 : index - // CHECK: %[[REM:.*]] = arith.remui %[[SGID]], %[[C16]] : index - // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index - // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index - // CHECK: %[[STEP:.*]] = vector.step : vector<4xindex> - // CHECK: %[[BCST0:.*]] = vector.broadcast %[[C0:.*]] : index to vector<4xindex> - // CHECK: %[[ADD0:.*]] = arith.addi %[[STEP]], %[[BCST0]] : vector<4xindex> - // CHECK: %[[BCST4:.*]] = vector.broadcast %[[C4:.*]] : index to vector<4xindex> - // CHECK: %[[ADD4:.*]] = arith.addi %[[STEP]], %[[BCST4]] : vector<4xindex> - // CHECK: %[[RES0:.*]] = vector.broadcast %[[ADD0]] : vector<4xindex> to vector<16x4xindex> - // CHECK: %[[RES1:.*]] = vector.broadcast %[[ADD4]] : vector<4xindex> to vector<16x4xindex> - %2 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} : vector<8xindex> - %bcast = vector.broadcast %2 {layout_result_0 = #xegpu.layout} : vector<8xindex> to vector<256x8xindex> - gpu.return - } - -} diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir deleted file mode 100644 index c3eb59adee2a6..0000000000000 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir +++ /dev/null @@ -1,987 +0,0 @@ -// RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s -gpu.module @test_distribution { - // CHECK-LABEL: create_nd_tdesc_no_offset - // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> - gpu.func @create_nd_tdesc_no_offset(%src: memref<256x128xf32>) { - // CHECK: xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf32> - // CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> - %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - gpu.return - } - - // CHECK-LABEL: create_nd_tdesc_with_ptr - // CHECK-SAME: %[[ARG_0:.*]]: ui64 - gpu.func @create_nd_tdesc_with_ptr(%src: ui64, %w : index, %h : index, %x : index, %y : index) { - // CHECK: xegpu.create_nd_tdesc %[[ARG_0]], shape : [{{.*}}, {{.*}}], strides : [{{.*}}, {{.*}}] : ui64 - // CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> - %c1 = arith.constant 1 : index - %tdesc = xegpu.create_nd_tdesc %src, shape:[%h, %w], strides: [%w, %c1] : ui64 - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - gpu.return - } - - // CHECK-LABEL: load_nd_tdesc_with_offset - gpu.func @load_nd_tdesc_with_offset(%src: memref<256x128xf32>) { - //CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> - //CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - //CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index - //CHECK-DAG: %[[SGIDX:.*]] = arith.remui %[[SGID]], %[[C4]] - //CHECK-DAG: %[[SGIDY_TMP:.*]] = arith.divui %[[SGID]], %[[C4]] - //CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index - //CHECK-DAG: %[[SGIDY:.*]] = arith.remui %[[SGIDY_TMP]], %[[C8]] - //CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index - //CHECK-DAG: %[[L_OFF_Y:.*]] = arith.muli %[[SGIDY]], %[[C32]] : index - //CHECK-DAG: %[[L_OFF_X:.*]] = arith.muli %[[SGIDX]], %[[C32_1:.*]] : index - //CHECK-DAG: %[[C256:.*]] = arith.constant 256 : index - //CHECK-DAG: %[[OFF_Y:.*]] = arith.remui %[[L_OFF_Y]], %[[C256]] : index - //CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index - //CHECK-DAG: %[[OFF_X:.*]] = arith.remui %[[L_OFF_X]], %[[C128]] : index - //CHECK-DAG: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]][{{%.*}}, {{%.*}}] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<32x32xf32, #xegpu.layout> -> vector<32x32xf32> - %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} - : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - -> vector<256x128xf32> - gpu.return - } - - // CHECK-LABEL: store_nd_with_offsets - // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> - gpu.func @store_nd_with_offsets(%src: memref<256x128xf32>) { - //CHECK: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout}> : vector<32x32xf32>, !xegpu.tensor_desc<32x32xf32, #xegpu.layout> - %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} - : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - -> vector<256x128xf32> - xegpu.store_nd %load, %tdesc[0, 0] {layout = #xegpu.layout} - : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - gpu.return -} - - // CHECK-LABEL: prefetch_nd_tdesc_with_offset - // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> - gpu.func @prefetch_nd_tdesc_with_offset(%src: memref<256x128xf32>) { - //CHECK: xegpu.prefetch_nd %{{.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<32x32xf32, #xegpu.layout> - %cst0 = arith.constant 0 : index - %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - xegpu.prefetch_nd %tdesc[%cst0, %cst0] - : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - gpu.return - } - - // CHECK-LABEL: dpas - gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { - // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32> - %tdesc_a = xegpu.create_nd_tdesc %a : memref<128x128xf16> - -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout} - : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - -> vector<128x128xf16> - %tdesc_b = xegpu.create_nd_tdesc %b : memref<128x128xf16> - -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout} - : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - -> vector<128x128xf16> - %dpas = xegpu.dpas %load_a, %load_b - {layout_a = #xegpu.layout, - layout_b = #xegpu.layout, - layout_cd = #xegpu.layout} - : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32> - gpu.return - } - - // CHECK-LABEL: dpas_no_sg_data - gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { - // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32> - %tdesc_a = xegpu.create_nd_tdesc %a : memref<128x128xf16> - -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout} - : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - -> vector<128x128xf16> - %tdesc_b = xegpu.create_nd_tdesc %b : memref<128x128xf16> - -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout } - : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - -> vector<128x128xf16> - %dpas = xegpu.dpas %load_a, %load_b - {layout_a = #xegpu.layout, - layout_b = #xegpu.layout, - layout_cd = #xegpu.layout} - : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32> - gpu.return - } - - // CHECK-LABEL: broadcast_dim1 - // CHECK-SAME: %[[ARG_0:.*]]: memref<256x1xf32> - gpu.func @broadcast_dim1(%src: memref<256x1xf32>) { - %tdesc = xegpu.create_nd_tdesc %src : memref<256x1xf32> - -> !xegpu.tensor_desc<256x1xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} - : !xegpu.tensor_desc<256x1xf32, #xegpu.layout> - -> vector<256x1xf32> - // CHECK: vector.broadcast {{.*}} : vector<32x1xf32> to vector<32x32xf32> - %broadcast = vector.broadcast %load - {layout_result_0 = #xegpu.layout} - : vector<256x1xf32> to vector<256x32xf32> - gpu.return - } - - // CHECK-LABEL: broadcast_dim0 - // CHECK-SAME: %[[ARG_0:.*]]: memref<1x128xf32> - gpu.func @broadcast_dim0(%src: memref<1x128xf32>) { - %tdesc = xegpu.create_nd_tdesc %src : memref<1x128xf32> - -> !xegpu.tensor_desc<1x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} - : !xegpu.tensor_desc<1x128xf32, #xegpu.layout> - -> vector<1x128xf32> - // CHECK: vector.broadcast {{.*}} : vector<1x32xf32> to vector<32x32xf32> - %broadcast = vector.broadcast %load - {layout_result_0 = #xegpu.layout} - : vector<1x128xf32> to vector<32x128xf32> - gpu.return - } - - // CHECK-LABEL: gemm_with_load_store_offset - // CHECK-SAME: %[[ARG_0:.*]]: memref<1024x1024xf16>, %[[ARG_1:.*]]: memref<1024x1024xf16>, %[[ARG_2:.*]]: memref<1024x1024xf32> - gpu.func @gemm_with_load_store_offset(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf32>) { - //CHECK: [[c0:%.+]] = arith.constant 0 : index - //CHECK: [[c128:%.+]] = arith.constant 128 : index - //CHECK: [[c1024:%.+]] = arith.constant 1024 : index - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %block_id_x = gpu.block_id x - %block_id_y = gpu.block_id y - %0 = arith.muli %block_id_x, %c128 : index - %1 = arith.muli %block_id_y, %c128 : index - %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> - // CHECK: [[DESC_A:%.+]] = xegpu.create_nd_tdesc %[[ARG_0]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x128xf16> - // CHECK: [[DESC_B:%.+]] = xegpu.create_nd_tdesc %[[ARG_1]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x16xf16> - %3 = xegpu.create_nd_tdesc %arg0 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %4 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - // load_nd with offset - %5 = xegpu.load_nd %2[%0, %1] {layout = #xegpu.layout}: !xegpu.tensor_desc<128x128xf32, #xegpu.layout> -> vector<128x128xf32> - %6 = xegpu.load_nd %3[%0, %c0] {layout = #xegpu.layout}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> - %7 = xegpu.load_nd %4[%c0, %1] {layout = #xegpu.layout}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> - // scf.for loop - // CHECK: [[scf:%.+]]:3 = scf.for [[arg3:%.+]] = [[c0]] to [[c1024]] step [[c128]] - // CHECK-SAME: iter_args([[arg4:%.+]] = {{.*}}, [[arg5:%.+]] = {{.*}}, [[arg6:%.+]] = {{.*}}) -> - // CHECK-SAME: (vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32>) - // CHECK: [[c:%.+]] = xegpu.dpas [[arg4]], [[arg5]], [[arg6]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> -> vector<16x16xf32> - // CHECK: [[a:%.+]] = xegpu.load_nd [[DESC_A]][{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16> - // CHECK: [[b:%.+]] = xegpu.load_nd [[DESC_B]][{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16> - // CHECK: scf.yield [[a]], [[b]], [[c]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> - %8:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg4 = %6, %arg5 = %7, %arg6 = %5) - -> (vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32>) { - // load_nd with offset inside loop - %9 = xegpu.dpas %arg4, %arg5, %arg6 - {layout_a = #xegpu.layout, - layout_b = #xegpu.layout, - layout_cd = #xegpu.layout} - : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32> - %10 = xegpu.load_nd %3[%arg3, %c0] {layout = #xegpu.layout}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> - %11 = xegpu.load_nd %4[%c0, %arg3] {layout = #xegpu.layout}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> - scf.yield %10, %11, %9 : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> - } {layout_result_0 = #xegpu.layout, - layout_result_1 = #xegpu.layout, - layout_result_2 = #xegpu.layout} - // store_nd with offset - xegpu.store_nd %8#2, %2[%0, %1] {layout = #xegpu.layout} : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout> - gpu.return - } - - // CHECK-LABEL: @subgroup_id_range - gpu.func @subgroup_id_range(%src: memref<256x128xf32>, %src1: memref<128x256xf32>, %src2: memref<128x64xf32>) { - %sg_id = gpu.subgroup_id : index - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %c31 = arith.constant 31 : index - %c3 = arith.constant 3 : index - %cond1 = arith.cmpi sge, %sg_id, %c0 : index - %cond2 = arith.cmpi slt, %sg_id, %c1 : index - %cond = arith.andi %cond1, %cond2 : i1 - scf.if %cond { - // CHECK-NOT: index.sub - %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} - : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - -> vector<256x128xf32> - } {sg_id_range = #xegpu.range<[0, 32]>} - %cond3 = arith.cmpi sge, %sg_id, %c2 : index - %cond4 = arith.cmpi slt, %sg_id, %c31 : index - %cond5 = arith.andi %cond3, %cond4 : i1 - scf.if %cond5 { - // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK: %[[C2:.*]] = arith.constant 2 : index - // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C2]] - %tdesc = xegpu.create_nd_tdesc %src2 : memref<128x64xf32> - -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} - : !xegpu.tensor_desc<128x64xf32, #xegpu.layout> - -> vector<128x64xf32> - %exp = math.exp %load {layout_result_0 = #xegpu.layout} : vector<128x64xf32> - }{sg_id_range = #xegpu.range<[2, 18]>} - gpu.return - } - - // CHECK-LABEL: @subgroup_id_range_nested_if - gpu.func @subgroup_id_range_nested_if(%src: memref<256x128xf32>, %src1: memref<128x64xf32>) { - %sg_id = gpu.subgroup_id : index - %c1 = arith.constant 1 : i1 - %c3 = arith.constant 3 : index - %c32 = arith.constant 32 : index - %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} - : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - -> vector<256x128xf32> - %cond1 = arith.cmpi sge, %sg_id, %c3 : index - %cond2 = arith.cmpi slt, %sg_id, %c32 : index - %cond = arith.andi %cond1, %cond2 : i1 - scf.if %c1 { - scf.if %cond { - // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK: %[[C3:.*]] = arith.constant 3 : index - // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C3]] - %td = xegpu.create_nd_tdesc %src1 : memref<128x64xf32> - -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout> - %ld = xegpu.load_nd %td[0, 0] {layout = #xegpu.layout} - : !xegpu.tensor_desc<128x64xf32, #xegpu.layout> - -> vector<128x64xf32> - %exp = math.exp %ld {layout_result_0 = #xegpu.layout} : vector<128x64xf32> - } - } {sg_id_range = #xegpu.range<[3, 19]>} - gpu.return - } - - // CHECK-LABEL: @load_gather - // CHECK-SAME: %[[ARG0:.*]]: memref - gpu.func @load_gather(%src : memref) { - // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<32x4xindex> - // CHECK: %[[MASK:.*]] = arith.constant dense : vector<32x4xi1> - // CHECK: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint}> - // CHECK-SAME: : memref, vector<32x4xindex>, vector<32x4xi1> -> vector<32x4xf16> - %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<256x16xindex> - %mask = arith.constant {layout_result_0 = #xegpu.layout} dense<1> : vector<256x16xi1> - %load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} - : memref, vector<256x16xindex>, vector<256x16xi1> -> vector<256x16xf16> - gpu.return - } - - // CHECK-LABEL: @store_scatter - // CHECK-SAME: %[[ARG0:.*]]: memref<256xf16> - gpu.func @store_scatter(%dest : memref<256xf16>) { - // CHECK: %[[VAL:.*]] = arith.constant dense<2.550000e+01> : vector<8xf16> - // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<8xindex> - // CHECK: %[[MASK:.*]] = arith.constant dense : vector<8xi1> - // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint, layout = #xegpu.layout}> - // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1> - %val = arith.constant {layout_result_0 = #xegpu.layout} dense<25.5> : vector<256xf16> - %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<256xindex> - %mask = arith.constant {layout_result_0 = #xegpu.layout} dense<1> : vector<256xi1> - xegpu.store %val, %dest[%offset], %mask {chunk_size = 1, layout = #xegpu.layout, - layout_operand_2 = #xegpu.layout, - layout_operand_3 = #xegpu.layout, - l1_hint = #xegpu.cache_hint} - : vector<256xf16>, memref<256xf16>, vector<256xindex>, vector<256xi1> - gpu.return - } - - // CHECK-LABEL: @load_with_non_unit_chunk_size - // CHECK-SAME: %[[ARG0:.*]]: memref - gpu.func @load_with_non_unit_chunk_size(%src : memref) { - // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<8xindex> - // CHECK: %[[MASK:.*]] = arith.constant dense : vector<8xi1> - // CHECK: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 4 : i64, l1_hint = #xegpu.cache_hint}> - // CHECK-SAME: : memref, vector<8xindex>, vector<8xi1> -> vector<8x4xf16> - %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<256xindex> - %mask = arith.constant {layout_result_0 = #xegpu.layout} dense<1> : vector<256xi1> - %load = xegpu.load %src[%offset], %mask {chunk_size = 4, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} - : memref, vector<256xindex>, vector<256xi1> -> vector<256x4xf16> - gpu.return - } - - // CHECK-LABEL: distribute_load_matrix - // CHECK-SAME: [[arg0:%.+]]: memref<32768xi8, 3> - gpu.func @distribute_load_matrix(%arg0: memref<32768xi8, 3>) { - //CHECK: [[mdesc:%.+]] = xegpu.create_mem_desc [[arg0]] : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32> - //CHECK: [[sgid:%.+]] = gpu.subgroup_id : index - //CHECK: [[c4:%.+]] = arith.constant 4 : index - //CHECK: [[sgidx:%.+]] = arith.remui [[sgid]], [[c4]] : index - //CHECK: [[sgidy_tmp:%.+]] = arith.divui [[sgid]], [[c4]] : index - //CHECK: [[c2:%.+]] = arith.constant 2 : index - //CHECK: [[sgidy:%.+]] = arith.remui [[sgidy_tmp]], [[c2]] : index - //CHECK: [[c32:%.+]] = arith.constant 32 : index - //CHECK: [[l_off_y:%.+]] = arith.muli [[sgidy]], [[c32]] : index - //CHECK: [[c32_0:%.+]] = arith.constant 32 : index - //CHECK: [[l_off_x:%.+]] = arith.muli [[sgidx]], [[c32_0]] : index - //CHECK: [[c64:%.+]] = arith.constant 64 : index - //CHECK: [[off_y:%.+]] = arith.remui [[l_off_y]], [[c64]] : index - //CHECK: [[c128:%.+]] = arith.constant 128 : index - //CHECK: [[off_x:%.+]] = arith.remui [[l_off_x]], [[c128]] : index - //CHECK: xegpu.load_matrix [[mdesc]][[[off_y]], [[off_x]]] <{layout = #xegpu.layout}>: !xegpu.mem_desc<64x128xf32>, index, index -> vector<32x32xf32> - %0 = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32> - %1 = xegpu.load_matrix %0[0, 0] <{layout = #xegpu.layout}>: !xegpu.mem_desc<64x128xf32> -> vector<64x128xf32> - gpu.return - } - - //CHECK-LABEL: distribute_store_matrix - //CHECK-SAME: [[arg0:%.+]]: memref<32768xi8, 3> - gpu.func @distribute_store_matrix(%arg0 : memref<32768xi8, 3>) { - //CHECK: [[cst:%.+]] = arith.constant dense<1.000000e+00> : vector<32x32xf32> - //CHECK: [[mdesc:%.+]] = xegpu.create_mem_desc [[arg0]] : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32> - //CHECK: [[sgid:%.+]] = gpu.subgroup_id : index - //CHECK: [[c4:%.+]] = arith.constant 4 : index - //CHECK: [[sgidx:%.+]] = arith.remui [[sgid]], [[c4]] : index - //CHECK: [[sgidy_tmp:%.+]] = arith.divui [[sgid]], [[c4]] : index - //CHECK: [[c2:%.+]] = arith.constant 2 : index - //CHECK: [[sgidy:%.+]] = arith.remui [[sgidy_tmp]], [[c2]] : index - //CHECK: [[c32:%.+]] = arith.constant 32 : index - //CHECK: [[l_off_y:%.+]] = arith.muli [[sgidy]], [[c32]] : index - //CHECK: [[c32_0:%.+]] = arith.constant 32 : index - //CHECK: [[l_off_x:%.+]] = arith.muli [[sgidx]], [[c32_0]] : index - //CHECK: [[c64:%.+]] = arith.constant 64 : index - //CHECK: [[off_y:%.+]] = arith.remui [[l_off_y]], [[c64]] : index - //CHECK: [[c128:%.+]] = arith.constant 128 : index - //CHECK: [[off_x:%.+]] = arith.remui [[l_off_x]], [[c128]] : index - //CHECK: xegpu.store_matrix [[cst]], [[mdesc]][[[off_y]], [[off_x]]] : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32>, index, index - %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.0> : vector<64x128xf32> - %mdesc = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32> - xegpu.store_matrix %cst, %mdesc[0, 0] {layout = #xegpu.layout} : vector<64x128xf32>, !xegpu.mem_desc<64x128xf32> - gpu.return - } - - // CHECK-LABEL: @vector_reduce_dim_0 - gpu.func @vector_reduce_dim_0(%src: memref<4x128xf32>) { - %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} dense<1.0> : vector<128xf32> - %tdesc = xegpu.create_nd_tdesc %src : memref<4x128xf32> - -> !xegpu.tensor_desc<4x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} - : !xegpu.tensor_desc<4x128xf32, #xegpu.layout> - -> vector<4x128xf32> - // CHECK: vector.multi_reduction , {{.*}}, {{.*}} [0] : vector<4x4xf32> to vector<4xf32> - %reduce = vector.multi_reduction , %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} [0] - : vector<4x128xf32> to vector<128xf32> - gpu.return - } - - // CHECK-LABEL: @vector_reduce_dim_1 - gpu.func @vector_reduce_dim_1(%src: memref<256x64xf32>) { - %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} dense<1.0> : vector<256xf32> - %tdesc = xegpu.create_nd_tdesc %src : memref<256x64xf32> - -> !xegpu.tensor_desc<256x64xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} - : !xegpu.tensor_desc<256x64xf32, #xegpu.layout> - -> vector<256x64xf32> - // CHECK: vector.multi_reduction , {{.*}}, {{.*}} [1] : vector<16x64xf32> to vector<16xf32> - %reduce = vector.multi_reduction , %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} [1] - : vector<256x64xf32> to vector<256xf32> - gpu.return - } - - // CHECK-LABEL: @vector_reduce_4D - gpu.func @vector_reduce_4D(%src: ui64) { - %cst_acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [3]>} dense<0.0> : vector<4x2x6xf16> - %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<4x2x6x32xindex> - %mask = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<4x2x6x32xi1> - %load = xegpu.load %src[%offset], %mask {layout = #xegpu.layout} : ui64, vector<4x2x6x32xindex>, vector<4x2x6x32xi1> -> vector<4x2x6x32xf16> - // CHECK: vector.multi_reduction , {{.*}}, {{.*}} [3] : vector<1x1x1x32xf16> to vector<1x1x1xf16> - %reduce = vector.multi_reduction , %load, %cst_acc {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [3]>} [3] - : vector<4x2x6x32xf16> to vector<4x2x6xf16> - gpu.return - } - - // CHECK-LABEL: gpu.func @vector_reduce_scalar_cross_sg - // CHECK-SAME: (%[[ARG0:.*]]: memref<32x32xf32>) - // CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 - // CHECK-DAG: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<8x8xf32> -> vector<8x8xf32> - // CHECK-DAG: %[[CST_ACC:.*]] = arith.constant 0.000000e+00 : f32 - // CHECK-DAG: %[[LOCAL:.*]] = vector.multi_reduction , %[[LOAD]], %[[CST_ACC]] [0, 1] : vector<8x8xf32> to f32 - // CHECK-DAG: %[[BCAST:.*]] = vector.broadcast %[[LOCAL]] : f32 to vector<1x1xf32> - // CHECK-DAG: %[[ALLOCA:.*]] = memref.alloca() : memref<64xi8, 3> - // CHECK-DAG: %[[MEM_DESC:.*]] = xegpu.create_mem_desc %[[ALLOCA]] : memref<64xi8, 3> -> !xegpu.mem_desc<4x4xf32> - // CHECK-DAG: xegpu.store_matrix %[[BCAST]], %[[MEM_DESC]]{{.*}} : vector<1x1xf32>, !xegpu.mem_desc<4x4xf32> - // CHECK-DAG: gpu.barrier - // CHECK-DAG: %[[LOAD_SLM:.*]] = xegpu.load_matrix %[[MEM_DESC]]{{.*}} -> vector<4x4xf32> - // CHECK-DAG: %[[CST_FINAL:.*]] = arith.constant 0.000000e+00 : f32 - // CHECK-DAG: %[[FINAL:.*]] = vector.multi_reduction , %[[LOAD_SLM]], %[[CST_FINAL]] [0, 1] : vector<4x4xf32> to f32 - // CHECK-DAG: arith.addf %[[FINAL]], %[[CST]] : f32 - gpu.func @vector_reduce_scalar_cross_sg(%src: memref<32x32xf32>) { - %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0, 1]>} 0.0 : f32 - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<32x32xf32> - -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} - : !xegpu.tensor_desc<32x32xf32, #xegpu.layout> - -> vector<32x32xf32> - %reduce = vector.multi_reduction , %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0, 1]>} [0, 1] - : vector<32x32xf32> to f32 - gpu.return - } - - // CHECK-LABEL: vector_step_op - gpu.func @vector_step_op_slice_attr() { - //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index - //CHECK: [[c8:%.+]] = arith.constant 8 : index - //CHECK: [[sgidx:%.+]] = arith.remui [[sgId]], [[c8]] : index - //CHECK: [[sgidy_tmp:%.+]] = arith.divui [[sgId]], [[c8]] : index - //CHECK: [[c4:%.+]] = arith.constant 4 : index - //CHECK: [[sgidy:%.+]] = arith.remui [[sgidy_tmp]], [[c4]] : index - //CHECK: [[c32:%.+]] = arith.constant 32 : index - //CHECK: [[LY:%.+]] = arith.muli [[sgidy]], [[c32]] : index - //CHECK: [[c128:%.+]] = arith.constant 128 : index - //CHECK: [[MODY:%.+]] = arith.remui [[LY]], [[c128]] : index - //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex> - //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex> - //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex> - %step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>}: vector<128xindex> - gpu.return - } - - gpu.func @vector_step_op_layout_attr() { - //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index - //CHECK: [[c16:%.+]] = arith.constant 16 : index - //CHECK: [[sgidx:%.+]] = arith.remui [[sgId]], [[c16]] : index - //CHECK: [[c8:%.+]] = arith.constant 8 : index - //CHECK: [[LOCALY:%.+]] = arith.muli [[sgidx]], [[c8]] : index - //CHECK: [[c128:%.+]] = arith.constant 128 : index - //CHECK: [[MODY:%.+]] = arith.remui [[LOCALY]], [[c128]] : index - //CHECK: [[BASE:%.+]] = vector.step : vector<8xindex> - //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<8xindex> - //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<8xindex> - %step = vector.step {layout_result_0 = #xegpu.layout}: vector<128xindex> - gpu.return - } - - // CHECK-LABEL: constant_with_slice_attr - gpu.func @constant_with_slice_attr() { - //CHECK: [[cst:%.+]] = arith.constant dense<10> : vector<1xindex> - %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1, 2, 3]>} dense<10> : vector<4xindex> - gpu.return - } - - // CHECK-LABEL: vector_shape_cast - gpu.func @vector_shape_cast() { - %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0, 1, 2]>} dense<10> : vector<128xindex> - %step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0, 1, 2]>} : vector<128xindex> - %muli = arith.muli %cst, %step {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0, 1, 2]>} : vector<128xindex> - //CHECK: vector.shape_cast {{.*}} : vector<32xindex> to vector<1x1x1x32xindex> - %shape_cast = vector.shape_cast %muli {layout_result_0 = #xegpu.layout, layout_operand_0 = #xegpu.slice<#xegpu.layout, dims = [0, 1, 2]>} : vector<128xindex> to vector<1x1x1x128xindex> - gpu.return - } - - // CHECK-LABEL: vector_broadcast - gpu.func @vector_broadcast(%arg0: index, %arg1: index) { - %muli = arith.muli %arg0, %arg1 : index - // CHECK: vector.broadcast {{.*}} : index to vector<1x1x1x32xindex> - %broadcast = vector.broadcast %muli {layout_result_0 = #xegpu.layout} : index to vector<4x2x6x32xindex> - gpu.return - } - - // CHECK-LABEL: vector_transpose - gpu.func @vector_transpose(%src: memref<256x32xf32>) { - %tdesc = xegpu.create_nd_tdesc %src : memref<256x32xf32> - -> !xegpu.tensor_desc<256x32xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} - : !xegpu.tensor_desc<256x32xf32, #xegpu.layout> - -> vector<256x32xf32> - //CHECK: vector.transpose {{.*}}, [1, 0] : vector<64x32xf32> to vector<32x64xf32> - %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout} - : vector<256x32xf32> to vector<32x256xf32> - gpu.return - } - - // CHECK-LABEL: non_splat_constant_2D - gpu.func @non_splat_constant_2D() { - // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1x1xindex> - // CHECK-DAG: %[[T0:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[T1:.*]] = arith.remui %[[T0]], %[[C32:.*]] : index - // CHECK-DAG: %[[T2:.*]] = arith.remui %[[T1]], %[[C32_4:.*]] : index - // CHECK-DAG: %[[T3:.*]] = arith.muli %[[T2]], %[[C16:.*]] : index - // CHECK-DAG: %[[T4:.*]] = arith.addi %[[C0_8:.*]], %[[T3]] : index - // CHECK-DAG: %[[T5:.*]] = arith.muli %[[C0_6:.*]], %[[C0_7:.*]] : index - // CHECK-DAG: %[[T6:.*]] = arith.addi %[[T4]], %[[T5]] : index - // CHECK-DAG: %[[T7:.*]] = vector.broadcast %[[T6]] : index to vector<1x1xindex> - // CHECK-DAG: %[[T8:.*]] = arith.addi %[[CST]], %[[T7]] : vector<1x1xindex> - %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<[[0], [16], [32], [48], [64], [80], [96], [112], [128], [144], [160], [176], [192], [208], [224], [240], [256], [272], [288], [304], [320], [336], [352], [368], [384], [400], [416], [432], [448], [464], [480], [496]]> : vector<32x1xindex> - gpu.return - } - - // CHECK-LABEL: non_splat_constant_2D_non_unit_dim - gpu.func @non_splat_constant_2D_non_unit_dim() { - // CHECK-DAG: %[[BASECST:.*]] = arith.constant dense<{{\[}}{{\[}}0, 16{{\]}}, {{\[}}8, 24{{\]}}{{\]}}> : vector<2x2xindex> - // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[SGIDX:.*]] = arith.remui %[[SGID]], %{{.*}} - // CHECK-DAG: %[[SGIDY_TMP:.*]] = arith.divui %[[SGID]], %{{.*}} - // CHECK-DAG: %[[SGIDY:.*]] = arith.remui %[[SGIDY_TMP]], %{{.*}} - // CHECK-DAG: %[[MULY:.*]] = arith.muli %[[SGIDY]], %[[C2:.*]] : index - // CHECK-DAG: %[[MULX:.*]] = arith.muli %[[SGIDX]], %{{.*}} : index - // CHECK-DAG: %[[REMU_Y:.*]] = arith.remui %[[MULY]], %[[C8:.*]] : index - // CHECK-DAG: %[[REMU_X:.*]] = arith.remui %[[MULX]], %{{.*}} : index - // CHECK-DAG: %[[MUL5:.*]] = arith.muli %[[REMU_Y]], %{{.*}} : index - // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[C0:.*]], %[[MUL5]] : index - // CHECK-DAG: %[[MUL6:.*]] = arith.muli %[[REMU_X]], %[[C16:.*]] : index - // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[ADD]], %[[MUL6]] : index - // CHECK-DAG: %[[BCAST:.*]] = vector.broadcast %[[ADDSTRIDES]] : index to vector<2x2xindex> - // CHECK-DAG: %[[ADDCST:.*]] = arith.addi %[[BASECST]], %[[BCAST]] : vector<2x2xindex> - %cst_8x8 = arith.constant {layout_result_0 = #xegpu.layout} dense<[ - [0, 16, 32, 48, 64, 80, 96, 112], - [8, 24, 40, 56, 72, 88, 104, 120], - [16, 32, 48, 64, 80, 96, 112, 128], - [24, 40, 56, 72, 88, 104, 120, 136], - [32, 48, 64, 80, 96, 112, 128, 144], - [40, 56, 72, 88, 104, 120, 136, 152], - [48, 64, 80, 96, 112, 128, 144, 160], - [56, 72, 88, 104, 120, 136, 152, 168] - ]> : vector<8x8xindex> - gpu.return - } - - // CHECK-LABEL: non_splat_constant - gpu.func @non_splat_constant() { - // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1xindex> - // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[REMU:.*]] = arith.remui %[[SGID]], %{{.*}} - // CHECK-DAG: %[[REMU2:.*]] = arith.remui %[[REMU]], %{{.*}} - // CHECK-DAG: %[[MUL:.*]] = arith.muli %[[REMU2]], %[[C16:.*]] : index - // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[C0:.*]], %[[MUL]] : index - // CHECK-DAG: %[[BCAST:.*]] = vector.broadcast %[[ADDSTRIDES]] : index to vector<1xindex> - // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[CST]], %[[BCAST]] : vector<1xindex> - %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496]> : vector<32xindex> - // CHECK: arith.constant dense<{{\[}}{{\[}}0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15{{\]}}{{\]}}> : vector<1x16xindex> - %cst_1 = arith.constant {layout_result_0 = #xegpu.layout} dense<[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]> : vector<1x16xindex> - gpu.return - } - - // CHECK-LABEL: scalar_broadcast - gpu.func @scalar_broadcast(%arg0: index) { - // CHECK: vector.broadcast {{.*}} : index to vector<1x1x1xindex> - %broadcast = vector.broadcast %arg0 {layout_result_0 = #xegpu.layout} : index to vector<4x1x1xindex> - gpu.return - } - - // CHECK-LABEL: vector_mask_1D - gpu.func @vector_mask_1D() { - // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[REMU:.*]] = arith.remui %[[SGID]], %[[C2:.*]] - // CHECK-DAG: %[[MUL:.*]] = arith.muli %[[REMU]], %[[C16:.*]] : index - // CHECK-DAG: %[[REMU2:.*]] = arith.remui %[[MUL]], %[[C32:.*]] : index - // CHECK-DAG: %[[SUB:.*]] = arith.subi %[[C8:.*]], %[[REMU2]] : index - // CHECK-DAG: %[[MAX:.*]] = arith.maxsi %[[SUB]], %[[C0:.*]] : index - // CHECK-DAG: %[[MIN:.*]] = arith.minsi %[[MAX]], %[[C16:.*]] : index - // CHECK-DAG: %[[MASK:.*]] = vector.create_mask %[[MIN]] : vector<16xi1> - %constant_mask = vector.constant_mask [8] {layout_result_0 = #xegpu.layout} : vector<32xi1> - gpu.return - } - - // CHECK-LABEL: vector_mask_2D - gpu.func @vector_mask_2D() { - // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[SGIDX:.*]] = arith.remui %[[SGID]], %[[C4:.*]] - // CHECK-DAG: %[[SGIDY_TMP:.*]] = arith.divui %[[SGID]], %[[C4:.*]] - // CHECK-DAG: %[[SGIDY:.*]] = arith.remui %[[SGIDY_TMP]], %[[C8:.*]] - // CHECK-DAG: %[[ROW:.*]] = arith.muli %[[SGIDY]], %[[C32:.*]] : index - // CHECK-DAG: %[[COL:.*]] = arith.muli %[[SGIDX]], %[[C32:.*]] : index - // CHECK-DAG: %[[MODROW:.*]] = arith.remui %[[ROW]], %[[C256:.*]] : index - // CHECK-DAG: %[[MODCOL:.*]] = arith.remui %[[COL]], %[[C128:.*]] : index - // CHECK-DAG: %[[SUBROW:.*]] = arith.subi %[[C16:.*]], %[[MODROW]] : index - // CHECK-DAG: %[[MAXROW:.*]] = arith.maxsi %[[SUBROW]], %[[C4:.*]] : index - // CHECK-DAG: %[[MINROW:.*]] = arith.minsi %[[MAXROW]], %[[C32:.*]] : index - // CHECK-DAG: %[[SUBCOL:.*]] = arith.subi %[[C16:.*]], %[[MODCOL]] : index - // CHECK-DAG: %[[MAXCOL:.*]] = arith.maxsi %[[SUBCOL]], %[[C7:.*]] : index - // CHECK-DAG: %[[MINCOL:.*]] = arith.minsi %[[MAXCOL]], %[[C32:.*]] : index - // CHECK-DAG: %[[MASK:.*]] = vector.create_mask %[[MINROW]], %[[MINCOL]] : vector<32x32xi1> - %constant_mask = vector.constant_mask [16, 16] {layout_result_0 = #xegpu.layout} : vector<256x128xi1> - gpu.return - } - - // CHECK-LABEL: vector_create_mask_1D - gpu.func @vector_create_mask_1D() { - // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[REMU:.*]] = arith.remui %[[SGID]], %[[C2:.*]] - // CHECK-DAG: %[[MUL:.*]] = arith.muli %[[REMU]], %[[C16:.*]] - // CHECK-DAG: %[[REMU2:.*]] = arith.remui %[[MUL]], %[[C32:.*]] - // CHECK-DAG: %[[SUB:.*]] = arith.subi %[[C8:.*]], %[[REMU2]] : index - // CHECK-DAG: %[[MAX:.*]] = arith.maxsi %[[SUB]], %[[C0:.*]] : index - // CHECK-DAG: %[[MIN:.*]] = arith.minsi %[[MAX]], %[[C16:.*]] : index - // CHECK-DAG: %[[MASK:.*]] = vector.create_mask %[[MIN]] : vector<16xi1> - %cst8 = arith.constant 8 : index - %constant_mask = vector.create_mask %cst8 {layout_result_0 = #xegpu.layout} : vector<32xi1> - gpu.return - } - - // CHECK-LABEL: vector_create_mask_2D - gpu.func @vector_create_mask_2D() { - // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[SGIDX:.*]] = arith.remui %[[SGID]], %[[C4:.*]] - // CHECK-DAG: %[[SGIDY_TMP:.*]] = arith.divui %[[SGID]], %[[C4:.*]] - // CHECK-DAG: %[[SGIDY:.*]] = arith.remui %[[SGIDY_TMP]], %[[C8:.*]] - // CHECK-DAG: %[[ROW:.*]] = arith.muli %[[SGIDY]], %[[C32:.*]] - // CHECK-DAG: %[[COL:.*]] = arith.muli %[[SGIDX]], %[[C32:.*]] - // CHECK-DAG: %[[MODROW:.*]] = arith.remui %[[ROW]], %[[C256:.*]] - // CHECK-DAG: %[[MODCOL:.*]] = arith.remui %[[COL]], %[[C128:.*]] - // CHECK-DAG: %[[SUBROW:.*]] = arith.subi %[[C16:.*]], %[[MODROW]] : index - // CHECK-DAG: %[[MAXROW:.*]] = arith.maxsi %[[SUBROW]], %[[C0:.*]] : index - // CHECK-DAG: %[[MINROW:.*]] = arith.minsi %[[MAXROW]], %[[C32:.*]] : index - // CHECK-DAG: %[[SUBCOL:.*]] = arith.subi %[[C16:.*]], %[[MODCOL]] : index - // CHECK-DAG: %[[MAXCOL:.*]] = arith.maxsi %[[SUBCOL]], %[[C0:.*]] : index - // CHECK-DAG: %[[MINCOL:.*]] = arith.minsi %[[MAXCOL]], %[[C32:.*]] : index - // CHECK-DAG: %[[MASK:.*]] = vector.create_mask %[[MINROW]], %[[MINCOL]] : vector<32x32xi1> - %cst16 = arith.constant 16 : index - %constant_mask = vector.create_mask %cst16, %cst16 {layout_result_0 = #xegpu.layout} : vector<256x128xi1> - gpu.return - } - - // CHECK-LABEL: distribute_load_slice_attr - gpu.func @distribute_load_slice_attr() { - %2 = memref.alloca() {alignment = 1024} : memref<4096xf32> - %offset = arith.constant {layout_result_0 = #xegpu.layout } dense<0> : vector<256xindex> - %mask = arith.constant {layout_result_0 = #xegpu.layout } dense<1> : vector<256xi1> - - // CHECK: %[[LOAD:.*]] = xegpu.load {{.*}} <{chunk_size = 1 : i64, layout = #xegpu.slice<#xegpu.layout, dims = [0]>}> - // CHECK-SAME: memref<4096xf32>, vector<32xindex>, vector<32xi1> -> vector<32xf32> - %3 = xegpu.load %2[%offset], %mask {chunk_size = 1, layout = #xegpu.slice<#xegpu.layout, dims = [0]> } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32> - - // CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[LOAD]] : vector<32xf32> to vector<32x32xf32> - %4 = vector.broadcast %3 {layout_result_0 = - #xegpu.layout} : vector<256xf32> to vector<256x256xf32> - gpu.return - } - - // CHECK-LABEL: gpu.func @vector_reduce_cross_sg_dim_1 - // CHECK-SAME: (%[[ARG0:.*]]: memref) - gpu.func @vector_reduce_cross_sg_dim_1(%src: memref) { - // CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1x32xf32> - // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<0> : vector<1x1x32xindex> - // CHECK-DAG: %[[CST_1:.*]] = arith.constant dense : vector<1x1x32xi1> - // CHECK-DAG: %[[LOAD:.*]] = xegpu.load %[[ARG0:.*]][%[[CST_0]]], %[[CST_1]] <{chunk_size = 1 : i64}> : memref, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32> - // CHECK-DAG: %[[CST_2:.*]] = arith.constant dense<0.000000e+00> : vector<1x32xf32> - // CHECK-DAG: %[[LOCAL_REDUCE:.*]] = vector.multi_reduction , %[[LOAD]], %[[CST_2]] [1] : vector<1x1x32xf32> to vector<1x32xf32> - // CHECK-DAG: %[[CAST:.*]] = vector.shape_cast %[[LOCAL_REDUCE]] : vector<1x32xf32> to vector<1x1x32xf32> - // CHECK-DAG: %[[ALLOCA:.*]] = memref.alloca() : memref<4096xi8, 3> - // CHECK-DAG: %[[MEM_DESC:.*]] = xegpu.create_mem_desc %[[ALLOCA]] : memref<4096xi8, 3> -> !xegpu.mem_desc<1x32x32xf32> - // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: xegpu.store_matrix %[[CAST]], %[[MEM_DESC]]{{.*}} : vector<1x1x32xf32>, !xegpu.mem_desc<1x32x32xf32>, index, index, index - // CHECK-DAG: gpu.barrier - // CHECK-DAG: %[[LOAD_SLM:.*]] = xegpu.load_matrix %[[MEM_DESC]]{{.*}} : !xegpu.mem_desc<1x32x32xf32>, index, index, index -> vector<1x32x32xf32> - // CHECK-DAG: %[[CST_3:.*]] = arith.constant dense<0.000000e+00> : vector<1x32xf32> - // CHECK-DAG: %[[FINAL_REDUCE:.*]] = vector.multi_reduction , %[[LOAD_SLM]], %[[CST_3]] [1] : vector<1x32x32xf32> to vector<1x32xf32> - // CHECK-DAG: %[[ADD:.*]] = arith.addf %[[FINAL_REDUCE]], %[[CST]] : vector<1x32xf32> - // CHECK-DAG: gpu.return - %cst_3 = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} dense<1.0> : vector<1x32xf32> - %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<1x32x32xindex> - %mask = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<1x32x32xi1> - %14 = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout} : memref, vector<1x32x32xindex>, vector<1x32x32xi1> -> vector<1x32x32xf32> - %15 = vector.multi_reduction , %14, %cst_3 {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} [1] : vector<1x32x32xf32> to vector<1x32xf32> - gpu.return - } - - // CHECK-LABEL: gpu.func @vector_reduce_cross_sg_dim_0 - // CHECK-SAME: (%[[ARG0:.*]]: memref<256x128xf32>) - gpu.func @vector_reduce_cross_sg_dim_0(%src: memref<256x128xf32>) { - // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<32xf32> - // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[REM1:.*]] = arith.remui %[[SGID]], %[[C4:.*]] : index - // CHECK-DAG: %[[DIV1:.*]] = arith.divui %[[SGID]], %[[C4:.*]] : index - // CHECK-DAG: %[[REM2:.*]] = arith.remui %[[DIV1]], %[[C8:.*]] : index - // CHECK-DAG: %[[MUL1:.*]] = arith.muli %[[REM2]], %[[C32:.*]] : index - // CHECK-DAG: %[[MUL2:.*]] = arith.muli %[[REM1]], %[[C32_0:.*]] : index - // CHECK-DAG: %[[REM3:.*]] = arith.remui %[[MUL1]], %[[C256:.*]] : index - // CHECK-DAG: %[[REM4:.*]] = arith.remui %[[MUL2]], %[[C128:.*]] : index - // CHECK-DAG: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[REM3]], %[[REM4]]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32> - // CHECK-DAG: %[[LOAD_ND:.*]] = xegpu.load_nd %[[TDESC]] : !xegpu.tensor_desc<32x32xf32> -> vector<32x32xf32> - // CHECK-DAG: %[[CST_LOCAL:.*]] = arith.constant dense<0.000000e+00> : vector<32xf32> - // CHECK-DAG: %[[LOCAL_REDUCE:.*]] = vector.multi_reduction , %[[LOAD_ND]], %[[CST_LOCAL]] [0] : vector<32x32xf32> to vector<32xf32> - // CHECK-DAG: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[LOCAL_REDUCE]] : vector<32xf32> to vector<1x32xf32> - // CHECK-DAG: %[[ALLOCA:.*]] = memref.alloca() : memref<4096xi8, 3> - // CHECK-DAG: %[[MEM_DESC:.*]] = xegpu.create_mem_desc %[[ALLOCA]] : memref<4096xi8, 3> -> !xegpu.mem_desc<8x128xf32> - // CHECK-DAG: %[[SGID2:.*]] = gpu.subgroup_id : index - // CHECK-DAG: xegpu.store_matrix %[[SHAPE_CAST]], %[[MEM_DESC]]{{.*}} : vector<1x32xf32>, !xegpu.mem_desc<8x128xf32>, index, index - // CHECK-DAG: gpu.barrier - // CHECK-DAG: %[[LOAD_SLM:.*]] = xegpu.load_matrix %[[MEM_DESC]]{{.*}} : !xegpu.mem_desc<8x128xf32>, index, index -> vector<8x32xf32> - // CHECK-DAG: %[[CST_CROSS_SG_1:.*]] = arith.constant dense<0.000000e+00> : vector<32xf32> - // CHECK-DAG: %[[FINAL_REDUCE:.*]] = vector.multi_reduction , %[[LOAD_SLM]], %[[CST_CROSS_SG_1]] [0] : vector<8x32xf32> to vector<32xf32> - // CHECK-DAG: arith.addf %[[FINAL_REDUCE]], %[[CST]] : vector<32xf32> - %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} dense<0.0> : vector<128xf32> - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc {layout = #xegpu.layout} - : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - -> vector<256x128xf32> - %reduce = vector.multi_reduction , %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} [0] - : vector<256x128xf32> to vector<128xf32> - gpu.return - } - - // CHECK-LABEL: gpu.func @vector_reduce_multi_dim - // CHECK-SAME: (%[[ARG0:.*]]: memref) - gpu.func @vector_reduce_multi_dim(%src: memref) { - // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<1x1xf32> - // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<0> : vector<1x1x32x32xindex> - // CHECK-DAG: %[[CST_1:.*]] = arith.constant dense : vector<1x1x32x32xi1> - // CHECK-DAG: %[[LOAD:.*]] = xegpu.load %{{.*}}[%[[CST_0]]], %[[CST_1]] <{chunk_size = 1 : i64}> : memref, vector<1x1x32x32xindex>, vector<1x1x32x32xi1> -> vector<1x1x32x32xf32> - // CHECK-DAG: %[[CST_2:.*]] = arith.constant dense<0.000000e+00> : vector<1x1xf32> - // CHECK-DAG: %[[LOCAL_REDUCE:.*]] = vector.multi_reduction , %[[LOAD]], %[[CST_2]] [2, 3] : vector<1x1x32x32xf32> to vector<1x1xf32> - // CHECK-DAG: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[LOCAL_REDUCE]] : vector<1x1xf32> to vector<1x1x1x1xf32> - // CHECK-DAG: %[[ALLOCA:.*]] = memref.alloca() : memref<256xi8, 3> - // CHECK-DAG: %[[MEM_DESC:.*]] = xegpu.create_mem_desc %[[ALLOCA]] : memref<256xi8, 3> -> !xegpu.mem_desc<2x2x4x4xf32> - // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: xegpu.store_matrix %[[SHAPE_CAST]], %[[MEM_DESC]]{{.*}} : vector<1x1x1x1xf32>, !xegpu.mem_desc<2x2x4x4xf32>, index, index, index, index - // CHECK-DAG: gpu.barrier - // CHECK-DAG: %[[LOAD_SLM:.*]] = xegpu.load_matrix %[[MEM_DESC]]{{.*}} : !xegpu.mem_desc<2x2x4x4xf32>, index, index, index, index -> vector<1x1x4x4xf32> - // CHECK-DAG: %[[CST_3:.*]] = arith.constant dense<0.000000e+00> : vector<1x1xf32> - // CHECK-DAG: %[[FINAL_REDUCE:.*]] = vector.multi_reduction , %[[LOAD_SLM]], %[[CST_3]] [2, 3] : vector<1x1x4x4xf32> to vector<1x1xf32> - // CHECK-DAG: %[[FINAL_ADD:.*]] = arith.addf %[[FINAL_REDUCE]], %[[CST]] : vector<1x1xf32> - // CHECK-DAG: gpu.return - %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [2, 3]>} dense<0.0> : vector<2x2xf32> - %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<2x2x128x128xindex> - %mask = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<2x2x128x128xi1> - %load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout} : memref, vector<2x2x128x128xindex>, vector<2x2x128x128xi1> -> vector<2x2x128x128xf32> - %reduce = vector.multi_reduction , %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [2, 3]>} [2, 3] : vector<2x2x128x128xf32> to vector<2x2xf32> - gpu.return - } - - // CHECK-LABEL: gpu.func @vector_reduce_multi_dim_nou_unit_local_reduction - // CHECK-SAME: (%[[ARG0:.*]]: memref) - gpu.func @vector_reduce_multi_dim_nou_unit_local_reduction(%src: memref) { - // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<16x16xf32> - // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<0> : vector<16x16x32x32xindex> - // CHECK-DAG: %[[CST_1:.*]] = arith.constant dense : vector<16x16x32x32xi1> - // CHECK-DAG: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[CST_0]]], %[[CST_1]] <{chunk_size = 1 : i64}> : memref, vector<16x16x32x32xindex>, vector<16x16x32x32xi1> -> vector<16x16x32x32xf32> - // CHECK-DAG: %[[CST_2:.*]] = arith.constant dense<0.000000e+00> : vector<16x16xf32> - // CHECK-DAG: %[[LOCAL_REDUCE:.*]] = vector.multi_reduction , %[[LOAD]], %[[CST_2]] [2, 3] : vector<16x16x32x32xf32> to vector<16x16xf32> - // CHECK-DAG: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[LOCAL_REDUCE]] : vector<16x16xf32> to vector<16x16x1x1xf32> - // CHECK-DAG: %[[ALLOCA:.*]] = memref.alloca() : memref<65536xi8, 3> - // CHECK-DAG: %[[MEM_DESC:.*]] = xegpu.create_mem_desc %[[ALLOCA]] : memref<65536xi8, 3> -> !xegpu.mem_desc<32x32x4x4xf32> - // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: xegpu.store_matrix %[[SHAPE_CAST]], %[[MEM_DESC]]{{.*}} : vector<16x16x1x1xf32>, !xegpu.mem_desc<32x32x4x4xf32>, index, index, index, index - // CHECK-DAG: gpu.barrier - // CHECK-DAG: %[[LOAD_SLM:.*]] = xegpu.load_matrix %[[MEM_DESC]]{{.*}} : !xegpu.mem_desc<32x32x4x4xf32>, index, index, index, index -> vector<16x16x4x4xf32> - // CHECK-DAG: %[[CST_3:.*]] = arith.constant dense<0.000000e+00> : vector<16x16xf32> - // CHECK-DAG: %[[FINAL_REDUCE:.*]] = vector.multi_reduction , %[[LOAD_SLM]], %[[CST_3]] [2, 3] : vector<16x16x4x4xf32> to vector<16x16xf32> - // CHECK-DAG: %[[FINAL_ADD:.*]] = arith.addf %[[FINAL_REDUCE]], %[[CST]] : vector<16x16xf32> - // CHECK-DAG: gpu.return - %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [2, 3]>} dense<0.0> : vector<32x32xf32> - %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<32x32x128x128xindex> - %mask = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<32x32x128x128xi1> - %load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout} : memref, vector<32x32x128x128xindex>, vector<32x32x128x128xi1> -> vector<32x32x128x128xf32> - %reduce = vector.multi_reduction , %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [2, 3]>} [2, 3] : vector<32x32x128x128xf32> to vector<32x32xf32> - gpu.return - } - - // CHECK-LABEL: load_nd_tdesc_with_anchor_layout - gpu.func @load_nd_tdesc_with_anchor_layout(%src: memref<256x128xf32>) { - //CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> - %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - // CHECK: xegpu.load_nd %[[TDESC]][{{%.*}}, {{%.*}}] <{layout = #xegpu.layout}> - // CHECK-SAME: : !xegpu.tensor_desc<32x32xf32, #xegpu.layout> -> vector<32x32xf32> - %load = xegpu.load_nd %tdesc[0, 0] <{layout = #xegpu.layout}> - : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - -> vector<256x128xf32> - gpu.return - } - - // CHECK-LABEL: convert_layout_no_slm - gpu.func @convert_layout_no_slm(%arg0: memref<4096x4096xf32>, %arg1: memref<4096x4096xf16>, %arg2: memref<4096x4096xf16>) { - %c32 = arith.constant 32 : index - %c4096 = arith.constant 4096 : index - %c0 = arith.constant 0 : index - %c256 = arith.constant 256 : index - %block_id_x = gpu.block_id x - %block_id_y = gpu.block_id y - %0 = arith.muli %block_id_x, %c256 overflow : index - %1 = arith.muli %block_id_y, %c256 overflow : index - %2 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf32> -> !xegpu.tensor_desc<256x256xf32, #xegpu.block_tdesc_attr, #xegpu.layout> - %3 = xegpu.load_nd %2[%0, %1] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<256x256xf32, #xegpu.block_tdesc_attr, #xegpu.layout> -> vector<256x256xf32> - %4 = xegpu.create_nd_tdesc %arg1 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16, #xegpu.block_tdesc_attr, #xegpu.layout> - %5 = xegpu.create_nd_tdesc %arg2 : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x256xf16, #xegpu.block_tdesc_attr, #xegpu.layout> - %6 = scf.for %arg3 = %c0 to %c4096 step %c32 iter_args(%arg4 = %3) -> (vector<256x256xf32>) { - %7 = xegpu.load_nd %4[%0, %arg3] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<256x32xf16, #xegpu.block_tdesc_attr, #xegpu.layout> -> vector<256x32xf16> - %8 = xegpu.load_nd %5[%arg3, %1] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<32x256xf16, #xegpu.block_tdesc_attr, #xegpu.layout> -> vector<32x256xf16> - // CHECK: %[[CONVERT_A:.*]] = xegpu.convert_layout %{{.*}} <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<32x32xf16> - // CHECK: %[[CONVERT_B:.*]] = xegpu.convert_layout %{{.*}} <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<32x32xf16> - %9 = xegpu.convert_layout %7 <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<256x32xf16> - %10 = xegpu.convert_layout %8 <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<32x256xf16> - %11 = xegpu.dpas %9, %10, %arg4 {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf32> -> vector<256x256xf32> - scf.yield %11 : vector<256x256xf32> - } {layout_result_0 = #xegpu.layout} - xegpu.store_nd %6, %2[%0, %1] <{layout = #xegpu.layout}> : vector<256x256xf32>, !xegpu.tensor_desc<256x256xf32, #xegpu.block_tdesc_attr, #xegpu.layout> - gpu.return - } - - // CHECK-LABEL: convert_layout_slm - // CHECK-SAME: %[[ARG0:.*]]: memref<128x256xf32> - gpu.func @convert_layout_slm(%arg0: memref<128x256xf32>) { - // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[SGIDX:.*]] = arith.remui %[[SGID]], %[[C16:.*]] : index - // CHECK-DAG: %[[SGIDY_TMP:.*]] = arith.divui %[[SGID]], %[[C16:.*]] : index - // CHECK-DAG: %[[SGIDY:.*]] = arith.remui %[[SGIDY_TMP]], %[[C4:.*]] : index - // CHECK-DAG: %[[MUL_Y:.*]] = arith.muli %[[SGIDY]], %[[C32:.*]] : index - // CHECK-DAG: %[[MUL_X:.*]] = arith.muli %[[SGIDX]], %[[C16:.*]] : index - // CHECK-DAG: %[[OFF_Y:.*]] = arith.remui %[[MUL_Y]], %[[C128:.*]] : index - // CHECK-DAG: %[[OFF_X:.*]] = arith.remui %[[MUL_X]], %[[C256:.*]] : index - // CHECK-DAG: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[OFF_Y]], %[[OFF_X]]] : memref<128x256xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.layout> - // CHECK-DAG: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<32x16xf32, #xegpu.layout> -> vector<32x16xf32> - // CHECK-DAG: %[[ALLOCA:.*]] = memref.alloca() : memref<131072xi8, 3> - // CHECK-DAG: %[[MDESC:.*]] = xegpu.create_mem_desc %[[ALLOCA]] : memref<131072xi8, 3> -> !xegpu.mem_desc<128x256xf32> - // CHECK-DAG: %[[SGID_STORE:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[STORE_X:.*]] = arith.remui %[[SGID_STORE]], %[[C16:.*]] : index - // CHECK-DAG: %[[STORE_Y_TMP:.*]] = arith.divui %[[SGID_STORE]], %[[C16:.*]] : index - // CHECK-DAG: %[[STORE_Y:.*]] = arith.remui %[[STORE_Y_TMP]], %[[C4:.*]] : index - // CHECK-DAG: %[[STORE_MUL_Y:.*]] = arith.muli %[[STORE_Y]], %[[C32:.*]] : index - // CHECK-DAG: %[[STORE_MUL_X:.*]] = arith.muli %[[STORE_X]], %[[C16:.*]] : index - // CHECK-DAG: %[[STORE_OFF_Y:.*]] = arith.remui %[[STORE_MUL_Y]], %[[C128:.*]] : index - // CHECK-DAG: %[[STORE_OFF_X:.*]] = arith.remui %[[STORE_MUL_X]], %[[C256:.*]] : index - // CHECK-DAG: xegpu.store_matrix %[[LOAD]], %[[MDESC]][%[[STORE_OFF_Y]], %[[STORE_OFF_X]]] : vector<32x16xf32>, !xegpu.mem_desc<128x256xf32>, index, index - // CHECK-DAG: gpu.barrier - // CHECK-DAG: %[[LOAD_X:.*]] = arith.remui %[[SGID_STORE]], %[[C8:.*]] : index - // CHECK-DAG: %[[LOAD_Y_TMP:.*]] = arith.divui %[[SGID_STORE]], %[[C8:.*]] : index - // CHECK-DAG: %[[LOAD_Y:.*]] = arith.remui %[[LOAD_Y_TMP]], %[[C8:.*]] : index - // CHECK-DAG: %[[LOAD_MUL_Y:.*]] = arith.muli %[[LOAD_Y]], %[[C16:.*]] : index - // CHECK-DAG: %[[LOAD_MUL_X:.*]] = arith.muli %[[LOAD_X]], %[[C32:.*]] : index - // CHECK-DAG: %[[LOAD_OFF_Y:.*]] = arith.remui %[[LOAD_MUL_Y]], %[[C128:.*]] : index - // CHECK-DAG: %[[LOAD_OFF_X:.*]] = arith.remui %[[LOAD_MUL_X]], %[[C256:.*]] : index - // CHECK-DAG: %[[LOAD_SLM:.*]] = xegpu.load_matrix %[[MDESC]][%[[LOAD_OFF_Y]], %[[LOAD_OFF_X]]] <{layout = #xegpu.layout}>: !xegpu.mem_desc<128x256xf32>, index, index -> vector<16x32xf32> - %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<128x256xf32> -> !xegpu.tensor_desc<128x256xf32, #xegpu.layout> - %1 = xegpu.load_nd %0 {layout = #xegpu.layout} : !xegpu.tensor_desc<128x256xf32, #xegpu.layout> -> vector<128x256xf32> - %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout, - target_layout = #xegpu.layout}> : vector<128x256xf32> - gpu.return - } - - gpu.func @convert_layout_3D(%arg0: memref) { - // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1x32x16xindex> - // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense : vector<1x32x16xi1> - // CHECK-DAG: %[[LOAD:.*]] = xegpu.load %{{.*}}[%[[CST]]], %[[CST_0]] <{chunk_size = 1 : i64, layout = #xegpu.layout}> : memref, vector<1x32x16xindex>, vector<1x32x16xi1> -> vector<1x32x16xf32> - // CHECK-DAG: %[[ALLOCA:.*]] = memref.alloca() : memref<1048576xi8, 3> - // CHECK-DAG: %[[MDESC:.*]] = xegpu.create_mem_desc %[[ALLOCA]] : memref<1048576xi8, 3> -> !xegpu.mem_desc<8x128x256xf32> - // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[STORE_X:.*]] = arith.remui %[[SGID]], %[[C16:.*]] : index - // CHECK-DAG: %[[STORE_YZ_TMP:.*]] = arith.divui %[[SGID]], %[[C16:.*]] : index - // CHECK-DAG: %[[STORE_Y:.*]] = arith.remui %[[STORE_YZ_TMP]], %[[C4:.*]] : index - // CHECK-DAG: %[[STORE_Z_TMP:.*]] = arith.divui %[[STORE_YZ_TMP]], %[[C4:.*]] : index - // CHECK-DAG: %[[STORE_Z:.*]] = arith.remui %[[STORE_Z_TMP]], %[[C8:.*]] : index - // CHECK-DAG: %[[STORE_MUL_Y:.*]] = arith.muli %[[STORE_Y]], %[[C32:.*]] : index - // CHECK-DAG: %[[STORE_MUL_X:.*]] = arith.muli %[[STORE_X]], %[[C16:.*]] : index - // CHECK-DAG: %[[STORE_OFF_Z:.*]] = arith.remui %[[STORE_Z]], %[[C8:.*]] : index - // CHECK-DAG: %[[STORE_OFF_Y:.*]] = arith.remui %[[STORE_MUL_Y]], %[[C128:.*]] : index - // CHECK-DAG: %[[STORE_OFF_X:.*]] = arith.remui %[[STORE_MUL_X]], %[[C256:.*]] : index - // CHECK-DAG: xegpu.store_matrix %[[LOAD]], %[[MDESC]][%[[STORE_OFF_Z]], %[[STORE_OFF_Y]], %[[STORE_OFF_X]]] : vector<1x32x16xf32>, !xegpu.mem_desc<8x128x256xf32>, index, index, index - // CHECK-DAG: gpu.barrier - // CHECK-DAG: %[[LOAD_X:.*]] = arith.remui %[[SGID]], %[[C8:.*]] : index - // CHECK-DAG: %[[LOAD_YZ_TMP:.*]] = arith.divui %[[SGID]], %[[C8:.*]] : index - // CHECK-DAG: %[[LOAD_Y:.*]] = arith.remui %[[LOAD_YZ_TMP]], %[[C8:.*]] : index - // CHECK-DAG: %[[LOAD_Z_TMP:.*]] = arith.divui %[[LOAD_YZ_TMP]], %[[C8:.*]] : index - // CHECK-DAG: %[[LOAD_Z:.*]] = arith.remui %[[LOAD_Z_TMP]], %[[C8:.*]] : index - // CHECK-DAG: %[[LOAD_MUL_Y:.*]] = arith.muli %[[LOAD_Y]], %[[C16:.*]] : index - // CHECK-DAG: %[[LOAD_MUL_X:.*]] = arith.muli %[[LOAD_X]], %[[C32:.*]] : index - // CHECK-DAG: %[[LOAD_OFF_Z:.*]] = arith.remui %[[LOAD_Z]], %[[C8:.*]] : index - // CHECK-DAG: %[[LOAD_OFF_Y:.*]] = arith.remui %[[LOAD_MUL_Y]], %[[C128:.*]] : index - // CHECK-DAG: %[[LOAD_OFF_X:.*]] = arith.remui %[[LOAD_MUL_X]], %[[C256:.*]] : index - // CHECK-DAG: %[[LOAD_SLM:.*]] = xegpu.load_matrix %[[MDESC]][%[[LOAD_OFF_Z]], %[[LOAD_OFF_Y]], %[[LOAD_OFF_X]]] <{layout = #xegpu.layout}>: !xegpu.mem_desc<8x128x256xf32>, index, index, index -> vector<1x16x32xf32> - %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<8x128x256xindex> - %mask = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<8x128x256xi1> - %1 = xegpu.load %arg0[%offset], %mask {chunk_size = 1, layout = #xegpu.layout} : memref, vector<8x128x256xindex>, vector<8x128x256xi1> -> vector<8x128x256xf32> - %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout, - target_layout = #xegpu.layout}> : vector<8x128x256xf32> - gpu.return - } - - // CHECK-LABEL: convert_layout_reduce_to_scalar - gpu.func @convert_layout_reduce_to_scalar(%arg0: memref<32x32xf32>) { - %mask = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<32x32xi1> - %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<32x32xindex> - %cst_0 = arith.constant 0.000000e+00 : f32 - %intptr = memref.extract_aligned_pointer_as_index %arg0 : memref<32x32xf32> -> index - %10 = arith.index_cast %intptr : index to i64 - %11 = xegpu.load %10[%offset], %mask <{layout = #xegpu.layout}> {layout_operand_1 = #xegpu.layout, layout_operand_2 = #xegpu.layout} : i64, vector<32x32xindex>, vector<32x32xi1> -> vector<32x32xf32> - %12 = vector.multi_reduction , %11, %cst_0 {layout_operand_0 = #xegpu.layout, layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0, 1]>} [0, 1] : vector<32x32xf32> to f32 - // CHECK-NOT: xegpu.convert_layout - %13 = xegpu.convert_layout %12 <{input_layout = #xegpu.slice<#xegpu.layout, dims = [0, 1]>, target_layout = #xegpu.slice<#xegpu.layout, dims = [0, 1]>}> : f32 - gpu.return - } - - // CHECK-LABEL: distribute_nested_slice - // CHECK: %[[V0:.*]] = vector.shape_cast %{{.*}} : vector<32x32xf32> to vector<32x1x32x1xf32> - // CHECK: %[[V1:.*]] = vector.broadcast %[[V0]] : vector<32x1x32x1xf32> to vector<32x16x32x16xf32> - // CHECK: %[[V2:.*]] = vector.shape_cast %[[V1]] : vector<32x16x32x16xf32> to vector<32x16x32x16x1xf32> - // CHECK: %[[V3:.*]] = vector.broadcast %[[V2]] : vector<32x16x32x16x1xf32> to vector<32x16x32x16x16xf32> - // CHECK: %[[V4:.*]] = vector.shape_cast %[[V3]] : vector<32x16x32x16x16xf32> to vector<32x16x1x32x16x16xf32> - // CHECK: %[[V5:.*]] = vector.broadcast %[[V4]] : vector<32x16x1x32x16x16xf32> to vector<32x16x16x32x16x16xf32> - gpu.func @distribute_nested_slice(%src: memref<256x256xf32>) { - - %tdesc = xegpu.create_nd_tdesc %src : memref<256x256xf32> - -> !xegpu.tensor_desc<256x256xf32, #xegpu.layout> - - %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} - : !xegpu.tensor_desc<256x256xf32, #xegpu.layout> - -> vector<256x256xf32> - - %load2 = xegpu.convert_layout %load <{input_layout = #xegpu.layout, target_layout = #xegpu.slice<#xegpu.slice<#xegpu.slice<#xegpu.layout, dims=[2]>, dims=[4]>, dims=[1, 3]>}> : vector<256x256xf32> - - %scast = vector.shape_cast %load2 {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout, dims=[2]>, dims=[4]>, layout_operand_0 = #xegpu.slice<#xegpu.slice<#xegpu.slice<#xegpu.layout, dims=[2]>, dims=[4]>, dims=[1, 3]>} : vector<256x256xf32> to vector<256x1x256x1xf32> - - %bcast = vector.broadcast %scast {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout, dims=[2]>, dims=[4]>, layout_operand_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout, dims=[2]>, dims=[4]>} : vector<256x1x256x1xf32> to vector<256x16x256x16xf32> - - %scast1 = vector.shape_cast %bcast {layout_result_0 = #xegpu.slice<#xegpu.layout, dims=[2]>, layout_operand_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout, dims=[2]>, dims=[4]>} : vector<256x16x256x16xf32> to vector<256x16x256x16x1xf32> - - %bcast1 = vector.broadcast %scast1 {layout_result_0 = #xegpu.slice<#xegpu.layout, dims=[2]>, layout_operand_0 = #xegpu.slice<#xegpu.layout, dims=[2]>} : vector<256x16x256x16x1xf32> to vector<256x16x256x16x16xf32> - - %scast2 = vector.shape_cast %bcast1 {layout_result_0 = - #xegpu.layout, layout_operand_0 = #xegpu.slice<#xegpu.layout, dims=[2]>} : vector<256x16x256x16x16xf32> to vector<256x16x1x256x16x16xf32> - - %bcast2 = vector.broadcast %scast2 {layout_result_0 = - #xegpu.layout, layout_operand_0 = - #xegpu.layout} : vector<256x16x1x256x16x16xf32> to vector<256x16x16x256x16x16xf32> - gpu.return - } - - // CHECK-LABEL: @preserve_anchor_layout - // CHECK: arith.constant dense<1.000000e+00> : vector<16x128xf32> - // CHECK: xegpu.store_nd %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] <{layout = #xegpu.layout}> - gpu.func @preserve_anchor_layout(%dst: memref<256x128xf32>) { - %val = arith.constant {layout_result_0 = #xegpu.layout} dense<1.0> : vector<256x128xf32> - %tdesc = xegpu.create_nd_tdesc %dst : memref<256x128xf32> - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - xegpu.store_nd %val, %tdesc[0, 0] <{layout = #xegpu.layout}> - : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - gpu.return - } - - // CHECK-LABEL: @shape_cast_used_by_elementwise - gpu.func @shape_cast_used_by_elementwise(%dst: memref<1x1x16xf32>) { - // Regression test: shape_cast expanding unit dimensions can be used by elementwise ops - // This previously failed with "ShapeCast ops that expand unit dimensions and are used by - // non-broadcast operations are not supported." - - // CHECK: vector.step : vector<16xindex> - // CHECK: vector.shape_cast {{.*}} : vector<16xindex> to vector<1x1x16xindex> - // CHECK: arith.addi {{.*}} : vector<1x1x16xindex> - // CHECK: xegpu.store {{.*}} : vector<1x1x16xf32>, i64, vector<1x1x16xindex>, vector<1x1x16xi1> - %step = vector.step : vector<16xindex> - %shape_cast = vector.shape_cast %step : vector<16xindex> to vector<1x1x16xindex> - %cst = arith.constant dense<10> : vector<1x1x16xindex> - %add = arith.addi %shape_cast, %cst : vector<1x1x16xindex> - - %cst_val = arith.constant dense<1.0> : vector<1x1x16xf32> - %intptr = memref.extract_aligned_pointer_as_index %dst : memref<1x1x16xf32> -> index - %ptr = arith.index_cast %intptr : index to i64 - %mask = arith.constant dense : vector<1x1x16xi1> - - xegpu.store %cst_val, %ptr[%add], %mask {layout = #xegpu.layout} : vector<1x1x16xf32>, i64, vector<1x1x16xindex>, vector<1x1x16xi1> - gpu.return - } - -} diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir index df3fa880c9d6d..f2cc05808ed12 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir @@ -1,51 +1,46 @@ // RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s - -gpu.module @test_1_1_assignment { +gpu.module @test_distribution { // CHECK-LABEL: create_nd_tdesc // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> gpu.func @create_nd_tdesc(%src: memref<256x128xf32>) { - // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[REMUX:.*]] = arith.remui %[[SGID]], %[[C4:.*]] - // CHECK-DAG: %[[DIVU:.*]] = arith.divui %[[SGID]], %[[C4:.*]] - // CHECK-DAG: %[[REMUY:.*]] = arith.remui %[[DIVU]], %[[C8:.*]] - // CHECK-DAG: %[[MULY:.*]] = arith.muli %[[REMUY]], %[[C32:.*]] - // CHECK-DAG: %[[MULX:.*]] = arith.muli %[[REMUX]], %[[C32:.*]] - // CHECK-DAG: %[[MODY:.*]] = arith.remui %[[MULY]], %[[C256:.*]] - // CHECK-DAG: %[[MODX:.*]] = arith.remui %[[MULX]], %[[C128:.*]] - // CHECK-DAG: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][%[[MODY]], %[[MODX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - gpu.return + // CHECK: xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf32> + // CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> + %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> + -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + gpu.return } - // CHECK-LABEL: create_nd_tdesc_from_higher_rank_memref - // CHECK-SAME: %[[ARG_0:.*]]: memref<3x256x128xf32> - gpu.func @create_nd_tdesc_from_higher_rank_memref(%src: memref<3x256x128xf32>) { - // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[REMUX:.*]] = arith.remui %[[SGID]], %[[C4:.*]] - // CHECK-DAG: %[[DIVU:.*]] = arith.divui %[[SGID]], %[[C4:.*]] - // CHECK-DAG: %[[REMUY:.*]] = arith.remui %[[DIVU]], %[[C8:.*]] - // CHECK-DAG: %[[MULY:.*]] = arith.muli %[[REMUY]], %[[C32:.*]] - // CHECK-DAG: %[[MULX:.*]] = arith.muli %[[REMUX]], %[[C32:.*]] - // CHECK-DAG: %[[MODY:.*]] = arith.remui %[[MULY]], %[[C256:.*]] - // CHECK-DAG: %[[MODX:.*]] = arith.remui %[[MULX]], %[[C128:.*]] - // CHECK-DAG: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][1, %[[MODY]], %[[MODX]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> - %tdesc = xegpu.create_nd_tdesc %src[1, 0, 0] : memref<3x256x128xf32> - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - gpu.return + // CHECK-LABEL: create_nd_tdesc_with_ptr + // CHECK-SAME: %[[ARG_0:.*]]: ui64 + gpu.func @create_nd_tdesc_with_ptr(%src: ui64, %w : index, %h : index, %x : index, %y : index) { + // CHECK: xegpu.create_nd_tdesc %[[ARG_0]], shape : [{{.*}}, {{.*}}], strides : [{{.*}}, {{.*}}] : ui64 + // CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> + %c1 = arith.constant 1 : index + %tdesc = xegpu.create_nd_tdesc %src, shape:[%h, %w], strides: [%w, %c1] : ui64 + -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + gpu.return } - // CHECK-LABEL: load_nd_tdesc - // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> - gpu.func @load_nd_tdesc(%src: memref<256x128xf32>) { - // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<256x128xf32> - // CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> - // CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]] - // CHECK-SAME: : !xegpu.tensor_desc<32x32xf32, #xegpu.layout> - // CHECK-SAME: -> vector<32x32xf32> - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> + // CHECK-LABEL: load_nd + gpu.func @load_nd(%src: memref<256x128xf32>) { + //CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> + //CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index + //CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index + //CHECK-DAG: %[[SGIDX:.*]] = arith.remui %[[SGID]], %[[C4]] + //CHECK-DAG: %[[SGIDY_TMP:.*]] = arith.divui %[[SGID]], %[[C4]] + //CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index + //CHECK-DAG: %[[SGIDY:.*]] = arith.remui %[[SGIDY_TMP]], %[[C8]] + //CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index + //CHECK-DAG: %[[L_OFF_Y:.*]] = arith.muli %[[SGIDY]], %[[C32]] : index + //CHECK-DAG: %[[L_OFF_X:.*]] = arith.muli %[[SGIDX]], %[[C32_1:.*]] : index + //CHECK-DAG: %[[C256:.*]] = arith.constant 256 : index + //CHECK-DAG: %[[OFF_Y:.*]] = arith.remui %[[L_OFF_Y]], %[[C256]] : index + //CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index + //CHECK-DAG: %[[OFF_X:.*]] = arith.remui %[[L_OFF_X]], %[[C128]] : index + //CHECK-DAG: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]][{{%.*}}, {{%.*}}] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<32x32xf32, #xegpu.layout> -> vector<32x32xf32> + %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> -> vector<256x128xf32> gpu.return @@ -54,102 +49,87 @@ gpu.module @test_1_1_assignment { // CHECK-LABEL: store_nd // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> gpu.func @store_nd(%src: memref<256x128xf32>) { - // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<256x128xf32> - // CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> - // CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]] - // CHECK-SAME: : !xegpu.tensor_desc<32x32xf32, #xegpu.layout> - // CHECK-SAME: -> vector<32x32xf32> - // CHECK: xegpu.store_nd %[[LOAD]], %[[TDESC]] - // CHECK-SAME: : vector<32x32xf32>, !xegpu.tensor_desc<32x32xf32, #xegpu.layout> - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> + //CHECK: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] <{layout = #xegpu.layout}> : vector<32x32xf32>, !xegpu.tensor_desc<32x32xf32, #xegpu.layout> + %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc {layout = #xegpu.layout} + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> -> vector<256x128xf32> - xegpu.store_nd %load, %tdesc + xegpu.store_nd %load, %tdesc[0, 0] {layout = #xegpu.layout} : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout> gpu.return - } +} - // CHECK-LABEL: update_nd + // CHECK-LABEL: prefetch_nd // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> - gpu.func @update_nd(%src: memref<256x128xf32>){ - // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<256x128xf32> - // CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> - // CHECK: %[[UPDATE:.*]] = xegpu.update_nd_offset %[[TDESC]], [0, 16] - // CHECK-SAME: : !xegpu.tensor_desc<32x32xf32, #xegpu.layout> - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> + gpu.func @prefetch_nd(%src: memref<256x128xf32>) { + //CHECK: xegpu.prefetch_nd %{{.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<32x32xf32, #xegpu.layout> + %cst0 = arith.constant 0 : index + %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %update = xegpu.update_nd_offset %tdesc, [0, 16] + xegpu.prefetch_nd %tdesc[%cst0, %cst0] : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> gpu.return } // CHECK-LABEL: dpas gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { - %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<128x128xf16> + // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32> + %tdesc_a = xegpu.create_nd_tdesc %a : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> + %load_a = xegpu.load_nd %tdesc_a[0, 0] {layout = #xegpu.layout} + : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> - %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<128x128xf16> + %tdesc_b = xegpu.create_nd_tdesc %b : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> + %load_b = xegpu.load_nd %tdesc_b[0, 0] {layout = #xegpu.layout} + : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> - // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32> %dpas = xegpu.dpas %load_a, %load_b - {layout_a = #xegpu.layout, - layout_b = #xegpu.layout, - layout_cd = #xegpu.layout} + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32> gpu.return } // CHECK-LABEL: dpas_no_sg_data gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { - %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<128x128xf16> + // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32> + %tdesc_a = xegpu.create_nd_tdesc %a : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %load_a = xegpu.load_nd %tdesc_a {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout} + : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> - %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<128x128xf16> + %tdesc_b = xegpu.create_nd_tdesc %b : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %load_b = xegpu.load_nd %tdesc_b {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout } + : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> - // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32> %dpas = xegpu.dpas %load_a, %load_b - {layout_a = #xegpu.layout, - layout_b = #xegpu.layout, - layout_cd = #xegpu.layout} + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32> gpu.return } - // CHECK-LABEL: prefetch_nd_tdesc - // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> - gpu.func @prefetch_nd_tdesc(%src: memref<256x128xf32>) { - // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<256x128xf32> - // CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> - // CHECK: xegpu.prefetch_nd %[[TDESC]] - // CHECK-SAME: : !xegpu.tensor_desc<32x32xf32, #xegpu.layout> - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - xegpu.prefetch_nd %tdesc - : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - gpu.return - } - // CHECK-LABEL: broadcast_dim1 // CHECK-SAME: %[[ARG_0:.*]]: memref<256x1xf32> gpu.func @broadcast_dim1(%src: memref<256x1xf32>) { - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x1xf32> + %tdesc = xegpu.create_nd_tdesc %src : memref<256x1xf32> -> !xegpu.tensor_desc<256x1xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc {layout = #xegpu.layout} + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256x1xf32, #xegpu.layout> -> vector<256x1xf32> - // CHECK: vector.broadcast {{.*}} : vector<32x1xf32> to vector<32x32xf32> + // CHECK: vector.broadcast {{.*}} : vector<32x1xf32> to vector<32x32xf32> %broadcast = vector.broadcast %load {layout_result_0 = #xegpu.layout} : vector<256x1xf32> to vector<256x32xf32> @@ -159,9 +139,9 @@ gpu.module @test_1_1_assignment { // CHECK-LABEL: broadcast_dim0 // CHECK-SAME: %[[ARG_0:.*]]: memref<1x128xf32> gpu.func @broadcast_dim0(%src: memref<1x128xf32>) { - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<1x128xf32> + %tdesc = xegpu.create_nd_tdesc %src : memref<1x128xf32> -> !xegpu.tensor_desc<1x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc {layout = #xegpu.layout} + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<1x128xf32, #xegpu.layout> -> vector<1x128xf32> // CHECK: vector.broadcast {{.*}} : vector<1x32xf32> to vector<32x32xf32> @@ -171,6 +151,824 @@ gpu.module @test_1_1_assignment { gpu.return } + // CHECK-LABEL: gemm_with_load_store_offset + // CHECK-SAME: %[[ARG_0:.*]]: memref<1024x1024xf16>, %[[ARG_1:.*]]: memref<1024x1024xf16>, %[[ARG_2:.*]]: memref<1024x1024xf32> + gpu.func @gemm_with_load_store_offset(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf32>) { + //CHECK: [[c0:%.+]] = arith.constant 0 : index + //CHECK: [[c128:%.+]] = arith.constant 128 : index + //CHECK: [[c1024:%.+]] = arith.constant 1024 : index + %c0 = arith.constant 0 : index + %c128 = arith.constant 128 : index + %c1024 = arith.constant 1024 : index + %block_id_x = gpu.block_id x + %block_id_y = gpu.block_id y + %0 = arith.muli %block_id_x, %c128 : index + %1 = arith.muli %block_id_y, %c128 : index + %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> + // CHECK: [[DESC_A:%.+]] = xegpu.create_nd_tdesc %[[ARG_0]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x128xf16> + // CHECK: [[DESC_B:%.+]] = xegpu.create_nd_tdesc %[[ARG_1]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x16xf16> + %3 = xegpu.create_nd_tdesc %arg0 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> + %4 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> + // load_nd with offset + %5 = xegpu.load_nd %2[%0, %1] {layout = #xegpu.layout}: !xegpu.tensor_desc<128x128xf32, #xegpu.layout> -> vector<128x128xf32> + %6 = xegpu.load_nd %3[%0, %c0] {layout = #xegpu.layout}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> + %7 = xegpu.load_nd %4[%c0, %1] {layout = #xegpu.layout}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> + // scf.for loop + // CHECK: [[scf:%.+]]:3 = scf.for [[arg3:%.+]] = [[c0]] to [[c1024]] step [[c128]] + // CHECK-SAME: iter_args([[arg4:%.+]] = {{.*}}, [[arg5:%.+]] = {{.*}}, [[arg6:%.+]] = {{.*}}) -> + // CHECK-SAME: (vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32>) + // CHECK: [[c:%.+]] = xegpu.dpas [[arg4]], [[arg5]], [[arg6]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> -> vector<16x16xf32> + // CHECK: [[a:%.+]] = xegpu.load_nd [[DESC_A]][{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16> + // CHECK: [[b:%.+]] = xegpu.load_nd [[DESC_B]][{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16> + // CHECK: scf.yield [[a]], [[b]], [[c]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> + %8:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg4 = %6, %arg5 = %7, %arg6 = %5) + -> (vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32>) { + // load_nd with offset inside loop + %9 = xegpu.dpas %arg4, %arg5, %arg6 + {layout_a = #xegpu.layout, + layout_b = #xegpu.layout, + layout_cd = #xegpu.layout} + : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32> + %10 = xegpu.load_nd %3[%arg3, %c0] {layout = #xegpu.layout}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> + %11 = xegpu.load_nd %4[%c0, %arg3] {layout = #xegpu.layout}: !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> + scf.yield %10, %11, %9 : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> + } {layout_result_0 = #xegpu.layout, + layout_result_1 = #xegpu.layout, + layout_result_2 = #xegpu.layout} + // store_nd with offset + xegpu.store_nd %8#2, %2[%0, %1] {layout = #xegpu.layout} : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout> + gpu.return + } + + // CHECK-LABEL: @subgroup_id_range + gpu.func @subgroup_id_range(%src: memref<256x128xf32>, %src1: memref<128x256xf32>, %src2: memref<128x64xf32>) { + %sg_id = gpu.subgroup_id : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c31 = arith.constant 31 : index + %c3 = arith.constant 3 : index + %cond1 = arith.cmpi sge, %sg_id, %c0 : index + %cond2 = arith.cmpi slt, %sg_id, %c1 : index + %cond = arith.andi %cond1, %cond2 : i1 + scf.if %cond { + // CHECK-NOT: index.sub + %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> + -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} + : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + -> vector<256x128xf32> + } {sg_id_range = #xegpu.range<[0, 32]>} + %cond3 = arith.cmpi sge, %sg_id, %c2 : index + %cond4 = arith.cmpi slt, %sg_id, %c31 : index + %cond5 = arith.andi %cond3, %cond4 : i1 + scf.if %cond5 { + // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK: %[[C2:.*]] = arith.constant 2 : index + // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C2]] + %tdesc = xegpu.create_nd_tdesc %src2 : memref<128x64xf32> + -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} + : !xegpu.tensor_desc<128x64xf32, #xegpu.layout> + -> vector<128x64xf32> + %exp = math.exp %load {layout_result_0 = #xegpu.layout} : vector<128x64xf32> + }{sg_id_range = #xegpu.range<[2, 18]>} + gpu.return + } + + // CHECK-LABEL: @subgroup_id_range_nested_if + gpu.func @subgroup_id_range_nested_if(%src: memref<256x128xf32>, %src1: memref<128x64xf32>) { + %sg_id = gpu.subgroup_id : index + %c1 = arith.constant 1 : i1 + %c3 = arith.constant 3 : index + %c32 = arith.constant 32 : index + %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> + -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} + : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + -> vector<256x128xf32> + %cond1 = arith.cmpi sge, %sg_id, %c3 : index + %cond2 = arith.cmpi slt, %sg_id, %c32 : index + %cond = arith.andi %cond1, %cond2 : i1 + scf.if %c1 { + scf.if %cond { + // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK: %[[C3:.*]] = arith.constant 3 : index + // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C3]] + %td = xegpu.create_nd_tdesc %src1 : memref<128x64xf32> + -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout> + %ld = xegpu.load_nd %td[0, 0] {layout = #xegpu.layout} + : !xegpu.tensor_desc<128x64xf32, #xegpu.layout> + -> vector<128x64xf32> + %exp = math.exp %ld {layout_result_0 = #xegpu.layout} : vector<128x64xf32> + } + } {sg_id_range = #xegpu.range<[3, 19]>} + gpu.return + } + + // CHECK-LABEL: @load_gather + // CHECK-SAME: %[[ARG0:.*]]: memref + gpu.func @load_gather(%src : memref) { + // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<32x4xindex> + // CHECK: %[[MASK:.*]] = arith.constant dense : vector<32x4xi1> + // CHECK: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint}> + // CHECK-SAME: : memref, vector<32x4xindex>, vector<32x4xi1> -> vector<32x4xf16> + %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<256x16xindex> + %mask = arith.constant {layout_result_0 = #xegpu.layout} dense<1> : vector<256x16xi1> + %load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} + : memref, vector<256x16xindex>, vector<256x16xi1> -> vector<256x16xf16> + gpu.return + } + + // CHECK-LABEL: @store_scatter + // CHECK-SAME: %[[ARG0:.*]]: memref<256xf16> + gpu.func @store_scatter(%dest : memref<256xf16>) { + // CHECK: %[[VAL:.*]] = arith.constant dense<2.550000e+01> : vector<8xf16> + // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<8xindex> + // CHECK: %[[MASK:.*]] = arith.constant dense : vector<8xi1> + // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint, layout = #xegpu.layout}> + // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1> + %val = arith.constant {layout_result_0 = #xegpu.layout} dense<25.5> : vector<256xf16> + %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<256xindex> + %mask = arith.constant {layout_result_0 = #xegpu.layout} dense<1> : vector<256xi1> + xegpu.store %val, %dest[%offset], %mask {chunk_size = 1, layout = #xegpu.layout, + layout_operand_2 = #xegpu.layout, + layout_operand_3 = #xegpu.layout, + l1_hint = #xegpu.cache_hint} + : vector<256xf16>, memref<256xf16>, vector<256xindex>, vector<256xi1> + gpu.return + } + + // CHECK-LABEL: @load_with_non_unit_chunk_size + // CHECK-SAME: %[[ARG0:.*]]: memref + gpu.func @load_with_non_unit_chunk_size(%src : memref) { + // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<8xindex> + // CHECK: %[[MASK:.*]] = arith.constant dense : vector<8xi1> + // CHECK: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 4 : i64, l1_hint = #xegpu.cache_hint}> + // CHECK-SAME: : memref, vector<8xindex>, vector<8xi1> -> vector<8x4xf16> + %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<256xindex> + %mask = arith.constant {layout_result_0 = #xegpu.layout} dense<1> : vector<256xi1> + %load = xegpu.load %src[%offset], %mask {chunk_size = 4, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} + : memref, vector<256xindex>, vector<256xi1> -> vector<256x4xf16> + gpu.return + } + + // CHECK-LABEL: distribute_load_matrix + // CHECK-SAME: [[arg0:%.+]]: memref<32768xi8, 3> + gpu.func @distribute_load_matrix(%arg0: memref<32768xi8, 3>) { + //CHECK: [[mdesc:%.+]] = xegpu.create_mem_desc [[arg0]] : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32> + //CHECK: [[sgid:%.+]] = gpu.subgroup_id : index + //CHECK: [[c4:%.+]] = arith.constant 4 : index + //CHECK: [[sgidx:%.+]] = arith.remui [[sgid]], [[c4]] : index + //CHECK: [[sgidy_tmp:%.+]] = arith.divui [[sgid]], [[c4]] : index + //CHECK: [[c2:%.+]] = arith.constant 2 : index + //CHECK: [[sgidy:%.+]] = arith.remui [[sgidy_tmp]], [[c2]] : index + //CHECK: [[c32:%.+]] = arith.constant 32 : index + //CHECK: [[l_off_y:%.+]] = arith.muli [[sgidy]], [[c32]] : index + //CHECK: [[c32_0:%.+]] = arith.constant 32 : index + //CHECK: [[l_off_x:%.+]] = arith.muli [[sgidx]], [[c32_0]] : index + //CHECK: [[c64:%.+]] = arith.constant 64 : index + //CHECK: [[off_y:%.+]] = arith.remui [[l_off_y]], [[c64]] : index + //CHECK: [[c128:%.+]] = arith.constant 128 : index + //CHECK: [[off_x:%.+]] = arith.remui [[l_off_x]], [[c128]] : index + //CHECK: xegpu.load_matrix [[mdesc]][[[off_y]], [[off_x]]] <{layout = #xegpu.layout}>: !xegpu.mem_desc<64x128xf32>, index, index -> vector<32x32xf32> + %0 = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32> + %1 = xegpu.load_matrix %0[0, 0] <{layout = #xegpu.layout}>: !xegpu.mem_desc<64x128xf32> -> vector<64x128xf32> + gpu.return + } + + //CHECK-LABEL: distribute_store_matrix + //CHECK-SAME: [[arg0:%.+]]: memref<32768xi8, 3> + gpu.func @distribute_store_matrix(%arg0 : memref<32768xi8, 3>) { + //CHECK: [[cst:%.+]] = arith.constant dense<1.000000e+00> : vector<32x32xf32> + //CHECK: [[mdesc:%.+]] = xegpu.create_mem_desc [[arg0]] : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32> + //CHECK: [[sgid:%.+]] = gpu.subgroup_id : index + //CHECK: [[c4:%.+]] = arith.constant 4 : index + //CHECK: [[sgidx:%.+]] = arith.remui [[sgid]], [[c4]] : index + //CHECK: [[sgidy_tmp:%.+]] = arith.divui [[sgid]], [[c4]] : index + //CHECK: [[c2:%.+]] = arith.constant 2 : index + //CHECK: [[sgidy:%.+]] = arith.remui [[sgidy_tmp]], [[c2]] : index + //CHECK: [[c32:%.+]] = arith.constant 32 : index + //CHECK: [[l_off_y:%.+]] = arith.muli [[sgidy]], [[c32]] : index + //CHECK: [[c32_0:%.+]] = arith.constant 32 : index + //CHECK: [[l_off_x:%.+]] = arith.muli [[sgidx]], [[c32_0]] : index + //CHECK: [[c64:%.+]] = arith.constant 64 : index + //CHECK: [[off_y:%.+]] = arith.remui [[l_off_y]], [[c64]] : index + //CHECK: [[c128:%.+]] = arith.constant 128 : index + //CHECK: [[off_x:%.+]] = arith.remui [[l_off_x]], [[c128]] : index + //CHECK: xegpu.store_matrix [[cst]], [[mdesc]][[[off_y]], [[off_x]]] : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32>, index, index + %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.0> : vector<64x128xf32> + %mdesc = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32> + xegpu.store_matrix %cst, %mdesc[0, 0] {layout = #xegpu.layout} : vector<64x128xf32>, !xegpu.mem_desc<64x128xf32> + gpu.return + } + + // CHECK-LABEL: @vector_reduce_dim_0 + gpu.func @vector_reduce_dim_0(%src: memref<4x128xf32>) { + %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} dense<1.0> : vector<128xf32> + %tdesc = xegpu.create_nd_tdesc %src : memref<4x128xf32> + -> !xegpu.tensor_desc<4x128xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} + : !xegpu.tensor_desc<4x128xf32, #xegpu.layout> + -> vector<4x128xf32> + // CHECK: vector.multi_reduction , {{.*}}, {{.*}} [0] : vector<4x4xf32> to vector<4xf32> + %reduce = vector.multi_reduction , %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} [0] + : vector<4x128xf32> to vector<128xf32> + gpu.return + } + + // CHECK-LABEL: @vector_reduce_dim_1 + gpu.func @vector_reduce_dim_1(%src: memref<256x64xf32>) { + %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} dense<1.0> : vector<256xf32> + %tdesc = xegpu.create_nd_tdesc %src : memref<256x64xf32> + -> !xegpu.tensor_desc<256x64xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} + : !xegpu.tensor_desc<256x64xf32, #xegpu.layout> + -> vector<256x64xf32> + // CHECK: vector.multi_reduction , {{.*}}, {{.*}} [1] : vector<16x64xf32> to vector<16xf32> + %reduce = vector.multi_reduction , %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} [1] + : vector<256x64xf32> to vector<256xf32> + gpu.return + } + + // CHECK-LABEL: @vector_reduce_4D + gpu.func @vector_reduce_4D(%src: ui64) { + %cst_acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [3]>} dense<0.0> : vector<4x2x6xf16> + %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<4x2x6x32xindex> + %mask = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<4x2x6x32xi1> + %load = xegpu.load %src[%offset], %mask {layout = #xegpu.layout} : ui64, vector<4x2x6x32xindex>, vector<4x2x6x32xi1> -> vector<4x2x6x32xf16> + // CHECK: vector.multi_reduction , {{.*}}, {{.*}} [3] : vector<1x1x1x32xf16> to vector<1x1x1xf16> + %reduce = vector.multi_reduction , %load, %cst_acc {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [3]>} [3] + : vector<4x2x6x32xf16> to vector<4x2x6xf16> + gpu.return + } + + // CHECK-LABEL: gpu.func @vector_reduce_scalar_cross_sg + // CHECK-SAME: (%[[ARG0:.*]]: memref<32x32xf32>) + // CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 + // CHECK-DAG: %[[LOAD:.*]] = xegpu.load_nd %{{.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<8x8xf32> -> vector<8x8xf32> + // CHECK-DAG: %[[CST_ACC:.*]] = arith.constant 0.000000e+00 : f32 + // CHECK-DAG: %[[LOCAL:.*]] = vector.multi_reduction , %[[LOAD]], %[[CST_ACC]] [0, 1] : vector<8x8xf32> to f32 + // CHECK-DAG: %[[BCAST:.*]] = vector.broadcast %[[LOCAL]] : f32 to vector<1x1xf32> + // CHECK-DAG: %[[ALLOCA:.*]] = memref.alloca() : memref<64xi8, 3> + // CHECK-DAG: %[[MEM_DESC:.*]] = xegpu.create_mem_desc %[[ALLOCA]] : memref<64xi8, 3> -> !xegpu.mem_desc<4x4xf32> + // CHECK-DAG: xegpu.store_matrix %[[BCAST]], %[[MEM_DESC]]{{.*}} : vector<1x1xf32>, !xegpu.mem_desc<4x4xf32> + // CHECK-DAG: gpu.barrier + // CHECK-DAG: %[[LOAD_SLM:.*]] = xegpu.load_matrix %[[MEM_DESC]]{{.*}} -> vector<4x4xf32> + // CHECK-DAG: %[[CST_FINAL:.*]] = arith.constant 0.000000e+00 : f32 + // CHECK-DAG: %[[FINAL:.*]] = vector.multi_reduction , %[[LOAD_SLM]], %[[CST_FINAL]] [0, 1] : vector<4x4xf32> to f32 + // CHECK-DAG: arith.addf %[[FINAL]], %[[CST]] : f32 + gpu.func @vector_reduce_scalar_cross_sg(%src: memref<32x32xf32>) { + %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0, 1]>} 0.0 : f32 + %tdesc = xegpu.create_nd_tdesc %src : memref<32x32xf32> + -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} + : !xegpu.tensor_desc<32x32xf32, #xegpu.layout> + -> vector<32x32xf32> + %reduce = vector.multi_reduction , %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0, 1]>} [0, 1] + : vector<32x32xf32> to f32 + gpu.return + } + + // CHECK-LABEL: vector_step_op + gpu.func @vector_step_op_slice_attr() { + //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index + //CHECK: [[c8:%.+]] = arith.constant 8 : index + //CHECK: [[sgidx:%.+]] = arith.remui [[sgId]], [[c8]] : index + //CHECK: [[sgidy_tmp:%.+]] = arith.divui [[sgId]], [[c8]] : index + //CHECK: [[c4:%.+]] = arith.constant 4 : index + //CHECK: [[sgidy:%.+]] = arith.remui [[sgidy_tmp]], [[c4]] : index + //CHECK: [[c32:%.+]] = arith.constant 32 : index + //CHECK: [[LY:%.+]] = arith.muli [[sgidy]], [[c32]] : index + //CHECK: [[c128:%.+]] = arith.constant 128 : index + //CHECK: [[MODY:%.+]] = arith.remui [[LY]], [[c128]] : index + //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex> + //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex> + //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex> + %step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>}: vector<128xindex> + gpu.return + } + + gpu.func @vector_step_op_layout_attr() { + //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index + //CHECK: [[c16:%.+]] = arith.constant 16 : index + //CHECK: [[sgidx:%.+]] = arith.remui [[sgId]], [[c16]] : index + //CHECK: [[c8:%.+]] = arith.constant 8 : index + //CHECK: [[LOCALY:%.+]] = arith.muli [[sgidx]], [[c8]] : index + //CHECK: [[c128:%.+]] = arith.constant 128 : index + //CHECK: [[MODY:%.+]] = arith.remui [[LOCALY]], [[c128]] : index + //CHECK: [[BASE:%.+]] = vector.step : vector<8xindex> + //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<8xindex> + //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<8xindex> + %step = vector.step {layout_result_0 = #xegpu.layout}: vector<128xindex> + gpu.return + } + + // CHECK-LABEL: constant_with_slice_attr + gpu.func @constant_with_slice_attr() { + //CHECK: [[cst:%.+]] = arith.constant dense<10> : vector<1xindex> + %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1, 2, 3]>} dense<10> : vector<4xindex> + gpu.return + } + + // CHECK-LABEL: vector_shape_cast + gpu.func @vector_shape_cast() { + %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0, 1, 2]>} dense<10> : vector<128xindex> + %step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0, 1, 2]>} : vector<128xindex> + %muli = arith.muli %cst, %step {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0, 1, 2]>} : vector<128xindex> + //CHECK: vector.shape_cast {{.*}} : vector<32xindex> to vector<1x1x1x32xindex> + %shape_cast = vector.shape_cast %muli {layout_result_0 = #xegpu.layout, layout_operand_0 = #xegpu.slice<#xegpu.layout, dims = [0, 1, 2]>} : vector<128xindex> to vector<1x1x1x128xindex> + gpu.return + } + + // CHECK-LABEL: vector_broadcast + gpu.func @vector_broadcast(%arg0: index, %arg1: index) { + %muli = arith.muli %arg0, %arg1 : index + // CHECK: vector.broadcast {{.*}} : index to vector<1x1x1x32xindex> + %broadcast = vector.broadcast %muli {layout_result_0 = #xegpu.layout} : index to vector<4x2x6x32xindex> + gpu.return + } + + // CHECK-LABEL: vector_transpose + gpu.func @vector_transpose(%src: memref<256x32xf32>) { + %tdesc = xegpu.create_nd_tdesc %src : memref<256x32xf32> + -> !xegpu.tensor_desc<256x32xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} + : !xegpu.tensor_desc<256x32xf32, #xegpu.layout> + -> vector<256x32xf32> + //CHECK: vector.transpose {{.*}}, [1, 0] : vector<64x32xf32> to vector<32x64xf32> + %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout} + : vector<256x32xf32> to vector<32x256xf32> + gpu.return + } + + // CHECK-LABEL: non_splat_constant_2D + gpu.func @non_splat_constant_2D() { + // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1x1xindex> + // CHECK-DAG: %[[T0:.*]] = gpu.subgroup_id : index + // CHECK-DAG: %[[T1:.*]] = arith.remui %[[T0]], %[[C32:.*]] : index + // CHECK-DAG: %[[T2:.*]] = arith.remui %[[T1]], %[[C32_4:.*]] : index + // CHECK-DAG: %[[T3:.*]] = arith.muli %[[T2]], %[[C16:.*]] : index + // CHECK-DAG: %[[T4:.*]] = arith.addi %[[C0_8:.*]], %[[T3]] : index + // CHECK-DAG: %[[T5:.*]] = arith.muli %[[C0_6:.*]], %[[C0_7:.*]] : index + // CHECK-DAG: %[[T6:.*]] = arith.addi %[[T4]], %[[T5]] : index + // CHECK-DAG: %[[T7:.*]] = vector.broadcast %[[T6]] : index to vector<1x1xindex> + // CHECK-DAG: %[[T8:.*]] = arith.addi %[[CST]], %[[T7]] : vector<1x1xindex> + %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<[[0], [16], [32], [48], [64], [80], [96], [112], [128], [144], [160], [176], [192], [208], [224], [240], [256], [272], [288], [304], [320], [336], [352], [368], [384], [400], [416], [432], [448], [464], [480], [496]]> : vector<32x1xindex> + gpu.return + } + + // CHECK-LABEL: non_splat_constant_2D_non_unit_dim + gpu.func @non_splat_constant_2D_non_unit_dim() { + // CHECK-DAG: %[[BASECST:.*]] = arith.constant dense<{{\[}}{{\[}}0, 16{{\]}}, {{\[}}8, 24{{\]}}{{\]}}> : vector<2x2xindex> + // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK-DAG: %[[SGIDX:.*]] = arith.remui %[[SGID]], %{{.*}} + // CHECK-DAG: %[[SGIDY_TMP:.*]] = arith.divui %[[SGID]], %{{.*}} + // CHECK-DAG: %[[SGIDY:.*]] = arith.remui %[[SGIDY_TMP]], %{{.*}} + // CHECK-DAG: %[[MULY:.*]] = arith.muli %[[SGIDY]], %[[C2:.*]] : index + // CHECK-DAG: %[[MULX:.*]] = arith.muli %[[SGIDX]], %{{.*}} : index + // CHECK-DAG: %[[REMU_Y:.*]] = arith.remui %[[MULY]], %[[C8:.*]] : index + // CHECK-DAG: %[[REMU_X:.*]] = arith.remui %[[MULX]], %{{.*}} : index + // CHECK-DAG: %[[MUL5:.*]] = arith.muli %[[REMU_Y]], %{{.*}} : index + // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[C0:.*]], %[[MUL5]] : index + // CHECK-DAG: %[[MUL6:.*]] = arith.muli %[[REMU_X]], %[[C16:.*]] : index + // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[ADD]], %[[MUL6]] : index + // CHECK-DAG: %[[BCAST:.*]] = vector.broadcast %[[ADDSTRIDES]] : index to vector<2x2xindex> + // CHECK-DAG: %[[ADDCST:.*]] = arith.addi %[[BASECST]], %[[BCAST]] : vector<2x2xindex> + %cst_8x8 = arith.constant {layout_result_0 = #xegpu.layout} dense<[ + [0, 16, 32, 48, 64, 80, 96, 112], + [8, 24, 40, 56, 72, 88, 104, 120], + [16, 32, 48, 64, 80, 96, 112, 128], + [24, 40, 56, 72, 88, 104, 120, 136], + [32, 48, 64, 80, 96, 112, 128, 144], + [40, 56, 72, 88, 104, 120, 136, 152], + [48, 64, 80, 96, 112, 128, 144, 160], + [56, 72, 88, 104, 120, 136, 152, 168] + ]> : vector<8x8xindex> + gpu.return + } + + // CHECK-LABEL: non_splat_constant + gpu.func @non_splat_constant() { + // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1xindex> + // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK-DAG: %[[REMU:.*]] = arith.remui %[[SGID]], %{{.*}} + // CHECK-DAG: %[[REMU2:.*]] = arith.remui %[[REMU]], %{{.*}} + // CHECK-DAG: %[[MUL:.*]] = arith.muli %[[REMU2]], %[[C16:.*]] : index + // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[C0:.*]], %[[MUL]] : index + // CHECK-DAG: %[[BCAST:.*]] = vector.broadcast %[[ADDSTRIDES]] : index to vector<1xindex> + // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[CST]], %[[BCAST]] : vector<1xindex> + %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496]> : vector<32xindex> + // CHECK: arith.constant dense<{{\[}}{{\[}}0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15{{\]}}{{\]}}> : vector<1x16xindex> + %cst_1 = arith.constant {layout_result_0 = #xegpu.layout} dense<[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]> : vector<1x16xindex> + gpu.return + } + + // CHECK-LABEL: scalar_broadcast + gpu.func @scalar_broadcast(%arg0: index) { + // CHECK: vector.broadcast {{.*}} : index to vector<1x1x1xindex> + %broadcast = vector.broadcast %arg0 {layout_result_0 = #xegpu.layout} : index to vector<4x1x1xindex> + gpu.return + } + + // CHECK-LABEL: vector_mask_1D + gpu.func @vector_mask_1D() { + // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK-DAG: %[[REMU:.*]] = arith.remui %[[SGID]], %[[C2:.*]] + // CHECK-DAG: %[[MUL:.*]] = arith.muli %[[REMU]], %[[C16:.*]] : index + // CHECK-DAG: %[[REMU2:.*]] = arith.remui %[[MUL]], %[[C32:.*]] : index + // CHECK-DAG: %[[SUB:.*]] = arith.subi %[[C8:.*]], %[[REMU2]] : index + // CHECK-DAG: %[[MAX:.*]] = arith.maxsi %[[SUB]], %[[C0:.*]] : index + // CHECK-DAG: %[[MIN:.*]] = arith.minsi %[[MAX]], %[[C16:.*]] : index + // CHECK-DAG: %[[MASK:.*]] = vector.create_mask %[[MIN]] : vector<16xi1> + %constant_mask = vector.constant_mask [8] {layout_result_0 = #xegpu.layout} : vector<32xi1> + gpu.return + } + + // CHECK-LABEL: vector_mask_2D + gpu.func @vector_mask_2D() { + // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK-DAG: %[[SGIDX:.*]] = arith.remui %[[SGID]], %[[C4:.*]] + // CHECK-DAG: %[[SGIDY_TMP:.*]] = arith.divui %[[SGID]], %[[C4:.*]] + // CHECK-DAG: %[[SGIDY:.*]] = arith.remui %[[SGIDY_TMP]], %[[C8:.*]] + // CHECK-DAG: %[[ROW:.*]] = arith.muli %[[SGIDY]], %[[C32:.*]] : index + // CHECK-DAG: %[[COL:.*]] = arith.muli %[[SGIDX]], %[[C32:.*]] : index + // CHECK-DAG: %[[MODROW:.*]] = arith.remui %[[ROW]], %[[C256:.*]] : index + // CHECK-DAG: %[[MODCOL:.*]] = arith.remui %[[COL]], %[[C128:.*]] : index + // CHECK-DAG: %[[SUBROW:.*]] = arith.subi %[[C16:.*]], %[[MODROW]] : index + // CHECK-DAG: %[[MAXROW:.*]] = arith.maxsi %[[SUBROW]], %[[C4:.*]] : index + // CHECK-DAG: %[[MINROW:.*]] = arith.minsi %[[MAXROW]], %[[C32:.*]] : index + // CHECK-DAG: %[[SUBCOL:.*]] = arith.subi %[[C16:.*]], %[[MODCOL]] : index + // CHECK-DAG: %[[MAXCOL:.*]] = arith.maxsi %[[SUBCOL]], %[[C7:.*]] : index + // CHECK-DAG: %[[MINCOL:.*]] = arith.minsi %[[MAXCOL]], %[[C32:.*]] : index + // CHECK-DAG: %[[MASK:.*]] = vector.create_mask %[[MINROW]], %[[MINCOL]] : vector<32x32xi1> + %constant_mask = vector.constant_mask [16, 16] {layout_result_0 = #xegpu.layout} : vector<256x128xi1> + gpu.return + } + + // CHECK-LABEL: vector_create_mask_1D + gpu.func @vector_create_mask_1D() { + // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK-DAG: %[[REMU:.*]] = arith.remui %[[SGID]], %[[C2:.*]] + // CHECK-DAG: %[[MUL:.*]] = arith.muli %[[REMU]], %[[C16:.*]] + // CHECK-DAG: %[[REMU2:.*]] = arith.remui %[[MUL]], %[[C32:.*]] + // CHECK-DAG: %[[SUB:.*]] = arith.subi %[[C8:.*]], %[[REMU2]] : index + // CHECK-DAG: %[[MAX:.*]] = arith.maxsi %[[SUB]], %[[C0:.*]] : index + // CHECK-DAG: %[[MIN:.*]] = arith.minsi %[[MAX]], %[[C16:.*]] : index + // CHECK-DAG: %[[MASK:.*]] = vector.create_mask %[[MIN]] : vector<16xi1> + %cst8 = arith.constant 8 : index + %constant_mask = vector.create_mask %cst8 {layout_result_0 = #xegpu.layout} : vector<32xi1> + gpu.return + } + + // CHECK-LABEL: vector_create_mask_2D + gpu.func @vector_create_mask_2D() { + // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK-DAG: %[[SGIDX:.*]] = arith.remui %[[SGID]], %[[C4:.*]] + // CHECK-DAG: %[[SGIDY_TMP:.*]] = arith.divui %[[SGID]], %[[C4:.*]] + // CHECK-DAG: %[[SGIDY:.*]] = arith.remui %[[SGIDY_TMP]], %[[C8:.*]] + // CHECK-DAG: %[[ROW:.*]] = arith.muli %[[SGIDY]], %[[C32:.*]] + // CHECK-DAG: %[[COL:.*]] = arith.muli %[[SGIDX]], %[[C32:.*]] + // CHECK-DAG: %[[MODROW:.*]] = arith.remui %[[ROW]], %[[C256:.*]] + // CHECK-DAG: %[[MODCOL:.*]] = arith.remui %[[COL]], %[[C128:.*]] + // CHECK-DAG: %[[SUBROW:.*]] = arith.subi %[[C16:.*]], %[[MODROW]] : index + // CHECK-DAG: %[[MAXROW:.*]] = arith.maxsi %[[SUBROW]], %[[C0:.*]] : index + // CHECK-DAG: %[[MINROW:.*]] = arith.minsi %[[MAXROW]], %[[C32:.*]] : index + // CHECK-DAG: %[[SUBCOL:.*]] = arith.subi %[[C16:.*]], %[[MODCOL]] : index + // CHECK-DAG: %[[MAXCOL:.*]] = arith.maxsi %[[SUBCOL]], %[[C0:.*]] : index + // CHECK-DAG: %[[MINCOL:.*]] = arith.minsi %[[MAXCOL]], %[[C32:.*]] : index + // CHECK-DAG: %[[MASK:.*]] = vector.create_mask %[[MINROW]], %[[MINCOL]] : vector<32x32xi1> + %cst16 = arith.constant 16 : index + %constant_mask = vector.create_mask %cst16, %cst16 {layout_result_0 = #xegpu.layout} : vector<256x128xi1> + gpu.return + } + + // CHECK-LABEL: distribute_load_slice_attr + gpu.func @distribute_load_slice_attr() { + %2 = memref.alloca() {alignment = 1024} : memref<4096xf32> + %offset = arith.constant {layout_result_0 = #xegpu.layout } dense<0> : vector<256xindex> + %mask = arith.constant {layout_result_0 = #xegpu.layout } dense<1> : vector<256xi1> + + // CHECK: %[[LOAD:.*]] = xegpu.load {{.*}} <{chunk_size = 1 : i64, layout = #xegpu.slice<#xegpu.layout, dims = [0]>}> + // CHECK-SAME: memref<4096xf32>, vector<32xindex>, vector<32xi1> -> vector<32xf32> + %3 = xegpu.load %2[%offset], %mask {chunk_size = 1, layout = #xegpu.slice<#xegpu.layout, dims = [0]> } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32> + + // CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[LOAD]] : vector<32xf32> to vector<32x32xf32> + %4 = vector.broadcast %3 {layout_result_0 = + #xegpu.layout} : vector<256xf32> to vector<256x256xf32> + gpu.return + } + + // CHECK-LABEL: gpu.func @vector_reduce_cross_sg_dim_1 + // CHECK-SAME: (%[[ARG0:.*]]: memref) + gpu.func @vector_reduce_cross_sg_dim_1(%src: memref) { + // CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1x32xf32> + // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<0> : vector<1x1x32xindex> + // CHECK-DAG: %[[CST_1:.*]] = arith.constant dense : vector<1x1x32xi1> + // CHECK-DAG: %[[LOAD:.*]] = xegpu.load %[[ARG0:.*]][%[[CST_0]]], %[[CST_1]] <{chunk_size = 1 : i64}> : memref, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32> + // CHECK-DAG: %[[CST_2:.*]] = arith.constant dense<0.000000e+00> : vector<1x32xf32> + // CHECK-DAG: %[[LOCAL_REDUCE:.*]] = vector.multi_reduction , %[[LOAD]], %[[CST_2]] [1] : vector<1x1x32xf32> to vector<1x32xf32> + // CHECK-DAG: %[[CAST:.*]] = vector.shape_cast %[[LOCAL_REDUCE]] : vector<1x32xf32> to vector<1x1x32xf32> + // CHECK-DAG: %[[ALLOCA:.*]] = memref.alloca() : memref<4096xi8, 3> + // CHECK-DAG: %[[MEM_DESC:.*]] = xegpu.create_mem_desc %[[ALLOCA]] : memref<4096xi8, 3> -> !xegpu.mem_desc<1x32x32xf32> + // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK-DAG: xegpu.store_matrix %[[CAST]], %[[MEM_DESC]]{{.*}} : vector<1x1x32xf32>, !xegpu.mem_desc<1x32x32xf32>, index, index, index + // CHECK-DAG: gpu.barrier + // CHECK-DAG: %[[LOAD_SLM:.*]] = xegpu.load_matrix %[[MEM_DESC]]{{.*}} : !xegpu.mem_desc<1x32x32xf32>, index, index, index -> vector<1x32x32xf32> + // CHECK-DAG: %[[CST_3:.*]] = arith.constant dense<0.000000e+00> : vector<1x32xf32> + // CHECK-DAG: %[[FINAL_REDUCE:.*]] = vector.multi_reduction , %[[LOAD_SLM]], %[[CST_3]] [1] : vector<1x32x32xf32> to vector<1x32xf32> + // CHECK-DAG: %[[ADD:.*]] = arith.addf %[[FINAL_REDUCE]], %[[CST]] : vector<1x32xf32> + // CHECK-DAG: gpu.return + %cst_3 = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} dense<1.0> : vector<1x32xf32> + %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<1x32x32xindex> + %mask = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<1x32x32xi1> + %14 = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout} : memref, vector<1x32x32xindex>, vector<1x32x32xi1> -> vector<1x32x32xf32> + %15 = vector.multi_reduction , %14, %cst_3 {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} [1] : vector<1x32x32xf32> to vector<1x32xf32> + gpu.return + } + + // CHECK-LABEL: gpu.func @vector_reduce_cross_sg_dim_0 + // CHECK-SAME: (%[[ARG0:.*]]: memref<256x128xf32>) + gpu.func @vector_reduce_cross_sg_dim_0(%src: memref<256x128xf32>) { + // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<32xf32> + // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK-DAG: %[[REM1:.*]] = arith.remui %[[SGID]], %[[C4:.*]] : index + // CHECK-DAG: %[[DIV1:.*]] = arith.divui %[[SGID]], %[[C4:.*]] : index + // CHECK-DAG: %[[REM2:.*]] = arith.remui %[[DIV1]], %[[C8:.*]] : index + // CHECK-DAG: %[[MUL1:.*]] = arith.muli %[[REM2]], %[[C32:.*]] : index + // CHECK-DAG: %[[MUL2:.*]] = arith.muli %[[REM1]], %[[C32_0:.*]] : index + // CHECK-DAG: %[[REM3:.*]] = arith.remui %[[MUL1]], %[[C256:.*]] : index + // CHECK-DAG: %[[REM4:.*]] = arith.remui %[[MUL2]], %[[C128:.*]] : index + // CHECK-DAG: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32> + // CHECK-DAG: %[[LOAD_ND:.*]] = xegpu.load_nd %[[TDESC]][{{[^]]*}}] : !xegpu.tensor_desc<32x32xf32> -> vector<32x32xf32> + // CHECK-DAG: %[[CST_LOCAL:.*]] = arith.constant dense<0.000000e+00> : vector<32xf32> + // CHECK-DAG: %[[LOCAL_REDUCE:.*]] = vector.multi_reduction , %[[LOAD_ND]], %[[CST_LOCAL]] [0] : vector<32x32xf32> to vector<32xf32> + // CHECK-DAG: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[LOCAL_REDUCE]] : vector<32xf32> to vector<1x32xf32> + // CHECK-DAG: %[[ALLOCA:.*]] = memref.alloca() : memref<4096xi8, 3> + // CHECK-DAG: %[[MEM_DESC:.*]] = xegpu.create_mem_desc %[[ALLOCA]] : memref<4096xi8, 3> -> !xegpu.mem_desc<8x128xf32> + // CHECK-DAG: %[[SGID2:.*]] = gpu.subgroup_id : index + // CHECK-DAG: xegpu.store_matrix %[[SHAPE_CAST]], %[[MEM_DESC]]{{.*}} : vector<1x32xf32>, !xegpu.mem_desc<8x128xf32>, index, index + // CHECK-DAG: gpu.barrier + // CHECK-DAG: %[[LOAD_SLM:.*]] = xegpu.load_matrix %[[MEM_DESC]]{{.*}} : !xegpu.mem_desc<8x128xf32>, index, index -> vector<8x32xf32> + // CHECK-DAG: %[[CST_CROSS_SG_1:.*]] = arith.constant dense<0.000000e+00> : vector<32xf32> + // CHECK-DAG: %[[FINAL_REDUCE:.*]] = vector.multi_reduction , %[[LOAD_SLM]], %[[CST_CROSS_SG_1]] [0] : vector<8x32xf32> to vector<32xf32> + // CHECK-DAG: arith.addf %[[FINAL_REDUCE]], %[[CST]] : vector<32xf32> + %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} dense<0.0> : vector<128xf32> + %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> + -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} + : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + -> vector<256x128xf32> + %reduce = vector.multi_reduction , %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} [0] + : vector<256x128xf32> to vector<128xf32> + gpu.return + } + + // CHECK-LABEL: gpu.func @vector_reduce_multi_dim + // CHECK-SAME: (%[[ARG0:.*]]: memref) + gpu.func @vector_reduce_multi_dim(%src: memref) { + // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<1x1xf32> + // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<0> : vector<1x1x32x32xindex> + // CHECK-DAG: %[[CST_1:.*]] = arith.constant dense : vector<1x1x32x32xi1> + // CHECK-DAG: %[[LOAD:.*]] = xegpu.load %{{.*}}[%[[CST_0]]], %[[CST_1]] <{chunk_size = 1 : i64}> : memref, vector<1x1x32x32xindex>, vector<1x1x32x32xi1> -> vector<1x1x32x32xf32> + // CHECK-DAG: %[[CST_2:.*]] = arith.constant dense<0.000000e+00> : vector<1x1xf32> + // CHECK-DAG: %[[LOCAL_REDUCE:.*]] = vector.multi_reduction , %[[LOAD]], %[[CST_2]] [2, 3] : vector<1x1x32x32xf32> to vector<1x1xf32> + // CHECK-DAG: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[LOCAL_REDUCE]] : vector<1x1xf32> to vector<1x1x1x1xf32> + // CHECK-DAG: %[[ALLOCA:.*]] = memref.alloca() : memref<256xi8, 3> + // CHECK-DAG: %[[MEM_DESC:.*]] = xegpu.create_mem_desc %[[ALLOCA]] : memref<256xi8, 3> -> !xegpu.mem_desc<2x2x4x4xf32> + // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK-DAG: xegpu.store_matrix %[[SHAPE_CAST]], %[[MEM_DESC]]{{.*}} : vector<1x1x1x1xf32>, !xegpu.mem_desc<2x2x4x4xf32>, index, index, index, index + // CHECK-DAG: gpu.barrier + // CHECK-DAG: %[[LOAD_SLM:.*]] = xegpu.load_matrix %[[MEM_DESC]]{{.*}} : !xegpu.mem_desc<2x2x4x4xf32>, index, index, index, index -> vector<1x1x4x4xf32> + // CHECK-DAG: %[[CST_3:.*]] = arith.constant dense<0.000000e+00> : vector<1x1xf32> + // CHECK-DAG: %[[FINAL_REDUCE:.*]] = vector.multi_reduction , %[[LOAD_SLM]], %[[CST_3]] [2, 3] : vector<1x1x4x4xf32> to vector<1x1xf32> + // CHECK-DAG: %[[FINAL_ADD:.*]] = arith.addf %[[FINAL_REDUCE]], %[[CST]] : vector<1x1xf32> + // CHECK-DAG: gpu.return + %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [2, 3]>} dense<0.0> : vector<2x2xf32> + %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<2x2x128x128xindex> + %mask = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<2x2x128x128xi1> + %load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout} : memref, vector<2x2x128x128xindex>, vector<2x2x128x128xi1> -> vector<2x2x128x128xf32> + %reduce = vector.multi_reduction , %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [2, 3]>} [2, 3] : vector<2x2x128x128xf32> to vector<2x2xf32> + gpu.return + } + + // CHECK-LABEL: gpu.func @vector_reduce_multi_dim_nou_unit_local_reduction + // CHECK-SAME: (%[[ARG0:.*]]: memref) + gpu.func @vector_reduce_multi_dim_nou_unit_local_reduction(%src: memref) { + // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<16x16xf32> + // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<0> : vector<16x16x32x32xindex> + // CHECK-DAG: %[[CST_1:.*]] = arith.constant dense : vector<16x16x32x32xi1> + // CHECK-DAG: %[[LOAD:.*]] = xegpu.load %[[ARG0]][%[[CST_0]]], %[[CST_1]] <{chunk_size = 1 : i64}> : memref, vector<16x16x32x32xindex>, vector<16x16x32x32xi1> -> vector<16x16x32x32xf32> + // CHECK-DAG: %[[CST_2:.*]] = arith.constant dense<0.000000e+00> : vector<16x16xf32> + // CHECK-DAG: %[[LOCAL_REDUCE:.*]] = vector.multi_reduction , %[[LOAD]], %[[CST_2]] [2, 3] : vector<16x16x32x32xf32> to vector<16x16xf32> + // CHECK-DAG: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[LOCAL_REDUCE]] : vector<16x16xf32> to vector<16x16x1x1xf32> + // CHECK-DAG: %[[ALLOCA:.*]] = memref.alloca() : memref<65536xi8, 3> + // CHECK-DAG: %[[MEM_DESC:.*]] = xegpu.create_mem_desc %[[ALLOCA]] : memref<65536xi8, 3> -> !xegpu.mem_desc<32x32x4x4xf32> + // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK-DAG: xegpu.store_matrix %[[SHAPE_CAST]], %[[MEM_DESC]]{{.*}} : vector<16x16x1x1xf32>, !xegpu.mem_desc<32x32x4x4xf32>, index, index, index, index + // CHECK-DAG: gpu.barrier + // CHECK-DAG: %[[LOAD_SLM:.*]] = xegpu.load_matrix %[[MEM_DESC]]{{.*}} : !xegpu.mem_desc<32x32x4x4xf32>, index, index, index, index -> vector<16x16x4x4xf32> + // CHECK-DAG: %[[CST_3:.*]] = arith.constant dense<0.000000e+00> : vector<16x16xf32> + // CHECK-DAG: %[[FINAL_REDUCE:.*]] = vector.multi_reduction , %[[LOAD_SLM]], %[[CST_3]] [2, 3] : vector<16x16x4x4xf32> to vector<16x16xf32> + // CHECK-DAG: %[[FINAL_ADD:.*]] = arith.addf %[[FINAL_REDUCE]], %[[CST]] : vector<16x16xf32> + // CHECK-DAG: gpu.return + %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [2, 3]>} dense<0.0> : vector<32x32xf32> + %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<32x32x128x128xindex> + %mask = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<32x32x128x128xi1> + %load = xegpu.load %src[%offset], %mask {chunk_size = 1, layout = #xegpu.layout} : memref, vector<32x32x128x128xindex>, vector<32x32x128x128xi1> -> vector<32x32x128x128xf32> + %reduce = vector.multi_reduction , %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [2, 3]>} [2, 3] : vector<32x32x128x128xf32> to vector<32x32xf32> + gpu.return + } + + // CHECK-LABEL: load_nd_tdesc_with_anchor_layout + gpu.func @load_nd_tdesc_with_anchor_layout(%src: memref<256x128xf32>) { + //CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> + %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> + -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + // CHECK: xegpu.load_nd %[[TDESC]][{{%.*}}, {{%.*}}] <{layout = #xegpu.layout}> + // CHECK-SAME: : !xegpu.tensor_desc<32x32xf32, #xegpu.layout> -> vector<32x32xf32> + %load = xegpu.load_nd %tdesc[0, 0] <{layout = #xegpu.layout}> + : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + -> vector<256x128xf32> + gpu.return + } + + // CHECK-LABEL: convert_layout_no_slm + gpu.func @convert_layout_no_slm(%arg0: memref<4096x4096xf32>, %arg1: memref<4096x4096xf16>, %arg2: memref<4096x4096xf16>) { + %c32 = arith.constant 32 : index + %c4096 = arith.constant 4096 : index + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %block_id_x = gpu.block_id x + %block_id_y = gpu.block_id y + %0 = arith.muli %block_id_x, %c256 overflow : index + %1 = arith.muli %block_id_y, %c256 overflow : index + %2 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf32> -> !xegpu.tensor_desc<256x256xf32, #xegpu.block_tdesc_attr, #xegpu.layout> + %3 = xegpu.load_nd %2[%0, %1] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<256x256xf32, #xegpu.block_tdesc_attr, #xegpu.layout> -> vector<256x256xf32> + %4 = xegpu.create_nd_tdesc %arg1 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16, #xegpu.block_tdesc_attr, #xegpu.layout> + %5 = xegpu.create_nd_tdesc %arg2 : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x256xf16, #xegpu.block_tdesc_attr, #xegpu.layout> + %6 = scf.for %arg3 = %c0 to %c4096 step %c32 iter_args(%arg4 = %3) -> (vector<256x256xf32>) { + %7 = xegpu.load_nd %4[%0, %arg3] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<256x32xf16, #xegpu.block_tdesc_attr, #xegpu.layout> -> vector<256x32xf16> + %8 = xegpu.load_nd %5[%arg3, %1] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<32x256xf16, #xegpu.block_tdesc_attr, #xegpu.layout> -> vector<32x256xf16> + // CHECK: %[[CONVERT_A:.*]] = xegpu.convert_layout %{{.*}} <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<32x32xf16> + // CHECK: %[[CONVERT_B:.*]] = xegpu.convert_layout %{{.*}} <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<32x32xf16> + %9 = xegpu.convert_layout %7 <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<256x32xf16> + %10 = xegpu.convert_layout %8 <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<32x256xf16> + %11 = xegpu.dpas %9, %10, %arg4 {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf32> -> vector<256x256xf32> + scf.yield %11 : vector<256x256xf32> + } {layout_result_0 = #xegpu.layout} + xegpu.store_nd %6, %2[%0, %1] <{layout = #xegpu.layout}> : vector<256x256xf32>, !xegpu.tensor_desc<256x256xf32, #xegpu.block_tdesc_attr, #xegpu.layout> + gpu.return + } + + // CHECK-LABEL: convert_layout_slm + // CHECK-SAME: %[[ARG0:.*]]: memref<128x256xf32> + gpu.func @convert_layout_slm(%arg0: memref<128x256xf32>) { + // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK-DAG: %[[SGIDX:.*]] = arith.remui %[[SGID]], %[[C16:.*]] : index + // CHECK-DAG: %[[SGIDY_TMP:.*]] = arith.divui %[[SGID]], %[[C16:.*]] : index + // CHECK-DAG: %[[SGIDY:.*]] = arith.remui %[[SGIDY_TMP]], %[[C4:.*]] : index + // CHECK-DAG: %[[MUL_Y:.*]] = arith.muli %[[SGIDY]], %[[C32:.*]] : index + // CHECK-DAG: %[[MUL_X:.*]] = arith.muli %[[SGIDX]], %[[C16:.*]] : index + // CHECK-DAG: %[[OFF_Y:.*]] = arith.remui %[[MUL_Y]], %[[C128:.*]] : index + // CHECK-DAG: %[[OFF_X:.*]] = arith.remui %[[MUL_X]], %[[C256:.*]] : index + // CHECK-DAG: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<128x256xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.layout> + // CHECK-DAG: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]][{{[^]]*}}] <{layout = #xegpu.layout}> : !xegpu.tensor_desc<32x16xf32, #xegpu.layout> -> vector<32x16xf32> + // CHECK-DAG: %[[ALLOCA:.*]] = memref.alloca() : memref<131072xi8, 3> + // CHECK-DAG: %[[MDESC:.*]] = xegpu.create_mem_desc %[[ALLOCA]] : memref<131072xi8, 3> -> !xegpu.mem_desc<128x256xf32> + // CHECK-DAG: %[[SGID_STORE:.*]] = gpu.subgroup_id : index + // CHECK-DAG: %[[STORE_X:.*]] = arith.remui %[[SGID_STORE]], %[[C16:.*]] : index + // CHECK-DAG: %[[STORE_Y_TMP:.*]] = arith.divui %[[SGID_STORE]], %[[C16:.*]] : index + // CHECK-DAG: %[[STORE_Y:.*]] = arith.remui %[[STORE_Y_TMP]], %[[C4:.*]] : index + // CHECK-DAG: %[[STORE_MUL_Y:.*]] = arith.muli %[[STORE_Y]], %[[C32:.*]] : index + // CHECK-DAG: %[[STORE_MUL_X:.*]] = arith.muli %[[STORE_X]], %[[C16:.*]] : index + // CHECK-DAG: %[[STORE_OFF_Y:.*]] = arith.remui %[[STORE_MUL_Y]], %[[C128:.*]] : index + // CHECK-DAG: %[[STORE_OFF_X:.*]] = arith.remui %[[STORE_MUL_X]], %[[C256:.*]] : index + // CHECK-DAG: xegpu.store_matrix %[[LOAD]], %[[MDESC]][%[[STORE_OFF_Y]], %[[STORE_OFF_X]]] : vector<32x16xf32>, !xegpu.mem_desc<128x256xf32>, index, index + // CHECK-DAG: gpu.barrier + // CHECK-DAG: %[[LOAD_X:.*]] = arith.remui %[[SGID_STORE]], %[[C8:.*]] : index + // CHECK-DAG: %[[LOAD_Y_TMP:.*]] = arith.divui %[[SGID_STORE]], %[[C8:.*]] : index + // CHECK-DAG: %[[LOAD_Y:.*]] = arith.remui %[[LOAD_Y_TMP]], %[[C8:.*]] : index + // CHECK-DAG: %[[LOAD_MUL_Y:.*]] = arith.muli %[[LOAD_Y]], %[[C16:.*]] : index + // CHECK-DAG: %[[LOAD_MUL_X:.*]] = arith.muli %[[LOAD_X]], %[[C32:.*]] : index + // CHECK-DAG: %[[LOAD_OFF_Y:.*]] = arith.remui %[[LOAD_MUL_Y]], %[[C128:.*]] : index + // CHECK-DAG: %[[LOAD_OFF_X:.*]] = arith.remui %[[LOAD_MUL_X]], %[[C256:.*]] : index + // CHECK-DAG: %[[LOAD_SLM:.*]] = xegpu.load_matrix %[[MDESC]][%[[LOAD_OFF_Y]], %[[LOAD_OFF_X]]] <{layout = #xegpu.layout}>: !xegpu.mem_desc<128x256xf32>, index, index -> vector<16x32xf32> + %0 = xegpu.create_nd_tdesc %arg0 : memref<128x256xf32> -> !xegpu.tensor_desc<128x256xf32, #xegpu.layout> + %1 = xegpu.load_nd %0[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<128x256xf32, #xegpu.layout> -> vector<128x256xf32> + %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout, + target_layout = #xegpu.layout}> : vector<128x256xf32> + gpu.return + } + + gpu.func @convert_layout_3D(%arg0: memref) { + // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1x32x16xindex> + // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense : vector<1x32x16xi1> + // CHECK-DAG: %[[LOAD:.*]] = xegpu.load %{{.*}}[%[[CST]]], %[[CST_0]] <{chunk_size = 1 : i64, layout = #xegpu.layout}> : memref, vector<1x32x16xindex>, vector<1x32x16xi1> -> vector<1x32x16xf32> + // CHECK-DAG: %[[ALLOCA:.*]] = memref.alloca() : memref<1048576xi8, 3> + // CHECK-DAG: %[[MDESC:.*]] = xegpu.create_mem_desc %[[ALLOCA]] : memref<1048576xi8, 3> -> !xegpu.mem_desc<8x128x256xf32> + // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK-DAG: %[[STORE_X:.*]] = arith.remui %[[SGID]], %[[C16:.*]] : index + // CHECK-DAG: %[[STORE_YZ_TMP:.*]] = arith.divui %[[SGID]], %[[C16:.*]] : index + // CHECK-DAG: %[[STORE_Y:.*]] = arith.remui %[[STORE_YZ_TMP]], %[[C4:.*]] : index + // CHECK-DAG: %[[STORE_Z_TMP:.*]] = arith.divui %[[STORE_YZ_TMP]], %[[C4:.*]] : index + // CHECK-DAG: %[[STORE_Z:.*]] = arith.remui %[[STORE_Z_TMP]], %[[C8:.*]] : index + // CHECK-DAG: %[[STORE_MUL_Y:.*]] = arith.muli %[[STORE_Y]], %[[C32:.*]] : index + // CHECK-DAG: %[[STORE_MUL_X:.*]] = arith.muli %[[STORE_X]], %[[C16:.*]] : index + // CHECK-DAG: %[[STORE_OFF_Z:.*]] = arith.remui %[[STORE_Z]], %[[C8:.*]] : index + // CHECK-DAG: %[[STORE_OFF_Y:.*]] = arith.remui %[[STORE_MUL_Y]], %[[C128:.*]] : index + // CHECK-DAG: %[[STORE_OFF_X:.*]] = arith.remui %[[STORE_MUL_X]], %[[C256:.*]] : index + // CHECK-DAG: xegpu.store_matrix %[[LOAD]], %[[MDESC]][%[[STORE_OFF_Z]], %[[STORE_OFF_Y]], %[[STORE_OFF_X]]] : vector<1x32x16xf32>, !xegpu.mem_desc<8x128x256xf32>, index, index, index + // CHECK-DAG: gpu.barrier + // CHECK-DAG: %[[LOAD_X:.*]] = arith.remui %[[SGID]], %[[C8:.*]] : index + // CHECK-DAG: %[[LOAD_YZ_TMP:.*]] = arith.divui %[[SGID]], %[[C8:.*]] : index + // CHECK-DAG: %[[LOAD_Y:.*]] = arith.remui %[[LOAD_YZ_TMP]], %[[C8:.*]] : index + // CHECK-DAG: %[[LOAD_Z_TMP:.*]] = arith.divui %[[LOAD_YZ_TMP]], %[[C8:.*]] : index + // CHECK-DAG: %[[LOAD_Z:.*]] = arith.remui %[[LOAD_Z_TMP]], %[[C8:.*]] : index + // CHECK-DAG: %[[LOAD_MUL_Y:.*]] = arith.muli %[[LOAD_Y]], %[[C16:.*]] : index + // CHECK-DAG: %[[LOAD_MUL_X:.*]] = arith.muli %[[LOAD_X]], %[[C32:.*]] : index + // CHECK-DAG: %[[LOAD_OFF_Z:.*]] = arith.remui %[[LOAD_Z]], %[[C8:.*]] : index + // CHECK-DAG: %[[LOAD_OFF_Y:.*]] = arith.remui %[[LOAD_MUL_Y]], %[[C128:.*]] : index + // CHECK-DAG: %[[LOAD_OFF_X:.*]] = arith.remui %[[LOAD_MUL_X]], %[[C256:.*]] : index + // CHECK-DAG: %[[LOAD_SLM:.*]] = xegpu.load_matrix %[[MDESC]][%[[LOAD_OFF_Z]], %[[LOAD_OFF_Y]], %[[LOAD_OFF_X]]] <{layout = #xegpu.layout}>: !xegpu.mem_desc<8x128x256xf32>, index, index, index -> vector<1x16x32xf32> + %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<8x128x256xindex> + %mask = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<8x128x256xi1> + %1 = xegpu.load %arg0[%offset], %mask {chunk_size = 1, layout = #xegpu.layout} : memref, vector<8x128x256xindex>, vector<8x128x256xi1> -> vector<8x128x256xf32> + %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout, + target_layout = #xegpu.layout}> : vector<8x128x256xf32> + gpu.return + } + + // CHECK-LABEL: convert_layout_reduce_to_scalar + gpu.func @convert_layout_reduce_to_scalar(%arg0: memref<32x32xf32>) { + %mask = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<32x32xi1> + %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<32x32xindex> + %cst_0 = arith.constant 0.000000e+00 : f32 + %intptr = memref.extract_aligned_pointer_as_index %arg0 : memref<32x32xf32> -> index + %10 = arith.index_cast %intptr : index to i64 + %11 = xegpu.load %10[%offset], %mask <{layout = #xegpu.layout}> {layout_operand_1 = #xegpu.layout, layout_operand_2 = #xegpu.layout} : i64, vector<32x32xindex>, vector<32x32xi1> -> vector<32x32xf32> + %12 = vector.multi_reduction , %11, %cst_0 {layout_operand_0 = #xegpu.layout, layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0, 1]>} [0, 1] : vector<32x32xf32> to f32 + // CHECK-NOT: xegpu.convert_layout + %13 = xegpu.convert_layout %12 <{input_layout = #xegpu.slice<#xegpu.layout, dims = [0, 1]>, target_layout = #xegpu.slice<#xegpu.layout, dims = [0, 1]>}> : f32 + gpu.return + } + + // CHECK-LABEL: distribute_nested_slice + // CHECK: %[[V0:.*]] = vector.shape_cast %{{.*}} : vector<32x32xf32> to vector<32x1x32x1xf32> + // CHECK: %[[V1:.*]] = vector.broadcast %[[V0]] : vector<32x1x32x1xf32> to vector<32x16x32x16xf32> + // CHECK: %[[V2:.*]] = vector.shape_cast %[[V1]] : vector<32x16x32x16xf32> to vector<32x16x32x16x1xf32> + // CHECK: %[[V3:.*]] = vector.broadcast %[[V2]] : vector<32x16x32x16x1xf32> to vector<32x16x32x16x16xf32> + // CHECK: %[[V4:.*]] = vector.shape_cast %[[V3]] : vector<32x16x32x16x16xf32> to vector<32x16x1x32x16x16xf32> + // CHECK: %[[V5:.*]] = vector.broadcast %[[V4]] : vector<32x16x1x32x16x16xf32> to vector<32x16x16x32x16x16xf32> + gpu.func @distribute_nested_slice(%src: memref<256x256xf32>) { + + %tdesc = xegpu.create_nd_tdesc %src : memref<256x256xf32> + -> !xegpu.tensor_desc<256x256xf32, #xegpu.layout> + + %load = xegpu.load_nd %tdesc[0, 0] {layout = #xegpu.layout} + : !xegpu.tensor_desc<256x256xf32, #xegpu.layout> + -> vector<256x256xf32> + + %load2 = xegpu.convert_layout %load <{input_layout = #xegpu.layout, target_layout = #xegpu.slice<#xegpu.slice<#xegpu.slice<#xegpu.layout, dims=[2]>, dims=[4]>, dims=[1, 3]>}> : vector<256x256xf32> + + %scast = vector.shape_cast %load2 {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout, dims=[2]>, dims=[4]>, layout_operand_0 = #xegpu.slice<#xegpu.slice<#xegpu.slice<#xegpu.layout, dims=[2]>, dims=[4]>, dims=[1, 3]>} : vector<256x256xf32> to vector<256x1x256x1xf32> + + %bcast = vector.broadcast %scast {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout, dims=[2]>, dims=[4]>, layout_operand_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout, dims=[2]>, dims=[4]>} : vector<256x1x256x1xf32> to vector<256x16x256x16xf32> + + %scast1 = vector.shape_cast %bcast {layout_result_0 = #xegpu.slice<#xegpu.layout, dims=[2]>, layout_operand_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout, dims=[2]>, dims=[4]>} : vector<256x16x256x16xf32> to vector<256x16x256x16x1xf32> + + %bcast1 = vector.broadcast %scast1 {layout_result_0 = #xegpu.slice<#xegpu.layout, dims=[2]>, layout_operand_0 = #xegpu.slice<#xegpu.layout, dims=[2]>} : vector<256x16x256x16x1xf32> to vector<256x16x256x16x16xf32> + + %scast2 = vector.shape_cast %bcast1 {layout_result_0 = + #xegpu.layout, layout_operand_0 = #xegpu.slice<#xegpu.layout, dims=[2]>} : vector<256x16x256x16x16xf32> to vector<256x16x1x256x16x16xf32> + + %bcast2 = vector.broadcast %scast2 {layout_result_0 = + #xegpu.layout, layout_operand_0 = + #xegpu.layout} : vector<256x16x1x256x16x16xf32> to vector<256x16x16x256x16x16xf32> + gpu.return + } + + // CHECK-LABEL: @preserve_anchor_layout + // CHECK: arith.constant dense<1.000000e+00> : vector<16x128xf32> + // CHECK: xegpu.store_nd %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] <{layout = #xegpu.layout}> + gpu.func @preserve_anchor_layout(%dst: memref<256x128xf32>) { + %val = arith.constant {layout_result_0 = #xegpu.layout} dense<1.0> : vector<256x128xf32> + %tdesc = xegpu.create_nd_tdesc %dst : memref<256x128xf32> + -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + xegpu.store_nd %val, %tdesc[0, 0] <{layout = #xegpu.layout}> + : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + gpu.return + } + + // CHECK-LABEL: create_nd_tdesc_from_higher_rank_memref + // CHECK-SAME: %[[ARG_0:.*]]: memref<3x256x128xf32> + gpu.func @create_nd_tdesc_from_higher_rank_memref(%src: memref<3x256x128xf32>) { + // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> + %tdesc = xegpu.create_nd_tdesc %src : memref<3x256x128xf32> + -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + gpu.return + } + gpu.func @scf_for(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf32>) { // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index @@ -182,39 +980,32 @@ gpu.module @test_1_1_assignment { %block_id_y = gpu.block_id y %0 = arith.muli %block_id_x, %c128 : index %1 = arith.muli %block_id_y, %c128 : index - %2 = xegpu.create_nd_tdesc %arg2[%0, %1] : memref<1024x1024xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> - %3 = xegpu.load_nd %2 {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf32, #xegpu.layout> -> vector<128x128xf32> - %4 = xegpu.create_nd_tdesc %arg0[%0, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %5 = xegpu.create_nd_tdesc %arg1[%c0, %1] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - - // CHECK: %[[SCF:.*]]:3 = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C1024]] step %[[C128]] - // CHECK-SAME: iter_args(%[[ARG4:.*]] = {{.*}}, %[[ARG5:.*]] = {{.*}}, %[[ARG6:.*]] = {{.*}}) -> - // CHECK-SAME: (!xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>) - // CHECK: %[[A:.*]] = xegpu.load_nd %[[ARG4]] : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16> - // CHECK: %[[B:.*]] = xegpu.load_nd %[[ARG5]] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16> + %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> + %3 = xegpu.load_nd %2[0, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf32, #xegpu.layout> -> vector<128x128xf32> + %4 = xegpu.create_nd_tdesc %arg0 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> + %5 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout> + + // CHECK: %[[SCF:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C1024]] step %[[C128]] + // CHECK-SAME: iter_args(%[[ARG6:.*]] = {{.*}}) -> + // CHECK-SAME: (vector<16x16xf32>) + // CHECK: %[[A:.*]] = xegpu.load_nd %{{[^]]*}}[{{[^]]*}}] : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16> + // CHECK: %[[B:.*]] = xegpu.load_nd %{{[^]]*}}[{{[^]]*}}] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16> // CHECK: %[[C:.*]] = xegpu.dpas %[[A]], %[[B]], %[[ARG6]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> -> vector<16x16xf32> - // CHECK: %[[AT:.*]] = xegpu.update_nd_offset %[[ARG4]], [%[[C0]], %[[C128]]] : !xegpu.tensor_desc<16x128xf16> - // CHECK: %[[BT:.*]] = xegpu.update_nd_offset %[[ARG5]], [%[[C128]], %[[C0]]] : !xegpu.tensor_desc<128x16xf16> - // CHECK: scf.yield %[[AT]], %[[BT]], %[[C]] : !xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32> - %6:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg4 = %4, %arg5 = %5, %arg6 = %3) - -> (!xegpu.tensor_desc<128x128xf16, #xegpu.layout>, !xegpu.tensor_desc<128x128xf16, #xegpu.layout>, vector<128x128xf32>) { - %8 = xegpu.load_nd %arg4 {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> - %9 = xegpu.load_nd %arg5 {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> - %10 = xegpu.dpas %8, %9, %arg6 + // CHECK: scf.yield %[[C]] : vector<16x16xf32> + %6 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg6 = %3) + -> (vector<128x128xf32>) { + %8 = xegpu.load_nd %4[0, %arg3] {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> + %9 = xegpu.load_nd %5[%arg3, 0] {layout = #xegpu.layout} : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> -> vector<128x128xf16> + %10 = xegpu.dpas %8, %9, %arg6 {layout_a = #xegpu.layout, layout_b = #xegpu.layout, layout_cd = #xegpu.layout} : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32> - %11 = xegpu.update_nd_offset %arg4, [%c0, %c128] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - %12 = xegpu.update_nd_offset %arg5, [%c128, %c0] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout> - scf.yield %11, %12, %10 : !xegpu.tensor_desc<128x128xf16, #xegpu.layout>, - !xegpu.tensor_desc<128x128xf16, #xegpu.layout>, vector<128x128xf32> - } {layout_result_0 = #xegpu.layout, - layout_result_1 = #xegpu.layout, - layout_result_2 = #xegpu.layout} - %7 = xegpu.create_nd_tdesc %arg2[%0, %1] : memref<1024x1024xf32> + scf.yield %10 : vector<128x128xf32> + } {layout_result_0 = #xegpu.layout} + %7 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> - xegpu.store_nd %6#2, %7 {layout = #xegpu.layout } : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout> + xegpu.store_nd %6, %7[0, 0] {layout = #xegpu.layout } : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout> gpu.return } @@ -222,9 +1013,10 @@ gpu.module @test_1_1_assignment { %c1_i32 = arith.constant 1 : i32 %c10_i32 = arith.constant 10 : i32 %c0_i32 = arith.constant 0 : i32 - %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> - %1 = xegpu.load_nd %0 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> - %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> + %c256 = arith.constant 256 : index + %0 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> + %1 = xegpu.load_nd %0[0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %2 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> // CHECK: scf.while {{.*}} : (vector<16xf32>, i32) -> (vector<16xf32>, i32) %3:2 = scf.while (%arg2 = %1, %arg3 = %c0_i32) : (vector<256xf32>, i32) -> (vector<256xf32>, i32) { @@ -234,10 +1026,9 @@ gpu.module @test_1_1_assignment { } do { // CHECK: (%[[ARG2:.*]]: vector<16xf32>, %[[ARG3:.*]]: i32) ^bb0(%arg2: vector<256xf32>, %arg3: i32): - xegpu.store_nd %arg2, %2 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout> + xegpu.store_nd %arg2, %2[0] : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout> %4 = arith.addi %arg3, %c1_i32 : i32 - %5 = xegpu.update_nd_offset %0, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout> - %6 = xegpu.load_nd %5 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %6 = xegpu.load_nd %0[%c256] {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> scf.yield %6, %4 : vector<256xf32>, i32 } gpu.return @@ -247,8 +1038,8 @@ gpu.module @test_1_1_assignment { %c10 = arith.constant 10 : index %id = gpu.subgroup_id : index - %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> - %1 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> + %0 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> + %1 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> %4 = arith.cmpi eq, %id, %c10 : index // CHECK-LABEL: scf.if @@ -256,19 +1047,19 @@ gpu.module @test_1_1_assignment { %5 = scf.if %4 -> (vector<256xf32>) { // CHECK-LABEL: xegpu.load_nd // CHECK-SAME: !xegpu.tensor_desc<16xf32> -> vector<16xf32> - %2 = xegpu.load_nd %0 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %2 = xegpu.load_nd %0[0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> // CHECK-LABEL: scf.yield // CHECK-SAME: vector<16xf32> scf.yield %2 : vector<256xf32> } else { // CHECK-LABEL: xegpu.load_nd // CHECK-SAME: !xegpu.tensor_desc<16xf32> -> vector<16xf32> - %3 = xegpu.load_nd %1 {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %3 = xegpu.load_nd %1[0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> // CHECK-LABEL: scf.yield // CHECK-SAME: vector<16xf32> scf.yield %3 : vector<256xf32> } {layout_result_0 = #xegpu.layout} - xegpu.store_nd %5, %0 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout> + xegpu.store_nd %5, %0[0] : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout> gpu.return } @@ -276,8 +1067,8 @@ gpu.module @test_1_1_assignment { %c10 = arith.constant 10 : index %id = gpu.subgroup_id : index - %t = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> - %d = xegpu.load_nd %t {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> + %t = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> + %d = xegpu.load_nd %t[0] {layout = #xegpu.layout} : !xegpu.tensor_desc<256xf32, #xegpu.layout> -> vector<256xf32> %0 = arith.cmpi eq, %id, %c10 : index // CHECK-LABEL: scf.if @@ -285,94 +1076,53 @@ gpu.module @test_1_1_assignment { %1 = scf.if %0 -> (!xegpu.tensor_desc<256xf32, #xegpu.layout>) { // CHECK-LABEL: xegpu.create_nd_tdesc // CHECK-SAME: memref<1024xf32> -> !xegpu.tensor_desc<16xf32> - %2 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> + %2 = xegpu.create_nd_tdesc %arg0 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> // CHECK-LABEL: scf.yield // CHECK-SAME: !xegpu.tensor_desc<16xf32> scf.yield %2 : !xegpu.tensor_desc<256xf32, #xegpu.layout> } else { // CHECK-LABEL: xegpu.create_nd_tdesc // CHECK-SAME: memref<1024xf32> -> !xegpu.tensor_desc<16xf32> - %3 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> + %3 = xegpu.create_nd_tdesc %arg1 : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout> // CHECK-LABEL: scf.yield // CHECK-SAME: !xegpu.tensor_desc<16xf32> scf.yield %3 : !xegpu.tensor_desc<256xf32, #xegpu.layout> } - xegpu.store_nd %d, %1 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout> - gpu.return - } - - // CHECK-LABEL: @subgroup_id_range - gpu.func @subgroup_id_range(%src: memref<256x128xf32>, %src1: memref<128x256xf32>, %src2: memref<128x64xf32>) { - %sg_id = gpu.subgroup_id : index - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %c31 = arith.constant 31 : index - %c3 = arith.constant 3 : index - %cond1 = arith.cmpi sge, %sg_id, %c0 : index - %cond2 = arith.cmpi slt, %sg_id, %c1 : index - %cond = arith.andi %cond1, %cond2 : i1 - scf.if %cond { - // CHECK-NOT: index.sub - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc - : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - -> vector<256x128xf32> - } {sg_id_range = #xegpu.range<[0, 32]>} - %cond3 = arith.cmpi sge, %sg_id, %c2 : index - %cond4 = arith.cmpi slt, %sg_id, %c31 : index - %cond5 = arith.andi %cond3, %cond4 : i1 - scf.if %cond5 { - // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK: %[[C2:.*]] = arith.constant 2 : index - // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C2]] - %tdesc = xegpu.create_nd_tdesc %src2[0, 0] : memref<128x64xf32> - -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc {layout = #xegpu.layout} - : !xegpu.tensor_desc<128x64xf32, #xegpu.layout> - -> vector<128x64xf32> - %exp = math.exp %load {layout_result_0 = #xegpu.layout} : vector<128x64xf32> - }{sg_id_range = #xegpu.range<[2, 18]>} + xegpu.store_nd %d, %1[0] : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout> gpu.return } - // CHECK-LABEL: @subgroup_id_range_nested_if - gpu.func @subgroup_id_range_nested_if(%src: memref<256x128xf32>, %src1: memref<128x64xf32>) { - %sg_id = gpu.subgroup_id : index - %c1 = arith.constant 1 : i1 - %c3 = arith.constant 3 : index - %c32 = arith.constant 32 : index - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - %load = xegpu.load_nd %tdesc - : !xegpu.tensor_desc<256x128xf32, #xegpu.layout> - -> vector<256x128xf32> - %cond1 = arith.cmpi sge, %sg_id, %c3 : index - %cond2 = arith.cmpi slt, %sg_id, %c32 : index - %cond = arith.andi %cond1, %cond2 : i1 - scf.if %c1 { - scf.if %cond { - // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK: %[[C3:.*]] = arith.constant 3 : index - // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C3]] - %td = xegpu.create_nd_tdesc %src1[0, 0] : memref<128x64xf32> - -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout> - %ld = xegpu.load_nd %td {layout = #xegpu.layout} - : !xegpu.tensor_desc<128x64xf32, #xegpu.layout> - -> vector<128x64xf32> - %exp = math.exp %ld {layout_result_0 = #xegpu.layout} : vector<128x64xf32> - } - } {sg_id_range = #xegpu.range<[3, 19]>} - gpu.return - } - // CHECK-LABEL: distribute_constant gpu.func @distribute_constant() { // CHECK: arith.constant dense<1.000000e+00> : vector<32x32xf32> %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.0> : vector<256x128xf32> gpu.return } + + // CHECK-LABEL: @shape_cast_used_by_elementwise + gpu.func @shape_cast_used_by_elementwise(%dst: memref<1x1x16xf32>) { + // Regression test: shape_cast expanding unit dimensions can be used by elementwise ops + // This previously failed with "ShapeCast ops that expand unit dimensions and are used by + // non-broadcast operations are not supported." + + // CHECK: vector.step : vector<16xindex> + // CHECK: vector.shape_cast {{.*}} : vector<16xindex> to vector<1x1x16xindex> + // CHECK: arith.addi {{.*}} : vector<1x1x16xindex> + // CHECK: xegpu.store {{.*}} : vector<1x1x16xf32>, i64, vector<1x1x16xindex>, vector<1x1x16xi1> + %step = vector.step : vector<16xindex> + %shape_cast = vector.shape_cast %step : vector<16xindex> to vector<1x1x16xindex> + %cst = arith.constant dense<10> : vector<1x1x16xindex> + %add = arith.addi %shape_cast, %cst : vector<1x1x16xindex> + + %cst_val = arith.constant dense<1.0> : vector<1x1x16xf32> + %intptr = memref.extract_aligned_pointer_as_index %dst : memref<1x1x16xf32> -> index + %ptr = arith.index_cast %intptr : index to i64 + %mask = arith.constant dense : vector<1x1x16xi1> + + xegpu.store %cst_val, %ptr[%add], %mask {layout = #xegpu.layout} : vector<1x1x16xf32>, i64, vector<1x1x16xindex>, vector<1x1x16xi1> + gpu.return + } + } // ----- diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp index a3d2560cedf63..a4404c4100a71 100644 --- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp +++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp @@ -59,46 +59,29 @@ struct TestXeGPUUnrollingPatterns xegpu::UnrollOptions options; options.setNativeShapeFn([&](Operation *op) -> std::optional> { - if (isa( - op)) { + if (isa(op)) { xegpu::TensorDescType tdescTy; if (auto createNdOp = dyn_cast(op)) { tdescTy = createNdOp.getType(); - } else if (auto updateNdOp = dyn_cast(op)) { - tdescTy = updateNdOp.getTensorDescType(); } else if (auto prefetchNdOp = dyn_cast(op)) { tdescTy = prefetchNdOp.getTensorDescType(); } else if (auto loadNdOp = dyn_cast(op)) { tdescTy = loadNdOp.getTensorDescType(); } else if (auto storeNdOp = dyn_cast(op)) { tdescTy = storeNdOp.getTensorDescType(); - } else if (auto prefetchOp = dyn_cast(op)) { - tdescTy = prefetchOp.getTensorDescType(); - } else if (auto loadOp = dyn_cast(op)) { - if (loadOp.getOffsets()) { - auto layout = xegpu::getDistributeLayoutAttr(loadOp.getResult()); - if (layout && layout.isForSubgroup()) { - auto inst_data = layout.getEffectiveInstDataAsInt(); - if (!inst_data.empty()) - return SmallVector(inst_data.begin(), inst_data.end()); - } - return std::nullopt; - } - tdescTy = loadOp.getTensorDescType(); - } else if (auto storeOp = dyn_cast(op)) { - if (storeOp.getOffsets()) { - auto layout = llvm::dyn_cast_or_null( - op->getAttr("layout")); - if (layout && layout.isForSubgroup()) { - auto inst_data = layout.getEffectiveInstDataAsInt(); - if (!inst_data.empty()) - return SmallVector(inst_data.begin(), inst_data.end()); - } - return std::nullopt; + } else if (isa(op)) { + auto anchorOp = cast(op); + auto layout = + dyn_cast_or_null(anchorOp.getAnchorLayout()); + if (layout && layout.isForSubgroup()) { + auto inst_data = layout.getEffectiveInstDataAsInt(); + if (!inst_data.empty()) + return SmallVector(inst_data.begin(), inst_data.end()); } - tdescTy = storeOp.getTensorDescType(); + return std::nullopt; } if (auto layout = tdescTy.getLayoutAttr()) {