diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index a7bea9881602f..313a4355701a8 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -72,49 +72,6 @@ def XeGPU_BlockTensorDescAttr: XeGPU_TensorDescAttr<"BlockTensorDesc", "block_td } -def XeGPU_ScatterTensorDescAttr: XeGPU_TensorDescAttr<"ScatterTensorDesc", "scatter_tdesc_attr"> { - let summary = [{a composite attribute for `TensorDescType`}]; - let description = [{ - `ScatterTensorDesc` is a composite attribute defined for `TensorDescType` - for describing following properties of a `TensorDesc`: - - 1. `memory_space`: It describes where the data block described by the - TensorDesc is located, `Global` device memory or `Shared` local memory. - It is default to `Global`. - - 2. `chunk_size`: Specifies the number of contiguous elements accessed per offset. - The default value is 1. - }]; - - let parameters = (ins - DefaultValuedParameter< - "MemorySpaceAttr", - "MemorySpaceAttr::get($_ctxt, xegpu::MemorySpace::Global)", - "Data memory location" - >: $memory_space, - DefaultValuedParameter< - "IntegerAttr", - "IntegerAttr::get(IntegerType::get($_ctxt, 64), 1)", - "Number of contiguous elements" - >: $chunk_size - ); - - let builders = [ - AttrBuilder<(ins - CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space, - CArg<"int", "1">: $chunk_size - )> - ]; - - let extraClassDeclaration = [{ - int64_t getChunkSizeAsInt() { - return getChunkSize().getInt(); - } - }]; - - let genVerifyDecl = 1; - } - //===----------------------------------------------------------------------===// // XeGPU Memory Scope Enums. //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index f9c3c155a32d5..31fe93d209a6d 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -648,107 +648,6 @@ def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset", let hasVerifier = 1; } -def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> { - let summary = "create scattered tensor descriptors (TensorDesc)."; - let description = [{ - "create_tdesc" is similar to "create_nd_tdesc" in terms that it creates - a Tensor Descriptor (TensorDescType) for a memory region. While "create_nd_tdesc" - is for creating continuous subviews, "create_tdesc" is for creating non-continuous - (scattered) subviews, allowing each lane in a subgroup specifying their own offset. - It accepts the following parameters: - - Arguments: - - - `source`: a 1D memref or pointer (i64, i32, ui64, ui32) represents the flattened - memory object. - - - `offsets`: a vector containing offsets of each access point. Its size - is fixed to the hardware supportted subgroup size, e.g., 16 on PVC, - implying each element in the vector corresponds to a SIMT lane in the subgroup. - - Results: - - `res`: scattered tensor descriptor - - The first dimension of the result TensorDesc corresponds to lanes, so it should - match the dimension of offsets. It may also has a second dimension corresponding to - the chunk_size if the chunk size is larger than 1. - - Example 1: It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64] - ```mlir - %a = memref.alloc() : memref<1024xf32> - %0 = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex> - %1 = xegpu.create_tdesc %a, %0: memref<1024xf32>, vector<4xindex> -> TensorDesc<4xf32> - ``` - - Example 2: It assumes subgroup size is 4, and each workitem access 8 elements. - It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71] - ```mlir - %0 = memref.alloc() : memref<1024xf32> - %off = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex> - %1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex> - -> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr> - ``` - - Example 3: It is similar to Example 2, but there is some overlaps among workitems. - It accesses: a[0:7], a[4:11], a[8:15], a[12:19] - ```mlir - %0 = memref.alloc() : memref<1024xf32> - %off = arith.constant dense<[0, 4, 8, 12]> : vector<4xindex> - %1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex> - -> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr> - ``` - }]; - - let arguments = (ins XeGPU_GatherScatterBaseAddrType:$source, - XeGPU_OffsetType:$offsets); - let results = (outs XeGPU_TensorDesc:$TensorDesc); - - let builders = [ - OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "mlir::Value": $source, - "llvm::ArrayRef": $offsets)>, - OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "mlir::Value": $source, - "llvm::ArrayRef": $offsets)>, - ]; - - let assemblyFormat = [{ - $source `,` $offsets attr-dict `:` type($source) `,` type($offsets) `->` qualified(type($TensorDesc)) - }]; - - let extraClassDeclaration = [{ - xegpu::TensorDescType getTensorDescType() { - return getTensorDesc().getType(); - } - - mlir::VectorType getOffsetsType() { - return getOffsets().getType(); - } - - size_t getNumOffsets() { - return getOffsetsType().getNumElements(); - } - - mlir::Value getViewSource() { return getSource(); } - - unsigned getSourceMemorySpace() { - auto srcTy = getSource().getType(); - if (auto memrefTy = llvm::dyn_cast(srcTy)) { - auto attr = memrefTy.getMemorySpace(); - if (attr) { - if (auto intAttr = llvm::dyn_cast(attr)) - return static_cast(intAttr.getInt()); - if (auto memSpaceAttr = llvm::dyn_cast(attr)) - return static_cast(memSpaceAttr.getValue()); - } - } - // take global as default memory scope. - return static_cast(MemorySpace::Global); - } - - }]; - - let hasVerifier = 1; -} - def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [AnchorLayoutInterface]> { let summary = "prefetches a set of scattered data points to cache"; @@ -764,11 +663,9 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [AnchorLayoutInterface]> { Arguments: - `source`: represents the memory region to be loaded from, which can be either a - tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32). - In case of tensor_desc, offsets come from the producer create_tdesc op. - tensor_desc cannot be used at lane level. + 1D memref or pointer (ui64, ui32, i64 or i32). - - `offsets`: represents offsets from source. required if `source` in not a TensorDescType. + - `offsets`: represents offsets from source. offsets is a vector of `index` type and vector length is either the subgroup size or 1 at lane level. scalar offset is also valid for lane level. @@ -791,10 +688,8 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", [AnchorLayoutInterface]> { ``` Example 2 (lane level): - A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc. - It combines "create scattered TensorTdesc" and "prefetch with scattered TensorTdesc". + A variant accepts memref or integer (raw pointer) as base and offsets directly. The source operand could be a raw pointer (ui64, ui32, i64, i32). - Please refer to create_tdesc for the restriction of memref. ```mlir %a = memref.alloc() : memref<1024xf32> %0 = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex> @@ -896,11 +791,9 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayou Arguments: - `source`: represents the memory region to be loaded from, which can be either a - tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32). - In case of tensor_desc, offsets come from the producer create_tdesc op. - tensor_desc cannot be used at lane level. + 1D memref or pointer (ui64, ui32, i64 or i32). - - `offsets`: represents offsets from source. required if `source` in not a TensorDescType. + - `offsets`: represents offsets from source. offsets is a vector of `index` type and vector length is either the subgroup size or 1 at lane level. scalar offset is also valid for lane level. @@ -918,32 +811,9 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayou Results: - `res`: represents loaded data - - Example 1 (Workgroup level): - ```mlir - %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint, - l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint}, - layout = #xegpu.layout> - : !xegpu.tensor_desc<256xf32, #xegpu.scatter_tdesc_attr>, - vector<256xi1> -> vector<256xf32> - ``` - - Example 2 (Subgroup level): - ```mlir - %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint, - l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint}, - layout = #xegpu.layout> - : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr>, - vector<16xi1> -> vector<16x8xf32> - ``` - - Example 3 (Subgroup level): - A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc. - It combines "create scattered TensorTdesc" and "load with scattered TensorTdesc". - The source operand could be a raw pointer (ui64, ui32, i64, i32). Please refer to create_tdesc - for the restriction of memref. + Example 1 (Subgroup level): + A variant accepts memref as base pointer or the source operand + could be a raw pointer (ui64, ui32, i64, i32). ```mlir %a = memref.alloc() : memref<1024xf32> %offsets = vector.step : vector<16xindex> @@ -955,7 +825,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayou : memref<1024xf32>, vector<16xi1>, vector<16xindex> -> vector<16xf32> ``` - Example 4 (lane level): + Example 2 (lane level): lane level only accepts the offsets variant. chunk_size can be inferred from result type. In this example, chunk_size is 8. ```mlir @@ -1067,11 +937,9 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL - `value`: represents the data to be stored. - `dest`: represents the memory region to be stored to, which can be either a - tensor_desc or a 1D memref or pointer (ui64, ui32, i64 or i32). - In case of tensor_desc, offsets come from the producer create_tdesc op. - tensor_desc cannot be used at lane level. + 1D memref or pointer (ui64, ui32, i64 or i32). - - `offsets`: represents offsets from dest. required if `source` in not a TensorDescType. + - `offsets`: represents offsets from dest. offsets is a vector of `index` type and vector length is either the subgroup size or 1 at lane level. scalar offset is also valid for lane level. @@ -1087,29 +955,9 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL to be stored. Only valid at workgroup and subgroup levels. - Example 1 (Workgroup level): - ```mlir - xegpu.store %0, %1, %2 <{l1_hint = #xegpu.cache_hint, - l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint, - layout = #xegpu.layout}> - : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.scattered_tdesc_attr<>>, vector<256xi1> - ``` - - Example 2 (Subgroup level): - ```mlir - xegpu.store %0, %1, %2 <{l1_hint = #xegpu.cache_hint, - l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint, - layout = #xegpu.layout}> - : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr>, vector<16xi1> - ``` - - Example 3 (Subgroup level): - A variant accepts memref as base pointer and an offset instead of scattered TensorTdesc. - It combines "create scattered TensorTdesc" and "store with scattered TensorTdesc". + Example 1 (Subgroup level): + A variant accepts memref as base pointer and an offset. The dest operand could be a raw pointer (uint64_t). - Please refer to create_tdesc for the restriction of memref. ```mlir %a = memref.alloc() : memref<1024xf32> %val = arith.constant dense<0.0> : vector<16xf32> @@ -1122,7 +970,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL : vector<16xf32>, memref<1024xf32>, vector<16xi1>, vector<16xindex> ``` - Example 4 (Lane level): + Example 2 (Lane level): Lane level IR only accepts the offsets variant. chunk_size can be inferred from value type. In this example, chunk_size is 8. ```mlir @@ -1213,59 +1061,6 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL let hasVerifier = 1; } -def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset", - [AllTypesMatch<["TensorDesc", "result"]>]> { - let summary = "It updates the offsets for the given tensor descriptor"; - - let description = [{It behaves similar to `update_nd_offset` in terms that - it updates offset of a TensorDesc, and the offsets are relative offset to - the current position in the number of elements. However, `update_nd_offset` - is to update the start point of a 2D block, so its offset constains two - elements representing the shift in each dimension. `update_offset` is to - update the offset per lane, so its offsets contains values representing - shifts for each lane. - - Example: - ```mlir - %off = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex> - %2 = xegpu.update_offset %1, %off : - !xegpu.tensor_desc<4x2xf32, #xegpu.scattered_tdesc_attr>, vector<4xindex> - ``` - - }]; - - let arguments = (ins XeGPU_TensorDesc: $TensorDesc, - XeGPU_OffsetType: $offsets); - let results = (outs XeGPU_TensorDesc: $result); - - let builders = [ - OpBuilder<(ins "mlir::Value": $TensorDesc, - "llvm::ArrayRef": $offsets)>, - OpBuilder<(ins "mlir::Value": $TensorDesc, - "llvm::ArrayRef": $offsets)> - ]; - - let extraClassDeclaration = [{ - xegpu::TensorDescType getTensorDescType() { - return getTensorDesc().getType(); - } - - mlir::VectorType getOffsetsType() { - return getOffsets().getType(); - } - - size_t getNumOffsets() { - return getOffsetsType().getNumElements(); - } - }]; - - let assemblyFormat = [{ - $TensorDesc `,` $offsets attr-dict `:` qualified(type($TensorDesc)) `,` type($offsets) - }]; - - let hasVerifier = 1; -} - def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>, AnchorLayoutInterface]> { let summary = "It performs mma computation"; diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td index b13f5a9f2c9d9..33eab14e9dfd8 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td @@ -60,18 +60,13 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", and DPAS (matrix multiplication instruction) on Intel GPUs. It encodes the following information: * shape: the sizes/shape of the interested data block, e.g., 8x16 means 8 rows - and each row contains 16 contiguous data element. The rows could be - either contiguous or not, depends on the encoding attribute. If the - encoding is a BlockTensorDescAttr, rows are contiguous. If the encoding - is a ScatterTensorDescAttr, rows are not necessary to be contiguous. If - encoding is not set, it is considered as a default BlockTensorDescAttr. + and each row contains 16 contiguous data elements. * element_type: the data type of the data element, e.g., f16, f32. Similar to the built-in tensor, it also provides optional attributes for encoding - additional information via either BlockTensorDescAttr or ScatterTensorDescAttr, or - supporting Workgroup, Subgroup, and workitem (or SIMT) level programmings via the - Layout attribute. Please check their definition for details. + additional information via BlockTensorDescAttr, or supporting Workgroup & Subgroup + level programmings via the Layout attribute. Please check their definition for details. Syntax: @@ -81,7 +76,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", dim-list := (static-dim-list `x`)? static-dim-list ::= decimal-literal `x` decimal-literal attr-list = (, encoding-attr)? (, layout-attr)? - enconding-attr = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)? + enconding-attr = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? layout-attr = DistributeLayoutAttr ``` @@ -127,12 +122,6 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", CArg<"int", "1">: $array_length, CArg<"bool", "true">: $boundary_check, CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space, - CArg<"mlir::Attribute", "mlir::Attribute()">:$layout)>, - TypeBuilderWithInferredContext<(ins - "llvm::ArrayRef": $shape, - "mlir::Type": $elementType, - CArg<"int", "1">: $chunk_size, - CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space, CArg<"mlir::Attribute", "mlir::Attribute()">:$layout)> ]; @@ -150,12 +139,8 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", return llvm::cast(cloneWith(getShape(), elementType)); } - template || - std::is_same_v>> - T getEncodingOfType() const { - return llvm::dyn_cast_if_present(getEncoding()); + BlockTensorDescAttr getBlockAttr() const { + return llvm::dyn_cast_if_present(getEncoding()); } DistributeLayoutAttr getLayoutAttr() const { @@ -163,35 +148,16 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", } xegpu::MemorySpace getMemorySpace() const { - if (auto attr = getEncodingOfType()) - return attr.getMemorySpace().getValue(); - - auto attr = getEncodingOfType(); - return attr.getMemorySpace().getValue(); + return getBlockAttr().getMemorySpace().getValue(); } // get the ArrayLength for blocked TensorDesc int getArrayLength() { - auto attr = getEncodingOfType(); - assert(attr && "invalid on non BlockTensorDescAttr."); - return attr.getArrayLength().getInt(); + return getBlockAttr().getArrayLength().getInt(); } bool getBoundaryCheck() { - auto attr = getEncodingOfType(); - assert(attr && "invalid on non BlockTensorDescAttr."); - return attr.getBoundaryCheck().getValue(); - } - - bool isScattered() { - return bool(getEncodingOfType()); - } - - // get the ChunkSize for scattered TensorDesc - int getChunkSizeAsInt() { - auto attr = getEncodingOfType(); - assert(attr && "invalid on non ScatterTensorDescAttr."); - return attr.getChunkSizeAsInt(); + return getBlockAttr().getBoundaryCheck().getValue(); } /// Helper to drop all layout information from the TensorDesc type. diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp index 93da74e938c84..50eba56a16080 100644 --- a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp +++ b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp @@ -1118,9 +1118,6 @@ struct ConvertXeGPUToXeVMPass return VectorType::get(sum, elemType); }); typeConverter.addConversion([&](xegpu::TensorDescType type) -> Type { - // Scattered descriptors are not supported in XeVM lowering. - if (type.isScattered()) - return {}; if (type.getRank() == 1) return xevmIndexType; return VectorType::get(8, i32Type); diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index 80a3fc91f1c4f..811b09b011e47 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -140,28 +140,6 @@ bool BlockTensorDescAttr::hasDefaultsOnly() { getArrayLength().getInt() == 1 && getBoundaryCheck().getValue(); } -//===----------------------------------------------------------------------===// -// XeGPU_ScatterTensorDescAttr -//===----------------------------------------------------------------------===// -ScatterTensorDescAttr -ScatterTensorDescAttr::get(mlir::MLIRContext *context, - xegpu::MemorySpace memory_space, int chunk_size) { - auto scopeAttr = MemorySpaceAttr::get(context, memory_space); - auto chunkSizeAttr = - IntegerAttr::get(IntegerType::get(context, 64), chunk_size); - return Base::get(context, scopeAttr, chunkSizeAttr); -} - -LogicalResult ScatterTensorDescAttr::verify( - llvm::function_ref emitError, - MemorySpaceAttr memory_space, IntegerAttr chunk_size) { - int64_t chunkSize = chunk_size.getInt(); - if (chunkSize <= 0) - return emitError() << "invalid chunk size"; - - return success(); -} - //===----------------------------------------------------------------------===// // XeGPU_LayoutAttr //===----------------------------------------------------------------------===// @@ -1254,7 +1232,7 @@ mlir::Type TensorDescType::parse(AsmParser &parser) { layout = attr; continue; } - if (mlir::isa(attr)) { + if (mlir::isa(attr)) { encoding = attr; continue; } @@ -1309,15 +1287,6 @@ TensorDescType TensorDescType::get(llvm::ArrayRef shape, return Base::get(context, shape, elementType, attr, layout); } -TensorDescType TensorDescType::get(llvm::ArrayRef shape, - mlir::Type elementType, int chunk_size, - MemorySpace memory_space, - mlir::Attribute layout) { - auto *context = elementType.getContext(); - auto attr = ScatterTensorDescAttr::get(context, memory_space, chunk_size); - return Base::get(context, shape, elementType, attr, layout); -} - LogicalResult TensorDescType::verify(llvm::function_ref emitError, llvm::ArrayRef shape, mlir::Type elementType, @@ -1339,30 +1308,6 @@ TensorDescType::verify(llvm::function_ref emitError, return emitError() << "unsupported element type " << elementType << ": expected integer or float"; - // for gather and scatter ops, Low-precision types are packed in 32-bit - // units. - unsigned bitWidth = elementType.getIntOrFloatBitWidth(); - int chunkAlignmentFactor = - bitWidth < xegpu::uArch::generalPackedFormatBitSize - ? xegpu::uArch::generalPackedFormatBitSize / bitWidth - : 1; - auto scatterAttr = mlir::dyn_cast_if_present(encoding); - if (scatterAttr) { - int64_t chunkSize = scatterAttr.getChunkSizeAsInt(); - if (rank == 1 && chunkSize != 1) - return emitError() << "expected non-contiguous elements for 1D tensor"; - - // If chunk size > 1, the second dimension of the tensor shape must be - // equal to chunk size and it must be a multiple of the - // chunkAlignmentFactor. - if (chunkSize > 1) { - if (shape.back() != chunkSize) - return emitError() << "expected last dim of tensor to match chunk size"; - if (shape.back() % chunkAlignmentFactor != 0) - return emitError() << "expected last dim of tensor to be a multiple of " - << chunkAlignmentFactor; - } - } if (auto layoutAttr = mlir::dyn_cast_if_present(layout)) { if (rank != (size_t)layoutAttr.getRank()) diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 43c2e0aa37f22..51ce6ce53a2fe 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -74,53 +74,6 @@ static bool isWriteHintOrNone(const CachePolicyAttr &attr) { kind == CachePolicy::WRITE_BACK || kind == CachePolicy::WRITE_THROUGH; } -static LogicalResult -isValidGatherScatterParams(Type maskTy, VectorType valueTy, - TensorDescType tdescTy, - function_ref emitError) { - - if (!tdescTy.isScattered()) - return emitError() << "Expects a scattered TensorDesc."; - - auto chunkSize = tdescTy.getChunkSizeAsInt(); - if (!valueTy) { - if (chunkSize > 1) - return emitError() << "Expecting chunk size == 1 for scalar result"; - if (dyn_cast(maskTy)) - return emitError() << "Expecting a vector type result."; - return success(); - } - - auto maskShape = getShapeOf(maskTy); - auto valueShape = getShapeOf(valueTy); - auto tdescShape = getShapeOf(tdescTy); - - if (valueTy.getElementType() != tdescTy.getElementType()) - return emitError() - << "Value should have the same element type as TensorDesc."; - - llvm::SmallVector expectedMaskShape(tdescShape); - if (chunkSize > 1) - expectedMaskShape.pop_back(); - if (expectedMaskShape != maskShape) - return emitError() - << "Mask should match TensorDesc except the chunk size dim."; - - // a valid shape for SIMT case - if (valueTy.getRank() == 1 && valueTy.getNumElements() == chunkSize) { - if (tdescTy.getLayoutAttr()) - return emitError() << "TensorDesc doesn't need LayoutAttr for SIMT code"; - return success(); - } - - if (tdescShape != valueShape) - return emitError() << "Value shape " << makeString(valueShape) - << " is neither a valid distribution for SIMT nor " - "consistent with the tensor descriptor for SIMD " - << tdescTy; - return success(); -} - static LogicalResult isValidGatherScatterBufferParams(Type offsetsTy, Type maskTy, VectorType valueTy, int64_t chunkSize, @@ -408,9 +361,6 @@ LogicalResult CreateNdDescOp::verify() { return emitOpError("TensorDesc should have the same element " "type with the source if it is a memref.\n"); - if (getType().isScattered()) - return emitOpError("Expects a non-scattered TensorDesc.\n"); - return success(); } @@ -491,8 +441,6 @@ void PrefetchNdOp::build(OpBuilder &builder, OperationState &state, LogicalResult PrefetchNdOp::verify() { auto tdescTy = getTensorDescType(); - if (tdescTy.isScattered()) - return emitOpError("Expects a non-scattered TensorDesc.\n"); if (!isReadHintOrNone(getL1HintAttr())) return emitOpError("invalid l1_hint: ") << getL1HintAttr(); @@ -556,9 +504,6 @@ LogicalResult LoadNdOp::verify() { auto tdescTy = getTensorDescType(); auto valueTy = getType(); - if (tdescTy.isScattered()) - return emitOpError("Expects a non-scattered TensorDesc.\n"); - if (tdescTy.getRank() > 2) return emitOpError("Expects a 1D or 2D TensorDesc.\n"); @@ -682,9 +627,6 @@ LogicalResult StoreNdOp::verify() { auto dstTy = getTensorDescType(); // Tile auto valTy = getValueType(); // Vector - if (dstTy.isScattered()) - return emitOpError("Expects a non-scattered TensorDesc.\n"); - if (dstTy.getRank() > 2) return emitOpError("Expects a 1D or 2D TensorDesc.\n"); @@ -752,8 +694,6 @@ LogicalResult StoreNdOp::verify() { //===----------------------------------------------------------------------===// LogicalResult UpdateNdOffsetOp::verify() { auto ty = getTensorDescType(); - if (ty.isScattered()) - return emitOpError("Expects a non-scattered TensorDesc.\n"); // number of offsets specified must match the rank of the tensor descriptor if (ty.getRank() != (int64_t)getNumOffsets()) { @@ -762,59 +702,6 @@ LogicalResult UpdateNdOffsetOp::verify() { return success(); } -//===----------------------------------------------------------------------===// -// XeGPU_CreateDescOp -//===----------------------------------------------------------------------===// - -void CreateDescOp::build(OpBuilder &builder, OperationState &state, - TensorDescType TensorDesc, Value source, - llvm::ArrayRef offsets) { - auto loc = source.getLoc(); - int64_t size = static_cast(offsets.size()); - auto type = VectorType::get(size, builder.getIndexType()); - auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets); - auto offset = vector::FromElementsOp::create(builder, loc, type, values); - build(builder, state, TensorDesc, source, offset); -} - -void CreateDescOp::build(OpBuilder &builder, OperationState &state, - TensorDescType TensorDesc, Value source, - llvm::ArrayRef offsets) { - auto ofrs = getAsIndexOpFoldResult(builder.getContext(), offsets); - build(builder, state, TensorDesc, source, ofrs); -} - -LogicalResult CreateDescOp::verify() { - auto tdescTy = getTensorDescType(); - - if (!tdescTy.isScattered()) - return emitOpError("Expects a scattered TensorDesc.\n"); - - // Memory space of created TensorDesc should match with the source. - // Both source and TensorDesc are considered for global memory by default, - // if the memory scope attr is not specified. If source is an integer, - // it is considered as ptr to global memory. - auto srcMemorySpace = getSourceMemorySpace(); - auto tdescMemorySpace = static_cast(tdescTy.getMemorySpace()); - if (srcMemorySpace != tdescMemorySpace) - return emitOpError("Memory space mismatch.") - << " Source: " << srcMemorySpace - << ", TensorDesc: " << tdescMemorySpace; - - // check total size - auto chunkSize = tdescTy.getChunkSizeAsInt(); - SmallVector shape(getOffsetsType().getShape()); - if (chunkSize != 1) - shape.push_back(chunkSize); - - auto tdescShape = getShapeOf(tdescTy); - if (shape != tdescShape) - return emitOpError("Incorrect TensorDesc shape. ") - << "Expected is " << makeString(shape) << "\n"; - - return success(); -} - //===----------------------------------------------------------------------===// // XeGPU_PrefetchOp //===----------------------------------------------------------------------===// @@ -827,9 +714,6 @@ LogicalResult PrefetchOp::verify() { if (tdescTy && getOffsets()) return emitOpError("offsets not allowed."); - if (tdescTy && !tdescTy.isScattered()) - return emitOpError("Expects a scattered TensorDesc."); - if (!isReadHintOrNone(getL1HintAttr())) return emitOpError("invalid l1_hint: ") << getL1HintAttr(); @@ -881,9 +765,6 @@ LogicalResult LoadGatherOp::verify() { if (tdescTy && getOffsets()) return emitOpError("offsets not allowed."); - if (tdescTy && !tdescTy.isScattered()) - return emitOpError("Expects a scattered TensorDesc."); - if (!isReadHintOrNone(getL1HintAttr())) return emitOpError("invalid l1_hint: ") << getL1HintAttr(); @@ -893,9 +774,6 @@ LogicalResult LoadGatherOp::verify() { if (!isReadHintOrNone(getL3HintAttr())) return emitOpError("invalid l3_hint: ") << getL3HintAttr(); - if (tdescTy) - return isValidGatherScatterParams(maskTy, valueTy, tdescTy, - [&]() { return emitOpError(); }); auto srcTy = getSourceType(); uint64_t chunkSize = static_cast(getChunkSize().value_or(1)); auto memTy = dyn_cast(srcTy); @@ -969,9 +847,6 @@ LogicalResult StoreScatterOp::verify() { if (tdescTy && getOffsets()) return emitOpError("offsets not allowed."); - if (tdescTy && !tdescTy.isScattered()) - return emitOpError("Expects a scattered TensorDesc."); - if (!isWriteHintOrNone(getL1HintAttr())) return emitOpError("invalid l1_hint: ") << getL1HintAttr(); @@ -981,10 +856,6 @@ LogicalResult StoreScatterOp::verify() { if (!isWriteHintOrNone(getL3HintAttr())) return emitOpError("invalid l3_hint: ") << getL3HintAttr(); - if (tdescTy) - return isValidGatherScatterParams(maskTy, valueTy, tdescTy, - [&]() { return emitOpError(); }); - auto destTy = getDestType(); uint64_t chunkSize = static_cast(getChunkSize().value_or(1)); auto memTy = dyn_cast(destTy); @@ -1045,45 +916,6 @@ void StoreScatterOp::build( l3_hint, layout); } -//===----------------------------------------------------------------------===// -// XeGPU_UpdateOffsetOp -//===----------------------------------------------------------------------===// -void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state, - mlir::Value tensorDesc, - llvm::ArrayRef offsets) { - auto tdescTy = mlir::dyn_cast(tensorDesc.getType()); - assert(tdescTy && "Expecting the source is a TensorDescType value."); - auto loc = tensorDesc.getLoc(); - int64_t size = static_cast(offsets.size()); - auto type = VectorType::get({size}, builder.getIndexType()); - auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets); - auto offset = vector::FromElementsOp::create(builder, loc, type, values); - build(builder, state, tdescTy, tensorDesc, offset); -} - -void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state, - Value tensorDesc, llvm::ArrayRef offsets) { - auto ofrs = getAsIndexOpFoldResult(builder.getContext(), offsets); - build(builder, state, tensorDesc, ofrs); -} - -LogicalResult UpdateOffsetOp::verify() { - auto tdescTy = getTensorDescType(); - if (!tdescTy.isScattered()) - return emitOpError("Expects a scattered TensorDesc.\n"); - - SmallVector expectedOffsetShape = getShapeOf(tdescTy); - SmallVector offsetShape = getShapeOf(getOffsetsType()); - if (tdescTy.getChunkSizeAsInt() > 1) - expectedOffsetShape.pop_back(); - - if (expectedOffsetShape != offsetShape) - return emitOpError( - "Offsets should match TensorDesc except the chunk size dim."); - - return success(); -} - //===----------------------------------------------------------------------===// // XeGPU_DpasOp //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index ef6a494b76638..7fc5d2fffae51 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -136,8 +136,8 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const { std::optional> XeGPUBlockingPass::getTileShape(Operation *op) const { - if (isa(op)) + if (isa( + op)) return getTileShape(op->getOpResult(0)); if (isa(op)) @@ -145,13 +145,8 @@ XeGPUBlockingPass::getTileShape(Operation *op) const { if (isa(op)) return getTileShape(op->getOpOperand(1)); - // Handle LoadGatherOp and StoreScatterOp (with and without offset) - if (auto loadGatherOp = dyn_cast(op)) { - if (loadGatherOp.getOffsets()) - return getTileShape(loadGatherOp->getOpResult(0)); - else - return getTileShape(loadGatherOp->getOpOperand(0)); - } + if (isa(op)) + return getTileShape(op->getOpResult(0)); if (auto convertLayoutOp = dyn_cast(op)) { auto inputInstData = @@ -165,10 +160,8 @@ XeGPUBlockingPass::getTileShape(Operation *op) const { return targetInstData; } - if (auto storeScatterOp = dyn_cast(op)) - return getTileShape(storeScatterOp.getOffsets() - ? storeScatterOp->getOpOperand(0) - : storeScatterOp->getOpOperand(1)); + if (isa(op)) + return getTileShape(op->getOpOperand(0)); if (isa(op)) { std::optional> aTile = @@ -340,23 +333,6 @@ void XeGPUBlockingPass::runOnOperation() { if (auto tdescTy = dyn_cast(type)) { Attribute encoding = tdescTy.getEncoding(); - // If the encoding is a ScatterTensorDescAttr, we need to - // potentially adjust the chunk size based on the inst_data. - if (tdescTy.isScattered()) { - int64_t chunkSize = tdescTy.getChunkSizeAsInt(); - - if (chunkSize > 1) { - int64_t blockedChunkSize = chunkSize; - auto instData = tdescTy.getLayoutAttr().getEffectiveInstDataAsInt(); - if (!instData.empty()) - blockedChunkSize = instData.back(); - - // To create a new attribute with a different chunk_size: - auto newEncoding = xegpu::ScatterTensorDescAttr::get( - ctx, tdescTy.getMemorySpace(), blockedChunkSize); - encoding = newEncoding; - } - } newTy = xegpu::TensorDescType::get(ctx, tileShape, elemTy, encoding, diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index ff9ff4937c293..686cb20e1976e 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -278,15 +278,6 @@ static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx, xegpu::LayoutAttr::get(ctx, {1, uArch->getSubgroupSize()}, {1, 1})); } -static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx, - unsigned rank, int subgroupSize) { - assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector."); - if (rank == 1) { - return LayoutInfo(xegpu::LayoutAttr::get(ctx, {subgroupSize}, {1})); - } - return LayoutInfo(xegpu::LayoutAttr::get(ctx, {1, subgroupSize}, {1, 1})); -} - /// Helper to get the default layout for 2D block operations. template static LayoutInfo getSIMTLayoutInfoBlockIO(Ty ty, @@ -349,10 +340,6 @@ class LayoutInfoPropagation ArrayRef operands, ArrayRef results); - void visitCreateDescOp(xegpu::CreateDescOp createDesc, - ArrayRef operands, - ArrayRef results); - void visitUpdateNdOffsetOp(xegpu::UpdateNdOffsetOp updateNdOffset, ArrayRef operands, ArrayRef results); @@ -451,9 +438,6 @@ LogicalResult LayoutInfoPropagation::visitOperation( .Case([&](xegpu::LoadGatherOp loadGatherOp) { visitLoadGatherOp(loadGatherOp, operands, results); }) - .Case([&](xegpu::CreateDescOp createDescOp) { - visitCreateDescOp(createDescOp, operands, results); - }) .Case([&](xegpu::UpdateNdOffsetOp updateNdOffsetOp) { visitUpdateNdOffsetOp(updateNdOffsetOp, operands, results); }) @@ -1075,28 +1059,9 @@ void LayoutInfoPropagation::visitLoadGatherOp( // Propagate the new layout to the tensor descriptor operand. if (isa(load.getSourceType())) propagateIfChanged(operands[0], operands[0]->meet(loadLayoutInfo)); - // Propagate the new layout to the mask and optional offset operand. + // Propagate the new layout to the offset and mask operands. propagateIfChanged(operands[1], operands[1]->meet(maskLayoutInfo)); - if (load.getOffsets()) - propagateIfChanged(operands[2], operands[2]->meet(maskLayoutInfo)); -} - -/// Propagate the layout of the descriptor to the vector offset operand in -/// CreateDescOp. -void LayoutInfoPropagation::visitCreateDescOp( - xegpu::CreateDescOp createDesc, ArrayRef operands, - ArrayRef results) { - LayoutInfo descLayout = results[0]->getValue(); - // Need the layout of the descriptor to propagate to the operands. - if (!descLayout.isAssigned()) - return; - const uArch *uArch = getUArch(getChipStr(createDesc).value_or("")); - if (!uArch) - return; - // For offset operand propagate 1D default layout. - LayoutInfo layout = getDefaultSIMTLayoutInfo(createDesc->getContext(), 1, - uArch->getSubgroupSize()); - propagateIfChanged(operands[1], operands[1]->meet(layout)); + propagateIfChanged(operands[2], operands[2]->meet(maskLayoutInfo)); } /// Set the layout for the value, tensor descriptor, offset and mask operands in @@ -1136,10 +1101,9 @@ void LayoutInfoPropagation::visitStoreScatterOp( // Propagate the destination (if tdesc) operand layout if (isa(storeScatter.getDestType())) propagateIfChanged(operands[1], operands[1]->meet(srcLayoutInfo)); - // Propagate the new layout to the mask and optional offset operand. + // Propagate the new layout to the offset and mask operands. propagateIfChanged(operands[2], operands[2]->meet(maskLayoutInfo)); - if (storeScatter.getOffsets()) - propagateIfChanged(operands[3], operands[3]->meet(maskLayoutInfo)); + propagateIfChanged(operands[3], operands[3]->meet(maskLayoutInfo)); } void LayoutInfoPropagation::visitLoadMatrixOp( @@ -1420,12 +1384,6 @@ ResolveLayoutConflicts::resolveTensorDescConsumer(OpOperand &operand) { auto currTDescType = dyn_cast(tdescValue.getType()); assert(anchorOp && currTDescType && "Expected anchor layout op and tensor descriptor consumer."); - // TODO: Scattered tensor desc is not supported for now. - if (currTDescType.isScattered()) { - DBGS() << "Scattered tensor descriptor not supported: " << tdescValue - << "\n"; - return failure(); - } Attribute currLayout = currTDescType.getLayout(); Attribute expectedLayout = anchorOp.getAnchorLayout(); // A conflict exists in tensor descriptor operand if tensor descriptor's diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index ca454e632a3ea..9459164e4d48b 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -804,8 +804,8 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern { auto storeScatterOp = dyn_cast_or_null(lastNode); if (!storeScatterOp) return failure(); - auto offsets = storeScatterOp.getOffsets(); - if (!offsets || !isa(offsets.getType())) + Value offsets = storeScatterOp.getOffsets(); + if (!isa(offsets.getType())) return rewriter.notifyMatchFailure( storeScatterOp, "Store op must have a vector of offsets argument"); VectorType offsetsTy = cast(offsets.getType()); @@ -1109,12 +1109,12 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern { auto loadGatherOp = producedByLastLoad->get().getDefiningOp(); - auto offsets = loadGatherOp.getOffsets(); - if (!offsets || !isa(offsets.getType()) || + Value offsets = loadGatherOp.getOffsets(); + if (!isa(offsets.getType()) || !isa(loadGatherOp.getMask().getType())) return rewriter.notifyMatchFailure( loadGatherOp, - "Load op must have a vector arguments for offsets and mask"); + "Load op must have vector arguments for offsets and mask"); VectorType offsetsTy = cast(offsets.getType()); VectorType maskTy = cast(loadGatherOp.getMask().getType()); VectorType resultVecTy = diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp index 36b903c5b4303..51693da389a49 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp @@ -477,74 +477,6 @@ struct UnrollDpasOp : public UnrollPattern { } }; -struct UnrollCreateDescOp : public UnrollPattern { - using UnrollPattern::UnrollPattern; - LogicalResult matchAndRewrite(xegpu::CreateDescOp op, - PatternRewriter &rewriter) const override { - Location loc = op.getLoc(); - xegpu::TensorDescType tdescTy = op.getType(); - TypedValue<::mlir::VectorType> indiceVec = op.getOffsets(); - VectorType indiceVecTy = indiceVec.getType(); - - if (!tdescTy.isScattered()) - return failure(); - - std::optional> targetShape = getTargetShape(op); - if (!targetShape) - return failure(); - - SmallVector targetIndiceShape(*targetShape); - int64_t originalChunkSize = tdescTy.getChunkSizeAsInt(); - // IndiceVec is 1 dim lower than tdescTy when chunkSize is larger than 1. - if (originalChunkSize > 1) - targetIndiceShape.pop_back(); - - auto newTdescTy = getUnrolledTypes(tdescTy, *targetShape)[0]; - SmallVector convertedIndiceTypes = - getUnrolledTypes(indiceVecTy, targetIndiceShape); - SmallVector convertedIndiceVec = - pack(indiceVec, convertedIndiceTypes, targetIndiceShape, loc, rewriter); - - SmallVector newOps; - - // More indices is need when chunkSize > 1. Since a big load from one - // address could be break into multiple small loads. - if (originalChunkSize > 1) { - int64_t blockedChunkSize = targetShape->back(); - int64_t numNewChunks = originalChunkSize / blockedChunkSize; - - for (auto [indice, indiceType] : - llvm::zip(convertedIndiceVec, convertedIndiceTypes)) { - for (int64_t i = 0; i < numNewChunks; ++i) { - // Compute the offset - Value inc = arith::ConstantIndexOp::create(rewriter, loc, - i * blockedChunkSize); - Value incVec = - vector::BroadcastOp::create(rewriter, loc, indiceType, inc); - Value offsetIndice = - arith::AddIOp::create(rewriter, loc, indice, incVec); - - auto newOp = xegpu::CreateDescOp::create( - rewriter, loc, newTdescTy, op.getSource(), offsetIndice); - - newOps.push_back(newOp); - } - } - } else { - for (auto indice : convertedIndiceVec) { - auto newOp = xegpu::CreateDescOp::create(rewriter, loc, newTdescTy, - op.getSource(), indice); - newOps.push_back(newOp); - } - } - - Value castOp = unpack(newOps, tdescTy, *targetShape, loc, rewriter); - rewriter.replaceOp(op, castOp); - - return success(); - } -}; - struct UnrollLoadGatherOp : public UnrollPattern { using UnrollPattern::UnrollPattern; LogicalResult matchAndRewrite(xegpu::LoadGatherOp op, @@ -563,7 +495,7 @@ struct UnrollLoadGatherOp : public UnrollPattern { return failure(); SmallVector targetMaskShape(*targetShape); - int64_t originalChunkSize = tdescTy.getChunkSizeAsInt(); + int originalChunkSize = op.getChunkSize().value_or(1); VectorType maskTy = llvm::dyn_cast(op.getMask().getType()); @@ -854,7 +786,7 @@ struct UnrollStoreScatterOp : public UnrollPattern { return failure(); SmallVector targetMaskShape(*targetShape); - int64_t originalChunkSize = tdescTy.getChunkSizeAsInt(); + int originalChunkSize = op.getChunkSize().value_or(1); VectorType maskTy = llvm::dyn_cast(op.getMask().getType()); @@ -900,59 +832,6 @@ struct UnrollStoreScatterOp : public UnrollPattern { } }; -struct UnrollUpdateOffsetOp : public UnrollPattern { - using UnrollPattern::UnrollPattern; - LogicalResult matchAndRewrite(xegpu::UpdateOffsetOp op, - PatternRewriter &rewriter) const override { - Location loc = op.getLoc(); - xegpu::TensorDescType tdescTy = op.getTensorDescType(); - - if (!tdescTy.isScattered()) - return failure(); - - std::optional> targetShape = getTargetShape(op); - if (!targetShape) - return failure(); - - SmallVector convertedTdescTypes = - getUnrolledTypes(tdescTy, *targetShape); - SmallVector convertedTdesc = pack( - op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter); - - TypedValue<::mlir::VectorType> offsetVec = op.getOffsets(); - VectorType offsetVecTy = offsetVec.getType(); - SmallVector convertedOffsetTypes; - SmallVector convertedOffsetVec; - SmallVector newOps; - int64_t originalChunkSize = tdescTy.getChunkSizeAsInt(); - if (originalChunkSize > 1) { - auto targetOffsetShape = ArrayRef(*targetShape).drop_back(); - convertedOffsetTypes = getUnrolledTypes(offsetVecTy, targetOffsetShape); - - int64_t blockedChunkSize = targetShape->back(); - int64_t numNewChunks = originalChunkSize / blockedChunkSize; - // the offset is reused across the chunk_size dimension - for (auto offset : pack(offsetVec, convertedOffsetTypes, - targetOffsetShape, loc, rewriter)) - convertedOffsetVec.append(numNewChunks, offset); - - } else { - convertedOffsetTypes = getUnrolledTypes(offsetVecTy, *targetShape); - convertedOffsetVec = - pack(offsetVec, convertedOffsetTypes, *targetShape, loc, rewriter); - } - - for (auto [t, o] : llvm::zip(convertedTdesc, convertedOffsetVec)) { - auto newOp = - xegpu::UpdateOffsetOp::create(rewriter, loc, t.getType(), t, o); - newOps.push_back(newOp); - } - Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter); - rewriter.replaceOp(op, castOp); - return success(); - } -}; - struct UnrollLoadMatrixOp : public UnrollPattern { using UnrollPattern::UnrollPattern; LogicalResult matchAndRewrite(xegpu::LoadMatrixOp op, @@ -1094,11 +973,11 @@ struct UnrollConvertLayoutOp : public UnrollPattern { void mlir::xegpu::populateXeGPUUnrollPatterns( RewritePatternSet &patterns, const xegpu::UnrollOptions &options) { - patterns.add( - patterns.getContext(), options); + patterns + .add( + patterns.getContext(), options); } diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index d637b6828deab..1e867b9c34069 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -951,9 +951,6 @@ struct WgToSgLoadGatherOpWithOffset matchAndRewrite(xegpu::LoadGatherOp op, OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - if (!op.getOffsets()) - return failure(); - Location loc = op.getLoc(); VectorType resultType = dyn_cast(op.getResult().getType()); if (!resultType) @@ -1005,9 +1002,6 @@ struct WgToSgStoreScatterOpWithOffset matchAndRewrite(xegpu::StoreScatterOp op, OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - if (!op.getOffsets()) - return failure(); - Location loc = op.getLoc(); VectorType valueType = dyn_cast(op.getValue().getType()); if (!valueType) diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index e83f96bb294a9..9098eb7e4815b 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -55,18 +55,6 @@ mlir::xegpu::getDistributedVectorType(xegpu::TensorDescType tdescTy) { // e.g. for 1D layout, sgSize = laneLayout[0] int64_t sgSize = llvm::product_of(laneLayout); - // Case 1: regular loads/stores - auto scatterAttr = tdescTy.getEncodingOfType(); - if (scatterAttr) { - auto chunkSize = scatterAttr.getChunkSize().getInt(); - // Verify if the first dimension of the tensor descriptor shape is - // distributable. - assert(tdescShape[0] == laneLayout[0] && - "tensor descriptor shape is not distributable"); - return VectorType::get({chunkSize}, elementType); - } - - // Case 2: block loads/stores // Check if the tensor descriptor shape is distributable. int64_t tensorSize = 1; for (auto [tdescDim, laneDim, laneDataDim] : diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir index 82c7879c79d56..42b38c09e0765 100644 --- a/mlir/test/Dialect/XeGPU/invalid.mlir +++ b/mlir/test/Dialect/XeGPU/invalid.mlir @@ -79,17 +79,6 @@ func.func @prefetch_nd_vc_1(%src: memref<24x32xf16>) { return } -// ----- -func.func @prefetch_nd_vc_2(%src: memref<24xf16>) { - %0 = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7]> : vector<8xindex> - %1 = xegpu.create_tdesc %src, %0 : memref<24xf16>, vector<8xindex> - -> !xegpu.tensor_desc<8xf16, #xegpu.scatter_tdesc_attr<>> - // expected-error@+1 {{Expects a non-scattered TensorDesc}} - xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint}> - : !xegpu.tensor_desc<8xf16, #xegpu.scatter_tdesc_attr<>> - return -} - // ----- func.func @load_nd_vc_1(%src: memref<8x16xf16>) { %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> @@ -99,17 +88,6 @@ func.func @load_nd_vc_1(%src: memref<8x16xf16>) { return } -// ----- -func.func @load_nd_vc_2(%src: memref<16xf16>) { - %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex> - %1 = xegpu.create_tdesc %src, %0 : memref<16xf16>, vector<8xindex> - -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr> - // expected-error@+1 {{Expects a non-scattered TensorDesc.}} - %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint}> - : !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr> -> vector<8x2xf16> - return -} - // ----- func.func @load_nd_vc_3(%src: memref<8x16xf16>) { %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16> @@ -189,18 +167,6 @@ func.func @store_nd_vc_1(%dst: memref<24x32xf16>) { return } -// ----- -func.func @store_nd_vc_2(%dst: memref<16xf16>) { - %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex> - %1 = arith.constant dense<1.0>: vector<8x2xf16> - %2 = xegpu.create_tdesc %dst, %0 : memref<16xf16>, vector<8xindex> - -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr> - // expected-error@+1 {{Expects a non-scattered TensorDesc}} - xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint}> - : vector<8x2xf16>, !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr> - return -} - // ----- func.func @store_nd_vc_3(%dst: memref<24x32xf16>) { %1 = arith.constant dense<1.0>: vector<2x24x32xf16> @@ -245,140 +211,73 @@ func.func @store_nd_vc_5(%dst: memref<24x32xf32>, %data: vector<8x1xf32>) { } // ----- -func.func @update_nd_offset_1(%dst: memref<16xf16>) { - %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex> - %1 = xegpu.create_tdesc %dst, %0 : memref<16xf16>, vector<8xindex> - -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr> - // expected-error@+1 {{Expects a non-scattered TensorDesc}} - xegpu.update_nd_offset %1, [0, 2] : !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr> - return -} - -// ----- -func.func @create_tdesc_vc_1(%src: ui64) { - %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex> - // expected-error@+1 {{Expects a scattered TensorDesc}} - %1 = xegpu.create_tdesc %src, %0 : ui64, vector<8xindex> -> !xegpu.tensor_desc<8xf16> - return -} - -// ----- -func.func @create_tdesc_vc_2(%src: memref) { +func.func @prefetch_vc_2(%src: memref) { %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %1 = xegpu.create_tdesc %src, %0 : memref, vector<4xindex> - // expected-error@+1 {{invalid chunk size}} - -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr> + // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint}} + xegpu.prefetch %src[%0] <{l1_hint = #xegpu.cache_hint}> : memref, vector<4xindex> return } // ----- -func.func @create_tdesc_vc_3(%src: memref) { +func.func @load_gather_vc_2(%src: memref) { %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - // expected-error@+1 {{Memory space mismatch}} - %1 = xegpu.create_tdesc %src, %0 : memref, vector<4xindex> - -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> + %1 = arith.constant dense<1>: vector<4xi1> + // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint}} + %2 = xegpu.load %src[%0], %1 <{l1_hint = #xegpu.cache_hint}> + : memref, vector<4xindex>, vector<4xi1> -> vector<4xf32> return } // ----- -func.func @create_tdesc_vc_4(%src: memref) { - %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %1 = xegpu.create_tdesc %src, %0 : memref, vector<4xindex> - // expected-error@+1 {{expected last dim of tensor to match chunk size}} - -> !xegpu.tensor_desc<4x5xf32, #xegpu.scatter_tdesc_attr> +func.func @load_gather_vc_3(%src: memref) { + %offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + %mask = arith.constant dense<1>: vector<8xi1> + // expected-error@+1 {{Mask should match value except the chunk size dim}} + %2 = xegpu.load %src[%offsets], %mask <{chunk_size = 2}> + : memref, vector<4xindex>, vector<8xi1> -> vector<4x2xf32> return } // ----- -func.func @create_tdesc_vc_5(%src: memref) { +func.func @load_gather_simt_1(%src: memref) { %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %1 = xegpu.create_tdesc %src, %0 : memref, vector<4xindex> - // expected-error@+1 {{last dim of tensor to be a multiple of 2}} - -> !xegpu.tensor_desc<4x3xf16, #xegpu.scatter_tdesc_attr> - return -} - - -// ----- -func.func @prefetch_vc_1(%src: memref<24x32xf16>) { - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> - // expected-error@+1 {{Expects a scattered TensorDesc}} - xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<24x32xf16> + %1 = arith.constant dense<1>: vector<4xi1> + // expected-error@+1 {{value elements must match chunk size}} + %2 = xegpu.load %src[%0], %1 <{chunk_size = 2}> + : memref, vector<4xindex>, vector<4xi1> -> vector<6xf32> return } // ----- -func.func @prefetch_vc_2(%src: ui64) { +func.func @store_scatter_vc_2(%dst: memref) { %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> - -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint}} - xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - return -} - -// ----- -func.func @create_tdesc_layout_1(%src: ui64) { - %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - // expected-error@+1 {{expected layout rank to match tensor rank}} - %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> - return -} - -// ----- -func.func @load_gather_simt_1(%src: ui64) { - %0 = arith.constant dense<1>: vector<4xi1> - %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - // expected-error@+1 {{Value shape [6] is neither a valid distribution for SIMT nor consistent with the tensor descriptor for SIMD}} - %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> -> vector<6xf32> - return -} - -// ----- -func.func @store_scatter_simt_1(%src: ui64) { - %0 = arith.constant dense<1>: vector<4xi1> - %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %val = arith.constant dense<2.9>: vector<6xf32> - %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - // expected-error@+1 {{Value shape [6] is neither a valid distribution for SIMT nor consistent with the tensor descriptor for SIMD}} - xegpu.store %val, %1, %0 <{l1_hint = #xegpu.cache_hint}> : vector<6xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> - return -} - -// ----- -func.func @load_gather_vc_1(%src: memref<24x32xf16>) { - %0 = arith.constant dense<1>: vector<4xi1> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<4x2xf16> - // expected-error@+1 {{Expects a scattered TensorDesc}} - %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint}> - : !xegpu.tensor_desc<4x2xf16>, vector<4xi1> -> vector<4x2xf16> + %1 = arith.constant dense<1>: vector<4xi1> + %2 = arith.constant dense<2.9>: vector<4xf32> + // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint}} + xegpu.store %2, %dst[%0], %1 <{l1_hint = #xegpu.cache_hint}> + : vector<4xf32>, memref, vector<4xindex>, vector<4xi1> return } // ----- -func.func @load_gather_vc_2(%src: ui64) { - %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %0 = arith.constant dense<1>: vector<4xi1> - %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> - -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint}} - %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint}> - : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> - -> vector<4x2xf32> +func.func @store_scatter_vc_3(%dst: memref) { + %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + %1 = arith.constant dense<1>: vector<8xi1> + %2 = arith.constant dense<2.9>: vector<4x2xf32> + // expected-error@+1 {{Mask should match value except the chunk size dim}} + xegpu.store %2, %dst[%0], %1 <{chunk_size = 2}> + : vector<4x2xf32>, memref, vector<4xindex>, vector<8xi1> return } // ----- -func.func @load_gather_vc_3(%src: ui64) { - %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %0 = arith.constant dense<1>: vector<8xi1> - %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> - -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - // expected-error@+1 {{Mask should match TensorDesc except the chunk size dim}} - %2 = xegpu.load %1, %0 - : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<8xi1> - -> vector<4x2xf32> +func.func @store_scatter_simt_1(%dst: memref) { + %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + %1 = arith.constant dense<1>: vector<4xi1> + %2 = arith.constant dense<2.9>: vector<6xf32> + // expected-error@+1 {{value elements must match chunk size}} + xegpu.store %2, %dst[%0], %1 <{chunk_size = 2}> + : vector<6xf32>, memref, vector<4xindex>, vector<4xi1> return } @@ -390,16 +289,6 @@ func.func @prefetch_offset_wi_1(%src: memref<4x4xf32>) { return } -// ----- -func.func @prefetch_offset_wi_2(%src: memref<16xf32>) { - %offsets = arith.constant dense<[0]> : vector<1xindex> - %1 = xegpu.create_tdesc %src, %offsets : memref<16xf32>, vector<1xindex> - -> !xegpu.tensor_desc<1x3xf32, #xegpu.scatter_tdesc_attr> - // expected-error@+1 {{offsets not allowed}} - xegpu.prefetch %1[%offsets]: !xegpu.tensor_desc<1x3xf32, #xegpu.scatter_tdesc_attr>, vector<1xindex> - return -} - // ----- func.func @prefetch_offset_wi_3(%src: memref<16xf32>) { // expected-error@+1 {{Expects offsets}} @@ -476,22 +365,22 @@ func.func @store_scatter_offset_wi_3(%src: memref<16xf16>) { } // ----- -func.func @store_scatter_offset_wi_4(%src: !xegpu.tensor_desc<1x1xf32, #xegpu.scatter_tdesc_attr<>>) { +func.func @store_scatter_offset_wi_4(%src: !xegpu.tensor_desc<1x1xf32>) { %val = arith.constant dense<2.9>: vector<1xf16> %offsets = arith.constant dense<[0]> : vector<1xindex> %mask = arith.constant dense<1>: vector<1xi1> // expected-error@+1 {{offsets not allowed}} xegpu.store %val, %src[%offsets], %mask - : vector<1xf16>, !xegpu.tensor_desc<1x1xf32, #xegpu.scatter_tdesc_attr<>>, vector<1xindex>, vector<1xi1> + : vector<1xf16>, !xegpu.tensor_desc<1x1xf32>, vector<1xindex>, vector<1xi1> return } // ----- -func.func @load_gather_offset_wi_4(%src: !xegpu.tensor_desc<1x2xf16, #xegpu.scatter_tdesc_attr<>>) { +func.func @load_gather_offset_wi_4(%src: !xegpu.tensor_desc<1x2xf16>) { %mask = arith.constant dense<1>: vector<1xi1> %offsets = arith.constant dense<[0]> : vector<1xindex> // expected-error@+1 {{offsets not allowed}} - %2 = xegpu.load %src[%offsets], %mask <{chunk_size = 2}> : !xegpu.tensor_desc<1x2xf16, #xegpu.scatter_tdesc_attr<>>, vector<1xindex>, vector<1xi1> -> vector<2xf16> + %2 = xegpu.load %src[%offsets], %mask <{chunk_size = 2}> : !xegpu.tensor_desc<1x2xf16>, vector<1xindex>, vector<1xi1> -> vector<2xf16> return } @@ -521,43 +410,6 @@ func.func @load_gather_offset_wi_1(%src: memref<4x4xf32>) { return } -// ----- -func.func @store_scatter_vc_1(%src: memref<24x32xf32>) { - %0 = arith.constant dense<1>: vector<4xi1> - %1 = arith.constant dense<2.9>: vector<4x2xf32> - %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<4x2xf32> - // expected-error@+1 {{Expects a scattered TensorDesc}} - xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint}> - : vector<4x2xf32>, !xegpu.tensor_desc<4x2xf32>, vector<4xi1> - return -} - -// ----- -func.func @store_scatter_vc_2(%src: ui64) { - %cst = arith.constant dense<[0, 8, 16, 24]>: vector<4xindex> - %0 = arith.constant dense<1>: vector<4xi1> - %1 = arith.constant dense<2.9>: vector<4x2xf32> - %2 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> - -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint}} - xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint}> : vector<4x2xf32>, - !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> - return -} - -// ----- -func.func @store_scatter_vc_3(%src: ui64) { - %cst = arith.constant dense<[0, 8, 16, 24]>: vector<4xindex> - %0 = arith.constant dense<1>: vector<8xi1> - %1 = arith.constant dense<2.9>: vector<4x2xf32> - %2 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> - -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - // expected-error@+1 {{Mask should match TensorDesc except the chunk size dim}} - xegpu.store %1, %2, %0 : vector<4x2xf32>, - !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<8xi1> - return -} - // ----- func.func @dpas_vc_1(%a : vector<8x8xf16>, %b: vector<8x16x2xf16>) { // expected-error@+1 {{K-dimension mismatch}} @@ -600,15 +452,6 @@ func.func @dpas_simt_1(%a : vector<8xf16>, %b: vector<15xf16>) { return } -// ----- -func.func @atomic_rmw(%src: ui64, %value : vector<16x4xf32>, %mask : vector<16xi1>) { - %0 = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex> - %1 = xegpu.create_tdesc %src, %0 : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr> - // expected-error@+1 {{failed to verify that all of {tensorDesc, value, result} have same shape}} - xegpu.atomic_rmw addf %1, %mask, %value: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1>, vector<16x4xf32> -> vector<16x8xf32> - return -} - // ----- func.func @tensor_desc_invalid_rank_1(%src: memref<24x32xf32>) { %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> @@ -657,26 +500,6 @@ func.func @tensor_desc_invalid_map_data_1(%src: memref<24x32xf32>) { return } -// ----- -func.func @tensor_desc_scatter_invalid_chunk_size_1D(%src: ui64, %offsets: vector<16xindex>) { - %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> - // expected-error@+1 {{expected non-contiguous elements for 1D tensor}} - !xegpu.tensor_desc<16xf32, - #xegpu.scatter_tdesc_attr, - #xegpu.layout> - return -} - -// ----- -func.func @tensor_desc_scatter_invalid_chunk_size_2D(%src: ui64, %offsets: vector<16xindex>) { - %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> - // expected-error@+1 {{expected last dim of tensor to match chunk size}} - !xegpu.tensor_desc<16x2xf32, - #xegpu.scatter_tdesc_attr, - #xegpu.layout> - return -} - // ----- func.func @convert_layout_unmatch(%a: vector<32x64xf16>) { // expected-error@+1 {{expected input layout and target layout be WgLayout or SgLayout at the same time}} @@ -686,112 +509,121 @@ func.func @convert_layout_unmatch(%a: vector<32x64xf16>) { } // ----- -func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) { - %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> - !xegpu.tensor_desc<16x2xf32, - #xegpu.scatter_tdesc_attr, - // expected-error@+1 {{expected sg_layout and lane_layout to have the same rank}} - #xegpu.layout> +func.func @layout_rank_mismatch_sg_lane(%src: memref) { + %offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + %mask = arith.constant dense<1>: vector<4xi1> + %2 = xegpu.load %src[%offsets], %mask + // expected-error@below {{expected sg_layout and lane_layout to have the same rank}} + {layout = #xegpu.layout} + : memref, vector<4xindex>, vector<4xi1> -> vector<4xf32> return } // ----- -func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) { - %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> - !xegpu.tensor_desc<16x2xf32, - #xegpu.scatter_tdesc_attr, - // expected-error@+1 {{expected sg_layout and inst_data to have the same rank}} - #xegpu.layout> +func.func @layout_rank_mismatch_sg_inst(%src: memref) { + %offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + %mask = arith.constant dense<1>: vector<4xi1> + %2 = xegpu.load %src[%offsets], %mask + // expected-error@below {{expected sg_layout and inst_data to have the same rank}} + {layout = #xegpu.layout} + : memref, vector<4xindex>, vector<4xi1> -> vector<4xf32> return } // ----- -func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) { - %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> - !xegpu.tensor_desc<16x2xf32, - #xegpu.scatter_tdesc_attr, - // expected-error@+1 {{expected inst_data and lane_layout to have the same rank}} - #xegpu.layout> +func.func @layout_rank_mismatch_inst_lane(%src: memref) { + %offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + %mask = arith.constant dense<1>: vector<4xi1> + %2 = xegpu.load %src[%offsets], %mask + // expected-error@below {{expected inst_data and lane_layout to have the same rank}} + {layout = #xegpu.layout} + : memref, vector<4xindex>, vector<4xi1> -> vector<4xf32> return } // ----- -func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) { - %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> - !xegpu.tensor_desc<16x2xf32, - #xegpu.scatter_tdesc_attr, - // expected-error@+1 {{expected lane_data and lane_layout to have the same rank}} - #xegpu.layout> +func.func @layout_rank_mismatch_lane_data(%src: memref) { + %offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + %mask = arith.constant dense<1>: vector<4xi1> + %2 = xegpu.load %src[%offsets], %mask + // expected-error@below {{expected lane_data and lane_layout to have the same rank}} + {layout = #xegpu.layout} + : memref, vector<4xindex>, vector<4xi1> -> vector<4xf32> return } // ----- -func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) { - %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> - !xegpu.tensor_desc<16x2xf32, - #xegpu.scatter_tdesc_attr, - // expected-error@+1 {{expected sg_data and sg_layout to have the same rank}} - #xegpu.layout> +func.func @layout_rank_mismatch_sg_data(%src: memref) { + %offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + %mask = arith.constant dense<1>: vector<4xi1> + %2 = xegpu.load %src[%offsets], %mask + // expected-error@below {{expected sg_data and sg_layout to have the same rank}} + {layout = #xegpu.layout} + : memref, vector<4xindex>, vector<4xi1> -> vector<4xf32> return } // ----- -func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) { - %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> +func.func @layout_rank_mismatch_tensor(%src: memref<16x32xf32>) { + %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<16x32xf32> -> // expected-error@+1 {{expected layout rank to match tensor rank}} !xegpu.tensor_desc<16x2xf32, - #xegpu.scatter_tdesc_attr, #xegpu.layout> return } // ----- -func.func @tensor_desc_invalid_sg_data(%src: ui64, %offsets: vector<16xindex>) { - %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> - !xegpu.tensor_desc<16x2xf32, - #xegpu.scatter_tdesc_attr, - // expected-error@+1 {{sg_layout and sg_data must be used together}} - #xegpu.layout> +func.func @layout_sg_data_missing(%src: memref) { + %offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + %mask = arith.constant dense<1>: vector<4xi1> + %2 = xegpu.load %src[%offsets], %mask + // expected-error@below {{sg_layout and sg_data must be used together}} + {layout = #xegpu.layout} + : memref, vector<4xindex>, vector<4xi1> -> vector<4xf32> return } // ----- -func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) { - %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> - !xegpu.tensor_desc<16x2xf32, - #xegpu.scatter_tdesc_attr, - // expected-error@+1 {{lane_layout and lane_data must be used together}} - #xegpu.layout> +func.func @layout_lane_data_missing(%src: memref) { + %offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + %mask = arith.constant dense<1>: vector<4xi1> + %2 = xegpu.load %src[%offsets], %mask + // expected-error@below {{lane_layout and lane_data must be used together}} + {layout = #xegpu.layout} + : memref, vector<4xindex>, vector<4xi1> -> vector<4xf32> return } // ----- -func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) { - %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> - !xegpu.tensor_desc<16x2xf32, - #xegpu.scatter_tdesc_attr, - // expected-error@+1 {{expected sg_layout/lane_layout being used with order}} - #xegpu.layout> +func.func @layout_order_without_layout(%src: memref) { + %offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + %mask = arith.constant dense<1>: vector<4xi1> + %2 = xegpu.load %src[%offsets], %mask + // expected-error@below {{expected sg_layout/lane_layout being used with order}} + {layout = #xegpu.layout} + : memref, vector<4xindex>, vector<4xi1> -> vector<4xf32> return } // ----- -func.func @tensor_desc_rank_mismatch(%src: ui64, %offsets: vector<16xindex>) { - %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> - !xegpu.tensor_desc<16x2xf32, - #xegpu.scatter_tdesc_attr, - // expected-error@+1 {{expected order and sg_layout to have the same rank}} - #xegpu.layout> +func.func @layout_order_rank_mismatch_sg(%src: memref) { + %offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + %mask = arith.constant dense<1>: vector<4xi1> + %2 = xegpu.load %src[%offsets], %mask + // expected-error@below {{expected order and sg_layout to have the same rank}} + {layout = #xegpu.layout} + : memref, vector<4xindex>, vector<4xi1> -> vector<4xf32> return } // ----- -func.func @tensor_desc_invalid_sg_data(%src: ui64, %offsets: vector<16xindex>) { - %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> - !xegpu.tensor_desc<16x2xf32, - #xegpu.scatter_tdesc_attr, - // expected-error@+1 {{expected order and lane_layout to have the same rank}} - #xegpu.layout> +func.func @layout_order_rank_mismatch_lane(%src: memref) { + %offsets = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + %mask = arith.constant dense<1>: vector<4xi1> + %2 = xegpu.load %src[%offsets], %mask + // expected-error@below {{expected order and lane_layout to have the same rank}} + {layout = #xegpu.layout} + : memref, vector<4xindex>, vector<4xi1> -> vector<4xf32> return } diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir index 520061925f92c..b32e297b60fc8 100644 --- a/mlir/test/Dialect/XeGPU/ops.mlir +++ b/mlir/test/Dialect/XeGPU/ops.mlir @@ -22,7 +22,6 @@ gpu.func @create_nd_tdesc_2(%src: ui64, %w : index, %h : index, %x : index, %y : gpu.return } - // CHECK: gpu.func @create_nd_tdesc_3(%[[arg0:.*]]: memref<24x32xf32>) { gpu.func @create_nd_tdesc_3(%src: memref<24x32xf32>) { // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr @@ -30,7 +29,6 @@ gpu.func @create_nd_tdesc_3(%src: memref<24x32xf32>) { gpu.return } - // CHECK: gpu.func @create_nd_tdesc_4(%[[arg0:.*]]: memref<2x24x32xf32>) { gpu.func @create_nd_tdesc_4(%src: memref<2x24x32xf32>) { // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32> @@ -38,7 +36,6 @@ gpu.func @create_nd_tdesc_4(%src: memref<2x24x32xf32>) { gpu.return } - // CHECK: gpu.func @create_nd_tdesc_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) { gpu.func @create_nd_tdesc_5(%src: memref<2x24x32xf32, 3>) { // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr> @@ -46,7 +43,6 @@ gpu.func @create_nd_tdesc_5(%src: memref<2x24x32xf32, 3>) { gpu.return } - // CHECK: gpu.func @create_nd_tdesc_6(%[[arg0:.*]]: memref<24x32xf32>) { gpu.func @create_nd_tdesc_6(%src: memref<24x32xf32>) { // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr> @@ -61,7 +57,6 @@ gpu.func @create_nd_tdesc_7(%src: memref<8x24x32x48x64xf32>) { gpu.return } - // CHECK: gpu.func @test_create_nd_tdesc_7(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index, %[[arg5:.*]]: memref<24x32xf32>) gpu.func @test_create_nd_tdesc_7(%src: ui64, %w : index, %h : index, %x : index, %y : index, %src2: memref<24x32xf32>) { //CHECK: %[[C:.*]] = arith.constant 1 : index @@ -296,7 +291,6 @@ gpu.func @simt_load_nd_8(%src: memref<24x32xf32>) { gpu.return } - // CHECK: func @simt_load_nd_offset_1(%[[arg0:.*]]: memref<24x32xf32>) { gpu.func @simt_load_nd_offset_1(%src: memref<24x32xf32>) { // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> @@ -390,133 +384,6 @@ gpu.func @update_nd_tdesc_2(%src: memref<8x24x32xf32>) { gpu.return } -// CHECK: gpu.func @create_tdesc(%[[arg0:.*]]: ui64) { -gpu.func @create_tdesc(%src: ui64) { - //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - gpu.return -} - - -// CHECK: gpu.func @create_tdesc_1(%[[arg0:.*]]: memref) { -gpu.func @create_tdesc_1(%src: memref) { - //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - %1 = xegpu.create_tdesc %src, %0 : memref, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - gpu.return -} - - -// CHECK: gpu.func @create_tdesc_2(%[[arg0:.*]]: memref) { -gpu.func @create_tdesc_2(%src: memref) { - //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>> - %1 = xegpu.create_tdesc %src, %0 : memref, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>> - gpu.return -} - - -// CHECK: gpu.func @create_tdesc_3(%[[arg0:.*]]: ui64) { -gpu.func @create_tdesc_3(%src: ui64) { - //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr> - %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr> - gpu.return -} - -// CHECK: gpu.func @create_tdesc_4(%[[arg0:.*]]: ui64) { -gpu.func @create_tdesc_4(%src: ui64) { - //CHECK: %[[cst:.*]] = arith.constant dense<{{.*}}> : vector<2x4xindex> - %0 = arith.constant dense<[[0, 8, 16, 24], [32, 40, 48, 56]]> : vector<2x4xindex> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<2x4xindex> -> !xegpu.tensor_desc<2x4x2xf16, #xegpu.scatter_tdesc_attr> - %1 = xegpu.create_tdesc %src, %0 : ui64, vector<2x4xindex> -> !xegpu.tensor_desc<2x4x2xf16, #xegpu.scatter_tdesc_attr> - gpu.return -} - - -// CHECK: gpu.func @subgroup_load(%[[arg0:.*]]: ui64) { -gpu.func @subgroup_load(%src: ui64) { - //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> - %1 = arith.constant dense<1>: vector<4xi1> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> -> vector<4x2xf32> - %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> -> vector<4x2xf32> - gpu.return -} - -// CHECK: gpu.func @simt_load(%[[arg0:.*]]: ui64) { -gpu.func @simt_load(%src: ui64) { - //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> - %1 = arith.constant dense<1>: vector<4xi1> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> -> vector<2xf32> - %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> -> vector<2xf32> - gpu.return -} - -// CHECK: gpu.func @subgroup_load_2(%[[arg0:.*]]: ui64) { -gpu.func @subgroup_load_2(%src: ui64) { - //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> - %1 = arith.constant dense<1>: vector<4xi1> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>> - %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>> - //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1> -> vector<4xf32> - %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1> -> vector<4xf32> - gpu.return -} - -// CHECK: gpu.func @simt_load_2(%[[arg0:.*]]: ui64) { -gpu.func @simt_load_2(%src: ui64) { - //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> - %1 = arith.constant dense<1>: vector<4xi1> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>> - %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>> - //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1> -> vector<1xf32> - %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1> -> vector<1xf32> - gpu.return -} - -// CHECK: gpu.func @subgroup_load_3(%[[arg0:.*]]: ui64) { -gpu.func @subgroup_load_3(%src: ui64) { - //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> - %1 = arith.constant dense<1>: vector<4xi1> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr> - %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr> - //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr>, vector<4xi1> -> vector<4x8xf16> - %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr>, vector<4xi1> -> vector<4x8xf16> - gpu.return -} - -// CHECK: gpu.func @simt_load_3(%[[arg0:.*]]: ui64) { -gpu.func @simt_load_3(%src: ui64) { - //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> - %1 = arith.constant dense<1>: vector<4xi1> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr> - %2 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr> - //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr>, vector<4xi1> -> vector<8xf16> - %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x8xf16, #xegpu.scatter_tdesc_attr>, vector<4xi1> -> vector<8xf16> - gpu.return -} - // CHECK: gpu.func @simt_load_4(%[[arg0:.*]]: memref<256xf16>, %[[arg1:.*]]: vector<1xindex>, %[[arg2:.*]]: vector<1xi1>) { gpu.func @simt_load_4(%arg0: memref<256xf16>, %arg1: vector<1xindex>, %arg2: vector<1xi1>) { // CHECK: %0 = xegpu.load %[[arg0]][%[[arg1]]], %[[arg2]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16> @@ -545,19 +412,6 @@ gpu.func @simt_load_7(%arg0: memref<256xf16>, %arg1: index, %arg2: i1) { gpu.return } -// CHECK: gpu.func @subgroup_load_4(%[[arg0:.*]]: ui64) { -gpu.func @subgroup_load_4(%src: ui64) { - //CHECK: %[[cst:.*]] = arith.constant dense<{{.*}}> : vector<2x4xindex> - %0 = arith.constant dense<[[0, 8, 16, 24], [32, 40, 48, 56]]> : vector<2x4xindex> - //CHECK: %[[cst1:.*]] = arith.constant dense : vector<2x4xi1> - %1 = arith.constant dense<1>: vector<2x4xi1> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<2x4xindex> -> !xegpu.tensor_desc<2x4x8xf16, #xegpu.scatter_tdesc_attr> - %2 = xegpu.create_tdesc %src, %0 : ui64, vector<2x4xindex> -> !xegpu.tensor_desc<2x4x8xf16, #xegpu.scatter_tdesc_attr> - //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<2x4x8xf16, #xegpu.scatter_tdesc_attr>, vector<2x4xi1> -> vector<2x4x8xf16> - %3 = xegpu.load %2, %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<2x4x8xf16, #xegpu.scatter_tdesc_attr>, vector<2x4xi1> -> vector<2x4x8xf16> - gpu.return -} - // CHECK: gpu.func @subgroup_load_offset_1(%arg0: memref) { gpu.func @subgroup_load_offset_1(%src: memref) { %offset = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> @@ -568,96 +422,6 @@ gpu.func @subgroup_load_offset_1(%src: memref) { gpu.return } -// CHECK: gpu.func @subgroup_store(%[[arg0:.*]]: ui64) { -gpu.func @subgroup_store(%src: ui64) { - //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> - %1 = arith.constant dense<1>: vector<4xi1> - //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<4x2xf32> - %2 = arith.constant dense<2.9>: vector<4x2xf32> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<4x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> - xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<4x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> - gpu.return -} - -// CHECK: gpu.func @simt_store(%[[arg0:.*]]: ui64) { -gpu.func @simt_store(%src: ui64) { - //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> - %1 = arith.constant dense<1>: vector<4xi1> - //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<2xf32> - %2 = arith.constant dense<2.9>: vector<2xf32> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> - xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xi1> - gpu.return -} - -// CHECK: gpu.func @subgroup_store_2(%[[arg0:.*]]: ui64) { -gpu.func @subgroup_store_2(%src: ui64) { - //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> - %1 = arith.constant dense<1>: vector<4xi1> - //CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<4x2xf16> - %2 = arith.constant dense<2.9>: vector<4x2xf16> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr> - %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr> - //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<4x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr>, vector<4xi1> - xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<4x2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr>, vector<4xi1> - gpu.return -} - -// CHECK: gpu.func @simt_store_2(%[[arg0:.*]]: ui64) { -gpu.func @simt_store_2(%src: ui64) { - //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> - %1 = arith.constant dense<1>: vector<4xi1> - //CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<2xf16> - %2 = arith.constant dense<2.9>: vector<2xf16> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr> - %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr> - //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr>, vector<4xi1> - xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<2xf16>, !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr>, vector<4xi1> - gpu.return -} - -// CHECK: gpu.func @subgroup_store_3(%[[arg0:.*]]: ui64) { -gpu.func @subgroup_store_3(%src: ui64) { - //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> - %1 = arith.constant dense<1>: vector<4xi1> - //CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<4xf32> - %2 = arith.constant dense<2.9>: vector<4xf32> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>> - %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>> - //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<4xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1> - xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<4xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1> - gpu.return -} - -// CHECK: gpu.func @simt_store_3(%[[arg0:.*]]: ui64) { -gpu.func @simt_store_3(%src: ui64) { - //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - //CHECK: %[[cst1:.*]] = arith.constant dense : vector<4xi1> - %1 = arith.constant dense<1>: vector<4xi1> - //CHECK: %[[cst2:.*]] = arith.constant dense<2.900000e+00> : vector<1xf32> - %2 = arith.constant dense<2.9>: vector<1xf32> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>> - %3 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>> - //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1> - xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<1xf32>, !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>>, vector<4xi1> - gpu.return -} - // CHECK: gpu.func @simt_store_4(%[[arg0:.*]]: vector<8xf16>, %[[arg1:.*]]: memref<256xf16>, %[[arg2:.*]]: vector<1xindex>, %[[arg3:.*]]: vector<1xi1>) { gpu.func @simt_store_4(%arg0: vector<8xf16>, %arg1: memref<256xf16>, %arg2: vector<1xindex>, %arg3: vector<1xi1>) { // CHECK: xegpu.store %[[arg0]], %[[arg1]][%[[arg2]]], %[[arg3]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1> @@ -686,21 +450,6 @@ gpu.func @simt_store_7(%arg0: f16, %arg1: memref<256xf16>, %arg2: index, %arg3: gpu.return } -// CHECK: gpu.func @subgroup_store_4(%[[arg0:.*]]: ui64) { -gpu.func @subgroup_store_4(%src: ui64) { - //CHECK: %[[cst:.*]] = arith.constant dense<{{.*}}> : vector<2x4xindex> - %0 = arith.constant dense<[[0, 8, 16, 24], [32, 40, 48, 56]]> : vector<2x4xindex> - //CHECK: %[[cst1:.*]] = arith.constant dense : vector<2x4xi1> - %1 = arith.constant dense<1>: vector<2x4xi1> - //CHECK: %[[cst2:.*]] = arith.constant {{.*}} : vector<2x4xf32> - %2 = arith.constant dense<2.9>: vector<2x4xf32> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<2x4xindex> -> !xegpu.tensor_desc<2x4xf32, #xegpu.scatter_tdesc_attr<>> - %3 = xegpu.create_tdesc %src, %0 : ui64, vector<2x4xindex> -> !xegpu.tensor_desc<2x4xf32, #xegpu.scatter_tdesc_attr<>> - //CHECK: xegpu.store %[[cst2]], %[[R0]], %[[cst1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<2x4xf32>, !xegpu.tensor_desc<2x4xf32, #xegpu.scatter_tdesc_attr<>>, vector<2x4xi1> - xegpu.store %2, %3, %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<2x4xf32>, !xegpu.tensor_desc<2x4xf32, #xegpu.scatter_tdesc_attr<>>, vector<2x4xi1> - gpu.return -} - // CHECK: gpu.func @subgroup_store_offset_1(%arg0: memref) { gpu.func @subgroup_store_offset_1(%dest: memref) { %val = arith.constant dense<2.9>: vector<4x2xf16> @@ -712,17 +461,6 @@ gpu.func @subgroup_store_offset_1(%dest: memref) { gpu.return } -// CHECK: gpu.func @prefetch(%[[arg0:.*]]: ui64) { -gpu.func @prefetch(%src: ui64) { - //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - gpu.return -} - // CHECK: gpu.func @prefetch_offset(%[[arg0:.*]]: ui64) { gpu.func @prefetch_offset(%src: ui64) { //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> @@ -732,19 +470,6 @@ gpu.func @prefetch_offset(%src: ui64) { gpu.return } -// CHECK: gpu.func @create_update_tdesc(%[[arg0:.*]]: ui64) { -gpu.func @create_update_tdesc(%src: ui64) { - //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - //CHECK: %[[st:.*]] = arith.constant dense<32> : vector<4xindex> - //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xindex> - %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr> - %s = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex> - %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr>, vector<4xindex> - gpu.return -} - // CHECK: gpu.func @subgroup_dpas(%[[arg0:.*]]: vector<8x16xf16>, %[[arg1:.*]]: vector<16x16xf16>) gpu.func @subgroup_dpas(%a : vector<8x16xf16>, %b: vector<16x16xf16>) { // CHECK: %0 = xegpu.dpas %[[arg0]], %[[arg1]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> @@ -766,17 +491,6 @@ gpu.func @subgroup_dpas_packed_b(%a : vector<8x16xf16>, %b: vector<8x16x2xf16>) gpu.return } -// CHECK: gpu.func @subgroup_atomic_rmw(%[[arg0:.*]]: ui64, %[[arg1:.*]]: vector<16xf32>, %[[arg2:.*]]: vector<16xi1>) -gpu.func @subgroup_atomic_rmw(%src: ui64, %value : vector<16xf32>, %mask : vector<16xi1>) { - //CHECK: %[[c:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex> - %c = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex> - //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[c]] : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - %1 = xegpu.create_tdesc %src, %c: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - //CHECK: %[[R1:.*]] = xegpu.atomic_rmw addf %[[R0]], %[[arg2]], %[[arg1]] : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> -> vector<16xf32> - xegpu.atomic_rmw addf %1, %mask, %value: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> -> vector<16xf32> - gpu.return -} - // CHECK: gpu.func @alloc_nbarrier({{.*}}) { gpu.func @alloc_nbarrier() { // CHECK: xegpu.alloc_nbarrier @@ -834,7 +548,6 @@ gpu.func @create_mem_desc_with_stride() { gpu.return } - // CHECK-LABEL: gpu.func @create_mem_desc_from_2d_memref({{.*}}) { gpu.func @create_mem_desc_from_2d_memref() { //CHECK: [[alloc:%.+]] = memref.alloca() {alignment = 1024 : i64} : memref<16x64xf16, 3> @@ -940,3 +653,4 @@ gpu.func @dpas_mx(%a : vector<8x16xf8E5M2>, %b: vector<16x16xf8E5M2>, %acc: vect } } + diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir index 26936dab2fb38..c87dbf3ec2108 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir @@ -155,26 +155,6 @@ func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256 } } -// ----- -gpu.module @test { -// CHECK-LABEL: func.func @load_gather_1d( -// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>, %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16xf32, #xegpu.layout>) { -// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} -// CHECK-SAME: dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> -// CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> -// CHECK-NEXT: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %[[CST]] : memref<256xf32>, vector<16xindex> -> -// CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> -// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] <{layout = #xegpu.layout}> : -// CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<16xi1> -> vector<16xf32> -func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { - %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> - %cst_0 = arith.constant dense : vector<16xi1> - %0 = xegpu.create_tdesc %arg0, %cst : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - %1 = xegpu.load %0, %cst_0 : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> - xegpu.store_nd %1, %arg1 : vector<16xf32>, !xegpu.tensor_desc<16xf32> - return -} -} // ----- gpu.module @test { // CHECK-LABEL: func.func @store_scatter_with_chunksize( @@ -191,20 +171,7 @@ func.func @store_scatter_with_chunksize(%arg0: memref<128xf32>) { return } } -// ----- -gpu.module @test { -// CHECK-LABEL: func.func @store_scatter_1d( -// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<256xf32>) { -// CHECK: xegpu.store %[[ARG0]], %{{.*}}, %{{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, -// CHECK-SAME: #xegpu.layout>, vector<16xi1> -func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) { - %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> - %cst_0 = arith.constant dense : vector<16xi1> - %0 = xegpu.create_tdesc %arg1, %cst : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - xegpu.store %arg0, %0, %cst_0 : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> - return -} -} + // ----- gpu.module @test { // CHECK-LABEL: func.func @scatter_ops_chunksize( @@ -224,6 +191,7 @@ func.func @scatter_ops_chunksize(%src: memref<256xf16>) { return } } + // ----- gpu.module @test { // CHECK-LABEL: func.func @scatter_ops( diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir index 61b8046bd04e5..8b57b262ebddf 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir @@ -413,155 +413,6 @@ gpu.module @test_kernel { } } -// ----- - -gpu.module @test_kernel { - // CHECK-LABEL: test_prefetch_load_store_update - // CHECK-SAME: [[arg0:%.+]]: ui64 - // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - // CHECK-COUNT-2: xegpu.prefetch {{.*}} - // CHECK-COUNT-2: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex> - // CHECK-COUNT-2: xegpu.load {{.*}} - // CHECK-COUNT-2: xegpu.store {{.*}} - - gpu.func @test_prefetch_load_store_update(%src: ui64) { - - %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<[ - 0, 8, 16, 24, 32, 40, 48, 56, - 64, 72, 80, 88, 96, 104, 112, 120, - 128, 136, 144, 152, 160, 168, 176, 184, - 192, 200, 208, 216, 224, 232, 240, 248 - ]> : vector<32xindex> - - %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> - xegpu.prefetch %tdesc {layout = #xegpu.layout}: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> - - %delta = arith.constant {layout_result_0 = #xegpu.layout} dense<[ - 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 64, - 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 256 - ]> : vector<32xindex> - %new_tdesc = xegpu.update_offset %tdesc, %delta - : !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<32xindex> - - %c17 = arith.constant 17: index - %mask = vector.create_mask %c17 {layout_result_0 = #xegpu.layout} : vector<32xi1> - - %ld_vec = xegpu.load %new_tdesc, %mask {layout = #xegpu.layout}: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<32xi1> -> vector<32xf32> - - %st_vec = arith.addf %ld_vec, %ld_vec {layout_result_0 = #xegpu.layout} : vector<32xf32> - xegpu.store %st_vec, %tdesc, %mask {layout = #xegpu.layout}: - vector<32xf32>, - !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, - vector<32xi1> - - gpu.return - } - -} - -// ----- -gpu.module @test_kernel { - // CHECK-LABEL: test_prefetch_load_store_update_chunk - // CHECK-SAME: [[arg0:%.+]]: ui64 - // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> - // CHECK-COUNT-4: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> - // CHECK-COUNT-4: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr>, vector<16xindex> - // CHECK-COUNT-4: xegpu.load {{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<16x2xf32> - // CHECK-COUNT-4: xegpu.store {{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> - - gpu.func @test_prefetch_load_store_update_chunk(%src: ui64) { - - %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<[ - 0, 8, 16, 24, 32, 40, 48, 56, - 64, 72, 80, 88, 96, 104, 112, 120, - 128, 136, 144, 152, 160, 168, 176, 184, - 192, 200, 208, 216, 224, 232, 240, 248 - ]> : vector<32xindex> - - %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - xegpu.prefetch %tdesc {layout = #xegpu.layout}: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - - %delta = arith.constant {layout_result_0 = #xegpu.layout} dense<[ - 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 64, - 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 256 - ]> : vector<32xindex> - %new_tdesc = xegpu.update_offset %tdesc, %delta - : !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<32xindex> - - %c17 = arith.constant 17: index - %mask = vector.create_mask %c17 {layout_result_0 = #xegpu.layout} : vector<32xi1> - - %ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, layout = #xegpu.layout}>: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<32xi1> -> vector<32x4xf32> - - %st_vec = arith.addf %ld_vec, %ld_vec {layout_result_0 = #xegpu.layout} : vector<32x4xf32> - xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, layout = #xegpu.layout}>: - vector<32x4xf32>, - !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, - vector<32xi1> - - gpu.return - } -} - -// ----- -#l = #xegpu.layout - -// test the blocking pass on a 3D scattered tensor descriptor, -// Ops working 4x8x4xf32 scattered tensor_descs will be unrolled -// into 4 ops working 2x8x2xf32 scattered tensor_descs based on -// the given layout. -gpu.module @test_kernel { - // CHECK-LABEL: test_3d_scattered_tensor_desc - // CHECK-SAME: [[arg0:%.+]]: ui64 - // CHECK: [[cst_1:%.+]] = arith.constant dense<{{.*}}[130, 138, 146, 154, 162, 170, 178, 186], [194, 202, 210, 218, 226, 234, 242, 250]]> : vector<2x8xindex> - // CHECK: [[cst_2:%.+]] = arith.constant dense<{{.*}}[2, 10, 18, 26, 34, 42, 50, 58], [66, 74, 82, 90, 98, 106, 114, 122]]> : vector<2x8xindex> - // CHECK: [[cst_3:%.+]] = arith.constant dense<{{.*}}[0, 8, 16, 24, 32, 40, 48, 56], [64, 72, 80, 88, 96, 104, 112, 120]]> : vector<2x8xindex> - // CHECK: [[cst_4:%.+]] = arith.constant dense<{{.*}}[128, 136, 144, 152, 160, 168, 176, 184], [192, 200, 208, 216, 224, 232, 240, 248]]> : vector<2x8xindex> - // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<2x8xindex> -> !xegpu.tensor_desc<2x8x2xf32, #xegpu.scatter_tdesc_attr> - // CHECK-COUNT-4: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<2x8x2xf32, #xegpu.scatter_tdesc_attr> - // CHECK-COUNT-4: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<2x8x2xf32, #xegpu.scatter_tdesc_attr>, vector<2x8xindex> - // CHECK-COUNT-4: xegpu.load {{.*}} : !xegpu.tensor_desc<2x8x2xf32, #xegpu.scatter_tdesc_attr>, vector<2x8xi1> -> vector<2x8x2xf32> - // CHECK-COUNT-4: xegpu.store {{.*}} : vector<2x8x2xf32>, !xegpu.tensor_desc<2x8x2xf32, #xegpu.scatter_tdesc_attr>, vector<2x8xi1> - - - gpu.func @test_3d_scattered_tensor_desc(%src: ui64) { - %cst = arith.constant {layout_result_0 = #l} dense<[ - [0, 8, 16, 24, 32, 40, 48, 56], - [64, 72, 80, 88, 96, 104, 112, 120], - [128, 136, 144, 152, 160, 168, 176, 184], - [192, 200, 208, 216, 224, 232, 240, 248] - ]> : vector<4x8xindex> - - %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<4x8xindex> -> !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr, #l> - xegpu.prefetch %tdesc {layout = #l}: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr, #l> - - %delta = arith.constant {layout_result_0 = #l} dense<[ - [32, 32, 32, 32, 32, 32, 32, 32], - [32, 32, 32, 32, 32, 32, 32, 64], - [128, 128, 128, 128, 128, 128, 128, 128], - [128, 128, 128, 128, 128, 128, 128, 256] - ]> : vector<4x8xindex> - %new_tdesc = xegpu.update_offset %tdesc, %delta - : !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr, #l>, vector<4x8xindex> - - %c4 = arith.constant 4: index - %mask = vector.create_mask %c4, %c4 {layout_result_0 = #l}: vector<4x8xi1> - - %ld_vec = xegpu.load %new_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, layout = #l}>: !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr, #l>, vector<4x8xi1> -> vector<4x8x4xf32> - - %st_vec = arith.addf %ld_vec, %ld_vec {layout_result_0 = #l} : vector<4x8x4xf32> - xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, layout = #l}>: - vector<4x8x4xf32>, - !xegpu.tensor_desc<4x8x4xf32, #xegpu.scatter_tdesc_attr, #l>, - vector<4x8xi1> - gpu.return - } -} - // ----- #a = #xegpu.layout #b = #xegpu.layout @@ -605,13 +456,13 @@ gpu.module @test_kernel { #a = #xegpu.layout gpu.module @test_kernel { //CHECK-LABEL: gpu.func @convert_layout_scalar + // CHECK-NOT: xegpu.convert_layout gpu.func @convert_layout_scalar(%arg0: memref<16x16xf16>, %arg1: memref<4xf16>) { %acc = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index %a_tdesc = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #a> %a = xegpu.load_nd %a_tdesc {layout = #a}: !xegpu.tensor_desc<16x16xf16, #a> -> vector<16x16xf16> %a_reduce = vector.multi_reduction , %a, %acc {layout_operand_0 = #a, layout_result_0 = #xegpu.slice<#a, dims = [0, 1]>} [0, 1] : vector<16x16xf16> to f16 - // CHECK-NOT: xegpu.convert_layout %13 = xegpu.convert_layout %a_reduce <{input_layout = #xegpu.slice<#a, dims = [0, 1]>, target_layout = #xegpu.slice<#a, dims = [0, 1]>}> : f16 memref.store %13, %arg1[%c0] : memref<4xf16> gpu.return diff --git a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir index c3be138fef38a..750007077164f 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir @@ -161,58 +161,12 @@ gpu.module @test { //----- - // CHECK-LABEL: create_tdesc_vec - // CHECK-SAME: [[arg0:%.+]]: ui64 - // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - gpu.func @create_tdesc_vec(%src: ui64) -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> { - %cst = arith.constant dense<[ - 0, 8, 16, 24, 32, 40, 48, 56, - 64, 72, 80, 88, 96, 104, 112, 120, - 128, 136, 144, 152, 160, 168, 176, 184, - 192, 200, 208, 216, 224, 232, 240, 248 - ]> : vector<32xindex> - %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> - gpu.return %tdesc : !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> - } - //----- - // CHECK-LABEL: create_tdesc_step - // CHECK-SAME: [[arg0:%.+]]: ui64 - // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - gpu.func @create_tdesc_step(%src: ui64) -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> { - %step = arith.constant dense<8> : vector<32xindex> - %seq = vector.step : vector<32xindex> - %cst = arith.muli %seq, %step : vector<32xindex> - %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> - gpu.return %tdesc : !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> - } - //----- - // CHECK-LABEL: load - // CHECK-SAME: [[arg0:%.+]]: ui64 - // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - // CHECK-COUNT-2: xegpu.load {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> - gpu.func @load(%src: ui64) -> vector<32xf32> { - %cst = arith.constant dense<[ - 0, 8, 16, 24, 32, 40, 48, 56, - 64, 72, 80, 88, 96, 104, 112, 120, - 128, 136, 144, 152, 160, 168, 176, 184, - 192, 200, 208, 216, 224, 232, 240, 248 - ]> : vector<32xindex> - - %c17 = arith.constant 17: index - %mask = vector.create_mask %c17: vector<32xi1> - %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> - %ld = xegpu.load %tdesc, %mask: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<32xi1> -> vector<32xf32> - - gpu.return %ld : vector<32xf32> - } - //----- - // CHECK-LABEL: load_with_offsets // CHECK-SAME: [[arg0:%.+]]: ui64 // CHECK-COUNT-2: xegpu.load {{.*}}[{{.*}}], {{.*}} <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32> @@ -233,48 +187,7 @@ gpu.module @test { //----- - // CHECK-LABEL: prefetch - // CHECK-SAME: [[arg0:%.+]]: ui64 - // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - // CHECK-COUNT-2: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - gpu.func @prefetch(%src: ui64) { - - %cst = arith.constant dense<[ - 0, 8, 16, 24, 32, 40, 48, 56, - 64, 72, 80, 88, 96, 104, 112, 120, - 128, 136, 144, 152, 160, 168, 176, 184, - 192, 200, 208, 216, 224, 232, 240, 248 - ]> : vector<32xindex> - - %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> - - xegpu.prefetch %tdesc: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> - gpu.return - } - //----- - - // CHECK-LABEL: store - // CHECK-SAME: [[arg0:%.+]]: ui64 - // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - // CHECK-COUNT-2: xegpu.store {{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> - gpu.func @store(%src: ui64) { - %cst = arith.constant dense<[ - 0, 8, 16, 24, 32, 40, 48, 56, - 64, 72, 80, 88, 96, 104, 112, 120, - 128, 136, 144, 152, 160, 168, 176, 184, - 192, 200, 208, 216, 224, 232, 240, 248 - ]> : vector<32xindex> - - %c17 = arith.constant 17: index - %mask = vector.create_mask %c17: vector<32xi1> - - %st_vec = arith.constant dense<1023.0>: vector<32xf32> - %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout> - xegpu.store %st_vec, %tdesc, %mask: vector<32xf32>, !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout>, vector<32xi1> - - gpu.return - } //----- @@ -299,68 +212,10 @@ gpu.module @test { } //----- - // CHECK-LABEL: create_tdesc_step_chunk - // CHECK-SAME: [[arg0:%.+]]: ui64 - // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x4xf32, #xegpu.scatter_tdesc_attr> - gpu.func @create_tdesc_step_chunk(%src: ui64) -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> { - %step = arith.constant dense<8> : vector<32xindex> - %seq = vector.step : vector<32xindex> - %cst = arith.muli %seq, %step : vector<32xindex> - %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - gpu.return %tdesc : !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - } //----- - // CHECK-LABEL: create_tdesc_step_chunk2 - // CHECK-SAME: [[arg0:%.+]]: ui64 - // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> - gpu.func @create_tdesc_step_chunk2(%src: ui64) -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> { - %step = arith.constant dense<8> : vector<32xindex> - %seq = vector.step : vector<32xindex> - %cst = arith.muli %seq, %step : vector<32xindex> - %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - gpu.return %tdesc : !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - } - -// CHECK-LABEL: create_tdesc_step_chunk3 - // CHECK-SAME: [[arg0:%.+]]: ui64 - // CHECK: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> - // CHECK: arith.addi %{{.*}}, %{{.*}} : vector<16xindex> - // CHECK: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> - // CHECK: arith.addi %{{.*}}, %{{.*}} : vector<16xindex> - // CHECK: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> - // CHECK: arith.addi %{{.*}}, %{{.*}} : vector<16xindex> - // CHECK: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> - gpu.func @create_tdesc_step_chunk3(%src: ui64) -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> { - %step = arith.constant dense<8> : vector<16xindex> - %seq = vector.step : vector<16xindex> - %cst = arith.muli %seq, %step : vector<16xindex> - %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - gpu.return %tdesc : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - } //----- - // CHECK-LABEL: load_chunk - // CHECK-SAME: [[arg0:%.+]]: ui64 - // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> - // CHECK-COUNT-4: xegpu.load {{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<16x2xf32> - - gpu.func @load_chunk(%src: ui64) -> vector<32x4xf32> { - %cst = arith.constant dense<[ - 0, 8, 16, 24, 32, 40, 48, 56, - 64, 72, 80, 88, 96, 104, 112, 120, - 128, 136, 144, 152, 160, 168, 176, 184, - 192, 200, 208, 216, 224, 232, 240, 248 - ]> : vector<32xindex> - - %c17 = arith.constant 17: index - %mask = vector.create_mask %c17: vector<32xi1> - - %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - %ld = xegpu.load %tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<32xi1> -> vector<32x4xf32> - - gpu.return %ld : vector<32x4xf32> - } //----- // CHECK-LABEL: load_with_offsets_chunk @@ -386,27 +241,6 @@ gpu.module @test { } //----- - // CHECK-LABEL: store_chunk - // CHECK-SAME: [[arg0:%.+]]: ui64 - // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> - // CHECK-COUNT-4: xegpu.store {{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> - gpu.func @store_chunk(%src: ui64) { - %cst = arith.constant dense<[ - 0, 8, 16, 24, 32, 40, 48, 56, - 64, 72, 80, 88, 96, 104, 112, 120, - 128, 136, 144, 152, 160, 168, 176, 184, - 192, 200, 208, 216, 224, 232, 240, 248 - ]> : vector<32xindex> - - %c17 = arith.constant 17: index - %mask = vector.create_mask %c17: vector<32xi1> - - %st_vec = arith.constant dense<1023.>: vector<32x4xf32> - %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: vector<32x4xf32>, !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<32xi1> - - gpu.return - } //----- // CHECK-LABEL: store_with_offsets_chunk @@ -434,42 +268,7 @@ gpu.module @test { } //----- - // CHECK-LABEL: prefetch_chunk - // CHECK-SAME: [[arg0:%.+]]: ui64 - // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> - // CHECK-COUNT-2: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> - gpu.func @prefetch_chunk(%src: ui64) { - %cst = arith.constant dense<[ - 0, 8, 16, 24, 32, 40, 48, 56, - 64, 72, 80, 88, 96, 104, 112, 120, - 128, 136, 144, 152, 160, 168, 176, 184, - 192, 200, 208, 216, 224, 232, 240, 248 - ]> : vector<32xindex> - %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - xegpu.prefetch %tdesc: !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - - gpu.return - } //----- - // CHECK-LABEL: update_chunk - // CHECK-SAME: [[arg0:%.+]]: ui64 - // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> - // CHECK-COUNT-4: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr>, vector<16xindex> - gpu.func @update_chunk(%src: ui64) -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> { - %cst = arith.constant dense<[ - 0, 8, 16, 24, 32, 40, 48, 56, - 64, 72, 80, 88, 96, 104, 112, 120, - 128, 136, 144, 152, 160, 168, 176, 184, - 192, 200, 208, 216, 224, 232, 240, 248 - ]> : vector<32xindex> - %delta = arith.constant dense<32>: vector<32xindex> - %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - - %new_tdesc = xegpu.update_offset %tdesc, %delta - : !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout>, vector<32xindex> - - gpu.return %new_tdesc : !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr, #xegpu.layout> - } } diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp index 4760016bdcea4..3394d63dcbbdc 100644 --- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp +++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp @@ -61,8 +61,8 @@ struct TestXeGPUUnrollingPatterns -> std::optional> { if (isa(op)) { + xegpu::PrefetchOp, xegpu::LoadGatherOp, xegpu::StoreScatterOp>( + op)) { xegpu::TensorDescType tdescTy; if (auto createNdOp = dyn_cast(op)) { tdescTy = createNdOp.getType(); @@ -74,10 +74,6 @@ struct TestXeGPUUnrollingPatterns tdescTy = loadNdOp.getTensorDescType(); } else if (auto storeNdOp = dyn_cast(op)) { tdescTy = storeNdOp.getTensorDescType(); - } else if (auto createOp = dyn_cast(op)) { - tdescTy = createOp.getType(); - } else if (auto updateOp = dyn_cast(op)) { - tdescTy = updateOp.getTensorDescType(); } else if (auto prefetchOp = dyn_cast(op)) { tdescTy = prefetchOp.getTensorDescType(); } else if (auto loadOp = dyn_cast(op)) { @@ -130,24 +126,6 @@ struct TestXeGPUUnrollingPatterns Attribute encoding = tdescTy.getEncoding(); auto layout = tdescTy.getLayoutAttr(); - // If the encoding is a ScatterTensorDescAttr, we need to - // potentially adjust the chunk size based on the inst_data. - if (tdescTy.isScattered()) { - int64_t chunkSize = tdescTy.getChunkSizeAsInt(); - - if (chunkSize > 1) { - int64_t blockedChunkSize = chunkSize; - auto instData = layout.getEffectiveInstDataAsInt(); - if (!instData.empty()) - blockedChunkSize = instData.back(); - - // To create a new attribute with a different chunk_size: - auto newEncoding = xegpu::ScatterTensorDescAttr::get( - ctx, tdescTy.getMemorySpace(), blockedChunkSize); - - encoding = newEncoding; - } - } if (layout) { if (layout.getEffectiveLaneLayoutAsInt().empty()) layout = xegpu::LayoutAttr();