From b766215221d453a97002d6faabc1c387dac3f10b Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Wed, 26 Nov 2025 16:42:44 -0500 Subject: [PATCH 01/28] [mlir][amdgpu] Add make_dma_descriptor op --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 117 ++++++++++++++++-- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 28 +++++ mlir/test/Dialect/AMDGPU/invalid.mlir | 40 ++++++ mlir/test/Dialect/AMDGPU/ops.mlir | 59 ++++++++- 4 files changed, 232 insertions(+), 12 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index e07c72b839e7c..3581b07dc4e3e 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -80,15 +80,15 @@ def AMDGPU_AddressSpaceAttr : EnumAttr traits = []> : TypeDef { let mnemonic = typeMnemonic; } -//===----------------------------------------------------------------------===// -// AMDGPU Type definitions -//===----------------------------------------------------------------------===// - def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> { let summary = "Pair of base addresses that move data between LDS and global storage."; let description = [{ @@ -104,6 +104,15 @@ def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> { let assemblyFormat = "`<` $elementType `>`"; } +def AMDGPU_TDMDescriptorType : AMDGPU_Type<"TDMDescriptor", "tdm_descriptor"> { + let summary = "Descriptors used in tensor store/load operations."; + let description = [{ + This type is opaque and corresponds to the two or four descriptor groups + used in tensor_load_to_lds or tensor_store_from_lds. + }]; + +} + //===----------------------------------------------------------------------===// // AMDGPU Op definitions //===----------------------------------------------------------------------===// @@ -1222,14 +1231,13 @@ def AMDGPU_MakeDmaBaseOp : AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments]>, Arguments<(ins Arg:$src, - Variadic:$srcIndices, + Variadic:$src_indices, Arg:$dst, - Variadic:$dstIndices)>, + Variadic:$dst_indices)>, Results<(outs AMDGPU_TDMBaseType: $base)> { // TODO: // * Add verifiers such that one of the memrefs is from LDS and the other global. - // * Add verifiers to make sure that the type is in the correct direction. // * Add verifiers to make sure that the number of indices do not exceed the number of dimensions. let summary = "Pair of based addresses used when moving tiles between LDS and global memory."; @@ -1240,12 +1248,105 @@ def AMDGPU_MakeDmaBaseOp : This operation creates a value corresponding to the tensor descriptor (D#) group 0 found in TensorLoadToLDSOp and TensorStoreFromLDSOp in the rocdl dialect. + For example: + + ```mlir + %base = amdgpu.make_dma_base %src[%idx0], %dst[%idx1] : memref<8xi32>, memref<8xi32, #gpu.address_space> -> !amdgpu.tdm_base + %descriptor = amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [2, 1] sharedSize [2, 2] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor + ``` + + to + + ```mlir + // pseudocode + %base_0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr)> + %base_1 = llvm.insertvalue %global_addr, %base_0[0] : !llvm.struct<(ptr, ptr)> + %base_2 = llvm.insertvalue %lds_addr, %base_1[1] : !llvm.struct(ptr, ptr)> + // type(%base_2) = !llvm.struct<(ptr, ptr) roughly corresponds to amdgpu.tdm_base + + // The base will be used when contructing dgroup0 + // when lowering amdgpu.make_dma_descriptor + %dgroup0_0 = llvm.mlir.undef : !llvm.struct<(....)> + %dgroup0_1 = llvm.insertvalue %base2, %dgroup0_0 : .... + + // When lowering amdgpu.tensor_load_to_lds + rocdl.tensor.load.to.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32> + ``` + These tensor DMA operations were introduced in gfx1250. }]; let assemblyFormat = [{ - $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` attr-dict `:` type($src) `,` type($dst) `to` type(results) + $src `[` $src_indices `]` `,` $dst `[` $dst_indices `]` attr-dict `:` type($src) `,` type($dst) `->` type(results) + }]; +} + +def AMDGPU_MakeDmaDescriptorOp : + AMDGPU_Op<"make_dma_descriptor", [Pure, AttrSizedOperandSegments]>, + Arguments<(ins + AMDGPU_TDMBaseType: $base, + Variadic: $global_dynamic_sizes, + DenseI64ArrayAttr: $global_static_sizes, + Variadic: $global_dynamic_strides, + DenseI64ArrayAttr: $global_static_strides, + Variadic: $shared_dynamic_sizes, + DenseI64ArrayAttr: $shared_static_sizes, + Optional: $pad, + Optional: $pad_every, + Optional: $atomic_barrier_address, + Variadic: $atomic_barrier_indices, + Optional: $global_increment, + Optional: $lds_increment, + Optional: $iteration_count)>, + Results<(outs AMDGPU_TDMDescriptorType: $desc)> { + + let summary = "Make all descriptor groups needed by TensorLoadToLDS/TensorStoreFromLDS."; + let description = [{ + Make all descriptor groups needed by tensor memory operations. + + The $base operand corresponds to the base pair addresses, one must be an address in LDS + while the other must be a global memory location. + + $global_{static/dynamic}_sizes determine the size of the tensor. + $global_{static/dynamic}_strides determine the strides of the tensor. + $shared_{static/dynamic}_sizes determines the size of the tile. + + Padding can be applied to the LDS address when copying from memory to LDS, + but not when copying from LDS to memory. + The values in the padded target addresses remain the same as before the operation was applied. + + 2D and 3D tensors may be iterated over by setting $global_increment, $lds_increment, and $iteration_count. + $global_increment determines how much to increment the starting global memory address per iteration in units of the $base's element type. + $lds_increment determines how much to increment the starting LDS address per iteration in units of the $base's element type. + $iterate_count determines how many times to iterate. + + ```mlir + // Example of moving a two-dimensional tensor to LDS. + %base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<64x64xi32>, memref<64x64xi32, #gpu.address_space> -> !amdgpu.tdm_base + %descriptor = amdgpu.make_dma_descriptor %base globalSize [64, 64] globalStride [64, 1] sharedSize [64, 64] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor + + // Example of moving a two dimension tensor to LDS where padding is applied after every integer. + %base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<32x32xi32>, memref<64x64xi32, #gpu.address_space> -> !amdgpu.tdm_base + %descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padding(%pad pad_every %pad_every) : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor + ``` + }]; + + let assemblyFormat = [{ + $base + `globalSize` custom($global_dynamic_sizes, $global_static_sizes) + `globalStride` custom($global_dynamic_strides, $global_static_strides) + `sharedSize` custom($shared_dynamic_sizes, $shared_static_sizes) + ( `padShared` `(` $pad^ `every` $pad_every `)` )? + ( `atomicBarrier` `(` $atomic_barrier_address^ `[` $atomic_barrier_indices `]` + `:` type($atomic_barrier_address) `)`)? + ( `iterate` $global_increment^ `,` $lds_increment `,` $iteration_count )? + attr-dict `:` qualified(type($base)) `->` type(results) }]; + + let hasVerifier = 1; } #endif // AMDGPU diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index cdc10c60a42ae..5ff640b5d1596 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -705,6 +705,34 @@ LogicalResult TransposeLoadOp::verify() { return success(); } +//===----------------------------------------------------------------------===// +// MakeDmaDescriptorOp +//===----------------------------------------------------------------------===// + +LogicalResult MakeDmaDescriptorOp::verify() { + ArrayRef globalStaticStrides = getGlobalStaticStrides(); + + if (globalStaticStrides.empty()) { + return emitOpError("strides must not be empty."); + } + if (globalStaticStrides.back() != 1) { + return emitOpError("strides for the innermost dimension must be 1."); + } + + ArrayRef globalStaticSizes = getGlobalStaticSizes(); + size_t rank = globalStaticSizes.size(); + if (rank != globalStaticStrides.size()) { + return emitOpError("strides and sizes must have same rank."); + } + + ArrayRef sharedStaticSizes = getSharedStaticSizes(); + if (rank != sharedStaticSizes.size()) { + return emitOpError("tensor must have same rank as tile."); + } + + return success(); +} + //===----------------------------------------------------------------------===// // ScaledMFMAOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir index 61fdf29a78cbd..066f46060f62f 100644 --- a/mlir/test/Dialect/AMDGPU/invalid.mlir +++ b/mlir/test/Dialect/AMDGPU/invalid.mlir @@ -354,3 +354,43 @@ func.func @scaled_mfma_invalid_k(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32x %0 = amdgpu.scaled_mfma 32x32x32 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32> func.return %0 : vector<16xf32> } + +// ----- + +// CHECK-LABEL: func @make_dma_descriptor_invalid_empty_strides +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) +func.func @make_dma_descriptor_invalid_empty_strides(%base: !amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides must not be empty.}} + amdgpu.make_dma_descriptor %base globalSize [0] globalStride [] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return +} + +// ----- + +// CHECK-LABEL: func @make_dma_descriptor_invalid_innermost_stride +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) +func.func @make_dma_descriptor_invalid_innermost_stride(%base: !amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides for the innermost dimension must be 1.}} + amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [1, 2] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return +} + +// ----- + +// CHECK-LABEL: func @make_dma_descriptor_invalid_size_and_stride_sizes +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) +func.func @make_dma_descriptor_invalid_size_and_stride_sizes(%base: !amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides and sizes must have same rank.}} + amdgpu.make_dma_descriptor %base globalSize [1] globalStride [1, 1] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return +} + +// ----- + +// CHECK-LABEL: func @make_dma_descriptor_invalid_shared_and_global_rank +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) +func.func @make_dma_descriptor_invalid_shared_and_global_rank(%base: !amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_descriptor' op tensor must have same rank as tile.}} + amdgpu.make_dma_descriptor %base globalSize [4, 4] globalStride [1, 1] sharedSize [2] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return +} diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 653f9f64d24f4..a8af06dc5ff0a 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -689,11 +689,62 @@ func.func @memory_counter_wait() { // CHECK-LABEL: func @make_dma_base // CHECK-SAME: (%[[IDX:.+]]: index, %[[MEM:.+]]: memref<8xi32>, %[[SMEM:.+]]: memref<8xi32, #gpu.address_space>) func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32, #gpu.address_space>) { - // CHECK: amdgpu.make_dma_base %[[MEM]][%[[IDX]]], %[[SMEM]][%[[IDX]]] : memref<8xi32>, memref<8xi32, #gpu.address_space> to !amdgpu.tdm_base - amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32>, memref<8xi32, #gpu.address_space> to !amdgpu.tdm_base + // CHECK: amdgpu.make_dma_base %[[MEM]][%[[IDX]]], %[[SMEM]][%[[IDX]]] : memref<8xi32>, memref<8xi32, #gpu.address_space> -> !amdgpu.tdm_base + amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32>, memref<8xi32, #gpu.address_space> -> !amdgpu.tdm_base - // CHECK: amdgpu.make_dma_base %[[SMEM]][%[[IDX]]], %[[MEM]][%[[IDX]]] : memref<8xi32, #gpu.address_space>, memref<8xi32> to !amdgpu.tdm_base - amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu.address_space>, memref<8xi32> to !amdgpu.tdm_base + // CHECK: amdgpu.make_dma_base %[[SMEM]][%[[IDX]]], %[[MEM]][%[[IDX]]] : memref<8xi32, #gpu.address_space>, memref<8xi32> -> !amdgpu.tdm_base + amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu.address_space>, memref<8xi32> -> !amdgpu.tdm_base func.return } +// CHECK-LABEL: func @make_dma_descriptor +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base, %[[BARRIER:.+]]: memref<8xi32>, %[[IDX:.+]]: index) +func.func @make_dma_descriptor(%base: !amdgpu.tdm_base, %barrier: memref<8xi32>, %idx: index) { + + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] + amdgpu.make_dma_descriptor %base + // CHECK-SAME: globalSize [0] + globalSize [0] + // CHECK-SAME: globalStride [1] + globalStride [1] + // CHECK-SAME: sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] + amdgpu.make_dma_descriptor %base + // CHECK-SAME: globalSize [0] + globalSize [0] + // CHECK-SAME: globalStride [1] + globalStride [1] + // CHECK-SAME: sharedSize [0] + sharedSize [0] + // CHECK-SAME: padShared(%[[IDX]] every %[[IDX]]) + padShared(%idx every %idx) + : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] + amdgpu.make_dma_descriptor %base + // CHECK-SAME: globalSize [0] + globalSize [0] + // CHECK-SAME: globalStride [1] + globalStride [1] + // CHECK-SAME: sharedSize [0] + sharedSize [0] + // CHECK-SAME: atomicBarrier(%[[BARRIER]][%[[IDX]]] : memref<8xi32>) + atomicBarrier(%barrier[%idx] : memref<8xi32>) + : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] + amdgpu.make_dma_descriptor %base + // CHECK-SAME: globalSize [0] + globalSize [0] + // CHECK-SAME: globalStride [1] + globalStride [1] + // CHECK-SAME: sharedSize [0] + sharedSize [0] + // CHECK-SAME: iterate %[[IDX]], %[[IDX]], %[[IDX]] + iterate %idx, %idx, %idx + : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + + func.return +} From 3c31d68dab4254b01b727bee1bd15e4a2c6fc15e Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Tue, 25 Nov 2025 11:28:04 -0500 Subject: [PATCH 02/28] [mlir][amdgpu] Add tensor load store operation --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 30 +++++++++++++++++++ mlir/test/Dialect/AMDGPU/ops.mlir | 10 +++++++ 2 files changed, 40 insertions(+) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 3581b07dc4e3e..12ef5337296a2 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1349,4 +1349,34 @@ def AMDGPU_MakeDmaDescriptorOp : let hasVerifier = 1; } +def AMDGPU_TensorLoadToLDSOp : + AMDGPU_Op<"tensor_load_to_lds", [MemoryEffects<[MemWrite]>, MemoryEffects<[MemRead]>]>, + Arguments<(ins AMDGPU_TDMDescriptorType: $desc)> { + let summary = "Load tensors from global memory to LDS."; + let description = [{ + Load tensors of up to five dimensions from global memory to LDS. + + The operation is fully described by the descriptor operand. + }]; + + let assemblyFormat = [{ + $desc attr-dict `:` qualified(type($desc)) + }]; +} + +def AMDGPU_TensorStoreFromLDSOp : + AMDGPU_Op<"tensor_store_from_lds", [MemoryEffects<[MemWrite]>, MemoryEffects<[MemRead]>]>, + Arguments<(ins AMDGPU_TDMDescriptorType: $desc)> { + let summary = "Store tensors from LDS to global memory."; + let description = [{ + Store tensors of up to five dimensions from LDS to global memory. + + The operation is fully described by the descriptor operand. + }]; + + let assemblyFormat = [{ + $desc attr-dict `:` qualified(type($desc)) + }]; +} + #endif // AMDGPU diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index a8af06dc5ff0a..aa6bedc0e1135 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -748,3 +748,13 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base, %barrier: memref<8x func.return } + +// CHECK-LABEL: @tensor_load_store +// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor) +func.func @tensor_load_store(%desc: !amdgpu.tdm_descriptor) { + // CHECK: amdgpu.tensor_load_to_lds %[[DESC]] + amdgpu.tensor_load_to_lds %desc : !amdgpu.tdm_descriptor + // CHECK: amdgpu.tensor_store_from_lds %[[DESC]] + amdgpu.tensor_store_from_lds %desc : !amdgpu.tdm_descriptor + return +} From cb116ea0b0444eb72c26e38bdb6572cdeef97e61 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Tue, 25 Nov 2025 14:24:50 -0500 Subject: [PATCH 03/28] [mlir][amdgpu] Lower amdgpu.make_dma_base. --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 3 +- .../mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h | 5 ++ .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 77 +++++++++++++++++- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 79 ++++++++++++------- ...cvt_scale_pk-gfx1250.mlir => gfx1250.mlir} | 74 +++++++++++++++++ 5 files changed, 206 insertions(+), 32 deletions(-) rename mlir/test/Conversion/AMDGPUToROCDL/{cvt_scale_pk-gfx1250.mlir => gfx1250.mlir} (73%) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 12ef5337296a2..9cb0752fba48b 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1237,7 +1237,6 @@ def AMDGPU_MakeDmaBaseOp : Results<(outs AMDGPU_TDMBaseType: $base)> { // TODO: - // * Add verifiers such that one of the memrefs is from LDS and the other global. // * Add verifiers to make sure that the number of indices do not exceed the number of dimensions. let summary = "Pair of based addresses used when moving tiles between LDS and global memory."; @@ -1280,6 +1279,8 @@ def AMDGPU_MakeDmaBaseOp : let assemblyFormat = [{ $src `[` $src_indices `]` `,` $dst `[` $dst_indices `]` attr-dict `:` type($src) `,` type($dst) `->` type(results) }]; + + let hasVerifier = 1; } def AMDGPU_MakeDmaDescriptorOp : diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h index a7680fb5c3191..958757da0933e 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h @@ -48,6 +48,11 @@ inline void printMNKDimensionList(OpAsmPrinter &printer, Operation *, IntegerAttr m, IntegerAttr n, IntegerAttr k) { printMNKDimensionList(printer, m, n, k); } + +// Utility functions for quering the address space. +bool hasGlobalMemorySpace(Attribute memorySpace); +bool hasWorkgroupMemorySpace(Attribute memorySpace); +bool hasFatRawBufferMemorySpace(Attribute memorySpace); } // namespace mlir::amdgpu #define GET_ATTRDEF_CLASSES diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index b9a5e7d7f6eac..3316e16a05d5c 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -2264,6 +2264,76 @@ struct AMDGPUPermlaneLowering : public ConvertOpToLLVMPattern { } }; +struct AMDGPUMakeDmaBaseLowering + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + AMDGPUMakeDmaBaseLowering(const LLVMTypeConverter &converter, Chipset chipset) + : ConvertOpToLLVMPattern(converter), chipset(chipset) {} + Chipset chipset; + + LogicalResult + matchAndRewrite(MakeDmaBaseOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + if (chipset < kGfx1250) + return op->emitOpError("make_dma_base is only supported on gfx1250"); + + Location loc = op.getLoc(); + + ValueRange srcIndices = adaptor.getSrcIndices(); + Value src = adaptor.getSrc(); + auto srcMemRefType = cast(op.getSrc().getType()); + + Value srcPtr = + getStridedElementPtr(rewriter, loc, srcMemRefType, src, srcIndices); + + ValueRange dstIndices = adaptor.getDstIndices(); + Value dst = adaptor.getDst(); + auto dstMemRefType = cast(op.getDst().getType()); + + Value dstPtr = + getStridedElementPtr(rewriter, loc, dstMemRefType, dst, dstIndices); + + bool storeFrom = hasWorkgroupMemorySpace(srcMemRefType.getMemorySpace()); + Value ldsAddr = storeFrom ? srcPtr : dstPtr; + Value globalAddr = storeFrom ? dstPtr : srcPtr; + + Type i32 = rewriter.getI32Type(); + Type i64 = rewriter.getI64Type(); + + Value castForLdsAddr = + LLVM::PtrToIntOp::create(rewriter, loc, i32, ldsAddr); + Value castForGlobalAddr = + LLVM::PtrToIntOp::create(rewriter, loc, i64, globalAddr); + + Value mask = createI64Constant(rewriter, loc, 0x1FFFFFFFFFFFFFF); + Value first57BitsOfGlobalAddr = + LLVM::AndOp::create(rewriter, loc, castForGlobalAddr, mask); + Value shift = LLVM::LShrOp::create(rewriter, loc, first57BitsOfGlobalAddr, + createI64Constant(rewriter, loc, 32)); + + Value lowHalf = + LLVM::TruncOp::create(rewriter, loc, i32, first57BitsOfGlobalAddr); + Value highHalf = LLVM::TruncOp::create(rewriter, loc, i32, shift); + + Value c0 = createI32Constant(rewriter, loc, 0); + Value c1 = createI32Constant(rewriter, loc, 1); + Value c2 = createI32Constant(rewriter, loc, 2); + Value c3 = createI32Constant(rewriter, loc, 3); + + Type v4i32 = this->typeConverter->convertType(VectorType::get(4, i32)); + Value result = LLVM::UndefOp::create(rewriter, loc, v4i32); + result = LLVM::InsertElementOp::create(rewriter, loc, result, c0, c0); + result = LLVM::InsertElementOp::create(rewriter, loc, result, + castForLdsAddr, c1); + result = LLVM::InsertElementOp::create(rewriter, loc, result, lowHalf, c2); + result = LLVM::InsertElementOp::create(rewriter, loc, result, highHalf, c3); + + rewriter.replaceOp(op, result); + return success(); + } +}; + struct ConvertAMDGPUToROCDLPass : public impl::ConvertAMDGPUToROCDLPassBase { using Base::Base; @@ -2278,6 +2348,10 @@ struct ConvertAMDGPUToROCDLPass RewritePatternSet patterns(ctx); LLVMTypeConverter converter(ctx); + converter.addConversion([&](TDMBaseType type) -> Type { + Type i32 = IntegerType::get(type.getContext(), 32); + return converter.convertType(VectorType::get(4, i32)); + }); populateAMDGPUToROCDLConversionPatterns(converter, patterns, *maybeChipset); LLVMConversionTarget target(getContext()); target.addIllegalDialect<::mlir::amdgpu::AMDGPUDialect>(); @@ -2333,6 +2407,7 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, ScaledExtPackedOpLowering, PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering, GatherToLDSOpLowering, TransposeLoadOpLowering, - AMDGPUPermlaneLowering>(converter, chipset); + AMDGPUPermlaneLowering, AMDGPUMakeDmaBaseLowering>(converter, + chipset); patterns.add(converter); } diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index 5ff640b5d1596..8fc6220efc6ad 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -41,6 +41,38 @@ using namespace mlir::amdgpu; #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.cpp.inc" +namespace mlir::amdgpu { +bool hasGlobalMemorySpace(Attribute memorySpace) { + if (!memorySpace) + return true; + if (auto intMemorySpace = dyn_cast(memorySpace)) + return intMemorySpace.getInt() == 0 || intMemorySpace.getInt() == 1; + if (auto gpuMemorySpace = dyn_cast(memorySpace)) + return gpuMemorySpace.getValue() == gpu::AddressSpace::Global; + return false; +} + +bool hasWorkgroupMemorySpace(Attribute memorySpace) { + if (!memorySpace) + return false; + if (auto intMemorySpace = dyn_cast(memorySpace)) + return intMemorySpace.getInt() == 3; + if (auto gpuMemorySpace = dyn_cast(memorySpace)) + return gpuMemorySpace.getValue() == gpu::AddressSpace::Workgroup; + return false; +} + +bool hasFatRawBufferMemorySpace(Attribute memorySpace) { + if (!memorySpace) + return false; + if (auto intMemorySpace = dyn_cast(memorySpace)) + return intMemorySpace.getInt() == 7; + if (auto gpuMemorySpace = dyn_cast(memorySpace)) + return gpuMemorySpace.getValue() == amdgpu::AddressSpace::FatRawBuffer; + return false; +} +} // namespace mlir::amdgpu + namespace { struct AMDGPUInlinerInterface final : DialectInlinerInterface { using DialectInlinerInterface::DialectInlinerInterface; @@ -158,36 +190,6 @@ LogicalResult FatRawBufferCastOp::verify() { return success(); } -static bool hasGlobalMemorySpace(Attribute memorySpace) { - if (!memorySpace) - return true; - if (auto intMemorySpace = dyn_cast(memorySpace)) - return intMemorySpace.getInt() == 0 || intMemorySpace.getInt() == 1; - if (auto gpuMemorySpace = dyn_cast(memorySpace)) - return gpuMemorySpace.getValue() == gpu::AddressSpace::Global; - return false; -} - -static bool hasWorkgroupMemorySpace(Attribute memorySpace) { - if (!memorySpace) - return false; - if (auto intMemorySpace = dyn_cast(memorySpace)) - return intMemorySpace.getInt() == 3; - if (auto gpuMemorySpace = dyn_cast(memorySpace)) - return gpuMemorySpace.getValue() == gpu::AddressSpace::Workgroup; - return false; -} - -static bool hasFatRawBufferMemorySpace(Attribute memorySpace) { - if (!memorySpace) - return false; - if (auto intMemorySpace = dyn_cast(memorySpace)) - return intMemorySpace.getInt() == 7; - if (auto gpuMemorySpace = dyn_cast(memorySpace)) - return gpuMemorySpace.getValue() == amdgpu::AddressSpace::FatRawBuffer; - return false; -} - //===----------------------------------------------------------------------===// // RawBuffer*Op //===----------------------------------------------------------------------===// @@ -705,6 +707,23 @@ LogicalResult TransposeLoadOp::verify() { return success(); } +//===----------------------------------------------------------------------===// +// MakeDmaBaseOp +//===----------------------------------------------------------------------===// + +LogicalResult MakeDmaBaseOp::verify() { + MemRefType srcType = cast(getSrc().getType()); + MemRefType dstType = cast(getDst().getType()); + bool store_from_lds = hasWorkgroupMemorySpace(srcType.getMemorySpace()) && + hasGlobalMemorySpace(dstType.getMemorySpace()); + bool load_to_lds = hasGlobalMemorySpace(srcType.getMemorySpace()) && + hasWorkgroupMemorySpace(dstType.getMemorySpace()); + bool is_valid = store_from_lds != load_to_lds; + if (!is_valid) + return emitOpError("invalid combination of address spaces."); + return success(); +} + //===----------------------------------------------------------------------===// // MakeDmaDescriptorOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Conversion/AMDGPUToROCDL/cvt_scale_pk-gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir similarity index 73% rename from mlir/test/Conversion/AMDGPUToROCDL/cvt_scale_pk-gfx1250.mlir rename to mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir index d2391140ce056..96d03a427215f 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/cvt_scale_pk-gfx1250.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir @@ -162,3 +162,77 @@ func.func @amdgpu.scaled_ext_packed816_invalid_dst_elem_type(%v: vector<16xf6E3M %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf64> return %ret0: vector<16xf64> } + +// ----- + +#gpu_global_addrspace = 1 +#gpu_lds_addrspace = 3 +#amdgpu_fat_buffer_addrspace = 7 + +// CHECK-LABEL: func @make_dma_base +// CHECK-SAME: (%[[IDX:.+]]: index, %[[MEM:.+]]: memref<8xi32, 1>, %[[SMEM:.+]]: memref<8xi32, 3>) +func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace>, %smem: memref<8xi32,#gpu_lds_addrspace>) -> (!amdgpu.tdm_base, !amdgpu.tdm_base) { + // CHECK-DAG: %[[INT:.+]] = builtin.unrealized_conversion_cast %[[IDX]] : index to i64 + // CHECK-DAG: %[[MEMREF_DESC_MEM:.+]] = builtin.unrealized_conversion_cast %[[MEM]] : memref<8xi32, 1> + // CHECK-DAG: %[[MEMREF_DESC_SMEM:.+]] = builtin.unrealized_conversion_cast %[[SMEM]] : memref<8xi32, 3> + + // CHECK-DAG: %[[MEM_BASE_PTR:.+]] = llvm.extractvalue %[[MEMREF_DESC_MEM]][1] : !llvm.struct<(ptr<1> + // CHECK-DAG: %[[SMEM_BASE_PTR:.+]] = llvm.extractvalue %[[MEMREF_DESC_SMEM]][1] : !llvm.struct<(ptr<3> + + // CHECK-DAG: %[[MEM_BASE_OFFSET:.+]] = llvm.getelementptr %[[MEM_BASE_PTR]][%[[INT]]] + // CHECK-DAG: %[[SMEM_BASE_OFFSET:.+]] = llvm.getelementptr %[[SMEM_BASE_PTR]][%[[INT]]] + + // CHECK-DAG: %[[MEM_INT:.+]] = llvm.ptrtoint %[[MEM_BASE_OFFSET]] : !llvm.ptr<1> to i64 + // CHECK-DAG: %[[SMEM_INT:.+]] = llvm.ptrtoint %[[SMEM_BASE_OFFSET]] : !llvm.ptr<3> to i32 + + // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(144115188075855871 : i64) : i64 + // CHECK: %[[MEM_INT_LOW_57:.+]] = llvm.and %[[MEM_INT]], %[[MASK]] + // CHECK: %[[C32:.+]] = llvm.mlir.constant(32 : i64) : i64 + // CHECK: %[[SHIFT:.+]] = llvm.lshr %[[MEM_INT_LOW_57]], %[[C32]] + // CHECK-DAG: %[[MEM_INT_LOW:.+]] = llvm.trunc %[[MEM_INT_LOW_57]] : i64 to i32 + // CHECK-DAG: %[[MEM_INT_HIGH:.+]] = llvm.trunc %[[SHIFT]] : i64 to i32 + + // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: %[[C2:.+]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK-DAG: %[[C3:.+]] = llvm.mlir.constant(3 : i32) : i32 + + // CHECK: %[[V4I32_0_0:.+]] = llvm.mlir.undef : vector<4xi32> + // CHECK: %[[V4I32_0_1:.+]] = llvm.insertelement %[[C0]], %[[V4I32_0_0]][%[[C0]] : i32] + // CHECK: %[[V4I32_0_2:.+]] = llvm.insertelement %[[SMEM_INT]], %[[V4I32_0_1]][%[[C1]] : i32] + // CHECK: %[[V4I32_0_3:.+]] = llvm.insertelement %[[MEM_INT_LOW]], %[[V4I32_0_2]][%[[C2]] : i32] + // CHECK: %[[V4I32_0_4:.+]] = llvm.insertelement %[[MEM_INT_HIGH]], %[[V4I32_0_3]][%[[C3]] : i32] + + %0 = amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32, #gpu_global_addrspace>, memref<8xi32, #gpu_lds_addrspace> -> !amdgpu.tdm_base + + // CHECK-DAG: %[[MEM_BASE_PTR:.+]] = llvm.extractvalue %[[MEMREF_DESC_MEM]][1] : !llvm.struct<(ptr<1> + // CHECK-DAG: %[[SMEM_BASE_PTR:.+]] = llvm.extractvalue %[[MEMREF_DESC_SMEM]][1] : !llvm.struct<(ptr<3> + + // CHECK-DAG: %[[MEM_BASE_OFFSET:.+]] = llvm.getelementptr %[[MEM_BASE_PTR]][%[[INT]]] + // CHECK-DAG: %[[SMEM_BASE_OFFSET:.+]] = llvm.getelementptr %[[SMEM_BASE_PTR]][%[[INT]]] + + // CHECK-DAG: %[[MEM_INT:.+]] = llvm.ptrtoint %[[MEM_BASE_OFFSET]] : !llvm.ptr<1> to i64 + // CHECK-DAG: %[[SMEM_INT:.+]] = llvm.ptrtoint %[[SMEM_BASE_OFFSET]] : !llvm.ptr<3> to i32 + + // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(144115188075855871 : i64) : i64 + // CHECK: %[[MEM_INT_LOW_57:.+]] = llvm.and %[[MEM_INT]], %[[MASK]] + // CHECK: %[[C32:.+]] = llvm.mlir.constant(32 : i64) : i64 + // CHECK: %[[SHIFT:.+]] = llvm.lshr %[[MEM_INT_LOW_57]], %[[C32]] + // CHECK-DAG: %[[MEM_INT_LOW:.+]] = llvm.trunc %[[MEM_INT_LOW_57]] : i64 to i32 + // CHECK-DAG: %[[MEM_INT_HIGH:.+]] = llvm.trunc %[[SHIFT]] : i64 to i32 + + // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: %[[C2:.+]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK-DAG: %[[C3:.+]] = llvm.mlir.constant(3 : i32) : i32 + + // CHECK: %[[V4I32_1_0:.+]] = llvm.mlir.undef : vector<4xi32> + // CHECK: %[[V4I32_1_1:.+]] = llvm.insertelement %[[C0]], %[[V4I32_1_0]][%[[C0]] : i32] + // CHECK: %[[V4I32_1_2:.+]] = llvm.insertelement %[[SMEM_INT]], %[[V4I32_1_1]][%[[C1]] : i32] + // CHECK: %[[V4I32_1_3:.+]] = llvm.insertelement %[[MEM_INT_LOW]], %[[V4I32_1_2]][%[[C2]] : i32] + // CHECK: %[[V4I32_1_4:.+]] = llvm.insertelement %[[MEM_INT_HIGH]], %[[V4I32_1_3]][%[[C3]] : i32] + + %1 = amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu_lds_addrspace>, memref<8xi32, #gpu_global_addrspace> -> !amdgpu.tdm_base + + func.return %0, %1 : !amdgpu.tdm_base, !amdgpu.tdm_base +} From 3ee5464060d2bae3e071b4910ef09e0b0d4f6728 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Thu, 27 Nov 2025 09:41:38 -0500 Subject: [PATCH 04/28] Update documentation --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 9cb0752fba48b..1806c747046b8 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1258,19 +1258,21 @@ def AMDGPU_MakeDmaBaseOp : to ```mlir - // pseudocode - %base_0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr)> - %base_1 = llvm.insertvalue %global_addr, %base_0[0] : !llvm.struct<(ptr, ptr)> - %base_2 = llvm.insertvalue %lds_addr, %base_1[1] : !llvm.struct(ptr, ptr)> - // type(%base_2) = !llvm.struct<(ptr, ptr) roughly corresponds to amdgpu.tdm_base - - // The base will be used when contructing dgroup0 - // when lowering amdgpu.make_dma_descriptor - %dgroup0_0 = llvm.mlir.undef : !llvm.struct<(....)> - %dgroup0_1 = llvm.insertvalue %base2, %dgroup0_0 : .... - - // When lowering amdgpu.tensor_load_to_lds - rocdl.tensor.load.to.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32> + // pseudo-code + %global_base = llvm.extractvalue %global_memref[1] + %global_address = llvm.get_element_ptr ... + + %lds_base = llvm.extractvalue %lds_memref[1] + %lds_address = llvm.get_element_ptr ... + + // Definition of %base + %undef = llvm.mlir.undef : vector<4xi32> + %v0 = llvm.insertelement %15, %undef[0] : vector<4xi32> + %v1 = llvm.insertelement %lds_address, %v0[1] : vector<4xi32> + %v2 = llvm.insertelement %global_address_low, %v1[2] : vector<4xi32> + %base = llvm.insertelement %global_address_high, %v2[3] : vector<4xi32> + + rocdl.tensor.load.to.lds %base, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32> ``` These tensor DMA operations were introduced in gfx1250. From 7aa7699e3de3624c58b026ea9087a63b2033ff61 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Thu, 27 Nov 2025 11:10:03 -0500 Subject: [PATCH 05/28] [amdgpu][mlir] make_dma_base add type information. --- mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 7 ++++++- mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir | 10 ++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 3316e16a05d5c..452c4e96e62c1 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -2316,6 +2316,10 @@ struct AMDGPUMakeDmaBaseLowering LLVM::TruncOp::create(rewriter, loc, i32, first57BitsOfGlobalAddr); Value highHalf = LLVM::TruncOp::create(rewriter, loc, i32, shift); + Value typeMask = createI32Constant(rewriter, loc, 2 << 30); + Value highHalfPlusType = + LLVM::OrOp::create(rewriter, loc, highHalf, typeMask); + Value c0 = createI32Constant(rewriter, loc, 0); Value c1 = createI32Constant(rewriter, loc, 1); Value c2 = createI32Constant(rewriter, loc, 2); @@ -2327,7 +2331,8 @@ struct AMDGPUMakeDmaBaseLowering result = LLVM::InsertElementOp::create(rewriter, loc, result, castForLdsAddr, c1); result = LLVM::InsertElementOp::create(rewriter, loc, result, lowHalf, c2); - result = LLVM::InsertElementOp::create(rewriter, loc, result, highHalf, c3); + result = LLVM::InsertElementOp::create(rewriter, loc, result, + highHalfPlusType, c3); rewriter.replaceOp(op, result); return success(); diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir index 96d03a427215f..514ed9094da53 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir @@ -192,6 +192,9 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace> // CHECK-DAG: %[[MEM_INT_LOW:.+]] = llvm.trunc %[[MEM_INT_LOW_57]] : i64 to i32 // CHECK-DAG: %[[MEM_INT_HIGH:.+]] = llvm.trunc %[[SHIFT]] : i64 to i32 + // CHECK-DAG: %[[TYPE_MASK:.+]] = llvm.mlir.constant(-2147483648 : i32) + // CHECK: %[[MEM_INT_HIGH_TYPE:.+]] = llvm.or %[[MEM_INT_HIGH]], %[[TYPE_MASK]] + // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32) : i32 // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32) : i32 // CHECK-DAG: %[[C2:.+]] = llvm.mlir.constant(2 : i32) : i32 @@ -201,7 +204,7 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace> // CHECK: %[[V4I32_0_1:.+]] = llvm.insertelement %[[C0]], %[[V4I32_0_0]][%[[C0]] : i32] // CHECK: %[[V4I32_0_2:.+]] = llvm.insertelement %[[SMEM_INT]], %[[V4I32_0_1]][%[[C1]] : i32] // CHECK: %[[V4I32_0_3:.+]] = llvm.insertelement %[[MEM_INT_LOW]], %[[V4I32_0_2]][%[[C2]] : i32] - // CHECK: %[[V4I32_0_4:.+]] = llvm.insertelement %[[MEM_INT_HIGH]], %[[V4I32_0_3]][%[[C3]] : i32] + // CHECK: %[[V4I32_0_4:.+]] = llvm.insertelement %[[MEM_INT_HIGH_TYPE]], %[[V4I32_0_3]][%[[C3]] : i32] %0 = amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32, #gpu_global_addrspace>, memref<8xi32, #gpu_lds_addrspace> -> !amdgpu.tdm_base @@ -221,6 +224,9 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace> // CHECK-DAG: %[[MEM_INT_LOW:.+]] = llvm.trunc %[[MEM_INT_LOW_57]] : i64 to i32 // CHECK-DAG: %[[MEM_INT_HIGH:.+]] = llvm.trunc %[[SHIFT]] : i64 to i32 + // CHECK-DAG: %[[TYPE_MASK:.+]] = llvm.mlir.constant(-2147483648 : i32) + // CHECK: %[[MEM_INT_HIGH_TYPE:.+]] = llvm.or %[[MEM_INT_HIGH]], %[[TYPE_MASK]] + // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32) : i32 // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32) : i32 // CHECK-DAG: %[[C2:.+]] = llvm.mlir.constant(2 : i32) : i32 @@ -230,7 +236,7 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace> // CHECK: %[[V4I32_1_1:.+]] = llvm.insertelement %[[C0]], %[[V4I32_1_0]][%[[C0]] : i32] // CHECK: %[[V4I32_1_2:.+]] = llvm.insertelement %[[SMEM_INT]], %[[V4I32_1_1]][%[[C1]] : i32] // CHECK: %[[V4I32_1_3:.+]] = llvm.insertelement %[[MEM_INT_LOW]], %[[V4I32_1_2]][%[[C2]] : i32] - // CHECK: %[[V4I32_1_4:.+]] = llvm.insertelement %[[MEM_INT_HIGH]], %[[V4I32_1_3]][%[[C3]] : i32] + // CHECK: %[[V4I32_1_4:.+]] = llvm.insertelement %[[MEM_INT_HIGH_TYPE]], %[[V4I32_1_3]][%[[C3]] : i32] %1 = amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu_lds_addrspace>, memref<8xi32, #gpu_global_addrspace> -> !amdgpu.tdm_base From 9f37e601e97e024a9c7ed6877acbc22be154a5ab Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Thu, 27 Nov 2025 12:40:30 -0500 Subject: [PATCH 06/28] [mlir][amdgpu] Add AllElementTypesMatch attribute to make_dma_base --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 2 +- mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 1806c747046b8..23eacab216468 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1228,7 +1228,7 @@ def AMDGPU_ScaledMFMAOp : } def AMDGPU_MakeDmaBaseOp : - AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments]>, + AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments, AllElementTypesMatch<["src", "dst"]>]>, Arguments<(ins Arg:$src, Variadic:$src_indices, diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir index 514ed9094da53..272c7b375b9f8 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir @@ -169,6 +169,18 @@ func.func @amdgpu.scaled_ext_packed816_invalid_dst_elem_type(%v: vector<16xf6E3M #gpu_lds_addrspace = 3 #amdgpu_fat_buffer_addrspace = 7 +func.func @amdgpu.make_dma_base.invalid_element_types(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace>, %smem: memref<8xf32,#gpu_lds_addrspace>) -> (!amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_base' op failed to verify that all of {src, dst} have same element type}} + %0 = amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32, #gpu_global_addrspace>, memref<8xf32, #gpu_lds_addrspace> -> !amdgpu.tdm_base + return %0 : !amdgpu.tdm_base +} + +// ----- + +#gpu_global_addrspace = 1 +#gpu_lds_addrspace = 3 +#amdgpu_fat_buffer_addrspace = 7 + // CHECK-LABEL: func @make_dma_base // CHECK-SAME: (%[[IDX:.+]]: index, %[[MEM:.+]]: memref<8xi32, 1>, %[[SMEM:.+]]: memref<8xi32, 3>) func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace>, %smem: memref<8xi32,#gpu_lds_addrspace>) -> (!amdgpu.tdm_base, !amdgpu.tdm_base) { From 3a427759bdcafde5905582f3e92300cf6eeab0f1 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Thu, 27 Nov 2025 13:11:59 -0500 Subject: [PATCH 07/28] [mlir][amdgpu] verify element type sizes for make_dma_base --- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 17 +++++++++++++++++ mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir | 12 ++++++++++++ 2 files changed, 29 insertions(+) diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index 8fc6220efc6ad..75b4fdb3fbdd5 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -721,6 +721,23 @@ LogicalResult MakeDmaBaseOp::verify() { bool is_valid = store_from_lds != load_to_lds; if (!is_valid) return emitOpError("invalid combination of address spaces."); + + Type elementType = srcType.getElementType(); + int width; + if (auto intType = dyn_cast(elementType)) { + width = intType.getWidth(); + } else if (auto floatType = dyn_cast(elementType)) { + width = floatType.getWidth(); + } else { + return emitOpError("element type must have type width"); + } + + if (!llvm::is_contained({8, 16, 32, 64}, width)) { + return emitOpError( + "element type must be 1, 2, 4, or 8 bytes long but type was ") + << width << " bits long."; + } + return success(); } diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir index 272c7b375b9f8..172664e8a0e8d 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir @@ -181,6 +181,18 @@ func.func @amdgpu.make_dma_base.invalid_element_types(%idx: index, %mem: memref< #gpu_lds_addrspace = 3 #amdgpu_fat_buffer_addrspace = 7 +func.func @amdgpu.make_dma_base.invalid_element_types(%idx: index, %mem: memref<8xi7, #gpu_global_addrspace>, %smem: memref<8xi7,#gpu_lds_addrspace>) -> (!amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_base' op element type must be 1, 2, 4, or 8 bytes long but type was 7 bits long.}} + %0 = amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi7, #gpu_global_addrspace>, memref<8xi7, #gpu_lds_addrspace> -> !amdgpu.tdm_base + return %0 : !amdgpu.tdm_base +} + +// ----- + +#gpu_global_addrspace = 1 +#gpu_lds_addrspace = 3 +#amdgpu_fat_buffer_addrspace = 7 + // CHECK-LABEL: func @make_dma_base // CHECK-SAME: (%[[IDX:.+]]: index, %[[MEM:.+]]: memref<8xi32, 1>, %[[SMEM:.+]]: memref<8xi32, 3>) func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace>, %smem: memref<8xi32,#gpu_lds_addrspace>) -> (!amdgpu.tdm_base, !amdgpu.tdm_base) { From c0cd803d7e3ee06cd89e77d5d5a45b0adc1242ab Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Wed, 26 Nov 2025 14:06:54 -0500 Subject: [PATCH 08/28] [mlir][amdgpu] Lower make_dma_descriptor Initial lowering for make_dma_descriptor. At the moment it only supports tensors of rank 2. --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 53 ++- .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 416 +++++++++++++++++- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 10 + .../Conversion/AMDGPUToROCDL/gfx1250.mlir | 91 ++++ mlir/test/Dialect/AMDGPU/invalid.mlir | 19 +- mlir/test/Dialect/AMDGPU/ops.mlir | 48 +- 6 files changed, 582 insertions(+), 55 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 23eacab216468..28efa246689a1 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1295,8 +1295,8 @@ def AMDGPU_MakeDmaDescriptorOp : DenseI64ArrayAttr: $global_static_strides, Variadic: $shared_dynamic_sizes, DenseI64ArrayAttr: $shared_static_sizes, - Optional: $pad, - Optional: $pad_every, + Optional: $pad_amount, + Optional: $pad_interval, Optional: $atomic_barrier_address, Variadic: $atomic_barrier_indices, Optional: $global_increment, @@ -1332,7 +1332,7 @@ def AMDGPU_MakeDmaDescriptorOp : // Example of moving a two dimension tensor to LDS where padding is applied after every integer. %base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<32x32xi32>, memref<64x64xi32, #gpu.address_space> -> !amdgpu.tdm_base - %descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padding(%pad pad_every %pad_every) : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + %descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padding(%pad_amount every %pad_interval) : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor ``` }]; @@ -1342,13 +1342,58 @@ def AMDGPU_MakeDmaDescriptorOp : `globalSize` custom($global_dynamic_sizes, $global_static_sizes) `globalStride` custom($global_dynamic_strides, $global_static_strides) `sharedSize` custom($shared_dynamic_sizes, $shared_static_sizes) - ( `padShared` `(` $pad^ `every` $pad_every `)` )? + ( `padShared` `(` $pad_amount^ `every` $pad_interval`)` )? ( `atomicBarrier` `(` $atomic_barrier_address^ `[` $atomic_barrier_indices `]` `:` type($atomic_barrier_address) `)`)? ( `iterate` $global_increment^ `,` $lds_increment `,` $iteration_count )? attr-dict `:` qualified(type($base)) `->` type(results) }]; + let extraClassDeclaration = [{ + int getRank() { + return getGlobalStaticSizes().size(); + } + + int getElementTypeWidth() { + Type elementType = getBase().getType().getElementType(); + int width; + if (auto floatType = dyn_cast(elementType)) { + width = floatType.getWidth(); + } else if (auto intType = dyn_cast(elementType)) { + width = intType.getWidth(); + } else { + llvm_unreachable("element type must have getWidth interface"); + } + return width; + } + + SmallVector getMixedList(SmallVector dynamics, ArrayRef statics) { + SmallVector result; + unsigned ctr = 0; + OpBuilder b(getContext()); + for (int64_t static_elem : statics) { + if (ShapedType::isDynamic(static_elem)) { + result.push_back(dynamics[ctr++]); + } else { + result.push_back(b.getIndexAttr(static_elem)); + } + } + return result; + } + + SmallVector getMixedGlobalSizes() { + return getMixedList(getGlobalDynamicSizes(), getGlobalStaticSizes()); + } + + SmallVector getMixedGlobalStrides() { + return getMixedList(getGlobalDynamicStrides(), getGlobalStaticStrides()); + } + + SmallVector getMixedSharedSizes() { + return getMixedList(getSharedDynamicSizes(), getSharedStaticSizes()); + } + }]; + let hasVerifier = 1; } diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 452c4e96e62c1..1e81d339b0ddc 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -2275,8 +2275,9 @@ struct AMDGPUMakeDmaBaseLowering LogicalResult matchAndRewrite(MakeDmaBaseOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - if (chipset < kGfx1250) + if (chipset < kGfx1250) { return op->emitOpError("make_dma_base is only supported on gfx1250"); + } Location loc = op.getLoc(); @@ -2339,6 +2340,375 @@ struct AMDGPUMakeDmaBaseLowering } }; +struct AMDGPUMakeDmaDescriptorLowering + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + AMDGPUMakeDmaDescriptorLowering(const LLVMTypeConverter &converter, + Chipset chipset) + : ConvertOpToLLVMPattern(converter), + chipset(chipset) {} + Chipset chipset; + + Value getDGroup0(OpAdaptor adaptor) const { return adaptor.getBase(); } + + Value setValueAtOffset(ConversionPatternRewriter &rewriter, Location loc, + Value accumulator, Value value, int shift) const { + shift = shift % 32; + Value shiftAmount; + if (shift != 0) { + shiftAmount = createI32Constant(rewriter, loc, shift % 32); + value = LLVM::ShlOp::create(rewriter, loc, value, shiftAmount); + } + return LLVM::OrOp::create(rewriter, loc, accumulator, value); + } + + Value setDataSize(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr0, const SmallVector consts) const { + // Compute data_size. + int elementTypeWidthInBytes = op.getElementTypeWidth() / 8; + + Value dataSize; + switch (elementTypeWidthInBytes) { + case 1: + dataSize = consts[0]; + break; + case 2: + dataSize = consts[1]; + break; + case 4: + dataSize = consts[2]; + break; + case 8: + dataSize = consts[3]; + break; + default: + llvm_unreachable("Invalid element size."); + } + return setValueAtOffset(rewriter, loc, sgpr0, dataSize, 16); + } + + Value setAtomicBarrier(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr0, const SmallVector &consts) const { + bool atomic_barrier_enable = adaptor.getAtomicBarrierAddress() != nullptr; + if (!atomic_barrier_enable) + return sgpr0; + + return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 18); + } + + Value setIterateEnable(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr0, const SmallVector &consts) const { + bool iterate_enable = adaptor.getGlobalIncrement() != nullptr; + if (!iterate_enable) + return sgpr0; + + return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 19); + } + + Value setPadEnable(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr0, const SmallVector &consts) const { + bool pad_enable = op.getPadAmount() != nullptr; + if (!pad_enable) + return sgpr0; + + return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 20); + } + + Value setPadInterval(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr0, const SmallVector &consts) const { + bool pad_enable = op.getPadAmount() != nullptr; + if (!pad_enable) + return sgpr0; + + IntegerType i32 = rewriter.getI32Type(); + Value padInterval = adaptor.getPadInterval(); + // pre-condition: padInterval can be a power of two between 2 and 256 + padInterval = LLVM::CountTrailingZerosOp::create(rewriter, loc, i32, + padInterval, false); + padInterval = LLVM::SubOp::create(rewriter, loc, padInterval, consts[1]); + // post-condition: padInterval can be a value between 0 and 7 + return setValueAtOffset(rewriter, loc, sgpr0, padInterval, 22); + } + + Value setPadAmount(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr0, const SmallVector &consts) const { + bool pad_enable = op.getPadAmount() != nullptr; + if (!pad_enable) + return sgpr0; + + Value padAmount = adaptor.getPadAmount(); + // pre-condition: padAmount is a value between 1-128 + padAmount = LLVM::SubOp::create(rewriter, loc, padAmount, consts[1]); + // post-condition: padAmount is a value between 0-127 + return setValueAtOffset(rewriter, loc, sgpr0, padAmount, 25); + } + + Value setAtomicBarrierAddress(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, + Location loc, Value sgpr1, + const SmallVector &consts) const { + bool atomic_barrier_enable = adaptor.getAtomicBarrierAddress() != nullptr; + if (!atomic_barrier_enable) + return sgpr1; + + Value atomicBarrierAddress = adaptor.getAtomicBarrierAddress(); + IntegerType i32 = rewriter.getI32Type(); + atomicBarrierAddress = + LLVM::PtrToIntOp::create(rewriter, loc, i32, atomicBarrierAddress); + atomicBarrierAddress = + LLVM::LShrOp::create(rewriter, loc, atomicBarrierAddress, consts[3]); + Value mask = createI32Constant(rewriter, loc, 0xFFFF); + atomicBarrierAddress = + LLVM::AndOp::create(rewriter, loc, atomicBarrierAddress, mask); + return setValueAtOffset(rewriter, loc, sgpr1, atomicBarrierAddress, 32); + } + + std::pair + setTensorDim0(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, Value sgpr1, + Value sgpr2, const SmallVector &consts) const { + SmallVector mixedGlobalSizes = op.getMixedGlobalSizes(); + OpFoldResult tensorDim0OpFoldResult = mixedGlobalSizes.back(); + Value tensorDim0; + if (auto attr = dyn_cast(tensorDim0OpFoldResult)) { + tensorDim0 = + createI32Constant(rewriter, loc, cast(attr).getInt()); + } else { + tensorDim0 = cast(tensorDim0OpFoldResult); + } + Value c16 = createI32Constant(rewriter, loc, 16); + Value tensorDim0High = LLVM::LShrOp::create(rewriter, loc, tensorDim0, c16); + sgpr1 = setValueAtOffset(rewriter, loc, sgpr1, tensorDim0, 48); + sgpr2 = setValueAtOffset(rewriter, loc, sgpr2, tensorDim0High, 48 + 16); + return {sgpr1, sgpr2}; + } + + std::pair + setTensorDim1(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, Value sgpr2, + Value sgpr3, const SmallVector &consts) const { + SmallVector mixedGlobalSizes = op.getMixedGlobalSizes(); + OpFoldResult tensorDim1OpFoldResult = *(mixedGlobalSizes.rbegin() + 1); + Value tensorDim1; + if (auto attr = dyn_cast(tensorDim1OpFoldResult)) { + tensorDim1 = + createI32Constant(rewriter, loc, cast(attr).getInt()); + } else { + tensorDim1 = cast(tensorDim1OpFoldResult); + } + Value c16 = createI32Constant(rewriter, loc, 16); + Value tensorDim1High = LLVM::LShrOp::create(rewriter, loc, tensorDim1, c16); + sgpr2 = setValueAtOffset(rewriter, loc, sgpr2, tensorDim1, 80); + sgpr3 = setValueAtOffset(rewriter, loc, sgpr3, tensorDim1High, 80 + 16); + return {sgpr2, sgpr3}; + } + + Value setTileDimX(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr, const SmallVector &consts, size_t dimX, + int offset) const { + SmallVector mixedSharedSizes = op.getMixedSharedSizes(); + + if (mixedSharedSizes.size() <= dimX) { + return sgpr; + } + + OpFoldResult tileDimXOpFoldResult = *(mixedSharedSizes.rbegin() + dimX); + Value tileDimX; + if (auto attr = dyn_cast(tileDimXOpFoldResult)) { + tileDimX = + createI32Constant(rewriter, loc, cast(attr).getInt()); + } else { + tileDimX = cast(tileDimXOpFoldResult); + } + return setValueAtOffset(rewriter, loc, sgpr, tileDimX, offset); + } + + Value setTileDim0(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr3, const SmallVector &consts) const { + return setTileDimX(op, adaptor, rewriter, loc, sgpr3, consts, 0, 112); + } + + Value setTileDim1(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr4, const SmallVector &consts) const { + return setTileDimX(op, adaptor, rewriter, loc, sgpr4, consts, 1, 128); + } + + Value setTileDim2(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr4, const SmallVector &consts) const { + return setTileDimX(op, adaptor, rewriter, loc, sgpr4, consts, 2, 144); + } + + std::pair + setTensorDimXStride(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgprY, Value sgprZ, + const SmallVector &consts, size_t dimX, + int offset) const { + SmallVector mixedGlobalStrides = op.getMixedGlobalStrides(); + + if (mixedGlobalStrides.size() <= dimX) { + return {sgprY, sgprZ}; + } + + OpFoldResult tensorDimXStrideOpFoldResult = + *(mixedGlobalStrides.rbegin() + dimX); + Value tensorDimXStride; + if (auto attr = dyn_cast(tensorDimXStrideOpFoldResult)) { + tensorDimXStride = + createI64Constant(rewriter, loc, cast(attr).getInt()); + } else { + tensorDimXStride = cast(tensorDimXStrideOpFoldResult); + } + + constexpr int64_t first48bits = 0xFFFFFFFFFFFF; + Value mask = createI64Constant(rewriter, loc, first48bits); + tensorDimXStride = + LLVM::AndOp::create(rewriter, loc, mask, tensorDimXStride); + IntegerType i32 = rewriter.getI32Type(); + Value tensorDimXStrideLow = + LLVM::TruncOp::create(rewriter, loc, i32, tensorDimXStride); + + int shift = (offset % 32) == 0 ? 32 : offset % 32; + Value shiftVal = createI64Constant(rewriter, loc, shift); + Value tensorDimXStrideHigh = + LLVM::LShrOp::create(rewriter, loc, tensorDimXStride, shiftVal); + tensorDimXStrideHigh = + LLVM::TruncOp::create(rewriter, loc, i32, tensorDimXStrideHigh); + + sgprY = setValueAtOffset(rewriter, loc, sgprY, tensorDimXStrideLow, offset); + sgprZ = setValueAtOffset(rewriter, loc, sgprZ, tensorDimXStrideHigh, + offset + shift); + return {sgprY, sgprZ}; + } + + std::pair + setTensorDim0Stride(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr5, Value sgpr6, + const SmallVector &consts) const { + return setTensorDimXStride(op, adaptor, rewriter, loc, sgpr5, sgpr6, consts, + 0, 160); + } + + std::pair + setTensorDim1Stride(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + Value sgpr5, Value sgpr6, + const SmallVector &consts) const { + return setTensorDimXStride(op, adaptor, rewriter, loc, sgpr5, sgpr6, consts, + 1, 208); + } + + Value getDGroup1(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, Location loc, + const SmallVector &consts) const { + + Value sgpr0, sgpr1, sgpr2, sgpr3, sgpr4, sgpr5, sgpr6, sgpr7; + sgpr0 = sgpr1 = sgpr2 = sgpr3 = sgpr4 = sgpr5 = sgpr6 = sgpr7 = consts[0]; + + sgpr0 = setDataSize(op, adaptor, rewriter, loc, sgpr0, consts); + sgpr0 = setAtomicBarrier(op, adaptor, rewriter, loc, sgpr0, consts); + sgpr0 = setIterateEnable(op, adaptor, rewriter, loc, sgpr0, consts); + sgpr0 = setPadEnable(op, adaptor, rewriter, loc, sgpr0, consts); + sgpr0 = setPadInterval(op, adaptor, rewriter, loc, sgpr0, consts); + sgpr0 = setPadAmount(op, adaptor, rewriter, loc, sgpr0, consts); + + sgpr1 = setAtomicBarrierAddress(op, adaptor, rewriter, loc, sgpr1, consts); + std::tie(sgpr1, sgpr2) = + setTensorDim0(op, adaptor, rewriter, loc, sgpr1, sgpr2, consts); + std::tie(sgpr2, sgpr3) = + setTensorDim1(op, adaptor, rewriter, loc, sgpr2, sgpr3, consts); + + sgpr3 = setTileDim0(op, adaptor, rewriter, loc, sgpr3, consts); + sgpr4 = setTileDim1(op, adaptor, rewriter, loc, sgpr4, consts); + sgpr4 = setTileDim2(op, adaptor, rewriter, loc, sgpr4, consts); + std::tie(sgpr5, sgpr6) = + setTensorDim0Stride(op, adaptor, rewriter, loc, sgpr5, sgpr6, consts); + std::tie(sgpr6, sgpr7) = + setTensorDim1Stride(op, adaptor, rewriter, loc, sgpr6, sgpr7, consts); + + IntegerType i32 = rewriter.getI32Type(); + Type v8i32 = this->typeConverter->convertType(VectorType::get(8, i32)); + Value dgroup1 = LLVM::UndefOp::create(rewriter, loc, v8i32); + + dgroup1 = + LLVM::InsertElementOp::create(rewriter, loc, dgroup1, sgpr0, consts[0]); + dgroup1 = + LLVM::InsertElementOp::create(rewriter, loc, dgroup1, sgpr1, consts[1]); + dgroup1 = + LLVM::InsertElementOp::create(rewriter, loc, dgroup1, sgpr2, consts[2]); + dgroup1 = + LLVM::InsertElementOp::create(rewriter, loc, dgroup1, sgpr3, consts[3]); + dgroup1 = + LLVM::InsertElementOp::create(rewriter, loc, dgroup1, sgpr4, consts[4]); + dgroup1 = + LLVM::InsertElementOp::create(rewriter, loc, dgroup1, sgpr5, consts[5]); + dgroup1 = + LLVM::InsertElementOp::create(rewriter, loc, dgroup1, sgpr6, consts[6]); + dgroup1 = + LLVM::InsertElementOp::create(rewriter, loc, dgroup1, sgpr7, consts[7]); + + return dgroup1; + } + + LogicalResult + matchAndRewrite(MakeDmaDescriptorOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + if (chipset < kGfx1250) { + return op->emitOpError( + "make_dma_descriptor is only supported on gfx1250"); + } + + if (op.getRank() != 2) { + return op->emitOpError("unimplemented"); + } + + Location loc = op.getLoc(); + + IntegerType i32 = rewriter.getI32Type(); + Type v4i32 = this->typeConverter->convertType(VectorType::get(4, i32)); + + SmallVector consts; + for (int i = 0; i < 8; i++) { + consts.push_back(createI32Constant(rewriter, loc, i)); + } + + Value dgroup0 = this->getDGroup0(adaptor); + Value dgroup1 = this->getDGroup1(op, adaptor, rewriter, loc, consts); + Value undefV4I32 = LLVM::UndefOp::create(rewriter, loc, v4i32); + Value dgroup2 = undefV4I32; + Value dgroup3 = undefV4I32; + + if (op.getRank() == 2) { + Value nullConstant = createI32Constant(rewriter, loc, 0x7c); + dgroup2 = LLVM::InsertElementOp::create(rewriter, loc, dgroup2, + nullConstant, consts[0]); + dgroup2 = LLVM::InsertElementOp::create(rewriter, loc, dgroup2, consts[0], + consts[1]); + dgroup2 = LLVM::InsertElementOp::create(rewriter, loc, dgroup2, consts[0], + consts[2]); + dgroup2 = LLVM::InsertElementOp::create(rewriter, loc, dgroup2, consts[0], + consts[3]); + dgroup3 = dgroup2; + } + + SmallVector results = {dgroup0, dgroup1, dgroup2, dgroup3}; + rewriter.replaceOpWithMultiple(op, {results}); + return success(); + } +}; + struct ConvertAMDGPUToROCDLPass : public impl::ConvertAMDGPUToROCDLPassBase { using Base::Base; @@ -2392,27 +2762,27 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, Chipset chipset) { populateAMDGPUMemorySpaceAttributeConversions(converter); - patterns - .add, - RawBufferOpLowering, - RawBufferOpLowering, - RawBufferOpLowering, - RawBufferOpLowering, - RawBufferOpLowering, - RawBufferOpLowering, - AMDGPUDPPLowering, MemoryCounterWaitOpLowering, LDSBarrierOpLowering, - SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering, - WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPacked816OpLowering, - ScaledExtPackedOpLowering, PackedScaledTruncOpLowering, - PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering, - GatherToLDSOpLowering, TransposeLoadOpLowering, - AMDGPUPermlaneLowering, AMDGPUMakeDmaBaseLowering>(converter, - chipset); + patterns.add< + FatRawBufferCastLowering, + RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering, + AMDGPUDPPLowering, MemoryCounterWaitOpLowering, LDSBarrierOpLowering, + SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering, + WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPacked816OpLowering, + ScaledExtPackedOpLowering, PackedScaledTruncOpLowering, + PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering, + GatherToLDSOpLowering, TransposeLoadOpLowering, AMDGPUPermlaneLowering, + AMDGPUMakeDmaBaseLowering, AMDGPUMakeDmaDescriptorLowering>(converter, + chipset); patterns.add(converter); } diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index 75b4fdb3fbdd5..42797dadbb7e0 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -757,6 +757,9 @@ LogicalResult MakeDmaDescriptorOp::verify() { ArrayRef globalStaticSizes = getGlobalStaticSizes(); size_t rank = globalStaticSizes.size(); + if (rank < 2) { + return emitOpError("tensor and tile must be at least of rank 2."); + } if (rank != globalStaticStrides.size()) { return emitOpError("strides and sizes must have same rank."); } @@ -766,6 +769,13 @@ LogicalResult MakeDmaDescriptorOp::verify() { return emitOpError("tensor must have same rank as tile."); } + int elementTypeWidth = getElementTypeWidth(); + if (!llvm::is_contained({8, 16, 32, 64}, elementTypeWidth)) { + return emitOpError( + "element type width must be 1, 2, 4 or 8 bytes, but was ") + << elementTypeWidth << " bits long"; + } + return success(); } diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir index 172664e8a0e8d..e774cc8ca4f70 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir @@ -266,3 +266,94 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace> func.return %0, %1 : !amdgpu.tdm_base, !amdgpu.tdm_base } + +// ----- + +// CHECK-LABEL: func @make_dma_descriptor +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) +func.func @make_dma_descriptor(%base: !amdgpu.tdm_base) -> !amdgpu.tdm_descriptor { + // CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]] + + // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32) + // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32) + // CHECK-DAG: %[[C2:.+]] = llvm.mlir.constant(2 : i32) + // CHECK-DAG: %[[C3:.+]] = llvm.mlir.constant(3 : i32) + // CHECK-DAG: %[[C4:.+]] = llvm.mlir.constant(4 : i32) + // CHECK-DAG: %[[C5:.+]] = llvm.mlir.constant(5 : i32) + // CHECK-DAG: %[[C6:.+]] = llvm.mlir.constant(6 : i32) + // CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32) + + // CHECK: %[[C16:.+]] = llvm.mlir.constant(16 : i32) + // CHECK: %[[SIZE:.+]] = llvm.shl %[[C2]], %[[C16]] + // CHECK: %[[SGPR0:.+]] = llvm.or %[[C0]], %[[SIZE]] + + // CHECK-DAG: %[[TENSOR_DIM_0:.+]] = llvm.mlir.constant(64 : i32) + // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32) + // CHECK: %[[TENSOR_DIM_0_HIGH:.+]] = llvm.lshr %[[TENSOR_DIM_0]], %[[C16]] + // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32) + // CHECK: %[[TENSOR_DIM_0_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_0]], %[[C16]] + // CHECK: %[[SGPR1:.+]] = llvm.or %[[C0]], %[[TENSOR_DIM_0_SHIFTED]] + // CHECK: %[[SGPR2_0:.+]] = llvm.or %[[C0]], %[[TENSOR_DIM_0_HIGH]] + + // CHECK-DAG: %[[TENSOR_DIM_1:.+]] = llvm.mlir.constant(128 : i32) + // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32) + // CHECK: %[[TENSOR_DIM_1_HIGH:.+]] = llvm.lshr %[[TENSOR_DIM_1]], %[[C16]] + // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32) + // CHECK: %[[TENSOR_DIM_1_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1]], %[[C16]] + // CHECK: %[[SGPR2:.+]] = llvm.or %[[SGPR2_0]], %[[TENSOR_DIM_1_SHIFTED]] + // CHECK: %[[SGPR3_0:.+]] = llvm.or %[[C0]], %[[TENSOR_DIM_1_HIGH]] + + // CHECK-DAG: %[[TILE_DIM_0:.+]] = llvm.mlir.constant(64 : i32) + // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32) + // CHECK: %[[TILE_DIM_0_SHIFTED:.+]] = llvm.shl %[[TILE_DIM_0:.+]], %[[C16]] + // CHECK: %[[SGPR3:.+]] = llvm.or %[[SGPR3_0]], %[[TILE_DIM_0_SHIFTED]] + + // CHECK-DAG: %[[TILE_DIM_1:.+]] = llvm.mlir.constant(128 : i32) + // CHECK: %[[SGPR4:.+]] = llvm.or %[[C0]], %[[TILE_DIM_1]] + + // CHECK-DAG: %[[TENSOR_DIM_0_STRIDE:.+]] = llvm.mlir.constant(1 : i64) : i64 + // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64 + // CHECK: %[[TENSOR_DIM_0_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_0_STRIDE]] + // CHECK-DAG: %[[TENSOR_DIM_0_STRIDE_LOW:.+]] = llvm.trunc %[[TENSOR_DIM_0_STRIDE_MASKED]] : i64 to i32 + // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(32 : i64) : i64 + // CHECK: %[[TENSOR_DIM_0_STRIDE_HIGH_64:.+]] = llvm.lshr %[[TENSOR_DIM_0_STRIDE_MASKED]], %[[SHIFT]] + // CHECK: %[[TENSOR_DIM_0_STRIDE_HIGH:.+]] = llvm.trunc %[[TENSOR_DIM_0_STRIDE_HIGH_64]] : i64 to i32 + // CHECK: %[[SGPR5:.+]] = llvm.or %[[C0]], %[[TENSOR_DIM_0_STRIDE_LOW]] + // CHECK: %[[SGPR6_0:.+]] = llvm.or %[[C0]], %[[TENSOR_DIM_0_STRIDE_HIGH]] + + // CHECK-DAG: %[[TENSOR_DIM_1_STRIDE:.+]] = llvm.mlir.constant(64 : i64) + // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64 + // CHECK: %[[TENSOR_DIM_1_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_1_STRIDE]] + // CHECK-DAG: %[[TENSOR_DIM_1_STRIDE_LOW:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_MASKED]] + // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i64) : i64 + // CHECK: %[[TENSOR_DIM_1_STRIDE_SHIFTED:.+]] = llvm.lshr %[[TENSOR_DIM_1_STRIDE_MASKED]], %[[SHIFT]] + // CHECK: %[[TENSOR_DIM_1_STRIDE_HIGH:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_SHIFTED]] : i64 to i32 + // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK: %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1_STRIDE_LOW]], %[[SHIFT]] + // CHECK-DAG: %[[SGPR6:.+]] = llvm.or %[[SGPR6_0]], %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED]] + // CHECK-DAG: %[[SGPR7:.+]] = llvm.or %[[C0]], %[[TENSOR_DIM_1_STRIDE_HIGH]] + + // CHECK: %[[V8I32:.+]] = llvm.mlir.undef : vector<8xi32> + // CHECK: %[[DGROUP1_0:.+]] = llvm.insertelement %[[SGPR0]], %[[V8I32]][%[[C0]] : i32] + // CHECK: %[[DGROUP1_1:.+]] = llvm.insertelement %[[SGPR1]], %[[DGROUP1_0]][%[[C1]] : i32] + // CHECK: %[[DGROUP1_2:.+]] = llvm.insertelement %[[SGPR2]], %[[DGROUP1_1]][%[[C2]] : i32] + // CHECK: %[[DGROUP1_3:.+]] = llvm.insertelement %[[SGPR3]], %[[DGROUP1_2]][%[[C3]] : i32] + // CHECK: %[[DGROUP1_4:.+]] = llvm.insertelement %[[SGPR4]], %[[DGROUP1_3]][%[[C4]] : i32] + // CHECK: %[[DGROUP1_5:.+]] = llvm.insertelement %[[SGPR5]], %[[DGROUP1_4]][%[[C5]] : i32] + // CHECK: %[[DGROUP1_6:.+]] = llvm.insertelement %[[SGPR6]], %[[DGROUP1_5]][%[[C6]] : i32] + // CHECK: %[[DGROUP1:.+]] = llvm.insertelement %[[SGPR7]], %[[DGROUP1_6]][%[[C7]] : i32] + + // CHECK-DAG: %[[V4I32:.+]] = llvm.mlir.undef : vector<4xi32> + + // CHECK-DAG: %[[NULL:.+]] = llvm.mlir.constant(124 : i32) + + // CHECK: %[[NULL_GROUP_0:.+]] = llvm.insertelement %[[NULL]], %[[V4I32]][%[[C0]] : i32] + // CHECK: %[[NULL_GROUP_1:.+]] = llvm.insertelement %[[C0]], %[[NULL_GROUP_0]][%[[C1]] : i32] + // CHECK: %[[NULL_GROUP_2:.+]] = llvm.insertelement %[[C0]], %[[NULL_GROUP_1]][%[[C2]] : i32] + // CHECK: %[[NULL_GROUP:.+]] = llvm.insertelement %[[C0]], %[[NULL_GROUP_2]][%[[C3]] : i32] + + // CHECK: %[[DGROUPS:.+]] = builtin.unrealized_conversion_cast %[[DGROUP0]], %[[DGROUP1]], %[[NULL_GROUP]], %[[NULL_GROUP]] : vector<4xi32>, vector<8xi32>, vector<4xi32>, vector<4xi32> to !amdgpu.tdm_descriptor + %descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return %descriptor : !amdgpu.tdm_descriptor +} + diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir index 066f46060f62f..2374124e2a083 100644 --- a/mlir/test/Dialect/AMDGPU/invalid.mlir +++ b/mlir/test/Dialect/AMDGPU/invalid.mlir @@ -361,7 +361,7 @@ func.func @scaled_mfma_invalid_k(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32x // CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) func.func @make_dma_descriptor_invalid_empty_strides(%base: !amdgpu.tdm_base) { // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides must not be empty.}} - amdgpu.make_dma_descriptor %base globalSize [0] globalStride [] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.make_dma_descriptor %base globalSize [0, 1] globalStride [] sharedSize [1, 0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor func.return } @@ -371,7 +371,7 @@ func.func @make_dma_descriptor_invalid_empty_strides(%base: !amdgpu.tdm_base) func.func @make_dma_descriptor_invalid_innermost_stride(%base: !amdgpu.tdm_base) { // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides for the innermost dimension must be 1.}} - amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [1, 2] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [1, 2] sharedSize [1, 0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor func.return } @@ -381,7 +381,7 @@ func.func @make_dma_descriptor_invalid_innermost_stride(%base: !amdgpu.tdm_base< // CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) func.func @make_dma_descriptor_invalid_size_and_stride_sizes(%base: !amdgpu.tdm_base) { // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides and sizes must have same rank.}} - amdgpu.make_dma_descriptor %base globalSize [1] globalStride [1, 1] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.make_dma_descriptor %base globalSize [1, 1, 1] globalStride [1, 1] sharedSize [1, 0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor func.return } @@ -391,6 +391,17 @@ func.func @make_dma_descriptor_invalid_size_and_stride_sizes(%base: !amdgpu.tdm_ // CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) func.func @make_dma_descriptor_invalid_shared_and_global_rank(%base: !amdgpu.tdm_base) { // expected-error@+1 {{'amdgpu.make_dma_descriptor' op tensor must have same rank as tile.}} - amdgpu.make_dma_descriptor %base globalSize [4, 4] globalStride [1, 1] sharedSize [2] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.make_dma_descriptor %base globalSize [4, 4] globalStride [1, 1] sharedSize [1, 2, 3] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return +} + +// ----- + + +// CHECK-LABEL: func @make_dma_descriptor_invalid_rank_less_than_two +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) +func.func @make_dma_descriptor_invalid_rank_less_than_two(%base: !amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_descriptor' op tensor and tile must be at least of rank 2.}} + amdgpu.make_dma_descriptor %base globalSize [4] globalStride [1, 1] sharedSize [1, 2] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor func.return } diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index aa6bedc0e1135..79cb75d782c05 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -703,45 +703,45 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base, %barrier: memref<8x // CHECK: amdgpu.make_dma_descriptor %[[BASE]] amdgpu.make_dma_descriptor %base - // CHECK-SAME: globalSize [0] - globalSize [0] - // CHECK-SAME: globalStride [1] - globalStride [1] - // CHECK-SAME: sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor - sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + // CHECK-SAME: globalSize [64, 64] + globalSize [64, 64] + // CHECK-SAME: globalStride [64, 1] + globalStride [64, 1] + // CHECK-SAME: sharedSize [64, 64] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + sharedSize [64, 64] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor // CHECK: amdgpu.make_dma_descriptor %[[BASE]] amdgpu.make_dma_descriptor %base - // CHECK-SAME: globalSize [0] - globalSize [0] - // CHECK-SAME: globalStride [1] - globalStride [1] - // CHECK-SAME: sharedSize [0] - sharedSize [0] + // CHECK-SAME: globalSize [64, 64] + globalSize [64, 64] + // CHECK-SAME: globalStride [64, 1] + globalStride [64, 1] + // CHECK-SAME: sharedSize [64, 64] + sharedSize [64, 64] // CHECK-SAME: padShared(%[[IDX]] every %[[IDX]]) padShared(%idx every %idx) : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor // CHECK: amdgpu.make_dma_descriptor %[[BASE]] amdgpu.make_dma_descriptor %base - // CHECK-SAME: globalSize [0] - globalSize [0] - // CHECK-SAME: globalStride [1] - globalStride [1] - // CHECK-SAME: sharedSize [0] - sharedSize [0] + // CHECK-SAME: globalSize [64, 64] + globalSize [64, 64] + // CHECK-SAME: globalStride [64, 1] + globalStride [64, 1] + // CHECK-SAME: sharedSize [64, 64] + sharedSize [64, 64] // CHECK-SAME: atomicBarrier(%[[BARRIER]][%[[IDX]]] : memref<8xi32>) atomicBarrier(%barrier[%idx] : memref<8xi32>) : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor // CHECK: amdgpu.make_dma_descriptor %[[BASE]] amdgpu.make_dma_descriptor %base - // CHECK-SAME: globalSize [0] - globalSize [0] - // CHECK-SAME: globalStride [1] - globalStride [1] - // CHECK-SAME: sharedSize [0] - sharedSize [0] + // CHECK-SAME: globalSize [64, 64] + globalSize [64, 64] + // CHECK-SAME: globalStride [64, 1] + globalStride [64, 1] + // CHECK-SAME: sharedSize [64, 64] + sharedSize [64, 64] // CHECK-SAME: iterate %[[IDX]], %[[IDX]], %[[IDX]] iterate %idx, %idx, %idx : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor From 2973181cd611bab90235d6f5d0b4c8f28194f560 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Fri, 28 Nov 2025 14:04:00 -0500 Subject: [PATCH 09/28] Folding --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 1 + mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 79 +++++++++++++++++++ .../amdgpu-make-dma-descriptor-fold.mlir | 19 +++++ mlir/test/Dialect/AMDGPU/ops.mlir | 1 + 4 files changed, 100 insertions(+) create mode 100644 mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 28efa246689a1..c072ebdfa5d26 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1395,6 +1395,7 @@ def AMDGPU_MakeDmaDescriptorOp : }]; let hasVerifier = 1; + let hasFolder = 1; } def AMDGPU_TensorLoadToLDSOp : diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index 42797dadbb7e0..002381ce8a8eb 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -760,6 +760,9 @@ LogicalResult MakeDmaDescriptorOp::verify() { if (rank < 2) { return emitOpError("tensor and tile must be at least of rank 2."); } + if (rank > 5) { + return emitOpError("tensor and tile must be at most of rank 5."); + } if (rank != globalStaticStrides.size()) { return emitOpError("strides and sizes must have same rank."); } @@ -779,6 +782,82 @@ LogicalResult MakeDmaDescriptorOp::verify() { return success(); } +static bool maybeUpdateDynamicIndexList( + ArrayRef staticElements, ArrayRef foldedElements, + SmallVector dynamicElements, SmallVector &newStaticElements, + SmallVector &newDynamicElements) { + bool changed = false; + int index = 0; + + for (int64_t static_element : staticElements) { + if (!ShapedType::isDynamic(static_element)) { + newStaticElements.push_back(static_element); + continue; + } + + Attribute folded_element = foldedElements[index++]; + if (auto attr = dyn_cast(folded_element)) { + newStaticElements.push_back(attr.getInt()); + changed = true; + continue; + } + + newStaticElements.push_back(ShapedType::kDynamic); + newDynamicElements.push_back(dynamicElements[index]); + } + return changed; +} + +OpFoldResult MakeDmaDescriptorOp::fold(FoldAdaptor adaptor) { + ArrayRef oldGlobalStaticStrides = adaptor.getGlobalStaticStrides(); + ArrayRef foldedGlobalDynamicStrides = + adaptor.getGlobalDynamicStrides(); + SmallVector oldGlobalDynamicStrides = getGlobalDynamicStrides(); + + SmallVector newGlobalStaticStrides; + SmallVector newGlobalDynamicStrides; + + bool change = maybeUpdateDynamicIndexList( + oldGlobalStaticStrides, foldedGlobalDynamicStrides, + oldGlobalDynamicStrides, newGlobalStaticStrides, newGlobalDynamicStrides); + + ArrayRef oldGlobalStaticSizes = adaptor.getGlobalStaticSizes(); + ArrayRef foldedGlobalDynamicSizes = + adaptor.getGlobalDynamicSizes(); + SmallVector oldGlobalDynamicSizes = getGlobalDynamicSizes(); + + SmallVector newGlobalStaticSizes; + SmallVector newGlobalDynamicSizes; + + change |= maybeUpdateDynamicIndexList( + oldGlobalStaticSizes, foldedGlobalDynamicSizes, oldGlobalDynamicSizes, + newGlobalStaticSizes, newGlobalDynamicSizes); + + ArrayRef oldSharedStaticSizes = adaptor.getSharedStaticSizes(); + ArrayRef foldedSharedDynamicSizes = + adaptor.getSharedDynamicSizes(); + SmallVector oldSharedDynamicSizes = getSharedDynamicSizes(); + + SmallVector newSharedStaticSizes; + SmallVector newSharedDynamicSizes; + + change |= maybeUpdateDynamicIndexList( + oldSharedStaticSizes, foldedSharedDynamicSizes, oldSharedDynamicSizes, + newSharedStaticSizes, newSharedDynamicSizes); + + if (change) { + setGlobalStaticStrides(newGlobalStaticStrides); + getGlobalDynamicStridesMutable().assign(newGlobalDynamicStrides); + setGlobalStaticSizes(newGlobalStaticSizes); + getGlobalDynamicSizesMutable().assign(newGlobalDynamicSizes); + setSharedStaticSizes(newSharedStaticSizes); + getSharedDynamicSizesMutable().assign(newSharedDynamicSizes); + return getResult(); + } + + return nullptr; +} + //===----------------------------------------------------------------------===// // ScaledMFMAOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir new file mode 100644 index 0000000000000..9d43c9940f8e0 --- /dev/null +++ b/mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir @@ -0,0 +1,19 @@ +// RUN: mlir-opt --canonicalize %s | FileCheck %s + +// CHECK-LABEL: @make_dma_descriptor_fold +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base, %[[IDX:.+]]: index) +func.func @make_dma_descriptor_fold(%base: !amdgpu.tdm_base, %idx: index) -> !amdgpu.tdm_descriptor { + %c64 = arith.constant 64 : index + + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] + %0 = amdgpu.make_dma_descriptor %base + // CHECK-SAME: globalSize [64, 64] + globalSize [%c64, %c64] + // CHECK-SAME: globalStride [64, 1] + globalStride [%c64, 1] + // CHECK-SAME: sharedSize [64, 64] + sharedSize [%c64, %c64] + iterate %idx, %idx, %idx + : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return %0 : !amdgpu.tdm_descriptor +} diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 79cb75d782c05..7da995df77037 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -758,3 +758,4 @@ func.func @tensor_load_store(%desc: !amdgpu.tdm_descriptor) { amdgpu.tensor_store_from_lds %desc : !amdgpu.tdm_descriptor return } + From bf0600bd42df5a9c987cf448f6ac28afe67e0d8e Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Tue, 2 Dec 2025 10:19:44 -0500 Subject: [PATCH 10/28] const SmallVector& to ArrayRef --- .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 53 +++++++++---------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 1e81d339b0ddc..ac8dc971091a3 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -2365,7 +2365,7 @@ struct AMDGPUMakeDmaDescriptorLowering Value setDataSize(MakeDmaDescriptorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Location loc, - Value sgpr0, const SmallVector consts) const { + Value sgpr0, ArrayRef consts) const { // Compute data_size. int elementTypeWidthInBytes = op.getElementTypeWidth() / 8; @@ -2391,7 +2391,7 @@ struct AMDGPUMakeDmaDescriptorLowering Value setAtomicBarrier(MakeDmaDescriptorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Location loc, - Value sgpr0, const SmallVector &consts) const { + Value sgpr0, ArrayRef consts) const { bool atomic_barrier_enable = adaptor.getAtomicBarrierAddress() != nullptr; if (!atomic_barrier_enable) return sgpr0; @@ -2401,7 +2401,7 @@ struct AMDGPUMakeDmaDescriptorLowering Value setIterateEnable(MakeDmaDescriptorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Location loc, - Value sgpr0, const SmallVector &consts) const { + Value sgpr0, ArrayRef consts) const { bool iterate_enable = adaptor.getGlobalIncrement() != nullptr; if (!iterate_enable) return sgpr0; @@ -2411,7 +2411,7 @@ struct AMDGPUMakeDmaDescriptorLowering Value setPadEnable(MakeDmaDescriptorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Location loc, - Value sgpr0, const SmallVector &consts) const { + Value sgpr0, ArrayRef consts) const { bool pad_enable = op.getPadAmount() != nullptr; if (!pad_enable) return sgpr0; @@ -2421,7 +2421,7 @@ struct AMDGPUMakeDmaDescriptorLowering Value setPadInterval(MakeDmaDescriptorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Location loc, - Value sgpr0, const SmallVector &consts) const { + Value sgpr0, ArrayRef consts) const { bool pad_enable = op.getPadAmount() != nullptr; if (!pad_enable) return sgpr0; @@ -2438,7 +2438,7 @@ struct AMDGPUMakeDmaDescriptorLowering Value setPadAmount(MakeDmaDescriptorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Location loc, - Value sgpr0, const SmallVector &consts) const { + Value sgpr0, ArrayRef consts) const { bool pad_enable = op.getPadAmount() != nullptr; if (!pad_enable) return sgpr0; @@ -2453,7 +2453,7 @@ struct AMDGPUMakeDmaDescriptorLowering Value setAtomicBarrierAddress(MakeDmaDescriptorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Location loc, Value sgpr1, - const SmallVector &consts) const { + ArrayRef consts) const { bool atomic_barrier_enable = adaptor.getAtomicBarrierAddress() != nullptr; if (!atomic_barrier_enable) return sgpr1; @@ -2470,10 +2470,11 @@ struct AMDGPUMakeDmaDescriptorLowering return setValueAtOffset(rewriter, loc, sgpr1, atomicBarrierAddress, 32); } - std::pair - setTensorDim0(MakeDmaDescriptorOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter, Location loc, Value sgpr1, - Value sgpr2, const SmallVector &consts) const { + std::pair setTensorDim0(MakeDmaDescriptorOp op, + OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, + Location loc, Value sgpr1, Value sgpr2, + ArrayRef consts) const { SmallVector mixedGlobalSizes = op.getMixedGlobalSizes(); OpFoldResult tensorDim0OpFoldResult = mixedGlobalSizes.back(); Value tensorDim0; @@ -2490,10 +2491,11 @@ struct AMDGPUMakeDmaDescriptorLowering return {sgpr1, sgpr2}; } - std::pair - setTensorDim1(MakeDmaDescriptorOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter, Location loc, Value sgpr2, - Value sgpr3, const SmallVector &consts) const { + std::pair setTensorDim1(MakeDmaDescriptorOp op, + OpAdaptor adaptor, + ConversionPatternRewriter &rewriter, + Location loc, Value sgpr2, Value sgpr3, + ArrayRef consts) const { SmallVector mixedGlobalSizes = op.getMixedGlobalSizes(); OpFoldResult tensorDim1OpFoldResult = *(mixedGlobalSizes.rbegin() + 1); Value tensorDim1; @@ -2512,7 +2514,7 @@ struct AMDGPUMakeDmaDescriptorLowering Value setTileDimX(MakeDmaDescriptorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Location loc, - Value sgpr, const SmallVector &consts, size_t dimX, + Value sgpr, ArrayRef consts, size_t dimX, int offset) const { SmallVector mixedSharedSizes = op.getMixedSharedSizes(); @@ -2533,28 +2535,27 @@ struct AMDGPUMakeDmaDescriptorLowering Value setTileDim0(MakeDmaDescriptorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Location loc, - Value sgpr3, const SmallVector &consts) const { + Value sgpr3, ArrayRef consts) const { return setTileDimX(op, adaptor, rewriter, loc, sgpr3, consts, 0, 112); } Value setTileDim1(MakeDmaDescriptorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Location loc, - Value sgpr4, const SmallVector &consts) const { + Value sgpr4, ArrayRef consts) const { return setTileDimX(op, adaptor, rewriter, loc, sgpr4, consts, 1, 128); } Value setTileDim2(MakeDmaDescriptorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Location loc, - Value sgpr4, const SmallVector &consts) const { + Value sgpr4, ArrayRef consts) const { return setTileDimX(op, adaptor, rewriter, loc, sgpr4, consts, 2, 144); } std::pair setTensorDimXStride(MakeDmaDescriptorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Location loc, - Value sgprY, Value sgprZ, - const SmallVector &consts, size_t dimX, - int offset) const { + Value sgprY, Value sgprZ, ArrayRef consts, + size_t dimX, int offset) const { SmallVector mixedGlobalStrides = op.getMixedGlobalStrides(); if (mixedGlobalStrides.size() <= dimX) { @@ -2595,8 +2596,7 @@ struct AMDGPUMakeDmaDescriptorLowering std::pair setTensorDim0Stride(MakeDmaDescriptorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Location loc, - Value sgpr5, Value sgpr6, - const SmallVector &consts) const { + Value sgpr5, Value sgpr6, ArrayRef consts) const { return setTensorDimXStride(op, adaptor, rewriter, loc, sgpr5, sgpr6, consts, 0, 160); } @@ -2604,15 +2604,14 @@ struct AMDGPUMakeDmaDescriptorLowering std::pair setTensorDim1Stride(MakeDmaDescriptorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Location loc, - Value sgpr5, Value sgpr6, - const SmallVector &consts) const { + Value sgpr5, Value sgpr6, ArrayRef consts) const { return setTensorDimXStride(op, adaptor, rewriter, loc, sgpr5, sgpr6, consts, 1, 208); } Value getDGroup1(MakeDmaDescriptorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Location loc, - const SmallVector &consts) const { + ArrayRef consts) const { Value sgpr0, sgpr1, sgpr2, sgpr3, sgpr4, sgpr5, sgpr6, sgpr7; sgpr0 = sgpr1 = sgpr2 = sgpr3 = sgpr4 = sgpr5 = sgpr6 = sgpr7 = consts[0]; From 0aa6fe1dbc2236305de8dd234d1ec302eb4e767f Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Tue, 2 Dec 2025 10:26:20 -0500 Subject: [PATCH 11/28] change expression for 48-bits --- mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index ac8dc971091a3..809ce369ab717 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -2572,7 +2572,7 @@ struct AMDGPUMakeDmaDescriptorLowering tensorDimXStride = cast(tensorDimXStrideOpFoldResult); } - constexpr int64_t first48bits = 0xFFFFFFFFFFFF; + constexpr int64_t first48bits = (1ll << 48) - 1; Value mask = createI64Constant(rewriter, loc, first48bits); tensorDimXStride = LLVM::AndOp::create(rewriter, loc, mask, tensorDimXStride); From f482bbb1d53037cc14d95fe1c2372abfdf4cef13 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Tue, 2 Dec 2025 15:11:53 -0500 Subject: [PATCH 12/28] int to int64_t --- mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index beee56893cee4..0996e40393a74 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -2348,7 +2348,7 @@ struct AMDGPUMakeDmaDescriptorLowering Value getDGroup0(OpAdaptor adaptor) const { return adaptor.getBase(); } Value setValueAtOffset(ConversionPatternRewriter &rewriter, Location loc, - Value accumulator, Value value, int shift) const { + Value accumulator, Value value, int64_t shift) const { shift = shift % 32; Value shiftAmount; if (shift != 0) { @@ -2510,7 +2510,7 @@ struct AMDGPUMakeDmaDescriptorLowering Value setTileDimX(MakeDmaDescriptorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Location loc, Value sgpr, ArrayRef consts, size_t dimX, - int offset) const { + int64_t offset) const { SmallVector mixedSharedSizes = op.getMixedSharedSizes(); if (mixedSharedSizes.size() <= dimX) { @@ -2550,7 +2550,7 @@ struct AMDGPUMakeDmaDescriptorLowering setTensorDimXStride(MakeDmaDescriptorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Location loc, Value sgprY, Value sgprZ, ArrayRef consts, - size_t dimX, int offset) const { + size_t dimX, int64_t offset) const { SmallVector mixedGlobalStrides = op.getMixedGlobalStrides(); if (mixedGlobalStrides.size() <= dimX) { @@ -2575,7 +2575,7 @@ struct AMDGPUMakeDmaDescriptorLowering Value tensorDimXStrideLow = LLVM::TruncOp::create(rewriter, loc, i32, tensorDimXStride); - int shift = (offset % 32) == 0 ? 32 : offset % 32; + int64_t shift = (offset % 32) == 0 ? 32 : offset % 32; Value shiftVal = createI64Constant(rewriter, loc, shift); Value tensorDimXStrideHigh = LLVM::LShrOp::create(rewriter, loc, tensorDimXStride, shiftVal); From ffd56857bf59bf1236411f0de3b91b7100683c8d Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Tue, 2 Dec 2025 15:15:34 -0500 Subject: [PATCH 13/28] format --- .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 0996e40393a74..5a08aca51f025 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -2757,27 +2757,27 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, Chipset chipset) { populateAMDGPUMemorySpaceAttributeConversions(converter); - patterns - .add, - RawBufferOpLowering, - RawBufferOpLowering, - RawBufferOpLowering, - RawBufferOpLowering, - RawBufferOpLowering, - RawBufferOpLowering, - AMDGPUDPPLowering, MemoryCounterWaitOpLowering, LDSBarrierOpLowering, - SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering, - WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPacked816OpLowering, - ScaledExtPackedOpLowering, PackedScaledTruncOpLowering, - PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering, - GatherToLDSOpLowering, TransposeLoadOpLowering, - AMDGPUPermlaneLowering, AMDGPUMakeDmaBaseLowering, - AMDGPUMakeDmaDescriptorLowering>(converter, chipset); + patterns.add< + FatRawBufferCastLowering, + RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering, + RawBufferOpLowering, + AMDGPUDPPLowering, MemoryCounterWaitOpLowering, LDSBarrierOpLowering, + SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering, + WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPacked816OpLowering, + ScaledExtPackedOpLowering, PackedScaledTruncOpLowering, + PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering, + GatherToLDSOpLowering, TransposeLoadOpLowering, AMDGPUPermlaneLowering, + AMDGPUMakeDmaBaseLowering, AMDGPUMakeDmaDescriptorLowering>(converter, + chipset); patterns.add(converter); } From 5d45a721d1aefcf10fe87d3cd2def50041057ca1 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Tue, 2 Dec 2025 15:23:24 -0500 Subject: [PATCH 14/28] revert exposing utility functions --- .../mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h | 5 -- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 62 +++++++++---------- 2 files changed, 30 insertions(+), 37 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h index 958757da0933e..a7680fb5c3191 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h @@ -48,11 +48,6 @@ inline void printMNKDimensionList(OpAsmPrinter &printer, Operation *, IntegerAttr m, IntegerAttr n, IntegerAttr k) { printMNKDimensionList(printer, m, n, k); } - -// Utility functions for quering the address space. -bool hasGlobalMemorySpace(Attribute memorySpace); -bool hasWorkgroupMemorySpace(Attribute memorySpace); -bool hasFatRawBufferMemorySpace(Attribute memorySpace); } // namespace mlir::amdgpu #define GET_ATTRDEF_CLASSES diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index d45a85c9eb68c..6ef28d43df451 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -41,38 +41,6 @@ using namespace mlir::amdgpu; #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.cpp.inc" -namespace mlir::amdgpu { -bool hasGlobalMemorySpace(Attribute memorySpace) { - if (!memorySpace) - return true; - if (auto intMemorySpace = dyn_cast(memorySpace)) - return intMemorySpace.getInt() == 0 || intMemorySpace.getInt() == 1; - if (auto gpuMemorySpace = dyn_cast(memorySpace)) - return gpuMemorySpace.getValue() == gpu::AddressSpace::Global; - return false; -} - -bool hasWorkgroupMemorySpace(Attribute memorySpace) { - if (!memorySpace) - return false; - if (auto intMemorySpace = dyn_cast(memorySpace)) - return intMemorySpace.getInt() == 3; - if (auto gpuMemorySpace = dyn_cast(memorySpace)) - return gpuMemorySpace.getValue() == gpu::AddressSpace::Workgroup; - return false; -} - -bool hasFatRawBufferMemorySpace(Attribute memorySpace) { - if (!memorySpace) - return false; - if (auto intMemorySpace = dyn_cast(memorySpace)) - return intMemorySpace.getInt() == 7; - if (auto gpuMemorySpace = dyn_cast(memorySpace)) - return gpuMemorySpace.getValue() == amdgpu::AddressSpace::FatRawBuffer; - return false; -} -} // namespace mlir::amdgpu - namespace { struct AMDGPUInlinerInterface final : DialectInlinerInterface { using DialectInlinerInterface::DialectInlinerInterface; @@ -190,6 +158,36 @@ LogicalResult FatRawBufferCastOp::verify() { return success(); } +static bool hasGlobalMemorySpace(Attribute memorySpace) { + if (!memorySpace) + return true; + if (auto intMemorySpace = dyn_cast(memorySpace)) + return intMemorySpace.getInt() == 0 || intMemorySpace.getInt() == 1; + if (auto gpuMemorySpace = dyn_cast(memorySpace)) + return gpuMemorySpace.getValue() == gpu::AddressSpace::Global; + return false; +} + +static bool hasWorkgroupMemorySpace(Attribute memorySpace) { + if (!memorySpace) + return false; + if (auto intMemorySpace = dyn_cast(memorySpace)) + return intMemorySpace.getInt() == 3; + if (auto gpuMemorySpace = dyn_cast(memorySpace)) + return gpuMemorySpace.getValue() == gpu::AddressSpace::Workgroup; + return false; +} + +static bool hasFatRawBufferMemorySpace(Attribute memorySpace) { + if (!memorySpace) + return false; + if (auto intMemorySpace = dyn_cast(memorySpace)) + return intMemorySpace.getInt() == 7; + if (auto gpuMemorySpace = dyn_cast(memorySpace)) + return gpuMemorySpace.getValue() == amdgpu::AddressSpace::FatRawBuffer; + return false; +} + //===----------------------------------------------------------------------===// // RawBuffer*Op //===----------------------------------------------------------------------===// From f187e76c255c7fede998ce98bc7675df93ba8646 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Tue, 2 Dec 2025 16:03:13 -0500 Subject: [PATCH 15/28] Avoid or if possible --- .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 7 +++++ .../Conversion/AMDGPUToROCDL/gfx1250.mlir | 26 +++++++------------ 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 5a08aca51f025..36bb8b082c0f2 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -2355,6 +2355,13 @@ struct AMDGPUMakeDmaDescriptorLowering shiftAmount = createI32Constant(rewriter, loc, shift % 32); value = LLVM::ShlOp::create(rewriter, loc, value, shiftAmount); } + + if (LLVM::ConstantOp op = accumulator.getDefiningOp()) { + if (IntegerAttr attr = dyn_cast(op.getValue()); + attr.getInt() == 0) { + return value; + } + } return LLVM::OrOp::create(rewriter, loc, accumulator, value); } diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir index ed794631754f6..d4f703c004f11 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir @@ -251,43 +251,36 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base) -> !amdgpu.tdm_desc // CHECK-DAG: %[[C6:.+]] = llvm.mlir.constant(6 : i32) // CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32) - // CHECK: %[[C16:.+]] = llvm.mlir.constant(16 : i32) - // CHECK: %[[SIZE:.+]] = llvm.shl %[[C2]], %[[C16]] - // CHECK: %[[SGPR0:.+]] = llvm.or %[[C0]], %[[SIZE]] + // CHECK: %[[SIZE:.+]] = llvm.mlir.constant(16 : i32) + // CHECK: %[[SGPR0:.+]] = llvm.shl %[[C2]], %[[SIZE]] // CHECK-DAG: %[[TENSOR_DIM_0:.+]] = llvm.mlir.constant(64 : i32) // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32) - // CHECK: %[[TENSOR_DIM_0_HIGH:.+]] = llvm.lshr %[[TENSOR_DIM_0]], %[[C16]] + // CHECK: %[[SGPR2_0:.+]] = llvm.lshr %[[TENSOR_DIM_0]], %[[C16]] // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32) - // CHECK: %[[TENSOR_DIM_0_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_0]], %[[C16]] - // CHECK: %[[SGPR1:.+]] = llvm.or %[[C0]], %[[TENSOR_DIM_0_SHIFTED]] - // CHECK: %[[SGPR2_0:.+]] = llvm.or %[[C0]], %[[TENSOR_DIM_0_HIGH]] + // CHECK: %[[SGPR1:.+]] = llvm.shl %[[TENSOR_DIM_0]], %[[C16]] // CHECK-DAG: %[[TENSOR_DIM_1:.+]] = llvm.mlir.constant(128 : i32) // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32) - // CHECK: %[[TENSOR_DIM_1_HIGH:.+]] = llvm.lshr %[[TENSOR_DIM_1]], %[[C16]] + // CHECK: %[[SGPR3_0:.+]] = llvm.lshr %[[TENSOR_DIM_1]], %[[C16]] // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32) // CHECK: %[[TENSOR_DIM_1_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1]], %[[C16]] // CHECK: %[[SGPR2:.+]] = llvm.or %[[SGPR2_0]], %[[TENSOR_DIM_1_SHIFTED]] - // CHECK: %[[SGPR3_0:.+]] = llvm.or %[[C0]], %[[TENSOR_DIM_1_HIGH]] // CHECK-DAG: %[[TILE_DIM_0:.+]] = llvm.mlir.constant(64 : i32) // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32) // CHECK: %[[TILE_DIM_0_SHIFTED:.+]] = llvm.shl %[[TILE_DIM_0:.+]], %[[C16]] // CHECK: %[[SGPR3:.+]] = llvm.or %[[SGPR3_0]], %[[TILE_DIM_0_SHIFTED]] - // CHECK-DAG: %[[TILE_DIM_1:.+]] = llvm.mlir.constant(128 : i32) - // CHECK: %[[SGPR4:.+]] = llvm.or %[[C0]], %[[TILE_DIM_1]] + // CHECK-DAG: %[[SGPR4:.+]] = llvm.mlir.constant(128 : i32) // CHECK-DAG: %[[TENSOR_DIM_0_STRIDE:.+]] = llvm.mlir.constant(1 : i64) : i64 // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64 // CHECK: %[[TENSOR_DIM_0_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_0_STRIDE]] - // CHECK-DAG: %[[TENSOR_DIM_0_STRIDE_LOW:.+]] = llvm.trunc %[[TENSOR_DIM_0_STRIDE_MASKED]] : i64 to i32 + // CHECK-DAG: %[[SGPR5:.+]] = llvm.trunc %[[TENSOR_DIM_0_STRIDE_MASKED]] : i64 to i32 // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(32 : i64) : i64 // CHECK: %[[TENSOR_DIM_0_STRIDE_HIGH_64:.+]] = llvm.lshr %[[TENSOR_DIM_0_STRIDE_MASKED]], %[[SHIFT]] - // CHECK: %[[TENSOR_DIM_0_STRIDE_HIGH:.+]] = llvm.trunc %[[TENSOR_DIM_0_STRIDE_HIGH_64]] : i64 to i32 - // CHECK: %[[SGPR5:.+]] = llvm.or %[[C0]], %[[TENSOR_DIM_0_STRIDE_LOW]] - // CHECK: %[[SGPR6_0:.+]] = llvm.or %[[C0]], %[[TENSOR_DIM_0_STRIDE_HIGH]] + // CHECK: %[[SGPR6_0:.+]] = llvm.trunc %[[TENSOR_DIM_0_STRIDE_HIGH_64]] : i64 to i32 // CHECK-DAG: %[[TENSOR_DIM_1_STRIDE:.+]] = llvm.mlir.constant(64 : i64) // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64 @@ -295,11 +288,10 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base) -> !amdgpu.tdm_desc // CHECK-DAG: %[[TENSOR_DIM_1_STRIDE_LOW:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_MASKED]] // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i64) : i64 // CHECK: %[[TENSOR_DIM_1_STRIDE_SHIFTED:.+]] = llvm.lshr %[[TENSOR_DIM_1_STRIDE_MASKED]], %[[SHIFT]] - // CHECK: %[[TENSOR_DIM_1_STRIDE_HIGH:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_SHIFTED]] : i64 to i32 + // CHECK: %[[SGPR7:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_SHIFTED]] : i64 to i32 // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32) : i32 // CHECK: %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1_STRIDE_LOW]], %[[SHIFT]] // CHECK-DAG: %[[SGPR6:.+]] = llvm.or %[[SGPR6_0]], %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED]] - // CHECK-DAG: %[[SGPR7:.+]] = llvm.or %[[C0]], %[[TENSOR_DIM_1_STRIDE_HIGH]] // CHECK: %[[V8I32:.+]] = llvm.mlir.undef : vector<8xi32> // CHECK: %[[DGROUP1_0:.+]] = llvm.insertelement %[[SGPR0]], %[[V8I32]][%[[C0]] : i32] From 661931c2ff126b955dc9f16c9a4836b12c3eccd6 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Tue, 2 Dec 2025 16:06:32 -0500 Subject: [PATCH 16/28] Use a single constant --- mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 12 ++++++------ mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir | 3 +-- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 36bb8b082c0f2..e54cefd1e3fd2 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -2371,24 +2371,24 @@ struct AMDGPUMakeDmaDescriptorLowering // Compute data_size. int elementTypeWidthInBytes = op.getElementTypeWidth() / 8; - Value dataSize; + int dataSize; switch (elementTypeWidthInBytes) { case 1: - dataSize = consts[0]; + dataSize = 0; break; case 2: - dataSize = consts[1]; + dataSize = 1; break; case 4: - dataSize = consts[2]; + dataSize = 2; break; case 8: - dataSize = consts[3]; + dataSize = 3; break; default: llvm_unreachable("Invalid element size."); } - return setValueAtOffset(rewriter, loc, sgpr0, dataSize, 16); + return createI32Constant(rewriter, loc, dataSize << 16); } Value setAtomicBarrier(MakeDmaDescriptorOp op, OpAdaptor adaptor, diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir index d4f703c004f11..9e24f23f5a4f9 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir @@ -251,8 +251,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base) -> !amdgpu.tdm_desc // CHECK-DAG: %[[C6:.+]] = llvm.mlir.constant(6 : i32) // CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32) - // CHECK: %[[SIZE:.+]] = llvm.mlir.constant(16 : i32) - // CHECK: %[[SGPR0:.+]] = llvm.shl %[[C2]], %[[SIZE]] + // CHECK-DAG: %[[SGPR0:.+]] = llvm.mlir.constant(131072 : i32) // CHECK-DAG: %[[TENSOR_DIM_0:.+]] = llvm.mlir.constant(64 : i32) // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32) From a1a82f81e749d402747907dd50149f9aaa99ab66 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Tue, 2 Dec 2025 16:39:07 -0500 Subject: [PATCH 17/28] explicit type to auto --- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index 6ef28d43df451..ca1ed24ebd495 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -711,8 +711,8 @@ LogicalResult TransposeLoadOp::verify() { LogicalResult MakeDmaBaseOp::verify() { - MemRefType ldsType = cast(getLds().getType()); - MemRefType globalType = cast(getGlobal().getType()); + auto ldsType = cast(getLds().getType()); + auto globalType = cast(getGlobal().getType()); if (!hasWorkgroupMemorySpace(ldsType.getMemorySpace())) { return emitOpError( "lds memref must have workgroup address space attribute."); From ac543c2a094ff9ca9a5f7622252afbd946ae642d Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Tue, 2 Dec 2025 16:45:55 -0500 Subject: [PATCH 18/28] Remove unnecessary braces --- .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 45 +++++++---------- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 49 +++++++------------ 2 files changed, 37 insertions(+), 57 deletions(-) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index e54cefd1e3fd2..159a298951fc1 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -2356,12 +2356,11 @@ struct AMDGPUMakeDmaDescriptorLowering value = LLVM::ShlOp::create(rewriter, loc, value, shiftAmount); } - if (LLVM::ConstantOp op = accumulator.getDefiningOp()) { + if (LLVM::ConstantOp op = accumulator.getDefiningOp()) if (IntegerAttr attr = dyn_cast(op.getValue()); - attr.getInt() == 0) { + attr.getInt() == 0) return value; - } - } + return LLVM::OrOp::create(rewriter, loc, accumulator, value); } @@ -2480,12 +2479,12 @@ struct AMDGPUMakeDmaDescriptorLowering SmallVector mixedGlobalSizes = op.getMixedGlobalSizes(); OpFoldResult tensorDim0OpFoldResult = mixedGlobalSizes.back(); Value tensorDim0; - if (auto attr = dyn_cast(tensorDim0OpFoldResult)) { + if (auto attr = dyn_cast(tensorDim0OpFoldResult)) tensorDim0 = createI32Constant(rewriter, loc, cast(attr).getInt()); - } else { + else tensorDim0 = cast(tensorDim0OpFoldResult); - } + Value c16 = createI32Constant(rewriter, loc, 16); Value tensorDim0High = LLVM::LShrOp::create(rewriter, loc, tensorDim0, c16); sgpr1 = setValueAtOffset(rewriter, loc, sgpr1, tensorDim0, 48); @@ -2501,12 +2500,12 @@ struct AMDGPUMakeDmaDescriptorLowering SmallVector mixedGlobalSizes = op.getMixedGlobalSizes(); OpFoldResult tensorDim1OpFoldResult = *(mixedGlobalSizes.rbegin() + 1); Value tensorDim1; - if (auto attr = dyn_cast(tensorDim1OpFoldResult)) { + if (auto attr = dyn_cast(tensorDim1OpFoldResult)) tensorDim1 = createI32Constant(rewriter, loc, cast(attr).getInt()); - } else { + else tensorDim1 = cast(tensorDim1OpFoldResult); - } + Value c16 = createI32Constant(rewriter, loc, 16); Value tensorDim1High = LLVM::LShrOp::create(rewriter, loc, tensorDim1, c16); sgpr2 = setValueAtOffset(rewriter, loc, sgpr2, tensorDim1, 80); @@ -2520,18 +2519,17 @@ struct AMDGPUMakeDmaDescriptorLowering int64_t offset) const { SmallVector mixedSharedSizes = op.getMixedSharedSizes(); - if (mixedSharedSizes.size() <= dimX) { + if (mixedSharedSizes.size() <= dimX) return sgpr; - } OpFoldResult tileDimXOpFoldResult = *(mixedSharedSizes.rbegin() + dimX); Value tileDimX; - if (auto attr = dyn_cast(tileDimXOpFoldResult)) { + if (auto attr = dyn_cast(tileDimXOpFoldResult)) tileDimX = createI32Constant(rewriter, loc, cast(attr).getInt()); - } else { + else tileDimX = cast(tileDimXOpFoldResult); - } + return setValueAtOffset(rewriter, loc, sgpr, tileDimX, offset); } @@ -2560,19 +2558,17 @@ struct AMDGPUMakeDmaDescriptorLowering size_t dimX, int64_t offset) const { SmallVector mixedGlobalStrides = op.getMixedGlobalStrides(); - if (mixedGlobalStrides.size() <= dimX) { + if (mixedGlobalStrides.size() <= dimX) return {sgprY, sgprZ}; - } OpFoldResult tensorDimXStrideOpFoldResult = *(mixedGlobalStrides.rbegin() + dimX); Value tensorDimXStride; - if (auto attr = dyn_cast(tensorDimXStrideOpFoldResult)) { + if (auto attr = dyn_cast(tensorDimXStrideOpFoldResult)) tensorDimXStride = createI64Constant(rewriter, loc, cast(attr).getInt()); - } else { + else tensorDimXStride = cast(tensorDimXStrideOpFoldResult); - } constexpr int64_t first48bits = (1ll << 48) - 1; Value mask = createI64Constant(rewriter, loc, first48bits); @@ -2666,14 +2662,12 @@ struct AMDGPUMakeDmaDescriptorLowering LogicalResult matchAndRewrite(MakeDmaDescriptorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - if (chipset < kGfx1250) { + if (chipset < kGfx1250) return op->emitOpError( "make_dma_descriptor is only supported on gfx1250"); - } - if (op.getRank() != 2) { + if (op.getRank() != 2) return op->emitOpError("unimplemented"); - } Location loc = op.getLoc(); @@ -2681,9 +2675,8 @@ struct AMDGPUMakeDmaDescriptorLowering Type v4i32 = this->typeConverter->convertType(VectorType::get(4, i32)); SmallVector consts; - for (int i = 0; i < 8; i++) { + for (int i = 0; i < 8; i++) consts.push_back(createI32Constant(rewriter, loc, i)); - } Value dgroup0 = this->getDGroup0(adaptor); Value dgroup1 = this->getDGroup1(op, adaptor, rewriter, loc, consts); diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index ca1ed24ebd495..a89edc89acd24 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -692,15 +692,14 @@ LogicalResult TransposeLoadOp::verify() { }; auto validNumElems = kValidLoadSizeMap.find(elementTypeSize); - if (validNumElems == kValidLoadSizeMap.end()) { + if (validNumElems == kValidLoadSizeMap.end()) return emitOpError("Unsupported element type size for transpose load: ") << elementTypeSize << " bits"; - } - if (numElements != validNumElems->second) { + + if (numElements != validNumElems->second) return emitOpError( "Transferring type size mismatch: expected num of elements: ") << validNumElems->second; - } return success(); } @@ -713,30 +712,26 @@ LogicalResult MakeDmaBaseOp::verify() { auto ldsType = cast(getLds().getType()); auto globalType = cast(getGlobal().getType()); - if (!hasWorkgroupMemorySpace(ldsType.getMemorySpace())) { + if (!hasWorkgroupMemorySpace(ldsType.getMemorySpace())) return emitOpError( "lds memref must have workgroup address space attribute."); - } - if (!hasGlobalMemorySpace(globalType.getMemorySpace())) { + if (!hasGlobalMemorySpace(globalType.getMemorySpace())) return emitOpError( "global memref must have global address space attribute."); - } Type elementType = ldsType.getElementType(); int width; - if (auto intType = dyn_cast(elementType)) { + if (auto intType = dyn_cast(elementType)) width = intType.getWidth(); - } else if (auto floatType = dyn_cast(elementType)) { + else if (auto floatType = dyn_cast(elementType)) width = floatType.getWidth(); - } else { + else return emitOpError("element type must have type width"); - } - if (!llvm::is_contained({8, 16, 32, 64}, width)) { + if (!llvm::is_contained({8, 16, 32, 64}, width)) return emitOpError( "element type must be 1, 2, 4, or 8 bytes long but type was ") << width << " bits long."; - } return success(); } @@ -748,45 +743,37 @@ LogicalResult MakeDmaBaseOp::verify() { LogicalResult MakeDmaDescriptorOp::verify() { ArrayRef globalStaticStrides = getGlobalStaticStrides(); - if (globalStaticStrides.empty()) { + if (globalStaticStrides.empty()) return emitOpError("strides must not be empty."); - } - if (globalStaticStrides.back() != 1) { + if (globalStaticStrides.back() != 1) return emitOpError("strides for the innermost dimension must be 1."); - } ArrayRef globalStaticSizes = getGlobalStaticSizes(); size_t rank = globalStaticSizes.size(); - if (rank < 2) { + if (rank < 2) return emitOpError("tensor and tile must be at least of rank 2."); - } - if (rank > 5) { + if (rank > 5) return emitOpError("tensor and tile must be at most of rank 5."); - } - if (rank != globalStaticStrides.size()) { + if (rank != globalStaticStrides.size()) return emitOpError("strides and sizes must have same rank."); - } ArrayRef sharedStaticSizes = getSharedStaticSizes(); - if (rank != sharedStaticSizes.size()) { + if (rank != sharedStaticSizes.size()) return emitOpError("tensor must have same rank as tile."); - } int elementTypeWidth = getElementTypeWidth(); - if (!llvm::is_contained({8, 16, 32, 64}, elementTypeWidth)) { + if (!llvm::is_contained({8, 16, 32, 64}, elementTypeWidth)) return emitOpError( "element type width must be 1, 2, 4 or 8 bytes, but was ") << elementTypeWidth << " bits long"; - } if (Value atomicBarrierAddress = getAtomicBarrierAddress()) { - MemRefType atomicBarrierAddressType = + auto atomicBarrierAddressType = cast(atomicBarrierAddress.getType()); bool barrierInLDS = hasWorkgroupMemorySpace(atomicBarrierAddressType.getMemorySpace()); - if (!barrierInLDS) { + if (!barrierInLDS) return emitOpError("atomic barrier address must be in LDS."); - } } return success(); From 5672371252acd3b2555260bbb6b0eaa3ddf4f4ad Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Wed, 3 Dec 2025 09:08:23 -0500 Subject: [PATCH 19/28] Use log2_32 --- .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 22 +++---------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 159a298951fc1..cb050d58961c7 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -2368,25 +2368,9 @@ struct AMDGPUMakeDmaDescriptorLowering ConversionPatternRewriter &rewriter, Location loc, Value sgpr0, ArrayRef consts) const { // Compute data_size. - int elementTypeWidthInBytes = op.getElementTypeWidth() / 8; - - int dataSize; - switch (elementTypeWidthInBytes) { - case 1: - dataSize = 0; - break; - case 2: - dataSize = 1; - break; - case 4: - dataSize = 2; - break; - case 8: - dataSize = 3; - break; - default: - llvm_unreachable("Invalid element size."); - } + int elementTypeWidthInBits = op.getElementTypeWidth(); + assert(llvm::is_contained({8, 16, 32, 64}, elementTypeWidthInBits)); + int dataSize = llvm::Log2_32(elementTypeWidthInBits / 8); return createI32Constant(rewriter, loc, dataSize << 16); } From 1549f5bdd6469d60823a07db1aebc16fdb2a61ca Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Wed, 3 Dec 2025 09:11:02 -0500 Subject: [PATCH 20/28] !isDynamic -> isStatic --- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index a89edc89acd24..ec95b15fc9404 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -787,7 +787,7 @@ static bool maybeUpdateDynamicIndexList( int index = 0; for (int64_t static_element : staticElements) { - if (!ShapedType::isDynamic(static_element)) { + if (ShapedType::isStatic(static_element)) { newStaticElements.push_back(static_element); continue; } From dc76238c56787d468d3a5039f088bbfb9246cf2d Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Wed, 3 Dec 2025 09:13:35 -0500 Subject: [PATCH 21/28] Dot at end of comments --- mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index cb050d58961c7..48d0316e79c12 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -2413,11 +2413,11 @@ struct AMDGPUMakeDmaDescriptorLowering IntegerType i32 = rewriter.getI32Type(); Value padInterval = adaptor.getPadInterval(); - // pre-condition: padInterval can be a power of two between 2 and 256 + // pre-condition: padInterval can be a power of two between 2 and 256. padInterval = LLVM::CountTrailingZerosOp::create(rewriter, loc, i32, padInterval, false); padInterval = LLVM::SubOp::create(rewriter, loc, padInterval, consts[1]); - // post-condition: padInterval can be a value between 0 and 7 + // post-condition: padInterval can be a value between 0 and 7. return setValueAtOffset(rewriter, loc, sgpr0, padInterval, 22); } @@ -2429,9 +2429,9 @@ struct AMDGPUMakeDmaDescriptorLowering return sgpr0; Value padAmount = adaptor.getPadAmount(); - // pre-condition: padAmount is a value between 1-128 + // pre-condition: padAmount is a value between 1-128. padAmount = LLVM::SubOp::create(rewriter, loc, padAmount, consts[1]); - // post-condition: padAmount is a value between 0-127 + // post-condition: padAmount is a value between 0-127. return setValueAtOffset(rewriter, loc, sgpr0, padAmount, 25); } From 8ee42b1837c8b7bc74ff01c7c66add5bf7736d7d Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Wed, 3 Dec 2025 09:23:29 -0500 Subject: [PATCH 22/28] C-array and for-loop --- .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 69 ++++++++----------- 1 file changed, 30 insertions(+), 39 deletions(-) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 48d0316e79c12..2c3a8a0914b1b 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -2595,50 +2595,41 @@ struct AMDGPUMakeDmaDescriptorLowering ConversionPatternRewriter &rewriter, Location loc, ArrayRef consts) const { - Value sgpr0, sgpr1, sgpr2, sgpr3, sgpr4, sgpr5, sgpr6, sgpr7; - sgpr0 = sgpr1 = sgpr2 = sgpr3 = sgpr4 = sgpr5 = sgpr6 = sgpr7 = consts[0]; - - sgpr0 = setDataSize(op, adaptor, rewriter, loc, sgpr0, consts); - sgpr0 = setAtomicBarrier(op, adaptor, rewriter, loc, sgpr0, consts); - sgpr0 = setIterateEnable(op, adaptor, rewriter, loc, sgpr0, consts); - sgpr0 = setPadEnable(op, adaptor, rewriter, loc, sgpr0, consts); - sgpr0 = setPadInterval(op, adaptor, rewriter, loc, sgpr0, consts); - sgpr0 = setPadAmount(op, adaptor, rewriter, loc, sgpr0, consts); - - sgpr1 = setAtomicBarrierAddress(op, adaptor, rewriter, loc, sgpr1, consts); - std::tie(sgpr1, sgpr2) = - setTensorDim0(op, adaptor, rewriter, loc, sgpr1, sgpr2, consts); - std::tie(sgpr2, sgpr3) = - setTensorDim1(op, adaptor, rewriter, loc, sgpr2, sgpr3, consts); - - sgpr3 = setTileDim0(op, adaptor, rewriter, loc, sgpr3, consts); - sgpr4 = setTileDim1(op, adaptor, rewriter, loc, sgpr4, consts); - sgpr4 = setTileDim2(op, adaptor, rewriter, loc, sgpr4, consts); - std::tie(sgpr5, sgpr6) = - setTensorDim0Stride(op, adaptor, rewriter, loc, sgpr5, sgpr6, consts); - std::tie(sgpr6, sgpr7) = - setTensorDim1Stride(op, adaptor, rewriter, loc, sgpr6, sgpr7, consts); + Value sgprs[8]; + for (int i = 0; i < 8; i++) { + sgprs[i] = consts[0]; + } + + sgprs[0] = setDataSize(op, adaptor, rewriter, loc, sgprs[0], consts); + sgprs[0] = setAtomicBarrier(op, adaptor, rewriter, loc, sgprs[0], consts); + sgprs[0] = setIterateEnable(op, adaptor, rewriter, loc, sgprs[0], consts); + sgprs[0] = setPadEnable(op, adaptor, rewriter, loc, sgprs[0], consts); + sgprs[0] = setPadInterval(op, adaptor, rewriter, loc, sgprs[0], consts); + sgprs[0] = setPadAmount(op, adaptor, rewriter, loc, sgprs[0], consts); + + sgprs[1] = + setAtomicBarrierAddress(op, adaptor, rewriter, loc, sgprs[1], consts); + std::tie(sgprs[1], sgprs[2]) = + setTensorDim0(op, adaptor, rewriter, loc, sgprs[1], sgprs[2], consts); + std::tie(sgprs[2], sgprs[3]) = + setTensorDim1(op, adaptor, rewriter, loc, sgprs[2], sgprs[3], consts); + + sgprs[3] = setTileDim0(op, adaptor, rewriter, loc, sgprs[3], consts); + sgprs[4] = setTileDim1(op, adaptor, rewriter, loc, sgprs[4], consts); + sgprs[4] = setTileDim2(op, adaptor, rewriter, loc, sgprs[4], consts); + std::tie(sgprs[5], sgprs[6]) = setTensorDim0Stride( + op, adaptor, rewriter, loc, sgprs[5], sgprs[6], consts); + std::tie(sgprs[6], sgprs[7]) = setTensorDim1Stride( + op, adaptor, rewriter, loc, sgprs[6], sgprs[7], consts); IntegerType i32 = rewriter.getI32Type(); Type v8i32 = this->typeConverter->convertType(VectorType::get(8, i32)); Value dgroup1 = LLVM::UndefOp::create(rewriter, loc, v8i32); - dgroup1 = - LLVM::InsertElementOp::create(rewriter, loc, dgroup1, sgpr0, consts[0]); - dgroup1 = - LLVM::InsertElementOp::create(rewriter, loc, dgroup1, sgpr1, consts[1]); - dgroup1 = - LLVM::InsertElementOp::create(rewriter, loc, dgroup1, sgpr2, consts[2]); - dgroup1 = - LLVM::InsertElementOp::create(rewriter, loc, dgroup1, sgpr3, consts[3]); - dgroup1 = - LLVM::InsertElementOp::create(rewriter, loc, dgroup1, sgpr4, consts[4]); - dgroup1 = - LLVM::InsertElementOp::create(rewriter, loc, dgroup1, sgpr5, consts[5]); - dgroup1 = - LLVM::InsertElementOp::create(rewriter, loc, dgroup1, sgpr6, consts[6]); - dgroup1 = - LLVM::InsertElementOp::create(rewriter, loc, dgroup1, sgpr7, consts[7]); + for (auto [sgpr, constant] : llvm::zip_equal(sgprs, consts)) { + dgroup1 = + LLVM::InsertElementOp::create(rewriter, loc, dgroup1, sgpr, constant); + } return dgroup1; } From 535f8ceaebd78cc9bd1b360fb20341ac17cbe57c Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Wed, 3 Dec 2025 09:24:57 -0500 Subject: [PATCH 23/28] Delete superfluous empty line --- mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 2c3a8a0914b1b..71056dbe51717 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -2594,7 +2594,6 @@ struct AMDGPUMakeDmaDescriptorLowering Value getDGroup1(MakeDmaDescriptorOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Location loc, ArrayRef consts) const { - Value sgprs[8]; for (int i = 0; i < 8; i++) { sgprs[i] = consts[0]; From be6560dea9c778d39f73aedc536a8c26aa16d04a Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Wed, 3 Dec 2025 09:29:28 -0500 Subject: [PATCH 24/28] Assert type conversion succeeded. --- mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 71056dbe51717..5fb5ab67d4a72 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -2322,6 +2322,7 @@ struct AMDGPUMakeDmaBaseLowering Value c3 = createI32Constant(rewriter, loc, 3); Type v4i32 = this->typeConverter->convertType(VectorType::get(4, i32)); + assert(v4i32); Value result = LLVM::PoisonOp::create(rewriter, loc, v4i32); result = LLVM::InsertElementOp::create(rewriter, loc, result, c1, c0); result = LLVM::InsertElementOp::create(rewriter, loc, result, @@ -2623,6 +2624,7 @@ struct AMDGPUMakeDmaDescriptorLowering IntegerType i32 = rewriter.getI32Type(); Type v8i32 = this->typeConverter->convertType(VectorType::get(8, i32)); + assert(v8i32); Value dgroup1 = LLVM::UndefOp::create(rewriter, loc, v8i32); for (auto [sgpr, constant] : llvm::zip_equal(sgprs, consts)) { @@ -2647,6 +2649,7 @@ struct AMDGPUMakeDmaDescriptorLowering IntegerType i32 = rewriter.getI32Type(); Type v4i32 = this->typeConverter->convertType(VectorType::get(4, i32)); + assert(v4i32); SmallVector consts; for (int i = 0; i < 8; i++) From 2092acb6b233d4acc840f3ce6ac80e9e8e188422 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Wed, 3 Dec 2025 09:44:59 -0500 Subject: [PATCH 25/28] Use getIntOrFloatBitWidth --- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index ec95b15fc9404..772fef83133f3 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -720,15 +720,13 @@ LogicalResult MakeDmaBaseOp::verify() { "global memref must have global address space attribute."); Type elementType = ldsType.getElementType(); - int width; - if (auto intType = dyn_cast(elementType)) - width = intType.getWidth(); - else if (auto floatType = dyn_cast(elementType)) - width = floatType.getWidth(); + unsigned width; + if (elementType.isIntOrFloat()) + width = elementType.getIntOrFloatBitWidth(); else return emitOpError("element type must have type width"); - if (!llvm::is_contained({8, 16, 32, 64}, width)) + if (!llvm::is_contained({8, 16, 32, 64}, width)) return emitOpError( "element type must be 1, 2, 4, or 8 bytes long but type was ") << width << " bits long."; From ea45349259565d7a8b665c94457e26cf3a465fdb Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Wed, 3 Dec 2025 09:50:50 -0500 Subject: [PATCH 26/28] use getIntOrFloatBitWidth --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 18fc87de24919..d98034852554f 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1355,11 +1355,9 @@ def AMDGPU_MakeDmaDescriptorOp : int getElementTypeWidth() { Type elementType = getBase().getType().getElementType(); - int width; - if (auto floatType = dyn_cast(elementType)) { - width = floatType.getWidth(); - } else if (auto intType = dyn_cast(elementType)) { - width = intType.getWidth(); + unsigned width; + if (elementType.isIntOrFloat()) { + width = elementType.getIntOrFloatBitWidth(); } else { llvm_unreachable("element type must have getWidth interface"); } From 06343508492cefdb30f657504a5f8531720d06a0 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Wed, 3 Dec 2025 10:04:29 -0500 Subject: [PATCH 27/28] Add documentation. --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index d98034852554f..5bdac09fb66d6 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1317,6 +1317,8 @@ def AMDGPU_MakeDmaDescriptorOp : Padding can be applied to the LDS address when copying from memory to LDS, but not when copying from LDS to memory. The values in the padded target addresses remain the same as before the operation was applied. + $pad_interval must be a power of two contained in [2, 256]. + $pad_amount must be a value contained in [1, 128]. 2D and 3D tensors may be iterated over by setting $global_increment, $lds_increment, and $iteration_count. $global_increment determines how much to increment the starting global memory address per iteration in units of the $base's element type. From 2fafa289533829dcf09286a30ad91f9555d0386f Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Wed, 3 Dec 2025 10:09:01 -0500 Subject: [PATCH 28/28] documentation --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 2 ++ mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 5bdac09fb66d6..e692d62bf0187 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1320,6 +1320,8 @@ def AMDGPU_MakeDmaDescriptorOp : $pad_interval must be a power of two contained in [2, 256]. $pad_amount must be a value contained in [1, 128]. + $atomic_barrier_address must be aligned to 8 bytes. + 2D and 3D tensors may be iterated over by setting $global_increment, $lds_increment, and $iteration_count. $global_increment determines how much to increment the starting global memory address per iteration in units of the $base's element type. $lds_increment determines how much to increment the starting LDS address per iteration in units of the $base's element type. diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 5fb5ab67d4a72..0128da4572a7c 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -2446,6 +2446,8 @@ struct AMDGPUMakeDmaDescriptorLowering Value atomicBarrierAddress = adaptor.getAtomicBarrierAddress(); IntegerType i32 = rewriter.getI32Type(); + // pre-condition: atomicBarrierAddress is aligned to 8 bytes which implies + // that the 3 LSBs are zero. atomicBarrierAddress = LLVM::PtrToIntOp::create(rewriter, loc, i32, atomicBarrierAddress); atomicBarrierAddress =