From b766215221d453a97002d6faabc1c387dac3f10b Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Wed, 26 Nov 2025 16:42:44 -0500 Subject: [PATCH 1/4] [mlir][amdgpu] Add make_dma_descriptor op --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 117 ++++++++++++++++-- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 28 +++++ mlir/test/Dialect/AMDGPU/invalid.mlir | 40 ++++++ mlir/test/Dialect/AMDGPU/ops.mlir | 59 ++++++++- 4 files changed, 232 insertions(+), 12 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index e07c72b839e7c..3581b07dc4e3e 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -80,15 +80,15 @@ def AMDGPU_AddressSpaceAttr : EnumAttr traits = []> : TypeDef { let mnemonic = typeMnemonic; } -//===----------------------------------------------------------------------===// -// AMDGPU Type definitions -//===----------------------------------------------------------------------===// - def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> { let summary = "Pair of base addresses that move data between LDS and global storage."; let description = [{ @@ -104,6 +104,15 @@ def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> { let assemblyFormat = "`<` $elementType `>`"; } +def AMDGPU_TDMDescriptorType : AMDGPU_Type<"TDMDescriptor", "tdm_descriptor"> { + let summary = "Descriptors used in tensor store/load operations."; + let description = [{ + This type is opaque and corresponds to the two or four descriptor groups + used in tensor_load_to_lds or tensor_store_from_lds. + }]; + +} + //===----------------------------------------------------------------------===// // AMDGPU Op definitions //===----------------------------------------------------------------------===// @@ -1222,14 +1231,13 @@ def AMDGPU_MakeDmaBaseOp : AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments]>, Arguments<(ins Arg:$src, - Variadic:$srcIndices, + Variadic:$src_indices, Arg:$dst, - Variadic:$dstIndices)>, + Variadic:$dst_indices)>, Results<(outs AMDGPU_TDMBaseType: $base)> { // TODO: // * Add verifiers such that one of the memrefs is from LDS and the other global. - // * Add verifiers to make sure that the type is in the correct direction. // * Add verifiers to make sure that the number of indices do not exceed the number of dimensions. let summary = "Pair of based addresses used when moving tiles between LDS and global memory."; @@ -1240,12 +1248,105 @@ def AMDGPU_MakeDmaBaseOp : This operation creates a value corresponding to the tensor descriptor (D#) group 0 found in TensorLoadToLDSOp and TensorStoreFromLDSOp in the rocdl dialect. + For example: + + ```mlir + %base = amdgpu.make_dma_base %src[%idx0], %dst[%idx1] : memref<8xi32>, memref<8xi32, #gpu.address_space> -> !amdgpu.tdm_base + %descriptor = amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [2, 1] sharedSize [2, 2] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor + ``` + + to + + ```mlir + // pseudocode + %base_0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr)> + %base_1 = llvm.insertvalue %global_addr, %base_0[0] : !llvm.struct<(ptr, ptr)> + %base_2 = llvm.insertvalue %lds_addr, %base_1[1] : !llvm.struct(ptr, ptr)> + // type(%base_2) = !llvm.struct<(ptr, ptr) roughly corresponds to amdgpu.tdm_base + + // The base will be used when contructing dgroup0 + // when lowering amdgpu.make_dma_descriptor + %dgroup0_0 = llvm.mlir.undef : !llvm.struct<(....)> + %dgroup0_1 = llvm.insertvalue %base2, %dgroup0_0 : .... + + // When lowering amdgpu.tensor_load_to_lds + rocdl.tensor.load.to.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32> + ``` + These tensor DMA operations were introduced in gfx1250. }]; let assemblyFormat = [{ - $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` attr-dict `:` type($src) `,` type($dst) `to` type(results) + $src `[` $src_indices `]` `,` $dst `[` $dst_indices `]` attr-dict `:` type($src) `,` type($dst) `->` type(results) + }]; +} + +def AMDGPU_MakeDmaDescriptorOp : + AMDGPU_Op<"make_dma_descriptor", [Pure, AttrSizedOperandSegments]>, + Arguments<(ins + AMDGPU_TDMBaseType: $base, + Variadic: $global_dynamic_sizes, + DenseI64ArrayAttr: $global_static_sizes, + Variadic: $global_dynamic_strides, + DenseI64ArrayAttr: $global_static_strides, + Variadic: $shared_dynamic_sizes, + DenseI64ArrayAttr: $shared_static_sizes, + Optional: $pad, + Optional: $pad_every, + Optional: $atomic_barrier_address, + Variadic: $atomic_barrier_indices, + Optional: $global_increment, + Optional: $lds_increment, + Optional: $iteration_count)>, + Results<(outs AMDGPU_TDMDescriptorType: $desc)> { + + let summary = "Make all descriptor groups needed by TensorLoadToLDS/TensorStoreFromLDS."; + let description = [{ + Make all descriptor groups needed by tensor memory operations. + + The $base operand corresponds to the base pair addresses, one must be an address in LDS + while the other must be a global memory location. + + $global_{static/dynamic}_sizes determine the size of the tensor. + $global_{static/dynamic}_strides determine the strides of the tensor. + $shared_{static/dynamic}_sizes determines the size of the tile. + + Padding can be applied to the LDS address when copying from memory to LDS, + but not when copying from LDS to memory. + The values in the padded target addresses remain the same as before the operation was applied. + + 2D and 3D tensors may be iterated over by setting $global_increment, $lds_increment, and $iteration_count. + $global_increment determines how much to increment the starting global memory address per iteration in units of the $base's element type. + $lds_increment determines how much to increment the starting LDS address per iteration in units of the $base's element type. + $iterate_count determines how many times to iterate. + + ```mlir + // Example of moving a two-dimensional tensor to LDS. + %base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<64x64xi32>, memref<64x64xi32, #gpu.address_space> -> !amdgpu.tdm_base + %descriptor = amdgpu.make_dma_descriptor %base globalSize [64, 64] globalStride [64, 1] sharedSize [64, 64] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor + + // Example of moving a two dimension tensor to LDS where padding is applied after every integer. + %base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<32x32xi32>, memref<64x64xi32, #gpu.address_space> -> !amdgpu.tdm_base + %descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padding(%pad pad_every %pad_every) : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor + ``` + }]; + + let assemblyFormat = [{ + $base + `globalSize` custom($global_dynamic_sizes, $global_static_sizes) + `globalStride` custom($global_dynamic_strides, $global_static_strides) + `sharedSize` custom($shared_dynamic_sizes, $shared_static_sizes) + ( `padShared` `(` $pad^ `every` $pad_every `)` )? + ( `atomicBarrier` `(` $atomic_barrier_address^ `[` $atomic_barrier_indices `]` + `:` type($atomic_barrier_address) `)`)? + ( `iterate` $global_increment^ `,` $lds_increment `,` $iteration_count )? + attr-dict `:` qualified(type($base)) `->` type(results) }]; + + let hasVerifier = 1; } #endif // AMDGPU diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index cdc10c60a42ae..5ff640b5d1596 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -705,6 +705,34 @@ LogicalResult TransposeLoadOp::verify() { return success(); } +//===----------------------------------------------------------------------===// +// MakeDmaDescriptorOp +//===----------------------------------------------------------------------===// + +LogicalResult MakeDmaDescriptorOp::verify() { + ArrayRef globalStaticStrides = getGlobalStaticStrides(); + + if (globalStaticStrides.empty()) { + return emitOpError("strides must not be empty."); + } + if (globalStaticStrides.back() != 1) { + return emitOpError("strides for the innermost dimension must be 1."); + } + + ArrayRef globalStaticSizes = getGlobalStaticSizes(); + size_t rank = globalStaticSizes.size(); + if (rank != globalStaticStrides.size()) { + return emitOpError("strides and sizes must have same rank."); + } + + ArrayRef sharedStaticSizes = getSharedStaticSizes(); + if (rank != sharedStaticSizes.size()) { + return emitOpError("tensor must have same rank as tile."); + } + + return success(); +} + //===----------------------------------------------------------------------===// // ScaledMFMAOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir index 61fdf29a78cbd..066f46060f62f 100644 --- a/mlir/test/Dialect/AMDGPU/invalid.mlir +++ b/mlir/test/Dialect/AMDGPU/invalid.mlir @@ -354,3 +354,43 @@ func.func @scaled_mfma_invalid_k(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32x %0 = amdgpu.scaled_mfma 32x32x32 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32> func.return %0 : vector<16xf32> } + +// ----- + +// CHECK-LABEL: func @make_dma_descriptor_invalid_empty_strides +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) +func.func @make_dma_descriptor_invalid_empty_strides(%base: !amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides must not be empty.}} + amdgpu.make_dma_descriptor %base globalSize [0] globalStride [] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return +} + +// ----- + +// CHECK-LABEL: func @make_dma_descriptor_invalid_innermost_stride +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) +func.func @make_dma_descriptor_invalid_innermost_stride(%base: !amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides for the innermost dimension must be 1.}} + amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [1, 2] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return +} + +// ----- + +// CHECK-LABEL: func @make_dma_descriptor_invalid_size_and_stride_sizes +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) +func.func @make_dma_descriptor_invalid_size_and_stride_sizes(%base: !amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_descriptor' op strides and sizes must have same rank.}} + amdgpu.make_dma_descriptor %base globalSize [1] globalStride [1, 1] sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return +} + +// ----- + +// CHECK-LABEL: func @make_dma_descriptor_invalid_shared_and_global_rank +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base) +func.func @make_dma_descriptor_invalid_shared_and_global_rank(%base: !amdgpu.tdm_base) { + // expected-error@+1 {{'amdgpu.make_dma_descriptor' op tensor must have same rank as tile.}} + amdgpu.make_dma_descriptor %base globalSize [4, 4] globalStride [1, 1] sharedSize [2] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + func.return +} diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 653f9f64d24f4..a8af06dc5ff0a 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -689,11 +689,62 @@ func.func @memory_counter_wait() { // CHECK-LABEL: func @make_dma_base // CHECK-SAME: (%[[IDX:.+]]: index, %[[MEM:.+]]: memref<8xi32>, %[[SMEM:.+]]: memref<8xi32, #gpu.address_space>) func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32, #gpu.address_space>) { - // CHECK: amdgpu.make_dma_base %[[MEM]][%[[IDX]]], %[[SMEM]][%[[IDX]]] : memref<8xi32>, memref<8xi32, #gpu.address_space> to !amdgpu.tdm_base - amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32>, memref<8xi32, #gpu.address_space> to !amdgpu.tdm_base + // CHECK: amdgpu.make_dma_base %[[MEM]][%[[IDX]]], %[[SMEM]][%[[IDX]]] : memref<8xi32>, memref<8xi32, #gpu.address_space> -> !amdgpu.tdm_base + amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32>, memref<8xi32, #gpu.address_space> -> !amdgpu.tdm_base - // CHECK: amdgpu.make_dma_base %[[SMEM]][%[[IDX]]], %[[MEM]][%[[IDX]]] : memref<8xi32, #gpu.address_space>, memref<8xi32> to !amdgpu.tdm_base - amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu.address_space>, memref<8xi32> to !amdgpu.tdm_base + // CHECK: amdgpu.make_dma_base %[[SMEM]][%[[IDX]]], %[[MEM]][%[[IDX]]] : memref<8xi32, #gpu.address_space>, memref<8xi32> -> !amdgpu.tdm_base + amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu.address_space>, memref<8xi32> -> !amdgpu.tdm_base func.return } +// CHECK-LABEL: func @make_dma_descriptor +// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base, %[[BARRIER:.+]]: memref<8xi32>, %[[IDX:.+]]: index) +func.func @make_dma_descriptor(%base: !amdgpu.tdm_base, %barrier: memref<8xi32>, %idx: index) { + + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] + amdgpu.make_dma_descriptor %base + // CHECK-SAME: globalSize [0] + globalSize [0] + // CHECK-SAME: globalStride [1] + globalStride [1] + // CHECK-SAME: sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + sharedSize [0] : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] + amdgpu.make_dma_descriptor %base + // CHECK-SAME: globalSize [0] + globalSize [0] + // CHECK-SAME: globalStride [1] + globalStride [1] + // CHECK-SAME: sharedSize [0] + sharedSize [0] + // CHECK-SAME: padShared(%[[IDX]] every %[[IDX]]) + padShared(%idx every %idx) + : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] + amdgpu.make_dma_descriptor %base + // CHECK-SAME: globalSize [0] + globalSize [0] + // CHECK-SAME: globalStride [1] + globalStride [1] + // CHECK-SAME: sharedSize [0] + sharedSize [0] + // CHECK-SAME: atomicBarrier(%[[BARRIER]][%[[IDX]]] : memref<8xi32>) + atomicBarrier(%barrier[%idx] : memref<8xi32>) + : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + + // CHECK: amdgpu.make_dma_descriptor %[[BASE]] + amdgpu.make_dma_descriptor %base + // CHECK-SAME: globalSize [0] + globalSize [0] + // CHECK-SAME: globalStride [1] + globalStride [1] + // CHECK-SAME: sharedSize [0] + sharedSize [0] + // CHECK-SAME: iterate %[[IDX]], %[[IDX]], %[[IDX]] + iterate %idx, %idx, %idx + : !amdgpu.tdm_base -> !amdgpu.tdm_descriptor + + func.return +} From 3c31d68dab4254b01b727bee1bd15e4a2c6fc15e Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Tue, 25 Nov 2025 11:28:04 -0500 Subject: [PATCH 2/4] [mlir][amdgpu] Add tensor load store operation --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 30 +++++++++++++++++++ mlir/test/Dialect/AMDGPU/ops.mlir | 10 +++++++ 2 files changed, 40 insertions(+) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 3581b07dc4e3e..12ef5337296a2 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1349,4 +1349,34 @@ def AMDGPU_MakeDmaDescriptorOp : let hasVerifier = 1; } +def AMDGPU_TensorLoadToLDSOp : + AMDGPU_Op<"tensor_load_to_lds", [MemoryEffects<[MemWrite]>, MemoryEffects<[MemRead]>]>, + Arguments<(ins AMDGPU_TDMDescriptorType: $desc)> { + let summary = "Load tensors from global memory to LDS."; + let description = [{ + Load tensors of up to five dimensions from global memory to LDS. + + The operation is fully described by the descriptor operand. + }]; + + let assemblyFormat = [{ + $desc attr-dict `:` qualified(type($desc)) + }]; +} + +def AMDGPU_TensorStoreFromLDSOp : + AMDGPU_Op<"tensor_store_from_lds", [MemoryEffects<[MemWrite]>, MemoryEffects<[MemRead]>]>, + Arguments<(ins AMDGPU_TDMDescriptorType: $desc)> { + let summary = "Store tensors from LDS to global memory."; + let description = [{ + Store tensors of up to five dimensions from LDS to global memory. + + The operation is fully described by the descriptor operand. + }]; + + let assemblyFormat = [{ + $desc attr-dict `:` qualified(type($desc)) + }]; +} + #endif // AMDGPU diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index a8af06dc5ff0a..aa6bedc0e1135 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -748,3 +748,13 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base, %barrier: memref<8x func.return } + +// CHECK-LABEL: @tensor_load_store +// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor) +func.func @tensor_load_store(%desc: !amdgpu.tdm_descriptor) { + // CHECK: amdgpu.tensor_load_to_lds %[[DESC]] + amdgpu.tensor_load_to_lds %desc : !amdgpu.tdm_descriptor + // CHECK: amdgpu.tensor_store_from_lds %[[DESC]] + amdgpu.tensor_store_from_lds %desc : !amdgpu.tdm_descriptor + return +} From cb116ea0b0444eb72c26e38bdb6572cdeef97e61 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Tue, 25 Nov 2025 14:24:50 -0500 Subject: [PATCH 3/4] [mlir][amdgpu] Lower amdgpu.make_dma_base. --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 3 +- .../mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h | 5 ++ .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 77 +++++++++++++++++- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 79 ++++++++++++------- ...cvt_scale_pk-gfx1250.mlir => gfx1250.mlir} | 74 +++++++++++++++++ 5 files changed, 206 insertions(+), 32 deletions(-) rename mlir/test/Conversion/AMDGPUToROCDL/{cvt_scale_pk-gfx1250.mlir => gfx1250.mlir} (73%) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 12ef5337296a2..9cb0752fba48b 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1237,7 +1237,6 @@ def AMDGPU_MakeDmaBaseOp : Results<(outs AMDGPU_TDMBaseType: $base)> { // TODO: - // * Add verifiers such that one of the memrefs is from LDS and the other global. // * Add verifiers to make sure that the number of indices do not exceed the number of dimensions. let summary = "Pair of based addresses used when moving tiles between LDS and global memory."; @@ -1280,6 +1279,8 @@ def AMDGPU_MakeDmaBaseOp : let assemblyFormat = [{ $src `[` $src_indices `]` `,` $dst `[` $dst_indices `]` attr-dict `:` type($src) `,` type($dst) `->` type(results) }]; + + let hasVerifier = 1; } def AMDGPU_MakeDmaDescriptorOp : diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h index a7680fb5c3191..958757da0933e 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h @@ -48,6 +48,11 @@ inline void printMNKDimensionList(OpAsmPrinter &printer, Operation *, IntegerAttr m, IntegerAttr n, IntegerAttr k) { printMNKDimensionList(printer, m, n, k); } + +// Utility functions for quering the address space. +bool hasGlobalMemorySpace(Attribute memorySpace); +bool hasWorkgroupMemorySpace(Attribute memorySpace); +bool hasFatRawBufferMemorySpace(Attribute memorySpace); } // namespace mlir::amdgpu #define GET_ATTRDEF_CLASSES diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index b9a5e7d7f6eac..3316e16a05d5c 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -2264,6 +2264,76 @@ struct AMDGPUPermlaneLowering : public ConvertOpToLLVMPattern { } }; +struct AMDGPUMakeDmaBaseLowering + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + AMDGPUMakeDmaBaseLowering(const LLVMTypeConverter &converter, Chipset chipset) + : ConvertOpToLLVMPattern(converter), chipset(chipset) {} + Chipset chipset; + + LogicalResult + matchAndRewrite(MakeDmaBaseOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + if (chipset < kGfx1250) + return op->emitOpError("make_dma_base is only supported on gfx1250"); + + Location loc = op.getLoc(); + + ValueRange srcIndices = adaptor.getSrcIndices(); + Value src = adaptor.getSrc(); + auto srcMemRefType = cast(op.getSrc().getType()); + + Value srcPtr = + getStridedElementPtr(rewriter, loc, srcMemRefType, src, srcIndices); + + ValueRange dstIndices = adaptor.getDstIndices(); + Value dst = adaptor.getDst(); + auto dstMemRefType = cast(op.getDst().getType()); + + Value dstPtr = + getStridedElementPtr(rewriter, loc, dstMemRefType, dst, dstIndices); + + bool storeFrom = hasWorkgroupMemorySpace(srcMemRefType.getMemorySpace()); + Value ldsAddr = storeFrom ? srcPtr : dstPtr; + Value globalAddr = storeFrom ? dstPtr : srcPtr; + + Type i32 = rewriter.getI32Type(); + Type i64 = rewriter.getI64Type(); + + Value castForLdsAddr = + LLVM::PtrToIntOp::create(rewriter, loc, i32, ldsAddr); + Value castForGlobalAddr = + LLVM::PtrToIntOp::create(rewriter, loc, i64, globalAddr); + + Value mask = createI64Constant(rewriter, loc, 0x1FFFFFFFFFFFFFF); + Value first57BitsOfGlobalAddr = + LLVM::AndOp::create(rewriter, loc, castForGlobalAddr, mask); + Value shift = LLVM::LShrOp::create(rewriter, loc, first57BitsOfGlobalAddr, + createI64Constant(rewriter, loc, 32)); + + Value lowHalf = + LLVM::TruncOp::create(rewriter, loc, i32, first57BitsOfGlobalAddr); + Value highHalf = LLVM::TruncOp::create(rewriter, loc, i32, shift); + + Value c0 = createI32Constant(rewriter, loc, 0); + Value c1 = createI32Constant(rewriter, loc, 1); + Value c2 = createI32Constant(rewriter, loc, 2); + Value c3 = createI32Constant(rewriter, loc, 3); + + Type v4i32 = this->typeConverter->convertType(VectorType::get(4, i32)); + Value result = LLVM::UndefOp::create(rewriter, loc, v4i32); + result = LLVM::InsertElementOp::create(rewriter, loc, result, c0, c0); + result = LLVM::InsertElementOp::create(rewriter, loc, result, + castForLdsAddr, c1); + result = LLVM::InsertElementOp::create(rewriter, loc, result, lowHalf, c2); + result = LLVM::InsertElementOp::create(rewriter, loc, result, highHalf, c3); + + rewriter.replaceOp(op, result); + return success(); + } +}; + struct ConvertAMDGPUToROCDLPass : public impl::ConvertAMDGPUToROCDLPassBase { using Base::Base; @@ -2278,6 +2348,10 @@ struct ConvertAMDGPUToROCDLPass RewritePatternSet patterns(ctx); LLVMTypeConverter converter(ctx); + converter.addConversion([&](TDMBaseType type) -> Type { + Type i32 = IntegerType::get(type.getContext(), 32); + return converter.convertType(VectorType::get(4, i32)); + }); populateAMDGPUToROCDLConversionPatterns(converter, patterns, *maybeChipset); LLVMConversionTarget target(getContext()); target.addIllegalDialect<::mlir::amdgpu::AMDGPUDialect>(); @@ -2333,6 +2407,7 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, ScaledExtPackedOpLowering, PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering, GatherToLDSOpLowering, TransposeLoadOpLowering, - AMDGPUPermlaneLowering>(converter, chipset); + AMDGPUPermlaneLowering, AMDGPUMakeDmaBaseLowering>(converter, + chipset); patterns.add(converter); } diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index 5ff640b5d1596..8fc6220efc6ad 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -41,6 +41,38 @@ using namespace mlir::amdgpu; #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.cpp.inc" +namespace mlir::amdgpu { +bool hasGlobalMemorySpace(Attribute memorySpace) { + if (!memorySpace) + return true; + if (auto intMemorySpace = dyn_cast(memorySpace)) + return intMemorySpace.getInt() == 0 || intMemorySpace.getInt() == 1; + if (auto gpuMemorySpace = dyn_cast(memorySpace)) + return gpuMemorySpace.getValue() == gpu::AddressSpace::Global; + return false; +} + +bool hasWorkgroupMemorySpace(Attribute memorySpace) { + if (!memorySpace) + return false; + if (auto intMemorySpace = dyn_cast(memorySpace)) + return intMemorySpace.getInt() == 3; + if (auto gpuMemorySpace = dyn_cast(memorySpace)) + return gpuMemorySpace.getValue() == gpu::AddressSpace::Workgroup; + return false; +} + +bool hasFatRawBufferMemorySpace(Attribute memorySpace) { + if (!memorySpace) + return false; + if (auto intMemorySpace = dyn_cast(memorySpace)) + return intMemorySpace.getInt() == 7; + if (auto gpuMemorySpace = dyn_cast(memorySpace)) + return gpuMemorySpace.getValue() == amdgpu::AddressSpace::FatRawBuffer; + return false; +} +} // namespace mlir::amdgpu + namespace { struct AMDGPUInlinerInterface final : DialectInlinerInterface { using DialectInlinerInterface::DialectInlinerInterface; @@ -158,36 +190,6 @@ LogicalResult FatRawBufferCastOp::verify() { return success(); } -static bool hasGlobalMemorySpace(Attribute memorySpace) { - if (!memorySpace) - return true; - if (auto intMemorySpace = dyn_cast(memorySpace)) - return intMemorySpace.getInt() == 0 || intMemorySpace.getInt() == 1; - if (auto gpuMemorySpace = dyn_cast(memorySpace)) - return gpuMemorySpace.getValue() == gpu::AddressSpace::Global; - return false; -} - -static bool hasWorkgroupMemorySpace(Attribute memorySpace) { - if (!memorySpace) - return false; - if (auto intMemorySpace = dyn_cast(memorySpace)) - return intMemorySpace.getInt() == 3; - if (auto gpuMemorySpace = dyn_cast(memorySpace)) - return gpuMemorySpace.getValue() == gpu::AddressSpace::Workgroup; - return false; -} - -static bool hasFatRawBufferMemorySpace(Attribute memorySpace) { - if (!memorySpace) - return false; - if (auto intMemorySpace = dyn_cast(memorySpace)) - return intMemorySpace.getInt() == 7; - if (auto gpuMemorySpace = dyn_cast(memorySpace)) - return gpuMemorySpace.getValue() == amdgpu::AddressSpace::FatRawBuffer; - return false; -} - //===----------------------------------------------------------------------===// // RawBuffer*Op //===----------------------------------------------------------------------===// @@ -705,6 +707,23 @@ LogicalResult TransposeLoadOp::verify() { return success(); } +//===----------------------------------------------------------------------===// +// MakeDmaBaseOp +//===----------------------------------------------------------------------===// + +LogicalResult MakeDmaBaseOp::verify() { + MemRefType srcType = cast(getSrc().getType()); + MemRefType dstType = cast(getDst().getType()); + bool store_from_lds = hasWorkgroupMemorySpace(srcType.getMemorySpace()) && + hasGlobalMemorySpace(dstType.getMemorySpace()); + bool load_to_lds = hasGlobalMemorySpace(srcType.getMemorySpace()) && + hasWorkgroupMemorySpace(dstType.getMemorySpace()); + bool is_valid = store_from_lds != load_to_lds; + if (!is_valid) + return emitOpError("invalid combination of address spaces."); + return success(); +} + //===----------------------------------------------------------------------===// // MakeDmaDescriptorOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Conversion/AMDGPUToROCDL/cvt_scale_pk-gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir similarity index 73% rename from mlir/test/Conversion/AMDGPUToROCDL/cvt_scale_pk-gfx1250.mlir rename to mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir index d2391140ce056..96d03a427215f 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/cvt_scale_pk-gfx1250.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir @@ -162,3 +162,77 @@ func.func @amdgpu.scaled_ext_packed816_invalid_dst_elem_type(%v: vector<16xf6E3M %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf64> return %ret0: vector<16xf64> } + +// ----- + +#gpu_global_addrspace = 1 +#gpu_lds_addrspace = 3 +#amdgpu_fat_buffer_addrspace = 7 + +// CHECK-LABEL: func @make_dma_base +// CHECK-SAME: (%[[IDX:.+]]: index, %[[MEM:.+]]: memref<8xi32, 1>, %[[SMEM:.+]]: memref<8xi32, 3>) +func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace>, %smem: memref<8xi32,#gpu_lds_addrspace>) -> (!amdgpu.tdm_base, !amdgpu.tdm_base) { + // CHECK-DAG: %[[INT:.+]] = builtin.unrealized_conversion_cast %[[IDX]] : index to i64 + // CHECK-DAG: %[[MEMREF_DESC_MEM:.+]] = builtin.unrealized_conversion_cast %[[MEM]] : memref<8xi32, 1> + // CHECK-DAG: %[[MEMREF_DESC_SMEM:.+]] = builtin.unrealized_conversion_cast %[[SMEM]] : memref<8xi32, 3> + + // CHECK-DAG: %[[MEM_BASE_PTR:.+]] = llvm.extractvalue %[[MEMREF_DESC_MEM]][1] : !llvm.struct<(ptr<1> + // CHECK-DAG: %[[SMEM_BASE_PTR:.+]] = llvm.extractvalue %[[MEMREF_DESC_SMEM]][1] : !llvm.struct<(ptr<3> + + // CHECK-DAG: %[[MEM_BASE_OFFSET:.+]] = llvm.getelementptr %[[MEM_BASE_PTR]][%[[INT]]] + // CHECK-DAG: %[[SMEM_BASE_OFFSET:.+]] = llvm.getelementptr %[[SMEM_BASE_PTR]][%[[INT]]] + + // CHECK-DAG: %[[MEM_INT:.+]] = llvm.ptrtoint %[[MEM_BASE_OFFSET]] : !llvm.ptr<1> to i64 + // CHECK-DAG: %[[SMEM_INT:.+]] = llvm.ptrtoint %[[SMEM_BASE_OFFSET]] : !llvm.ptr<3> to i32 + + // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(144115188075855871 : i64) : i64 + // CHECK: %[[MEM_INT_LOW_57:.+]] = llvm.and %[[MEM_INT]], %[[MASK]] + // CHECK: %[[C32:.+]] = llvm.mlir.constant(32 : i64) : i64 + // CHECK: %[[SHIFT:.+]] = llvm.lshr %[[MEM_INT_LOW_57]], %[[C32]] + // CHECK-DAG: %[[MEM_INT_LOW:.+]] = llvm.trunc %[[MEM_INT_LOW_57]] : i64 to i32 + // CHECK-DAG: %[[MEM_INT_HIGH:.+]] = llvm.trunc %[[SHIFT]] : i64 to i32 + + // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: %[[C2:.+]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK-DAG: %[[C3:.+]] = llvm.mlir.constant(3 : i32) : i32 + + // CHECK: %[[V4I32_0_0:.+]] = llvm.mlir.undef : vector<4xi32> + // CHECK: %[[V4I32_0_1:.+]] = llvm.insertelement %[[C0]], %[[V4I32_0_0]][%[[C0]] : i32] + // CHECK: %[[V4I32_0_2:.+]] = llvm.insertelement %[[SMEM_INT]], %[[V4I32_0_1]][%[[C1]] : i32] + // CHECK: %[[V4I32_0_3:.+]] = llvm.insertelement %[[MEM_INT_LOW]], %[[V4I32_0_2]][%[[C2]] : i32] + // CHECK: %[[V4I32_0_4:.+]] = llvm.insertelement %[[MEM_INT_HIGH]], %[[V4I32_0_3]][%[[C3]] : i32] + + %0 = amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32, #gpu_global_addrspace>, memref<8xi32, #gpu_lds_addrspace> -> !amdgpu.tdm_base + + // CHECK-DAG: %[[MEM_BASE_PTR:.+]] = llvm.extractvalue %[[MEMREF_DESC_MEM]][1] : !llvm.struct<(ptr<1> + // CHECK-DAG: %[[SMEM_BASE_PTR:.+]] = llvm.extractvalue %[[MEMREF_DESC_SMEM]][1] : !llvm.struct<(ptr<3> + + // CHECK-DAG: %[[MEM_BASE_OFFSET:.+]] = llvm.getelementptr %[[MEM_BASE_PTR]][%[[INT]]] + // CHECK-DAG: %[[SMEM_BASE_OFFSET:.+]] = llvm.getelementptr %[[SMEM_BASE_PTR]][%[[INT]]] + + // CHECK-DAG: %[[MEM_INT:.+]] = llvm.ptrtoint %[[MEM_BASE_OFFSET]] : !llvm.ptr<1> to i64 + // CHECK-DAG: %[[SMEM_INT:.+]] = llvm.ptrtoint %[[SMEM_BASE_OFFSET]] : !llvm.ptr<3> to i32 + + // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(144115188075855871 : i64) : i64 + // CHECK: %[[MEM_INT_LOW_57:.+]] = llvm.and %[[MEM_INT]], %[[MASK]] + // CHECK: %[[C32:.+]] = llvm.mlir.constant(32 : i64) : i64 + // CHECK: %[[SHIFT:.+]] = llvm.lshr %[[MEM_INT_LOW_57]], %[[C32]] + // CHECK-DAG: %[[MEM_INT_LOW:.+]] = llvm.trunc %[[MEM_INT_LOW_57]] : i64 to i32 + // CHECK-DAG: %[[MEM_INT_HIGH:.+]] = llvm.trunc %[[SHIFT]] : i64 to i32 + + // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: %[[C2:.+]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK-DAG: %[[C3:.+]] = llvm.mlir.constant(3 : i32) : i32 + + // CHECK: %[[V4I32_1_0:.+]] = llvm.mlir.undef : vector<4xi32> + // CHECK: %[[V4I32_1_1:.+]] = llvm.insertelement %[[C0]], %[[V4I32_1_0]][%[[C0]] : i32] + // CHECK: %[[V4I32_1_2:.+]] = llvm.insertelement %[[SMEM_INT]], %[[V4I32_1_1]][%[[C1]] : i32] + // CHECK: %[[V4I32_1_3:.+]] = llvm.insertelement %[[MEM_INT_LOW]], %[[V4I32_1_2]][%[[C2]] : i32] + // CHECK: %[[V4I32_1_4:.+]] = llvm.insertelement %[[MEM_INT_HIGH]], %[[V4I32_1_3]][%[[C3]] : i32] + + %1 = amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu_lds_addrspace>, memref<8xi32, #gpu_global_addrspace> -> !amdgpu.tdm_base + + func.return %0, %1 : !amdgpu.tdm_base, !amdgpu.tdm_base +} From 3ee5464060d2bae3e071b4910ef09e0b0d4f6728 Mon Sep 17 00:00:00 2001 From: Erick Ochoa Date: Thu, 27 Nov 2025 09:41:38 -0500 Subject: [PATCH 4/4] Update documentation --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 9cb0752fba48b..1806c747046b8 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -1258,19 +1258,21 @@ def AMDGPU_MakeDmaBaseOp : to ```mlir - // pseudocode - %base_0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr)> - %base_1 = llvm.insertvalue %global_addr, %base_0[0] : !llvm.struct<(ptr, ptr)> - %base_2 = llvm.insertvalue %lds_addr, %base_1[1] : !llvm.struct(ptr, ptr)> - // type(%base_2) = !llvm.struct<(ptr, ptr) roughly corresponds to amdgpu.tdm_base - - // The base will be used when contructing dgroup0 - // when lowering amdgpu.make_dma_descriptor - %dgroup0_0 = llvm.mlir.undef : !llvm.struct<(....)> - %dgroup0_1 = llvm.insertvalue %base2, %dgroup0_0 : .... - - // When lowering amdgpu.tensor_load_to_lds - rocdl.tensor.load.to.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32> + // pseudo-code + %global_base = llvm.extractvalue %global_memref[1] + %global_address = llvm.get_element_ptr ... + + %lds_base = llvm.extractvalue %lds_memref[1] + %lds_address = llvm.get_element_ptr ... + + // Definition of %base + %undef = llvm.mlir.undef : vector<4xi32> + %v0 = llvm.insertelement %15, %undef[0] : vector<4xi32> + %v1 = llvm.insertelement %lds_address, %v0[1] : vector<4xi32> + %v2 = llvm.insertelement %global_address_low, %v1[2] : vector<4xi32> + %base = llvm.insertelement %global_address_high, %v2[3] : vector<4xi32> + + rocdl.tensor.load.to.lds %base, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32> ``` These tensor DMA operations were introduced in gfx1250.