From 09f7e0011774c33b688b3444b38014ee96cc0c65 Mon Sep 17 00:00:00 2001 From: Guray Ozen Date: Thu, 25 Jan 2024 10:25:21 +0100 Subject: [PATCH] [mlir][nvgpu] Mark TMA descriptor as MemWriteAt in `tma.async.store` The canonicalizer finds `nvgpu.tma.async.store` Op trivially dead, because it lacks any memory side effects. This PR aims to address this issue by adding the `MemWriteAt` to the TMA descriptor. This Op copies data `shared memory -> global memory`, but it is done asynchronously, so the fix might not be optimal. Because it does not mutate the memory right away. The asynchronous behavior is controlled by two NVVM OPs below: `nvvm.cp.async.bulk.commit.group`: Groups all the `nvgpu.tma.async.store` together and commits the group. `nvvm.cp.async.bulk.wait_group 1`: Waits for the completion of the 1st group Here's a simplified representation of the code: ``` gpu.func ... { // Write something to shared memory %shmem = ... // Perform asynchronous store from shared memory to global memory nvgpu.tma.async.store %shmem to %arg0[%c0, %c0], predicate = %1 : memref<128x32xf32, #gpu.address_space> -> , swizzle = none, l2promo = none, oob = zero, interleave = none> // Control asynchronous execution nvvm.cp.async.bulk.commit.group nvvm.cp.async.bulk.wait_group 1 } ``` --- mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td | 2 +- mlir/test/Dialect/NVGPU/canonicalization.mlir | 30 +++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 mlir/test/Dialect/NVGPU/canonicalization.mlir diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td index 239a5f1e2bc29..a0c0d4cfd8714 100644 --- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td +++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td @@ -671,7 +671,7 @@ def NVGPU_TmaAsyncStoreOp : NVGPU_Op<"tma.async.store", [AttrSizedOperandSegment tile shape. The descriptor is created by `nvgpu.tma.create.descriptor` }]; let arguments = (ins Arg]>:$src, - NVGPU_TensorMapDescriptor:$tensorMapDescriptor, + Arg]>:$tensorMapDescriptor, Variadic:$coordinates, Optional:$predicate); let assemblyFormat = [{ diff --git a/mlir/test/Dialect/NVGPU/canonicalization.mlir b/mlir/test/Dialect/NVGPU/canonicalization.mlir new file mode 100644 index 0000000000000..a7fbfd8067395 --- /dev/null +++ b/mlir/test/Dialect/NVGPU/canonicalization.mlir @@ -0,0 +1,30 @@ +// RUN: mlir-opt %s | mlir-opt -canonicalize -cse | FileCheck %s + +gpu.module @main_kernel { + +// CHECK-LABEL: @main_kernel( +// CHECK-SAME: %[[arg0:.*]]: !nvgpu.tensormap.descriptor + gpu.func @main_kernel(%arg0: !nvgpu.tensormap.descriptor< + tensor = memref<128x32xf32, 3>, swizzle = none, l2promo = none, + oob = zero, interleave = none>) kernel attributes + { gpu.known_block_size = array, + gpu.known_grid_size = array + } + { + // CHECK: %[[c0:.+]] = arith.constant 0 : index + // CHECK: %[[S0:.+]] = gpu.thread_id x + // CHECK: %[[S1:.+]] = arith.cmpi eq, %[[S0]], %[[c0]] : index + // CHECK: %[[S2:.+]] = gpu.dynamic_shared_memory : memref> + // CHECK: %[[S3:.+]] = memref.view %[[S2]][%[[c0]]][] : memref> to memref<128x32xf32, #gpu.address_space> + // CHECK: nvgpu.tma.async.store %[[S3]] to %[[arg0]][%[[c0]], %[[c0]]], predicate = %[[S1]] : memref<128x32xf32, #gpu.address_space> -> , swizzle = none, l2promo = none, oob = zero, interleave = none> + %c0 = arith.constant 0 : index + %0 = gpu.thread_id x + %1 = arith.cmpi eq, %0, %c0 : index + %2 = gpu.dynamic_shared_memory : memref> + %view = memref.view %2[%c0][] : memref> to memref<128x32xf32, #gpu.address_space> + nvgpu.tma.async.store %view to %arg0[%c0, %c0], predicate = %1 : memref<128x32xf32, #gpu.address_space> -> , swizzle = none, l2promo = none, oob = zero, interleave = none> + nvvm.cp.async.bulk.commit.group + nvvm.cp.async.bulk.wait_group 0 + gpu.return + } +} \ No newline at end of file