llvm · ianayl · Oct 27, 2025 · Oct 28, 2025 · Oct 28, 2025 · Oct 28, 2025
@@ -73,6 +73,12 @@ using MemorySpaceMapping = std::function<unsigned(gpu::AddressSpace)>;
 /// gpu.address_space to integer values.
 void populateGpuMemorySpaceAttributeConversions(
     TypeConverter &typeConverter, const MemorySpaceMapping &mapping);
+
+/// Insert gpu.wait calls before gpu operations with multiple async dependencies
+/// when the gpu operation does not support multiple async dependencies, i.e.
+/// gpu.launch_func.
+void populateGpuMultipleAsyncDepsConversionPatterns(
+    RewritePatternSet &patterns);
 } // namespace mlir
 
 #endif // MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
@@ -552,6 +552,7 @@ def GpuToLLVMConversionPass : Pass<"gpu-to-llvm", "ModuleOp"> {
            "runtime with the kernel bare pointer calling convention, to enable "
            "dynamic binding of buffers as arguments without static type info."
           >
+    // TODO should an option be made to turn this feature on?
   ];
 
   let dependentDialects = [

@@ -483,6 +483,14 @@ class ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern
                   ConversionPatternRewriter &rewriter) const override;
 };
 
+template <typename Op>
+struct ConvertMultipleAsyncDepsToGpuWaitPattern final : OpRewritePattern<Op> {
+  using OpRewritePattern<Op>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(Op op,
+                                PatternRewriter &rewriter) const override;
+};
+
 /// Generic rewriting rule for operation on sparse matrices.
 /// Currently supports CUDA (by means of cuSparse and cuSparseLt).
 #define DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(op_name)                \
@@ -538,6 +546,22 @@ void GpuToLLVMConversionPass::runOnOperation() {
     if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
       return signalPassFailure();
   }
+  // Insert gpu.wait operations before operations that do not support multiple
+  // async dependencies.
+  // TODO should this only be enabled upon an option?
+  {
+    RewritePatternSet patternss(&getContext());
+    populateGpuMultipleAsyncDepsConversionPatterns(patternss);
+    if (failed(applyPatternsGreedily(getOperation(), std::move(patternss)))) {
+      return signalPassFailure();
+    }
+
+    LLVM_DEBUG(llvm::dbgs()
+               << "--- IR After Adding Additional gpu.waits: ---\n");
+    LLVM_DEBUG(getOperation()->print(llvm::dbgs()));
+    LLVM_DEBUG(llvm::dbgs()
+               << "---------------------------------------------\n");
+  }
 
   LowerToLLVMOptions options(context);
   options.useBarePtrCallConv = hostBarePtrCallConv;
@@ -1787,6 +1811,29 @@ LogicalResult ConvertCreateBsrOpToGpuRuntimeCallPattern::matchAndRewrite(
   return success();
 }
 
+template <class Op>
+LogicalResult ConvertMultipleAsyncDepsToGpuWaitPattern<Op>::matchAndRewrite(
+    Op op, PatternRewriter &rewriter) const {
+  if (op.getAsyncDependencies().size() <= 1)
+    return rewriter.notifyMatchFailure(
+        op, "Can only convert ops with multiple async dependencies.");
+
+  // Create a new gpu.wait with the original async deps.
+  Type tokenType = rewriter.getType<gpu::AsyncTokenType>();
+  Value waitToken = gpu::WaitOp::create(rewriter, op.getLoc(), tokenType,
+                                        op.getAsyncDependencies())
+                        .getAsyncToken();
+
+  // TODO is it safe to just do getAsyncDependenciesMutable on the original op?
+  Operation *newOp = rewriter.clone(*op.getOperation());
+  auto iface = dyn_cast<Op>(newOp);
+  assert(iface && "Expected cloned op to have same type as original op.");
+  iface.getAsyncDependenciesMutable().assign({waitToken});
+  rewriter.replaceOp(op, newOp);
+
+  return success();
+}
+
 void mlir::populateGpuToLLVMConversionPatterns(
     LLVMTypeConverter &converter, RewritePatternSet &patterns,
     bool kernelBarePtrCallConv, bool kernelIntersperseSizeCallConv) {
@@ -1830,6 +1877,39 @@ void mlir::populateGpuToLLVMConversionPatterns(
                                             kernelIntersperseSizeCallConv);
 }
 
+void mlir::populateGpuMultipleAsyncDepsConversionPatterns(
+    RewritePatternSet &patterns) {
+  // TODO: Other ops to consider handling:
+  // - gpu::AllocOp,
+  // - gpu::DeallocOp,
+  // - gpu::MemcpyOp,
+  // - gpu::MemsetOp,
+  // - gpu::CreateDnTensorOp,
+  // - gpu::DestroyDnTensorOp,
+  // - gpu::CreateCooOp,
+  // - gpu::CreateCooAoSOp,
+  // - gpu::CreateCsrOp,
+  // - gpu::Create2To4SpMatOp,
+  // - gpu::DestroySpMatOp,
+  // - gpu::SpMVBufferSizeOp,
+  // - gpu::SpMVOp,
+  // - gpu::SpMMBufferSizeOp,
+  // - gpu::SDDMMBufferSizeOp,
+  // - gpu::SpMMOp,
+  // - gpu::SDDMMOp,
+  // - gpu::SpGEMMCreateDescrOp,
+  // - gpu::SpGEMMDestroyDescrOp,
+  // - gpu::SpGEMMWorkEstimationOrComputeOp,
+  // - gpu::SpGEMMCopyOp,
+  // - gpu::SpMatGetSizeOp,
+  // - gpu::SetCsrPointersOp,
+  // - gpu::CreateCscOp,
+  // - gpu::CreateBsrOp,
+  // - gpu::LaunchFuncOp
+  patterns.add<ConvertMultipleAsyncDepsToGpuWaitPattern<gpu::LaunchFuncOp>>(
+      patterns.getContext());
+}
+
 //===----------------------------------------------------------------------===//
 // GPUModuleOp convert to LLVM op interface
 //===----------------------------------------------------------------------===//

@@ -0,0 +1,81 @@
+// RUN: mlir-opt %s --gpu-to-llvm | FileCheck %s
+
+module attributes {gpu.container_module} {
+
+  gpu.module @foo {
+    gpu.func @bar() kernel {
+      gpu.return
+    }
+  }
+
+  // CHECK-LABEL: func @main
+  func.func @main() {
+    %c1 = arith.constant 1 : index
+
+    // Check that pass does not modify launch_func ops with only 1 dependency:
+
+    // CHECK: llvm.call @mgpuStreamCreate() : () -> !llvm.ptr
+    // CHECK: llvm.call @mgpuEventCreate() : () -> !llvm.ptr
+    // CHECK: llvm.call @mgpuEventRecord(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
+    %t0 = gpu.wait async
+    // CHECK: llvm.call @mgpuEventCreate() : () -> !llvm.ptr
+    // CHECK: llvm.call @mgpuEventRecord(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
+    // CHECK: llvm.call @mgpuStreamCreate() : () -> !llvm.ptr
+    // CHECK: llvm.call @mgpuStreamWaitEvent(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
+    // CHECK: llvm.call @mgpuEventDestroy(%{{.*}}) : (!llvm.ptr) -> ()
+    %t1 = gpu.wait async [%t0]
+    // CHECK: llvm.call @mgpuEventCreate() : () -> !llvm.ptr
+    // CHECK: llvm.call @mgpuEventRecord(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
+    // CHECK: llvm.call @mgpuStreamCreate() : () -> !llvm.ptr
+    // CHECK: llvm.call @mgpuStreamWaitEvent(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
+    // CHECK: llvm.call @mgpuStreamWaitEvent(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
+    // CHECK: llvm.call @mgpuEventDestroy(%{{.*}}) : (!llvm.ptr) -> ()
+    // CHECK: llvm.call @mgpuEventDestroy(%{{.*}}) : (!llvm.ptr) -> ()
+    %0 = gpu.wait async [%t0, %t1]
+    // CHECK: gpu.launch_func <%{{.*}} : !llvm.ptr> @foo::@bar blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i64
+    %good_call = gpu.launch_func async [%0] @foo::@bar
+        blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1)
+    // CHECK: llvm.call @mgpuStreamSynchronize(%{{.*}}) : (!llvm.ptr) -> ()
+    // CHECK: llvm.call @mgpuStreamDestroy(%{{.*}}) : (!llvm.ptr) -> ()
+    gpu.wait [%good_call]
+
+    // Check that launch_func ops with multiple dependencies are properly
+    // handled and do not result in a failure:
+
+    // CHECK: llvm.call @mgpuStreamCreate() : () -> !llvm.ptr
+    // CHECK: llvm.call @mgpuEventCreate() : () -> !llvm.ptr
+    // CHECK: llvm.call @mgpuEventRecord(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
+    %t2 = gpu.wait async
+    // CHECK: llvm.call @mgpuEventCreate() : () -> !llvm.ptr
+    // CHECK: llvm.call @mgpuEventRecord(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
+    // CHECK: llvm.call @mgpuStreamCreate() : () -> !llvm.ptr
+    // CHECK: llvm.call @mgpuStreamWaitEvent(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
+    // CHECK: llvm.call @mgpuEventDestroy(%{{.*}}) : (!llvm.ptr) -> ()
+    %t3 = gpu.wait async [%t2]
+    // Inserted gpu.wait:
+    // CHECK: llvm.call @mgpuEventCreate() : () -> !llvm.ptr
+    // CHECK: llvm.call @mgpuEventRecord(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
+    // CHECK: llvm.call @mgpuStreamCreate() : () -> !llvm.ptr
+    // CHECK: llvm.call @mgpuStreamWaitEvent(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
+    // CHECK: llvm.call @mgpuStreamWaitEvent(%{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr) -> ()
+    // CHECK: llvm.call @mgpuEventDestroy(%{{.*}}) : (!llvm.ptr) -> ()
+    // CHECK: llvm.call @mgpuEventDestroy(%{{.*}}) : (!llvm.ptr) -> ()
+    // gpu.launch_func only has 1 async dependency:
+    // CHECK: gpu.launch_func <%{{.*}} : !llvm.ptr> @foo::@bar blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i64
+    %bad_call = gpu.launch_func async [%t2, %t3] @foo::@bar
+        blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1)
+    // CHECK: llvm.call @mgpuStreamSynchronize(%{{.*}}) : (!llvm.ptr) -> ()
+    // CHECK: llvm.call @mgpuStreamDestroy(%{{.*}}) : (!llvm.ptr) -> ()
+    gpu.wait [%bad_call]
+    return
+  }
+
+  // func.func @foo(%size : index) -> memref<?xf32> {
+  //   %t0 = gpu.wait async
+  //   %t1 = gpu.wait async [%t0]
+  //   %0 = gpu.alloc [%t0, %t1] (%size) : memref<?xf32>
+  //   // gpu.wait [%1]
+  //   return %0 : memref<?xf32>
+  // }
+
+}