diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp index 3e99c537d0e02..2b321ee846d36 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -65,8 +65,8 @@ template class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern { public: explicit ConvertOpToGpuRuntimeCallPattern( - const LLVMTypeConverter &typeConverter) - : ConvertOpToLLVMPattern(typeConverter) {} + const LLVMTypeConverter &typeConverter, PatternBenefit benefit = 1) + : ConvertOpToLLVMPattern(typeConverter, benefit) {} protected: Value getNumElements(ConversionPatternRewriter &rewriter, Location loc, @@ -382,8 +382,9 @@ class ConvertAsyncYieldToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: ConvertAsyncYieldToGpuRuntimeCallPattern( - const LLVMTypeConverter &typeConverter) - : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} + const LLVMTypeConverter &typeConverter, PatternBenefit benefit = 1) + : ConvertOpToGpuRuntimeCallPattern(typeConverter, + benefit) {} private: LogicalResult @@ -838,6 +839,14 @@ static bool isGpuAsyncTokenType(Value value) { // !gpu.async.token are lowered to stream within the async.execute region, but // are passed as events between them. For each !gpu.async.token operand, we // create an event and record it on the stream. +// +// This pattern is registered with a higher benefit than the structural +// async.yield rewriter from populateAsyncStructuralTypeConversionsAndLegality +// so it wins when both match. Without that benefit override, the structural +// pattern can win and silently retype gpu.async.token operands without +// recording an event, leaving the host await to call cuEventSynchronize on +// a stream pointer (a no-op that returns an error), racing the host against +// the GPU. LogicalResult ConvertAsyncYieldToGpuRuntimeCallPattern::matchAndRewrite( async::YieldOp yieldOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { @@ -1828,6 +1837,13 @@ void mlir::populateGpuToLLVMConversionPatterns( addOpaquePointerConversion(converter); addOpaquePointerConversion(converter); + // Higher benefit so this pattern wins over the structural async.yield + // rewriter from populateAsyncStructuralTypeConversionsAndLegality on yields + // with gpu.async.token operands. The structural rewriter would silently + // retype operands without recording an event on the underlying stream. + patterns.add(converter, + /*benefit=*/2); + patterns.add) { + %c1 = arith.constant 1 : index + %t, %r = async.execute -> !async.value { + %0 = gpu.wait async + %1 = gpu.launch_func async [%0] @kmod::@kernel + blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) + args(%arg : memref) + async.yield %1 : !gpu.async.token + } + return + } + + gpu.module @kmod [#nvvm.target] { + llvm.func @kernel(%a: !llvm.ptr, %b: !llvm.ptr, %c: i64, %d: i64, %e: i64) + attributes {gpu.kernel, nvvm.kernel} { + llvm.return + } + } +}