diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 3e99c537d0e02..2b321ee846d36 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -65,8 +65,8 @@ template <typename OpTy>
 class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
 public:
   explicit ConvertOpToGpuRuntimeCallPattern(
-      const LLVMTypeConverter &typeConverter)
-      : ConvertOpToLLVMPattern<OpTy>(typeConverter) {}
+      const LLVMTypeConverter &typeConverter, PatternBenefit benefit = 1)
+      : ConvertOpToLLVMPattern<OpTy>(typeConverter, benefit) {}
 
 protected:
   Value getNumElements(ConversionPatternRewriter &rewriter, Location loc,
@@ -382,8 +382,9 @@ class ConvertAsyncYieldToGpuRuntimeCallPattern
     : public ConvertOpToGpuRuntimeCallPattern<async::YieldOp> {
 public:
   ConvertAsyncYieldToGpuRuntimeCallPattern(
-      const LLVMTypeConverter &typeConverter)
-      : ConvertOpToGpuRuntimeCallPattern<async::YieldOp>(typeConverter) {}
+      const LLVMTypeConverter &typeConverter, PatternBenefit benefit = 1)
+      : ConvertOpToGpuRuntimeCallPattern<async::YieldOp>(typeConverter,
+                                                         benefit) {}
 
 private:
   LogicalResult
@@ -838,6 +839,14 @@ static bool isGpuAsyncTokenType(Value value) {
 // !gpu.async.token are lowered to stream within the async.execute region, but
 // are passed as events between them. For each !gpu.async.token operand, we
 // create an event and record it on the stream.
+//
+// This pattern is registered with a higher benefit than the structural
+// async.yield rewriter from populateAsyncStructuralTypeConversionsAndLegality
+// so it wins when both match. Without that benefit override, the structural
+// pattern can win and silently retype gpu.async.token operands without
+// recording an event, leaving the host await to call cuEventSynchronize on
+// a stream pointer (a no-op that returns an error), racing the host against
+// the GPU.
 LogicalResult ConvertAsyncYieldToGpuRuntimeCallPattern::matchAndRewrite(
     async::YieldOp yieldOp, OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
@@ -1828,6 +1837,13 @@ void mlir::populateGpuToLLVMConversionPatterns(
   addOpaquePointerConversion<gpu::SparseSpMatHandleType>(converter);
   addOpaquePointerConversion<gpu::SparseSpGEMMOpHandleType>(converter);
 
+  // Higher benefit so this pattern wins over the structural async.yield
+  // rewriter from populateAsyncStructuralTypeConversionsAndLegality on yields
+  // with gpu.async.token operands. The structural rewriter would silently
+  // retype operands without recording an event on the underlying stream.
+  patterns.add<ConvertAsyncYieldToGpuRuntimeCallPattern>(converter,
+                                                         /*benefit=*/2);
+
   patterns.add<ConvertAllocOpToGpuRuntimeCallPattern,
                ConvertDeallocOpToGpuRuntimeCallPattern,
                ConvertHostRegisterOpToGpuRuntimeCallPattern,
@@ -1837,7 +1853,6 @@ void mlir::populateGpuToLLVMConversionPatterns(
                ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern,
                ConvertWaitAsyncOpToGpuRuntimeCallPattern,
                ConvertWaitOpToGpuRuntimeCallPattern,
-               ConvertAsyncYieldToGpuRuntimeCallPattern,
                ConvertCreateDnTensorOpToGpuRuntimeCallPattern,
                ConvertDestroyDnTensorOpToGpuRuntimeCallPattern,
                ConvertCreateCooOpToGpuRuntimeCallPattern,
diff --git a/mlir/test/Conversion/GPUCommon/lower-async-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-async-to-gpu-runtime-calls.mlir
new file mode 100644
index 0000000000000..cbb9df1ad8af3
--- /dev/null
+++ b/mlir/test/Conversion/GPUCommon/lower-async-to-gpu-runtime-calls.mlir
@@ -0,0 +1,46 @@
+// RUN: mlir-opt %s --gpu-to-llvm | FileCheck %s
+
+// Regression test for https://github.com/llvm/llvm-project/issues/170833.
+//
+// In `gpu-to-llvm`, an `async.yield` operand of type `!gpu.async.token`
+// must be lowered to an *event* recorded on the stream that produced it,
+// not to the stream pointer itself. Otherwise the host await later calls
+// `cuEventSynchronize` on a stream pointer (a no-op that returns an
+// error), and the host races against the GPU.
+//
+// The bug was that two patterns matched `async.yield` with the same
+// benefit: the structural rewriter from
+// `populateAsyncStructuralTypeConversionsAndLegality` (which only retypes
+// operands) and the GPU-aware rewriter (which also creates and records an
+// event). When the IR contained `gpu.launch_func` (so other patterns ran
+// alongside), the dialect-conversion framework picked the structural one
+// for the yield, dropping the event-record on the floor.
+
+module attributes {gpu.container_module} {
+
+  // CHECK-LABEL: llvm.func @yield_launch_token
+  // CHECK: %[[stream:.*]] = llvm.call @mgpuStreamCreate
+  // CHECK: gpu.launch_func {{.*}} @kmod::@kernel
+  // CHECK: %[[event:.*]] = llvm.call @mgpuEventCreate
+  // CHECK: llvm.call @mgpuEventRecord(%[[event]], %[[stream]])
+  // CHECK: llvm.call @mgpuStreamDestroy(%[[stream]])
+  // CHECK: async.yield %[[event]] : !llvm.ptr
+  func.func @yield_launch_token(%arg : memref<?xi32>) {
+    %c1 = arith.constant 1 : index
+    %t, %r = async.execute -> !async.value<!gpu.async.token> {
+      %0 = gpu.wait async
+      %1 = gpu.launch_func async [%0] @kmod::@kernel
+          blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1)
+          args(%arg : memref<?xi32>)
+      async.yield %1 : !gpu.async.token
+    }
+    return
+  }
+
+  gpu.module @kmod [#nvvm.target] {
+    llvm.func @kernel(%a: !llvm.ptr, %b: !llvm.ptr, %c: i64, %d: i64, %e: i64)
+        attributes {gpu.kernel, nvvm.kernel} {
+      llvm.return
+    }
+  }
+}