From 1bd7d9c79e809f579e5f44baeb3ef3d490881c47 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Sat, 4 Oct 2025 16:07:27 +0100
Subject: [PATCH] properly set NVDEC stream and wait on it

---
 .../_core/BetaCudaDeviceInterface.cpp         | 13 ++++--
 src/torchcodec/_core/CudaDeviceInterface.cpp  | 41 ++++++++++---------
 2 files changed, 30 insertions(+), 24 deletions(-)
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index 078655462..48e0bb465 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -4,6 +4,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+#include <c10/cuda/CUDAStream.h>
 #include <torch/types.h>
 #include <mutex>
 #include <vector>
@@ -459,14 +460,18 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
   CUVIDPARSERDISPINFO dispInfo = readyFrames_.front();
   readyFrames_.pop();
 
-  // TODONVDEC P1 we need to set the procParams.output_stream field to the
-  // current CUDA stream and ensure proper synchronization. There's a related
-  // NVDECTODO in CudaDeviceInterface.cpp where we do the necessary
-  // synchronization for NPP.
   CUVIDPROCPARAMS procParams = {};
   procParams.progressive_frame = dispInfo.progressive_frame;
   procParams.top_field_first = dispInfo.top_field_first;
   procParams.unpaired_field = dispInfo.repeat_first_field < 0;
+  // We set the NVDEC stream to the current stream. It will be waited upon by
+  // the NPP stream before any color conversion. Currently, that syncing logic
+  // is in the default interface.
+  // Re types: we get a cudaStream_t from PyTorch but it's interchangeable with
+  // CUstream
+  procParams.output_stream = reinterpret_cast<CUstream>(
+      at::cuda::getCurrentCUDAStream(device_.index()).stream());
+
   CUdeviceptr framePtr = 0;
   unsigned int pitch = 0;
 
diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
index 37e5053d5..2dee80af3 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -426,35 +426,36 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
     dst = allocateEmptyHWCTensor(frameDims, device_);
   }
 
-  torch::DeviceIndex deviceIndex = getNonNegativeDeviceIndex(device_);
-
-  // Create a CUDA event and attach it to the AVFrame's CUDA stream. That's the
-  // NVDEC stream, i.e. the CUDA stream that the frame was decoded on.
-  // We will be waiting for this event to complete before calling the NPP
-  // functions, to ensure NVDEC has finished decoding the frame before running
-  // the NPP color-conversion.
-  // Note that our code is generic and assumes that the NVDEC's stream can be
-  // arbitrary, but unfortunately we know it's hardcoded to be the default
-  // stream by FFmpeg:
+  // We need to make sure NVDEC has finished decoding a frame before
+  // color-converting it with NPP.
+  // So we make the NPP stream wait for NVDEC to finish.
+  // If we're in the default CUDA interface, we figure out the NVDEC stream from
+  // the avFrame's hardware context. But in reality, we know that this stream is
+  // hardcoded to be the default stream by FFmpeg:
   // https://github.com/FFmpeg/FFmpeg/blob/66e40840d15b514f275ce3ce2a4bf72ec68c7311/libavutil/hwcontext_cuda.c#L387-L388
-  at::cuda::CUDAStream nppStream = at::cuda::getCurrentCUDAStream(deviceIndex);
+  // If we're in the BETA CUDA interface, we know the NVDEC stream was set with
+  // getCurrentCUDAStream(), so it's the same as the nppStream.
+  at::cuda::CUDAStream nppStream =
+      at::cuda::getCurrentCUDAStream(device_.index());
+  // We can't create a CUDAStream without assigning it a value so we initialize
+  // it to the nppStream, which is valid for the BETA interface.
+  at::cuda::CUDAStream nvdecStream = nppStream;
   if (hwFramesCtx) {
-    // TODONVDEC P2 this block won't be hit from the beta interface because
-    // there is no hwFramesCtx, but we should still make sure there's no CUDA
-    // stream sync issue in the beta interface.
+    // Default interface path
     TORCH_CHECK(
         hwFramesCtx->device_ctx != nullptr,
         "The AVFrame's hw_frames_ctx does not have a device_ctx. ");
     auto cudaDeviceCtx =
         static_cast<AVCUDADeviceContext*>(hwFramesCtx->device_ctx->hwctx);
     TORCH_CHECK(cudaDeviceCtx != nullptr, "The hardware context is null");
-    at::cuda::CUDAEvent nvdecDoneEvent;
-    at::cuda::CUDAStream nvdecStream = // That's always the default stream. Sad.
-        c10::cuda::getStreamFromExternal(cudaDeviceCtx->stream, deviceIndex);
-    nvdecDoneEvent.record(nvdecStream);
-    // Don't start NPP work before NVDEC is done decoding the frame!
-    nvdecDoneEvent.block(nppStream);
+    nvdecStream = // That's always the default stream. Sad.
+        c10::cuda::getStreamFromExternal(
+            cudaDeviceCtx->stream, device_.index());
   }
+  // Don't start NPP work before NVDEC is done decoding the frame!
+  at::cuda::CUDAEvent nvdecDoneEvent;
+  nvdecDoneEvent.record(nvdecStream);
+  nvdecDoneEvent.block(nppStream);
 
   // Create the NPP context if we haven't yet.
   nppCtx_->hStream = nppStream.stream();