From 1bd7d9c79e809f579e5f44baeb3ef3d490881c47 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 4 Oct 2025 16:07:27 +0100 Subject: [PATCH] properly set NVDEC stream and wait on it --- .../_core/BetaCudaDeviceInterface.cpp | 13 ++++-- src/torchcodec/_core/CudaDeviceInterface.cpp | 41 ++++++++++--------- 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index 078655462..48e0bb465 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -4,6 +4,7 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. +#include #include #include #include @@ -459,14 +460,18 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) { CUVIDPARSERDISPINFO dispInfo = readyFrames_.front(); readyFrames_.pop(); - // TODONVDEC P1 we need to set the procParams.output_stream field to the - // current CUDA stream and ensure proper synchronization. There's a related - // NVDECTODO in CudaDeviceInterface.cpp where we do the necessary - // synchronization for NPP. CUVIDPROCPARAMS procParams = {}; procParams.progressive_frame = dispInfo.progressive_frame; procParams.top_field_first = dispInfo.top_field_first; procParams.unpaired_field = dispInfo.repeat_first_field < 0; + // We set the NVDEC stream to the current stream. It will be waited upon by + // the NPP stream before any color conversion. Currently, that syncing logic + // is in the default interface. + // Re types: we get a cudaStream_t from PyTorch but it's interchangeable with + // CUstream + procParams.output_stream = reinterpret_cast( + at::cuda::getCurrentCUDAStream(device_.index()).stream()); + CUdeviceptr framePtr = 0; unsigned int pitch = 0; diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp index 37e5053d5..2dee80af3 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.cpp +++ b/src/torchcodec/_core/CudaDeviceInterface.cpp @@ -426,35 +426,36 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( dst = allocateEmptyHWCTensor(frameDims, device_); } - torch::DeviceIndex deviceIndex = getNonNegativeDeviceIndex(device_); - - // Create a CUDA event and attach it to the AVFrame's CUDA stream. That's the - // NVDEC stream, i.e. the CUDA stream that the frame was decoded on. - // We will be waiting for this event to complete before calling the NPP - // functions, to ensure NVDEC has finished decoding the frame before running - // the NPP color-conversion. - // Note that our code is generic and assumes that the NVDEC's stream can be - // arbitrary, but unfortunately we know it's hardcoded to be the default - // stream by FFmpeg: + // We need to make sure NVDEC has finished decoding a frame before + // color-converting it with NPP. + // So we make the NPP stream wait for NVDEC to finish. + // If we're in the default CUDA interface, we figure out the NVDEC stream from + // the avFrame's hardware context. But in reality, we know that this stream is + // hardcoded to be the default stream by FFmpeg: // https://github.com/FFmpeg/FFmpeg/blob/66e40840d15b514f275ce3ce2a4bf72ec68c7311/libavutil/hwcontext_cuda.c#L387-L388 - at::cuda::CUDAStream nppStream = at::cuda::getCurrentCUDAStream(deviceIndex); + // If we're in the BETA CUDA interface, we know the NVDEC stream was set with + // getCurrentCUDAStream(), so it's the same as the nppStream. + at::cuda::CUDAStream nppStream = + at::cuda::getCurrentCUDAStream(device_.index()); + // We can't create a CUDAStream without assigning it a value so we initialize + // it to the nppStream, which is valid for the BETA interface. + at::cuda::CUDAStream nvdecStream = nppStream; if (hwFramesCtx) { - // TODONVDEC P2 this block won't be hit from the beta interface because - // there is no hwFramesCtx, but we should still make sure there's no CUDA - // stream sync issue in the beta interface. + // Default interface path TORCH_CHECK( hwFramesCtx->device_ctx != nullptr, "The AVFrame's hw_frames_ctx does not have a device_ctx. "); auto cudaDeviceCtx = static_cast(hwFramesCtx->device_ctx->hwctx); TORCH_CHECK(cudaDeviceCtx != nullptr, "The hardware context is null"); - at::cuda::CUDAEvent nvdecDoneEvent; - at::cuda::CUDAStream nvdecStream = // That's always the default stream. Sad. - c10::cuda::getStreamFromExternal(cudaDeviceCtx->stream, deviceIndex); - nvdecDoneEvent.record(nvdecStream); - // Don't start NPP work before NVDEC is done decoding the frame! - nvdecDoneEvent.block(nppStream); + nvdecStream = // That's always the default stream. Sad. + c10::cuda::getStreamFromExternal( + cudaDeviceCtx->stream, device_.index()); } + // Don't start NPP work before NVDEC is done decoding the frame! + at::cuda::CUDAEvent nvdecDoneEvent; + nvdecDoneEvent.record(nvdecStream); + nvdecDoneEvent.block(nppStream); // Create the NPP context if we haven't yet. nppCtx_->hStream = nppStream.stream();