Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions src/torchcodec/_core/BetaCudaDeviceInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <c10/cuda/CUDAStream.h>
#include <torch/types.h>
#include <mutex>
#include <vector>
Expand Down Expand Up @@ -459,14 +460,18 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
CUVIDPARSERDISPINFO dispInfo = readyFrames_.front();
readyFrames_.pop();

// TODONVDEC P1 we need to set the procParams.output_stream field to the
// current CUDA stream and ensure proper synchronization. There's a related
// NVDECTODO in CudaDeviceInterface.cpp where we do the necessary
// synchronization for NPP.
CUVIDPROCPARAMS procParams = {};
procParams.progressive_frame = dispInfo.progressive_frame;
procParams.top_field_first = dispInfo.top_field_first;
procParams.unpaired_field = dispInfo.repeat_first_field < 0;
// We set the NVDEC stream to the current stream. It will be waited upon by
// the NPP stream before any color conversion. Currently, that syncing logic
// is in the default interface.
// Re types: we get a cudaStream_t from PyTorch but it's interchangeable with
// CUstream
procParams.output_stream = reinterpret_cast<CUstream>(
at::cuda::getCurrentCUDAStream(device_.index()).stream());

CUdeviceptr framePtr = 0;
unsigned int pitch = 0;

Expand Down
41 changes: 21 additions & 20 deletions src/torchcodec/_core/CudaDeviceInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -426,35 +426,36 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
dst = allocateEmptyHWCTensor(frameDims, device_);
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note for below: that's introducing yet another dependency relationship between the BETA interface and the default interface. All of this badness is resolved in #931

torch::DeviceIndex deviceIndex = getNonNegativeDeviceIndex(device_);

// Create a CUDA event and attach it to the AVFrame's CUDA stream. That's the
// NVDEC stream, i.e. the CUDA stream that the frame was decoded on.
// We will be waiting for this event to complete before calling the NPP
// functions, to ensure NVDEC has finished decoding the frame before running
// the NPP color-conversion.
// Note that our code is generic and assumes that the NVDEC's stream can be
// arbitrary, but unfortunately we know it's hardcoded to be the default
// stream by FFmpeg:
// We need to make sure NVDEC has finished decoding a frame before
// color-converting it with NPP.
// So we make the NPP stream wait for NVDEC to finish.
// If we're in the default CUDA interface, we figure out the NVDEC stream from
// the avFrame's hardware context. But in reality, we know that this stream is
// hardcoded to be the default stream by FFmpeg:
// https://github.com/FFmpeg/FFmpeg/blob/66e40840d15b514f275ce3ce2a4bf72ec68c7311/libavutil/hwcontext_cuda.c#L387-L388
at::cuda::CUDAStream nppStream = at::cuda::getCurrentCUDAStream(deviceIndex);
// If we're in the BETA CUDA interface, we know the NVDEC stream was set with
// getCurrentCUDAStream(), so it's the same as the nppStream.
at::cuda::CUDAStream nppStream =
at::cuda::getCurrentCUDAStream(device_.index());
// We can't create a CUDAStream without assigning it a value so we initialize
// it to the nppStream, which is valid for the BETA interface.
at::cuda::CUDAStream nvdecStream = nppStream;
if (hwFramesCtx) {
// TODONVDEC P2 this block won't be hit from the beta interface because
// there is no hwFramesCtx, but we should still make sure there's no CUDA
// stream sync issue in the beta interface.
// Default interface path
TORCH_CHECK(
hwFramesCtx->device_ctx != nullptr,
"The AVFrame's hw_frames_ctx does not have a device_ctx. ");
auto cudaDeviceCtx =
static_cast<AVCUDADeviceContext*>(hwFramesCtx->device_ctx->hwctx);
TORCH_CHECK(cudaDeviceCtx != nullptr, "The hardware context is null");
at::cuda::CUDAEvent nvdecDoneEvent;
at::cuda::CUDAStream nvdecStream = // That's always the default stream. Sad.
c10::cuda::getStreamFromExternal(cudaDeviceCtx->stream, deviceIndex);
nvdecDoneEvent.record(nvdecStream);
// Don't start NPP work before NVDEC is done decoding the frame!
nvdecDoneEvent.block(nppStream);
nvdecStream = // That's always the default stream. Sad.
c10::cuda::getStreamFromExternal(
cudaDeviceCtx->stream, device_.index());
}
// Don't start NPP work before NVDEC is done decoding the frame!
at::cuda::CUDAEvent nvdecDoneEvent;
nvdecDoneEvent.record(nvdecStream);
nvdecDoneEvent.block(nppStream);

// Create the NPP context if we haven't yet.
nppCtx_->hStream = nppStream.stream();
Expand Down
Loading