Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
78ab058
Let's just commit 3k loc in a single commit
NicolasHug Sep 25, 2025
b45decc
Fixes
NicolasHug Sep 26, 2025
316f218
Merge branch 'main' of github.com:pytorch/torchcodec into aeaenjfjanef
NicolasHug Sep 30, 2025
d0192ec
GetCache -> getCache
NicolasHug Sep 30, 2025
515deb5
Make UniqueCUvideodecoder a pointer on CUvideodecoder, not void
NicolasHug Sep 30, 2025
13fad10
Make device and device_variant have a default instead of being std::o…
NicolasHug Sep 30, 2025
eb8de72
Remove old registerDeviceInterface
NicolasHug Sep 30, 2025
4f7a4fb
Call std::memset
NicolasHug Sep 30, 2025
dcf3124
remove unnecessary cuda_runtime.h include, update cmake accordingly
NicolasHug Sep 30, 2025
0ad7370
abstract frameBuffer_ into a FrameBuffer class
NicolasHug Sep 30, 2025
aad142e
Cleanup BSF logic
NicolasHug Sep 30, 2025
2592888
Return int in callback instead of unsigned char
NicolasHug Sep 30, 2025
b5fe9bc
define width and height as unsigned int
NicolasHug Sep 30, 2025
7494259
Merge branch 'main' of github.com:pytorch/torchcodec into aeaenjfjanef
NicolasHug Oct 1, 2025
560b376
Fix cuda context initialization
NicolasHug Oct 1, 2025
d1e51b3
Merge branch 'main' of github.com:pytorch/torchcodec into aeaenjfjanef
NicolasHug Oct 2, 2025
f9c7297
Skip equality check on ffmepg 4
NicolasHug Oct 2, 2025
7e4dd10
Define constant, add TODO for AVRational
NicolasHug Oct 2, 2025
f614846
Use uint32_t types
NicolasHug Oct 2, 2025
aa6e253
Create packet.reset() and add P0 TODO
NicolasHug Oct 2, 2025
186eaa4
Add TODO
NicolasHug Oct 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
576 changes: 576 additions & 0 deletions src/torchcodec/_core/BetaCudaDeviceInterface.cpp

Large diffs are not rendered by default.

129 changes: 129 additions & 0 deletions src/torchcodec/_core/BetaCudaDeviceInterface.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
// All rights reserved.
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

// BETA CUDA device interface that provides direct control over NVDEC
// while keeping FFmpeg for demuxing. A lot of the logic, particularly the use
// of a cache for the decoders, is inspired by DALI's implementation which is
// APACHE 2.0:
// https://github.com/NVIDIA/DALI/blob/c7539676a24a8e9e99a6e8665e277363c5445259/dali/operators/video/frames_decoder_gpu.cc#L1
//
// NVDEC / NVCUVID docs:
// https://docs.nvidia.com/video-technologies/video-codec-sdk/13.0/nvdec-video-decoder-api-prog-guide/index.html#using-nvidia-video-decoder-nvdecode-api

#pragma once

#include "src/torchcodec/_core/Cache.h"
#include "src/torchcodec/_core/DeviceInterface.h"
#include "src/torchcodec/_core/FFMPEGCommon.h"
#include "src/torchcodec/_core/NVDECCache.h"

#include <map>
#include <memory>
#include <mutex>
#include <queue>
#include <unordered_map>
#include <vector>

#include "src/torchcodec/_core/nvcuvid_include/cuviddec.h"
#include "src/torchcodec/_core/nvcuvid_include/nvcuvid.h"

namespace facebook::torchcodec {

class BetaCudaDeviceInterface : public DeviceInterface {
public:
explicit BetaCudaDeviceInterface(const torch::Device& device);
virtual ~BetaCudaDeviceInterface();

void initializeInterface(AVStream* stream) override;

void convertAVFrameToFrameOutput(
const VideoStreamOptions& videoStreamOptions,
const AVRational& timeBase,
UniqueAVFrame& avFrame,
FrameOutput& frameOutput,
std::optional<torch::Tensor> preAllocatedOutputTensor =
std::nullopt) override;

bool canDecodePacketDirectly() const override {
return true;
}

int sendPacket(ReferenceAVPacket& packet) override;
int receiveFrame(UniqueAVFrame& avFrame, int64_t desiredPts) override;
void flush() override;

// NVDEC callback functions (must be public for C callbacks)
int streamPropertyChange(CUVIDEOFORMAT* videoFormat);
int frameReadyForDecoding(CUVIDPICPARAMS* pPicParams);

private:
// Apply bitstream filter, modifies packet in-place
void applyBSF(ReferenceAVPacket& packet);

class FrameBuffer {
public:
struct Slot {
CUVIDPARSERDISPINFO dispInfo;
int64_t guessedPts;
bool occupied = false;

Slot() : guessedPts(-1), occupied(false) {
std::memset(&dispInfo, 0, sizeof(dispInfo));
}
};

// TODONVDEC P1: init size should probably be min_num_decode_surfaces from
// video format
FrameBuffer() : frameBuffer_(4) {}

~FrameBuffer() = default;

Slot* findEmptySlot();
Slot* findFrameWithExactPts(int64_t desiredPts);

// Iterator support for range-based for loops
auto begin() {
return frameBuffer_.begin();
}

auto end() {
return frameBuffer_.end();
}

private:
std::vector<Slot> frameBuffer_;
};

UniqueAVFrame convertCudaFrameToAVFrame(
CUdeviceptr framePtr,
unsigned int pitch,
const CUVIDPARSERDISPINFO& dispInfo);

CUvideoparser videoParser_ = nullptr;
UniqueCUvideodecoder decoder_;
CUVIDEOFORMAT videoFormat_ = {};

FrameBuffer frameBuffer_;

std::queue<int64_t> packetsPtsQueue;

bool eofSent_ = false;

// Flush flag to prevent decode operations during flush (like DALI's
// isFlushing_)
bool isFlushing_ = false;

AVRational timeBase_ = {0, 0};

UniqueAVBSFContext bitstreamFilter_;

// Default CUDA interface for color conversion.
// TODONVDEC P2: we shouldn't need to keep a separate instance of the default.
// See other TODO there about how interfaces should be completely independent.
std::unique_ptr<DeviceInterface> defaultCudaInterface_;
};

} // namespace facebook::torchcodec
20 changes: 19 additions & 1 deletion src/torchcodec/_core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ function(make_torchcodec_libraries
)

if(ENABLE_CUDA)
list(APPEND core_sources CudaDeviceInterface.cpp)
list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp)
endif()

set(core_library_dependencies
Expand All @@ -107,9 +107,27 @@ function(make_torchcodec_libraries
)

if(ENABLE_CUDA)
# Try to find NVCUVID. Try the normal way first. This should work locally.
find_library(NVCUVID_LIBRARY NAMES nvcuvid)
# If not found, try with version suffix, or hardcoded path. Appears
# to be necessary on the CI.
if(NOT NVCUVID_LIBRARY)
find_library(NVCUVID_LIBRARY NAMES nvcuvid.1 PATHS /usr/lib64 /usr/lib)
endif()
if(NOT NVCUVID_LIBRARY)
set(NVCUVID_LIBRARY "/usr/lib64/libnvcuvid.so.1")
endif()

if(NVCUVID_LIBRARY)
message(STATUS "Found NVCUVID: ${NVCUVID_LIBRARY}")
else()
message(FATAL_ERROR "Could not find NVCUVID library")
endif()

list(APPEND core_library_dependencies
${CUDA_nppi_LIBRARY}
${CUDA_nppicc_LIBRARY}
${NVCUVID_LIBRARY}
)
endif()

Expand Down
2 changes: 1 addition & 1 deletion src/torchcodec/_core/CpuDeviceInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ namespace facebook::torchcodec {
namespace {

static bool g_cpu = registerDeviceInterface(
torch::kCPU,
DeviceInterfaceKey(torch::kCPU),
[](const torch::Device& device) { return new CpuDeviceInterface(device); });

} // namespace
Expand Down
82 changes: 49 additions & 33 deletions src/torchcodec/_core/CudaDeviceInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,21 @@ extern "C" {
#include <libavutil/pixdesc.h>
}

// TODONVDEC P1 Changes were made to this file to accomodate for the BETA CUDA
// interface (see other TODONVDEC below). That's because the BETA CUDA interface
// relies on this default CUDA interface to do the color conversion. That's
// hacky, ugly, and leads to complicated code. We should refactor all this so
// that an interface doesn't need to know anything about any other interface.
// Note - this is more than just about the BETA CUDA interface: this default
// interface already relies on the CPU interface to do software decoding when
// needed, and that's already leading to similar complications.

namespace facebook::torchcodec {
namespace {

static bool g_cuda =
registerDeviceInterface(torch::kCUDA, [](const torch::Device& device) {
static bool g_cuda = registerDeviceInterface(
DeviceInterfaceKey(torch::kCUDA),
[](const torch::Device& device) {
return new CudaDeviceInterface(device);
});

Expand Down Expand Up @@ -216,10 +226,11 @@ std::unique_ptr<FiltersContext> CudaDeviceInterface::initializeFiltersContext(
return nullptr;
}

TORCH_CHECK(
avFrame->hw_frames_ctx != nullptr,
"The AVFrame does not have a hw_frames_ctx. "
"That's unexpected, please report this to the TorchCodec repo.");
if (avFrame->hw_frames_ctx == nullptr) {
// TODONVDEC P2 return early for for beta interface where avFrames don't
// have a hw_frames_ctx. We should get rid of this or improve the logic.
return nullptr;
}

auto hwFramesCtx =
reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data);
Expand Down Expand Up @@ -347,22 +358,23 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
// Above we checked that the AVFrame was on GPU, but that's not enough, we
// also need to check that the AVFrame is in AV_PIX_FMT_NV12 format (8 bits),
// because this is what the NPP color conversion routines expect.
TORCH_CHECK(
avFrame->hw_frames_ctx != nullptr,
"The AVFrame does not have a hw_frames_ctx. "
"That's unexpected, please report this to the TorchCodec repo.");

auto hwFramesCtx =
reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data);
AVPixelFormat actualFormat = hwFramesCtx->sw_format;
// TODONVDEC P2 this can be hit from the beta interface, but there's no
// hw_frames_ctx in this case. We should try to understand how that affects
// this validation.
AVHWFramesContext* hwFramesCtx = nullptr;
if (avFrame->hw_frames_ctx != nullptr) {
hwFramesCtx =
reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data);
AVPixelFormat actualFormat = hwFramesCtx->sw_format;

TORCH_CHECK(
actualFormat == AV_PIX_FMT_NV12,
"The AVFrame is ",
(av_get_pix_fmt_name(actualFormat) ? av_get_pix_fmt_name(actualFormat)
: "unknown"),
", but we expected AV_PIX_FMT_NV12. "
"That's unexpected, please report this to the TorchCodec repo.");
TORCH_CHECK(
actualFormat == AV_PIX_FMT_NV12,
"The AVFrame is ",
(av_get_pix_fmt_name(actualFormat) ? av_get_pix_fmt_name(actualFormat)
: "unknown"),
", but we expected AV_PIX_FMT_NV12. "
"That's unexpected, please report this to the TorchCodec repo.");
}

auto frameDims =
getHeightAndWidthFromOptionsOrAVFrame(videoStreamOptions, avFrame);
Expand Down Expand Up @@ -396,19 +408,23 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
// arbitrary, but unfortunately we know it's hardcoded to be the default
// stream by FFmpeg:
// https://github.com/FFmpeg/FFmpeg/blob/66e40840d15b514f275ce3ce2a4bf72ec68c7311/libavutil/hwcontext_cuda.c#L387-L388
TORCH_CHECK(
hwFramesCtx->device_ctx != nullptr,
"The AVFrame's hw_frames_ctx does not have a device_ctx. ");
auto cudaDeviceCtx =
static_cast<AVCUDADeviceContext*>(hwFramesCtx->device_ctx->hwctx);
at::cuda::CUDAEvent nvdecDoneEvent;
at::cuda::CUDAStream nvdecStream = // That's always the default stream. Sad.
c10::cuda::getStreamFromExternal(cudaDeviceCtx->stream, deviceIndex);
nvdecDoneEvent.record(nvdecStream);

// Don't start NPP work before NVDEC is done decoding the frame!
at::cuda::CUDAStream nppStream = at::cuda::getCurrentCUDAStream(deviceIndex);
nvdecDoneEvent.block(nppStream);
if (hwFramesCtx) {
// TODONVDEC P2 this block won't be hit from the beta interface because
// there is no hwFramesCtx, but we should still make sure there's no CUDA
// stream sync issue in the beta interface.
TORCH_CHECK(
hwFramesCtx->device_ctx != nullptr,
"The AVFrame's hw_frames_ctx does not have a device_ctx. ");
auto cudaDeviceCtx =
static_cast<AVCUDADeviceContext*>(hwFramesCtx->device_ctx->hwctx);
at::cuda::CUDAEvent nvdecDoneEvent;
at::cuda::CUDAStream nvdecStream = // That's always the default stream. Sad.
c10::cuda::getStreamFromExternal(cudaDeviceCtx->stream, deviceIndex);
nvdecDoneEvent.record(nvdecStream);
// Don't start NPP work before NVDEC is done decoding the frame!
nvdecDoneEvent.block(nppStream);
}

// Create the NPP context if we haven't yet.
nppCtx_->hStream = nppStream.stream();
Expand Down
63 changes: 43 additions & 20 deletions src/torchcodec/_core/DeviceInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
namespace facebook::torchcodec {

namespace {
using DeviceInterfaceMap = std::map<torch::DeviceType, CreateDeviceInterfaceFn>;
using DeviceInterfaceMap =
std::map<DeviceInterfaceKey, CreateDeviceInterfaceFn>;
static std::mutex g_interface_mutex;

DeviceInterfaceMap& getDeviceMap() {
Expand All @@ -30,50 +31,72 @@ std::string getDeviceType(const std::string& device) {
} // namespace

bool registerDeviceInterface(
torch::DeviceType deviceType,
const DeviceInterfaceKey& key,
CreateDeviceInterfaceFn createInterface) {
std::scoped_lock lock(g_interface_mutex);
DeviceInterfaceMap& deviceMap = getDeviceMap();

TORCH_CHECK(
deviceMap.find(deviceType) == deviceMap.end(),
"Device interface already registered for ",
deviceType);
deviceMap.insert({deviceType, createInterface});
deviceMap.find(key) == deviceMap.end(),
"Device interface already registered for device type ",
key.deviceType,
" variant '",
key.variant,
"'");
deviceMap.insert({key, createInterface});

return true;
}

torch::Device createTorchDevice(const std::string device) {
void validateDeviceInterface(
const std::string device,
const std::string variant) {
std::scoped_lock lock(g_interface_mutex);
std::string deviceType = getDeviceType(device);

DeviceInterfaceMap& deviceMap = getDeviceMap();

// Find device interface that matches device type and variant
torch::DeviceType deviceTypeEnum = torch::Device(deviceType).type();

auto deviceInterface = std::find_if(
deviceMap.begin(),
deviceMap.end(),
[&](const std::pair<torch::DeviceType, CreateDeviceInterfaceFn>& arg) {
return device.rfind(
torch::DeviceTypeName(arg.first, /*lcase*/ true), 0) == 0;
[&](const std::pair<DeviceInterfaceKey, CreateDeviceInterfaceFn>& arg) {
return arg.first.deviceType == deviceTypeEnum &&
arg.first.variant == variant;
});
TORCH_CHECK(
deviceInterface != deviceMap.end(), "Unsupported device: ", device);

return torch::Device(device);
TORCH_CHECK(
deviceInterface != deviceMap.end(),
"Unsupported device: ",
device,
" (device type: ",
deviceType,
", variant: ",
variant,
")");
}

std::unique_ptr<DeviceInterface> createDeviceInterface(
const torch::Device& device) {
auto deviceType = device.type();
const torch::Device& device,
const std::string_view variant) {
DeviceInterfaceKey key(device.type(), variant);
std::scoped_lock lock(g_interface_mutex);
DeviceInterfaceMap& deviceMap = getDeviceMap();

TORCH_CHECK(
deviceMap.find(deviceType) != deviceMap.end(),
"Unsupported device: ",
device);
auto it = deviceMap.find(key);
if (it != deviceMap.end()) {
return std::unique_ptr<DeviceInterface>(it->second(device));
}

return std::unique_ptr<DeviceInterface>(deviceMap[deviceType](device));
TORCH_CHECK(
false,
"No device interface found for device type: ",
device.type(),
" variant: '",
variant,
"'");
}

} // namespace facebook::torchcodec
Loading
Loading