Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions src/torchcodec/_core/BetaCudaDeviceInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,12 +216,11 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
// unclear.
flush();
unmapPreviousFrame();
NVDECCache::getCache(device_.index())
.returnDecoder(&videoFormat_, std::move(decoder_));
NVDECCache::getCache(device_).returnDecoder(
&videoFormat_, std::move(decoder_));
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Above and everywhere else in the PR, anything related to "device index" relates to the creation of getDeviceIndex() which is now in CUDACommon.cpp, and replaces the old (buggy!) getNonNegativeDeviceIndex.

}

if (videoParser_) {
// TODONVDEC P2: consider caching this? Does DALI do that?
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Answer: No, DALI doesn't cache the parser.

cuvidDestroyVideoParser(videoParser_);
videoParser_ = nullptr;
}
Expand Down Expand Up @@ -362,11 +361,12 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
}

if (!decoder_) {
decoder_ = NVDECCache::getCache(device_.index()).getDecoder(videoFormat);
decoder_ = NVDECCache::getCache(device_).getDecoder(videoFormat);

if (!decoder_) {
// TODONVDEC P2: consider re-configuring an existing decoder instead of
// re-creating one. See docs, see DALI.
// re-creating one. See docs, see DALI. Re-configuration doesn't seem to
// be enabled in DALI by default.
decoder_ = createDecoder(videoFormat);
}

Expand Down
27 changes: 21 additions & 6 deletions src/torchcodec/_core/CUDACommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,12 @@
// LICENSE file in the root directory of this source tree.

#include "src/torchcodec/_core/CUDACommon.h"
#include "src/torchcodec/_core/Cache.h" // for PerGpuCache

namespace facebook::torchcodec {

namespace {

// Pytorch can only handle up to 128 GPUs.
// https://github.com/pytorch/pytorch/blob/e30c55ee527b40d67555464b9e402b4b7ce03737/c10/cuda/CUDAMacros.h#L44
const int MAX_CUDA_GPUS = 128;
// Set to -1 to have an infinitely sized cache. Set it to 0 to disable caching.
// Set to a positive number to have a cache of that size.
const int MAX_CONTEXTS_PER_GPU_IN_CACHE = -1;
Expand Down Expand Up @@ -249,7 +247,7 @@ torch::Tensor convertNV12FrameToRGB(
}

UniqueNppContext getNppStreamContext(const torch::Device& device) {
torch::DeviceIndex nonNegativeDeviceIndex = getNonNegativeDeviceIndex(device);
int deviceIndex = getDeviceIndex(device);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Going from torch::DeviceIndex to int is a fix. cudaGetDeviceProperties and nppCtx->nCudaDeviceId below both expect int, not torch::DeviceIndex which is int8_t


UniqueNppContext nppCtx = g_cached_npp_ctxs.get(device);
if (nppCtx) {
Expand All @@ -266,13 +264,13 @@ UniqueNppContext getNppStreamContext(const torch::Device& device) {

nppCtx = std::make_unique<NppStreamContext>();
cudaDeviceProp prop{};
cudaError_t err = cudaGetDeviceProperties(&prop, nonNegativeDeviceIndex);
cudaError_t err = cudaGetDeviceProperties(&prop, deviceIndex);
TORCH_CHECK(
err == cudaSuccess,
"cudaGetDeviceProperties failed: ",
cudaGetErrorString(err));

nppCtx->nCudaDeviceId = nonNegativeDeviceIndex;
nppCtx->nCudaDeviceId = deviceIndex;
nppCtx->nMultiProcessorCount = prop.multiProcessorCount;
nppCtx->nMaxThreadsPerMultiProcessor = prop.maxThreadsPerMultiProcessor;
nppCtx->nMaxThreadsPerBlock = prop.maxThreadsPerBlock;
Expand Down Expand Up @@ -312,4 +310,21 @@ void validatePreAllocatedTensorShape(
}
}

int getDeviceIndex(const torch::Device& device) {
// PyTorch uses int8_t as its torch::DeviceIndex, but FFmpeg and CUDA
// libraries use int. So we use int, too.
int deviceIndex = static_cast<int>(device.index());
TORCH_CHECK(
deviceIndex >= -1 && deviceIndex < MAX_CUDA_GPUS,
"Invalid device index = ",
deviceIndex);

if (deviceIndex == -1) {
TORCH_CHECK(
cudaGetDevice(&deviceIndex) == cudaSuccess,
"Failed to get current CUDA device.");
}
return deviceIndex;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another fix: getNonNegativeDeviceIndex() used to map -1 to 0, which is incorrect. -1 should be mapped to the "current device", which can be arbitrary and set by a context manager from Python.

}

} // namespace facebook::torchcodec
7 changes: 6 additions & 1 deletion src/torchcodec/_core/CUDACommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
#include <npp.h>
#include <torch/types.h>

#include "src/torchcodec/_core/Cache.h"
#include "src/torchcodec/_core/FFMPEGCommon.h"
#include "src/torchcodec/_core/Frame.h"

Expand All @@ -22,6 +21,10 @@ extern "C" {

namespace facebook::torchcodec {

// Pytorch can only handle up to 128 GPUs.
// https://github.com/pytorch/pytorch/blob/e30c55ee527b40d67555464b9e402b4b7ce03737/c10/cuda/CUDAMacros.h#L44
constexpr int MAX_CUDA_GPUS = 128;

void initializeCudaContextWithPytorch(const torch::Device& device);

// Unique pointer type for NPP stream context
Expand All @@ -43,4 +46,6 @@ void validatePreAllocatedTensorShape(
const std::optional<torch::Tensor>& preAllocatedOutputTensor,
const UniqueAVFrame& avFrame);

int getDeviceIndex(const torch::Device& device);

} // namespace facebook::torchcodec
26 changes: 6 additions & 20 deletions src/torchcodec/_core/Cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,30 +95,16 @@ class PerGpuCache {
std::vector<std::unique_ptr<Cache<T, D>>> cache_;
};

// Note: this function is inline for convenience, not performance. Because the
// rest of this file is template functions, they must all be defined in this
// header. This function is not a template function, and should, in principle,
// be defined in a .cpp file to preserve the One Definition Rule. That's
// annoying for such a small amount of code, so we just inline it. If this file
// grows, and there are more such functions, we should break them out into a
// .cpp file.
inline torch::DeviceIndex getNonNegativeDeviceIndex(
const torch::Device& device) {
torch::DeviceIndex deviceIndex = device.index();
// For single GPU machines libtorch returns -1 for the device index. So for
// that case we set the device index to 0. That's used in per-gpu cache
// implementation and during initialization of CUDA and FFmpeg contexts
// which require non negative indices.
deviceIndex = std::max<at::DeviceIndex>(deviceIndex, 0);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: this was wrong! -1 should be mapped to the current device, not to 0.

TORCH_CHECK(deviceIndex >= 0, "Device index out of range");
return deviceIndex;
}
// Forward declaration of getDeviceIndex which exists in CUDACommon.h
// This avoids circular dependency between Cache.h and CUDACommon.cpp which also
// needs to include Cache.h
int getDeviceIndex(const torch::Device& device);
Copy link
Contributor Author

@NicolasHug NicolasHug Oct 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not thrilled by this, happy to consider alternatives.

We could:

  • include Cache.h within CUDACommon.cpp and not bother about the potential circular issue
  • Have a forward declaration of PerGpuCache within CudaCommon.cpp instead (not sure if that would work since it's a template class).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Forward declarations of templates are possible, but awkward. But if Cache.h needs to know CUDA stuff, then it's not really generic. The fact we have a circular dependency hints to me that we're not quite modeling things correctly. I can think of two alternatives:

  1. We add the ability to convert a torch::Device into a device-specific index to the DeviceInterface. That's still a little awkward, though, as we'd want to tell PerGpuCache the DeviceInterface so it would know where to get the index.
  2. PerGpuCache accepts the device index itself, instead of a torch::Device. That then assumes the device logic does the conversion itself, which I think is reasonable.

I prefer 2, but that might be a bigger refactor than this PR. So I'm good with this for now, and we can file a task for option 2, if you also agree it makes sense.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think 2 is reasonable, we could align both caches to just take an index as input. Let me follow-up on that


template <typename T, typename D>
bool PerGpuCache<T, D>::addIfCacheHasCapacity(
const torch::Device& device,
element_type&& obj) {
torch::DeviceIndex deviceIndex = getNonNegativeDeviceIndex(device);
int deviceIndex = getDeviceIndex(device);
TORCH_CHECK(
static_cast<size_t>(deviceIndex) < cache_.size(),
"Device index out of range");
Expand All @@ -128,7 +114,7 @@ bool PerGpuCache<T, D>::addIfCacheHasCapacity(
template <typename T, typename D>
typename PerGpuCache<T, D>::element_type PerGpuCache<T, D>::get(
const torch::Device& device) {
torch::DeviceIndex deviceIndex = getNonNegativeDeviceIndex(device);
int deviceIndex = getDeviceIndex(device);
TORCH_CHECK(
static_cast<size_t>(deviceIndex) < cache_.size(),
"Device index out of range");
Expand Down
9 changes: 3 additions & 6 deletions src/torchcodec/_core/CudaDeviceInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,6 @@ static bool g_cuda = registerDeviceInterface(
// from
// the cache. If the cache is empty we create a new cuda context.

// Pytorch can only handle up to 128 GPUs.
// https://github.com/pytorch/pytorch/blob/e30c55ee527b40d67555464b9e402b4b7ce03737/c10/cuda/CUDAMacros.h#L44
const int MAX_CUDA_GPUS = 128;
// Set to -1 to have an infinitely sized cache. Set it to 0 to disable caching.
// Set to a positive number to have a cache of that size.
const int MAX_CONTEXTS_PER_GPU_IN_CACHE = -1;
Expand All @@ -54,7 +51,7 @@ int getFlagsAVHardwareDeviceContextCreate() {
UniqueAVBufferRef getHardwareDeviceContext(const torch::Device& device) {
enum AVHWDeviceType type = av_hwdevice_find_type_by_name("cuda");
TORCH_CHECK(type != AV_HWDEVICE_TYPE_NONE, "Failed to find cuda device");
torch::DeviceIndex nonNegativeDeviceIndex = getNonNegativeDeviceIndex(device);
int deviceIndex = getDeviceIndex(device);

UniqueAVBufferRef hardwareDeviceCtx = g_cached_hw_device_ctxs.get(device);
if (hardwareDeviceCtx) {
Expand All @@ -68,9 +65,9 @@ UniqueAVBufferRef getHardwareDeviceContext(const torch::Device& device) {
// So we ensure the deviceIndex is not negative.
// We set the device because we may be called from a different thread than
// the one that initialized the cuda context.
cudaSetDevice(nonNegativeDeviceIndex);
cudaSetDevice(deviceIndex);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here as well: cudaSetDevice expects an int, not an implicitly-cast torch::DeviceIndex

AVBufferRef* hardwareDeviceCtxRaw = nullptr;
std::string deviceOrdinal = std::to_string(nonNegativeDeviceIndex);
std::string deviceOrdinal = std::to_string(deviceIndex);

int err = av_hwdevice_ctx_create(
&hardwareDeviceCtxRaw,
Expand Down
16 changes: 3 additions & 13 deletions src/torchcodec/_core/NVDECCache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <torch/types.h>
#include <mutex>

#include "src/torchcodec/_core/CUDACommon.h"
#include "src/torchcodec/_core/FFMPEGCommon.h"
#include "src/torchcodec/_core/NVDECCache.h"

Expand All @@ -19,20 +20,9 @@ extern "C" {

namespace facebook::torchcodec {

NVDECCache& NVDECCache::getCache(int deviceIndex) {
const int MAX_CUDA_GPUS = 128;
TORCH_CHECK(
deviceIndex >= -1 && deviceIndex < MAX_CUDA_GPUS,
"Invalid device index = ",
deviceIndex);
NVDECCache& NVDECCache::getCache(const torch::Device& device) {
static NVDECCache cacheInstances[MAX_CUDA_GPUS];
if (deviceIndex == -1) {
// TODO NVDEC P3: Unify with existing getNonNegativeDeviceIndex()
TORCH_CHECK(
cudaGetDevice(&deviceIndex) == cudaSuccess,
"Failed to get current CUDA device.");
}
return cacheInstances[deviceIndex];
return cacheInstances[getDeviceIndex(device)];
}

UniqueCUvideodecoder NVDECCache::getDecoder(CUVIDEOFORMAT* videoFormat) {
Expand Down
8 changes: 2 additions & 6 deletions src/torchcodec/_core/NVDECCache.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <mutex>

#include <cuda.h>
#include <torch/types.h>
#include "src/torchcodec/_core/nvcuvid_include/cuviddec.h"
#include "src/torchcodec/_core/nvcuvid_include/nvcuvid.h"

Expand All @@ -36,7 +37,7 @@ using UniqueCUvideodecoder =
// per GPU device, and it is accessed through the static getCache() method.
class NVDECCache {
public:
static NVDECCache& getCache(int deviceIndex);
static NVDECCache& getCache(const torch::Device& device);

// Get decoder from cache - returns nullptr if none available
UniqueCUvideodecoder getDecoder(CUVIDEOFORMAT* videoFormat);
Expand Down Expand Up @@ -68,11 +69,6 @@ class NVDECCache {
CacheKey(const CacheKey&) = default;
CacheKey& operator=(const CacheKey&) = default;

// TODONVDEC P2: we only implement operator< which is enough for std::map,
// but:
// - we should consider using std::unordered_map
// - we should consider a more sophisticated and potentially less strict
// cache key comparison logic
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I realized that using an unorderd_map would force us to define our own hash function, so that's really not worth it. The n log(n) complexity of map isn't relevant anyway considering the very small sizes we deal with.

On the potentailly less strict cache logic: I looked into more details at DALI's implementation and they do the same thing, modulo decoder re-configuration, which is tracked in another TODO. So we can remove this one.

bool operator<(const CacheKey& other) const {
return std::tie(
codecType,
Expand Down
9 changes: 3 additions & 6 deletions test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,6 @@ def unsplit_device_str(device_str: str) -> str:
# It is used:
# - before calling `.to(device)` where device can't be "cuda:0:beta"
# - before calling add_video_stream(device=device, device_variant=device_variant)
#
# TODONVDEC P2: Find a less clunky way to test the BETA CUDA interface. It
# will ultimately depend on how we want to publicly expose it.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was addressed by #959 and there is another TODO to remove the legacy use of cuda:0:beta, which will transitively resolve this one.

if device_str == "cuda:0:beta":
return "cuda", "beta"
else:
Expand Down Expand Up @@ -750,7 +747,7 @@ def sample_format(self) -> str:


def supports_approximate_mode(asset: TestVideo) -> bool:
# TODONVDEC P2: open an issue about his. That's actually not related to
# NVDEC at all, those don't support approximate mode because they don't set
# a duration. CPU decoder fails too!
# Those are missing the `duration` field so they fail in approximate mode (on all devices).
# TODO: we should address this, see
# https://github.com/meta-pytorch/torchcodec/issues/945
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't an NVDEC TODO, it's a general TODO.

return asset not in (AV1_VIDEO, TEST_SRC_2_720P_VP9, TEST_SRC_2_720P_VP8)
Loading