meta-pytorch · NicolasHug · Oct 15, 2025 · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -216,12 +216,11 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
     // unclear.
     flush();
     unmapPreviousFrame();
-    NVDECCache::getCache(device_.index())
-        .returnDecoder(&videoFormat_, std::move(decoder_));
+    NVDECCache::getCache(device_).returnDecoder(
+        &videoFormat_, std::move(decoder_));
   }
 
   if (videoParser_) {
-    // TODONVDEC P2: consider caching this? Does DALI do that?
     cuvidDestroyVideoParser(videoParser_);
     videoParser_ = nullptr;
   }
@@ -362,11 +361,12 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
   }
 
   if (!decoder_) {
-    decoder_ = NVDECCache::getCache(device_.index()).getDecoder(videoFormat);
+    decoder_ = NVDECCache::getCache(device_).getDecoder(videoFormat);
 
     if (!decoder_) {
       // TODONVDEC P2: consider re-configuring an existing decoder instead of
-      // re-creating one. See docs, see DALI.
+      // re-creating one. See docs, see DALI. Re-configuration doesn't seem to
+      // be enabled in DALI by default.
       decoder_ = createDecoder(videoFormat);
     }
 

diff --git a/src/torchcodec/_core/CUDACommon.cpp b/src/torchcodec/_core/CUDACommon.cpp
@@ -5,14 +5,12 @@
 // LICENSE file in the root directory of this source tree.
 
 #include "src/torchcodec/_core/CUDACommon.h"
+#include "src/torchcodec/_core/Cache.h" // for PerGpuCache
 
 namespace facebook::torchcodec {
 
 namespace {
 
-// Pytorch can only handle up to 128 GPUs.
-// https://github.com/pytorch/pytorch/blob/e30c55ee527b40d67555464b9e402b4b7ce03737/c10/cuda/CUDAMacros.h#L44
-const int MAX_CUDA_GPUS = 128;
 // Set to -1 to have an infinitely sized cache. Set it to 0 to disable caching.
 // Set to a positive number to have a cache of that size.
 const int MAX_CONTEXTS_PER_GPU_IN_CACHE = -1;
@@ -249,7 +247,7 @@ torch::Tensor convertNV12FrameToRGB(
 }
 
 UniqueNppContext getNppStreamContext(const torch::Device& device) {
-  torch::DeviceIndex nonNegativeDeviceIndex = getNonNegativeDeviceIndex(device);
+  int deviceIndex = getDeviceIndex(device);
 
   UniqueNppContext nppCtx = g_cached_npp_ctxs.get(device);
   if (nppCtx) {
@@ -266,13 +264,13 @@ UniqueNppContext getNppStreamContext(const torch::Device& device) {
 
   nppCtx = std::make_unique<NppStreamContext>();
   cudaDeviceProp prop{};
-  cudaError_t err = cudaGetDeviceProperties(&prop, nonNegativeDeviceIndex);
+  cudaError_t err = cudaGetDeviceProperties(&prop, deviceIndex);
   TORCH_CHECK(
       err == cudaSuccess,
       "cudaGetDeviceProperties failed: ",
       cudaGetErrorString(err));
 
-  nppCtx->nCudaDeviceId = nonNegativeDeviceIndex;
+  nppCtx->nCudaDeviceId = deviceIndex;
   nppCtx->nMultiProcessorCount = prop.multiProcessorCount;
   nppCtx->nMaxThreadsPerMultiProcessor = prop.maxThreadsPerMultiProcessor;
   nppCtx->nMaxThreadsPerBlock = prop.maxThreadsPerBlock;
@@ -312,4 +310,21 @@ void validatePreAllocatedTensorShape(
   }
 }
 
+int getDeviceIndex(const torch::Device& device) {
+  // PyTorch uses int8_t as its torch::DeviceIndex, but FFmpeg and CUDA
+  // libraries use int. So we use int, too.
+  int deviceIndex = static_cast<int>(device.index());
+  TORCH_CHECK(
+      deviceIndex >= -1 && deviceIndex < MAX_CUDA_GPUS,
+      "Invalid device index = ",
+      deviceIndex);
+
+  if (deviceIndex == -1) {
+    TORCH_CHECK(
+        cudaGetDevice(&deviceIndex) == cudaSuccess,
+        "Failed to get current CUDA device.");
+  }
+  return deviceIndex;
+}
+
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/CUDACommon.h b/src/torchcodec/_core/CUDACommon.h
@@ -11,7 +11,6 @@
 #include <npp.h>
 #include <torch/types.h>
 
-#include "src/torchcodec/_core/Cache.h"
 #include "src/torchcodec/_core/FFMPEGCommon.h"
 #include "src/torchcodec/_core/Frame.h"
 
@@ -22,6 +21,10 @@ extern "C" {
 
 namespace facebook::torchcodec {
 
+// Pytorch can only handle up to 128 GPUs.
+// https://github.com/pytorch/pytorch/blob/e30c55ee527b40d67555464b9e402b4b7ce03737/c10/cuda/CUDAMacros.h#L44
+constexpr int MAX_CUDA_GPUS = 128;
+
 void initializeCudaContextWithPytorch(const torch::Device& device);
 
 // Unique pointer type for NPP stream context
@@ -43,4 +46,6 @@ void validatePreAllocatedTensorShape(
     const std::optional<torch::Tensor>& preAllocatedOutputTensor,
     const UniqueAVFrame& avFrame);
 
+int getDeviceIndex(const torch::Device& device);
+
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/Cache.h b/src/torchcodec/_core/Cache.h
@@ -95,30 +95,16 @@ class PerGpuCache {
   std::vector<std::unique_ptr<Cache<T, D>>> cache_;
 };
 
-// Note: this function is inline for convenience, not performance. Because the
-// rest of this file is template functions, they must all be defined in this
-// header. This function is not a template function, and should, in principle,
-// be defined in a .cpp file to preserve the One Definition Rule. That's
-// annoying for such a small amount of code, so we just inline it. If this file
-// grows, and there are more such functions, we should break them out into a
-// .cpp file.
-inline torch::DeviceIndex getNonNegativeDeviceIndex(
-    const torch::Device& device) {
-  torch::DeviceIndex deviceIndex = device.index();
-  // For single GPU machines libtorch returns -1 for the device index. So for
-  // that case we set the device index to 0. That's used in per-gpu cache
-  // implementation and during initialization of CUDA and FFmpeg contexts
-  // which require non negative indices.
-  deviceIndex = std::max<at::DeviceIndex>(deviceIndex, 0);
-  TORCH_CHECK(deviceIndex >= 0, "Device index out of range");
-  return deviceIndex;
-}
+// Forward declaration of getDeviceIndex which exists in CUDACommon.h
+// This avoids circular dependency between Cache.h and CUDACommon.cpp which also
+// needs to include Cache.h
+int getDeviceIndex(const torch::Device& device);
 
 template <typename T, typename D>
 bool PerGpuCache<T, D>::addIfCacheHasCapacity(
     const torch::Device& device,
     element_type&& obj) {
-  torch::DeviceIndex deviceIndex = getNonNegativeDeviceIndex(device);
+  int deviceIndex = getDeviceIndex(device);
   TORCH_CHECK(
       static_cast<size_t>(deviceIndex) < cache_.size(),
       "Device index out of range");
@@ -128,7 +114,7 @@ bool PerGpuCache<T, D>::addIfCacheHasCapacity(
 template <typename T, typename D>
 typename PerGpuCache<T, D>::element_type PerGpuCache<T, D>::get(
     const torch::Device& device) {
-  torch::DeviceIndex deviceIndex = getNonNegativeDeviceIndex(device);
+  int deviceIndex = getDeviceIndex(device);
   TORCH_CHECK(
       static_cast<size_t>(deviceIndex) < cache_.size(),
       "Device index out of range");

diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -32,9 +32,6 @@ static bool g_cuda = registerDeviceInterface(
 // from
 //    the cache. If the cache is empty we create a new cuda context.
 
-// Pytorch can only handle up to 128 GPUs.
-// https://github.com/pytorch/pytorch/blob/e30c55ee527b40d67555464b9e402b4b7ce03737/c10/cuda/CUDAMacros.h#L44
-const int MAX_CUDA_GPUS = 128;
 // Set to -1 to have an infinitely sized cache. Set it to 0 to disable caching.
 // Set to a positive number to have a cache of that size.
 const int MAX_CONTEXTS_PER_GPU_IN_CACHE = -1;
@@ -54,7 +51,7 @@ int getFlagsAVHardwareDeviceContextCreate() {
 UniqueAVBufferRef getHardwareDeviceContext(const torch::Device& device) {
   enum AVHWDeviceType type = av_hwdevice_find_type_by_name("cuda");
   TORCH_CHECK(type != AV_HWDEVICE_TYPE_NONE, "Failed to find cuda device");
-  torch::DeviceIndex nonNegativeDeviceIndex = getNonNegativeDeviceIndex(device);
+  int deviceIndex = getDeviceIndex(device);
 
   UniqueAVBufferRef hardwareDeviceCtx = g_cached_hw_device_ctxs.get(device);
   if (hardwareDeviceCtx) {
@@ -68,9 +65,9 @@ UniqueAVBufferRef getHardwareDeviceContext(const torch::Device& device) {
   // So we ensure the deviceIndex is not negative.
   // We set the device because we may be called from a different thread than
   // the one that initialized the cuda context.
-  cudaSetDevice(nonNegativeDeviceIndex);
+  cudaSetDevice(deviceIndex);
   AVBufferRef* hardwareDeviceCtxRaw = nullptr;
-  std::string deviceOrdinal = std::to_string(nonNegativeDeviceIndex);
+  std::string deviceOrdinal = std::to_string(deviceIndex);
 
   int err = av_hwdevice_ctx_create(
       &hardwareDeviceCtxRaw,

diff --git a/src/torchcodec/_core/NVDECCache.cpp b/src/torchcodec/_core/NVDECCache.cpp
@@ -7,6 +7,7 @@
 #include <torch/types.h>
 #include <mutex>
 
+#include "src/torchcodec/_core/CUDACommon.h"
 #include "src/torchcodec/_core/FFMPEGCommon.h"
 #include "src/torchcodec/_core/NVDECCache.h"
 
@@ -19,20 +20,9 @@ extern "C" {
 
 namespace facebook::torchcodec {
 
-NVDECCache& NVDECCache::getCache(int deviceIndex) {
-  const int MAX_CUDA_GPUS = 128;
-  TORCH_CHECK(
-      deviceIndex >= -1 && deviceIndex < MAX_CUDA_GPUS,
-      "Invalid device index = ",
-      deviceIndex);
+NVDECCache& NVDECCache::getCache(const torch::Device& device) {
   static NVDECCache cacheInstances[MAX_CUDA_GPUS];
-  if (deviceIndex == -1) {
-    // TODO NVDEC P3: Unify with existing getNonNegativeDeviceIndex()
-    TORCH_CHECK(
-        cudaGetDevice(&deviceIndex) == cudaSuccess,
-        "Failed to get current CUDA device.");
-  }
-  return cacheInstances[deviceIndex];
+  return cacheInstances[getDeviceIndex(device)];
 }
 
 UniqueCUvideodecoder NVDECCache::getDecoder(CUVIDEOFORMAT* videoFormat) {

diff --git a/src/torchcodec/_core/NVDECCache.h b/src/torchcodec/_core/NVDECCache.h
@@ -11,6 +11,7 @@
 #include <mutex>
 
 #include <cuda.h>
+#include <torch/types.h>
 #include "src/torchcodec/_core/nvcuvid_include/cuviddec.h"
 #include "src/torchcodec/_core/nvcuvid_include/nvcuvid.h"
 
@@ -36,7 +37,7 @@ using UniqueCUvideodecoder =
 // per GPU device, and it is accessed through the static getCache() method.
 class NVDECCache {
  public:
-  static NVDECCache& getCache(int deviceIndex);
+  static NVDECCache& getCache(const torch::Device& device);
 
   // Get decoder from cache - returns nullptr if none available
   UniqueCUvideodecoder getDecoder(CUVIDEOFORMAT* videoFormat);
@@ -68,11 +69,6 @@ class NVDECCache {
     CacheKey(const CacheKey&) = default;
     CacheKey& operator=(const CacheKey&) = default;
 
-    // TODONVDEC P2: we only implement operator< which is enough for std::map,
-    // but:
-    // - we should consider using std::unordered_map
-    // - we should consider a more sophisticated and potentially less strict
-    // cache key comparison logic
     bool operator<(const CacheKey& other) const {
       return std::tie(
                  codecType,

diff --git a/test/utils.py b/test/utils.py
@@ -41,9 +41,6 @@ def unsplit_device_str(device_str: str) -> str:
     # It is used:
     # - before calling `.to(device)` where device can't be "cuda:0:beta"
     # - before calling add_video_stream(device=device, device_variant=device_variant)
-    #
-    # TODONVDEC P2: Find a less clunky way to test the BETA CUDA interface. It
-    # will ultimately depend on how we want to publicly expose it.
     if device_str == "cuda:0:beta":
         return "cuda", "beta"
     else:
@@ -750,7 +747,7 @@ def sample_format(self) -> str:
 
 
 def supports_approximate_mode(asset: TestVideo) -> bool:
-    # TODONVDEC P2: open an issue about his. That's actually not related to
-    # NVDEC at all, those don't support approximate mode because they don't set
-    # a duration. CPU decoder fails too!
+    # Those are missing the `duration` field so they fail in approximate mode (on all devices).
+    # TODO: we should address this, see
+    # https://github.com/meta-pytorch/torchcodec/issues/945
     return asset not in (AV1_VIDEO, TEST_SRC_2_720P_VP9, TEST_SRC_2_720P_VP8)