From 886f64ad984169c8c0ea31c13905e15d8971ff17 Mon Sep 17 00:00:00 2001
From: Serge Panev <spanev@nvidia.com>
Date: Mon, 7 Jul 2025 11:31:16 +0900
Subject: [PATCH 01/14] Update NPP calls for CUDA >= 12.9

---
 src/torchcodec/_core/CudaDeviceInterface.cpp | 60 ++++++++++++++------
 1 file changed, 43 insertions(+), 17 deletions(-)

diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
index 8086d0b4b..c56fe788b 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -224,41 +224,67 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
   // Use the user-requested GPU for running the NPP kernel.
   c10::cuda::CUDAGuard deviceGuard(device_);
 
-  NppiSize oSizeROI = {width, height};
-  Npp8u* input[2] = {avFrame->data[0], avFrame->data[1]};
+  cudaStream_t rawStream = at::cuda::getCurrentCUDAStream().stream();
+
+  // Build an NppStreamContext, either via the old helper or by hand on CUDA 12.9+
+  NppStreamContext nppCtx{};
+  #if CUDA_VERSION < 12090
+    NppStatus ctxStat = nppGetStreamContext(&nppCtx);
+    TORCH_CHECK(ctxStat == NPP_SUCCESS, "nppGetStreamContext failed");
+    // override if you want to force a particular stream
+    nppCtx.hStream = rawStream;
+  #else
+    // CUDA 12.9+: helper was removed, we need to build it manually
+    int dev = 0;
+    cudaError_t err = cudaGetDevice(&dev);
+    TORCH_CHECK(err == cudaSuccess, "cudaGetDevice failed");
+    cudaDeviceProp prop{};
+    err = cudaGetDeviceProperties(&prop, dev);
+    TORCH_CHECK(err == cudaSuccess, "cudaGetDeviceProperties failed");
+
+    nppCtx.nCudaDeviceId                       = dev;
+    nppCtx.nMultiProcessorCount                = prop.multiProcessorCount;
+    nppCtx.nMaxThreadsPerMultiProcessor        = prop.maxThreadsPerMultiProcessor;
+    nppCtx.nMaxThreadsPerBlock                 = prop.maxThreadsPerBlock;
+    nppCtx.nSharedMemPerBlock                  = prop.sharedMemPerBlock;
+    nppCtx.nCudaDevAttrComputeCapabilityMajor  = prop.major;
+    nppCtx.nCudaDevAttrComputeCapabilityMinor  = prop.minor;
+    nppCtx.nStreamFlags                        = 0;
+    nppCtx.hStream                             = rawStream;
+  #endif
+
+  // Prepare ROI + pointers
+  NppiSize oSizeROI = { width, height };
+  Npp8u* input[2]  = { avFrame->data[0], avFrame->data[1] };
 
   auto start = std::chrono::high_resolution_clock::now();
   NppStatus status;
+
   if (avFrame->colorspace == AVColorSpace::AVCOL_SPC_BT709) {
-    status = nppiNV12ToRGB_709CSC_8u_P2C3R(
+    status = nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx(
         input,
         avFrame->linesize[0],
         static_cast<Npp8u*>(dst.data_ptr()),
         dst.stride(0),
-        oSizeROI);
+        oSizeROI,
+        nppCtx);
   } else {
-    status = nppiNV12ToRGB_8u_P2C3R(
+    status = nppiNV12ToRGB_8u_P2C3R_Ctx(
         input,
         avFrame->linesize[0],
         static_cast<Npp8u*>(dst.data_ptr()),
         dst.stride(0),
-        oSizeROI);
+        oSizeROI,
+        nppCtx);
   }
   TORCH_CHECK(status == NPP_SUCCESS, "Failed to convert NV12 frame.");
 
-  // Make the pytorch stream wait for the npp kernel to finish before using the
-  // output.
-  at::cuda::CUDAEvent nppDoneEvent;
-  at::cuda::CUDAStream nppStreamWrapper =
-      c10::cuda::getStreamFromExternal(nppGetStream(), device_.index());
-  nppDoneEvent.record(nppStreamWrapper);
-  nppDoneEvent.block(at::cuda::getCurrentCUDAStream());
-
   auto end = std::chrono::high_resolution_clock::now();
+  auto duration = std::chrono::duration<double, std::micro>(end - start);
+  VLOG(9) << "NPP Conversion of frame h=" << height
+          << " w=" << width
+          << " took: " << duration.count() << "us";
 
-  std::chrono::duration<double, std::micro> duration = end - start;
-  VLOG(9) << "NPP Conversion of frame height=" << height << " width=" << width
-          << " took: " << duration.count() << "us" << std::endl;
 }
 
 // inspired by https://github.com/FFmpeg/FFmpeg/commit/ad67ea9

From 6023ea395edf83b2162c85cf32a3f6c0466cd221 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 7 Jul 2025 11:34:50 +0100
Subject: [PATCH 02/14] Add testing against CUDA 12.9

---
 .github/workflows/linux_cuda_wheel.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/linux_cuda_wheel.yaml b/.github/workflows/linux_cuda_wheel.yaml
index bd57cac5e..8bc8c3bc1 100644
--- a/.github/workflows/linux_cuda_wheel.yaml
+++ b/.github/workflows/linux_cuda_wheel.yaml
@@ -67,7 +67,9 @@ jobs:
           # For the actual release we should add that label and change this to
           # include more python versions.
         python-version: ['3.9']
-        cuda-version: ['12.6', '12.8']
+        # We test against 12.6 and 12.9 to avoid having too big of a CI matrix,
+        # but for releases we should add 12.8.
+        cuda-version: ['12.6', '12.9']
         # TODO: put back ffmpeg 5 https://github.com/pytorch/torchcodec/issues/325
         ffmpeg-version-for-tests: ['4.4.2', '6', '7']
 

From d66cb33bb7d25399cfbba692e30186a930e9e27b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 7 Jul 2025 12:39:37 +0100
Subject: [PATCH 03/14] Linter

---
 src/torchcodec/_core/CudaDeviceInterface.cpp | 59 ++++++++++----------
 1 file changed, 29 insertions(+), 30 deletions(-)

diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
index c56fe788b..4ee0bf690 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -226,36 +226,37 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
 
   cudaStream_t rawStream = at::cuda::getCurrentCUDAStream().stream();
 
-  // Build an NppStreamContext, either via the old helper or by hand on CUDA 12.9+
+  // Build an NppStreamContext, either via the old helper or by hand on
+  // CUDA 12.9+
   NppStreamContext nppCtx{};
-  #if CUDA_VERSION < 12090
-    NppStatus ctxStat = nppGetStreamContext(&nppCtx);
-    TORCH_CHECK(ctxStat == NPP_SUCCESS, "nppGetStreamContext failed");
-    // override if you want to force a particular stream
-    nppCtx.hStream = rawStream;
-  #else
-    // CUDA 12.9+: helper was removed, we need to build it manually
-    int dev = 0;
-    cudaError_t err = cudaGetDevice(&dev);
-    TORCH_CHECK(err == cudaSuccess, "cudaGetDevice failed");
-    cudaDeviceProp prop{};
-    err = cudaGetDeviceProperties(&prop, dev);
-    TORCH_CHECK(err == cudaSuccess, "cudaGetDeviceProperties failed");
-
-    nppCtx.nCudaDeviceId                       = dev;
-    nppCtx.nMultiProcessorCount                = prop.multiProcessorCount;
-    nppCtx.nMaxThreadsPerMultiProcessor        = prop.maxThreadsPerMultiProcessor;
-    nppCtx.nMaxThreadsPerBlock                 = prop.maxThreadsPerBlock;
-    nppCtx.nSharedMemPerBlock                  = prop.sharedMemPerBlock;
-    nppCtx.nCudaDevAttrComputeCapabilityMajor  = prop.major;
-    nppCtx.nCudaDevAttrComputeCapabilityMinor  = prop.minor;
-    nppCtx.nStreamFlags                        = 0;
-    nppCtx.hStream                             = rawStream;
-  #endif
+#if CUDA_VERSION < 12090
+  NppStatus ctxStat = nppGetStreamContext(&nppCtx);
+  TORCH_CHECK(ctxStat == NPP_SUCCESS, "nppGetStreamContext failed");
+  // override if you want to force a particular stream
+  nppCtx.hStream = rawStream;
+#else
+  // CUDA 12.9+: helper was removed, we need to build it manually
+  int dev = 0;
+  cudaError_t err = cudaGetDevice(&dev);
+  TORCH_CHECK(err == cudaSuccess, "cudaGetDevice failed");
+  cudaDeviceProp prop{};
+  err = cudaGetDeviceProperties(&prop, dev);
+  TORCH_CHECK(err == cudaSuccess, "cudaGetDeviceProperties failed");
+
+  nppCtx.nCudaDeviceId = dev;
+  nppCtx.nMultiProcessorCount = prop.multiProcessorCount;
+  nppCtx.nMaxThreadsPerMultiProcessor = prop.maxThreadsPerMultiProcessor;
+  nppCtx.nMaxThreadsPerBlock = prop.maxThreadsPerBlock;
+  nppCtx.nSharedMemPerBlock = prop.sharedMemPerBlock;
+  nppCtx.nCudaDevAttrComputeCapabilityMajor = prop.major;
+  nppCtx.nCudaDevAttrComputeCapabilityMinor = prop.minor;
+  nppCtx.nStreamFlags = 0;
+  nppCtx.hStream = rawStream;
+#endif
 
   // Prepare ROI + pointers
-  NppiSize oSizeROI = { width, height };
-  Npp8u* input[2]  = { avFrame->data[0], avFrame->data[1] };
+  NppiSize oSizeROI = {width, height};
+  Npp8u* input[2] = {avFrame->data[0], avFrame->data[1]};
 
   auto start = std::chrono::high_resolution_clock::now();
   NppStatus status;
@@ -281,10 +282,8 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
 
   auto end = std::chrono::high_resolution_clock::now();
   auto duration = std::chrono::duration<double, std::micro>(end - start);
-  VLOG(9) << "NPP Conversion of frame h=" << height
-          << " w=" << width
+  VLOG(9) << "NPP Conversion of frame h=" << height << " w=" << width
           << " took: " << duration.count() << "us";
-
 }
 
 // inspired by https://github.com/FFmpeg/FFmpeg/commit/ad67ea9

From ecf01a91f19f75908922199eb15a1208237f6fd3 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Sat, 12 Jul 2025 11:33:11 +0100
Subject: [PATCH 04/14] Move nppContext creation into separate function. Also
 rely on device_

---
 src/torchcodec/_core/CudaDeviceInterface.cpp | 55 +++++++++++---------
 1 file changed, 30 insertions(+), 25 deletions(-)

diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
index 4ee0bf690..7aeb3b357 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -161,6 +161,35 @@ AVBufferRef* getCudaContext(const torch::Device& device) {
       device, nonNegativeDeviceIndex, type);
 #endif
 }
+
+NppStreamContext createNppStreamContext(
+    cudaStream_t rawStream,
+    int deviceIndex) {
+  NppStreamContext nppCtx{};
+#if CUDA_VERSION < 12090
+  NppStatus ctxStat = nppGetStreamContext(&nppCtx);
+  TORCH_CHECK(ctxStat == NPP_SUCCESS, "nppGetStreamContext failed");
+  // override if you want to force a particular stream
+  nppCtx.hStream = rawStream;
+#else
+  // CUDA 12.9+: helper was removed, we need to build it manually
+  cudaDeviceProp prop{};
+  cudaError_t err = cudaGetDeviceProperties(&prop, deviceIndex);
+  TORCH_CHECK(err == cudaSuccess, "cudaGetDeviceProperties failed");
+
+  nppCtx.nCudaDeviceId = deviceIndex;
+  nppCtx.nMultiProcessorCount = prop.multiProcessorCount;
+  nppCtx.nMaxThreadsPerMultiProcessor = prop.maxThreadsPerMultiProcessor;
+  nppCtx.nMaxThreadsPerBlock = prop.maxThreadsPerBlock;
+  nppCtx.nSharedMemPerBlock = prop.sharedMemPerBlock;
+  nppCtx.nCudaDevAttrComputeCapabilityMajor = prop.major;
+  nppCtx.nCudaDevAttrComputeCapabilityMinor = prop.minor;
+  nppCtx.nStreamFlags = 0;
+  nppCtx.hStream = rawStream;
+#endif
+  return nppCtx;
+}
+
 } // namespace
 
 CudaDeviceInterface::CudaDeviceInterface(const torch::Device& device)
@@ -228,31 +257,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
 
   // Build an NppStreamContext, either via the old helper or by hand on
   // CUDA 12.9+
-  NppStreamContext nppCtx{};
-#if CUDA_VERSION < 12090
-  NppStatus ctxStat = nppGetStreamContext(&nppCtx);
-  TORCH_CHECK(ctxStat == NPP_SUCCESS, "nppGetStreamContext failed");
-  // override if you want to force a particular stream
-  nppCtx.hStream = rawStream;
-#else
-  // CUDA 12.9+: helper was removed, we need to build it manually
-  int dev = 0;
-  cudaError_t err = cudaGetDevice(&dev);
-  TORCH_CHECK(err == cudaSuccess, "cudaGetDevice failed");
-  cudaDeviceProp prop{};
-  err = cudaGetDeviceProperties(&prop, dev);
-  TORCH_CHECK(err == cudaSuccess, "cudaGetDeviceProperties failed");
-
-  nppCtx.nCudaDeviceId = dev;
-  nppCtx.nMultiProcessorCount = prop.multiProcessorCount;
-  nppCtx.nMaxThreadsPerMultiProcessor = prop.maxThreadsPerMultiProcessor;
-  nppCtx.nMaxThreadsPerBlock = prop.maxThreadsPerBlock;
-  nppCtx.nSharedMemPerBlock = prop.sharedMemPerBlock;
-  nppCtx.nCudaDevAttrComputeCapabilityMajor = prop.major;
-  nppCtx.nCudaDevAttrComputeCapabilityMinor = prop.minor;
-  nppCtx.nStreamFlags = 0;
-  nppCtx.hStream = rawStream;
-#endif
+  NppStreamContext nppCtx = createNppStreamContext(rawStream, device_.index());
 
   // Prepare ROI + pointers
   NppiSize oSizeROI = {width, height};

From 565896e4935113eab2a50dc66124548b0acb0025 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Sat, 12 Jul 2025 11:54:35 +0100
Subject: [PATCH 05/14] Use cache for nppContext object

---
 src/torchcodec/_core/CudaDeviceInterface.cpp | 24 +++++++++-----------
 src/torchcodec/_core/CudaDeviceInterface.h   |  3 +++
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
index 7aeb3b357..b384f1818 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -162,15 +162,14 @@ AVBufferRef* getCudaContext(const torch::Device& device) {
 #endif
 }
 
-NppStreamContext createNppStreamContext(
-    cudaStream_t rawStream,
-    int deviceIndex) {
+NppStreamContext createNppStreamContext(int deviceIndex) {
+  // Build an NppStreamContext, either via the old helper or by hand on CUDA
+  // 12.9+
+
   NppStreamContext nppCtx{};
 #if CUDA_VERSION < 12090
   NppStatus ctxStat = nppGetStreamContext(&nppCtx);
   TORCH_CHECK(ctxStat == NPP_SUCCESS, "nppGetStreamContext failed");
-  // override if you want to force a particular stream
-  nppCtx.hStream = rawStream;
 #else
   // CUDA 12.9+: helper was removed, we need to build it manually
   cudaDeviceProp prop{};
@@ -185,7 +184,6 @@ NppStreamContext createNppStreamContext(
   nppCtx.nCudaDevAttrComputeCapabilityMajor = prop.major;
   nppCtx.nCudaDevAttrComputeCapabilityMinor = prop.minor;
   nppCtx.nStreamFlags = 0;
-  nppCtx.hStream = rawStream;
 #endif
   return nppCtx;
 }
@@ -253,11 +251,11 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
   // Use the user-requested GPU for running the NPP kernel.
   c10::cuda::CUDAGuard deviceGuard(device_);
 
-  cudaStream_t rawStream = at::cuda::getCurrentCUDAStream().stream();
-
-  // Build an NppStreamContext, either via the old helper or by hand on
-  // CUDA 12.9+
-  NppStreamContext nppCtx = createNppStreamContext(rawStream, device_.index());
+  if (!nppCtxInitialized_) {
+    nppCtx_ = createNppStreamContext(device_.index());
+    nppCtxInitialized_ = true;
+  }
+  nppCtx_.hStream = at::cuda::getCurrentCUDAStream().stream();
 
   // Prepare ROI + pointers
   NppiSize oSizeROI = {width, height};
@@ -273,7 +271,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
         static_cast<Npp8u*>(dst.data_ptr()),
         dst.stride(0),
         oSizeROI,
-        nppCtx);
+        nppCtx_);
   } else {
     status = nppiNV12ToRGB_8u_P2C3R_Ctx(
         input,
@@ -281,7 +279,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
         static_cast<Npp8u*>(dst.data_ptr()),
         dst.stride(0),
         oSizeROI,
-        nppCtx);
+        nppCtx_);
   }
   TORCH_CHECK(status == NPP_SUCCESS, "Failed to convert NV12 frame.");
 
diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h
index 01f3b19b5..d62250132 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.h
+++ b/src/torchcodec/_core/CudaDeviceInterface.h
@@ -6,6 +6,7 @@
 
 #pragma once
 
+#include <npp.h>
 #include "src/torchcodec/_core/DeviceInterface.h"
 
 namespace facebook::torchcodec {
@@ -30,6 +31,8 @@ class CudaDeviceInterface : public DeviceInterface {
 
  private:
   AVBufferRef* ctx_ = nullptr;
+  NppStreamContext nppCtx_{};
+  bool nppCtxInitialized_ = false;
 };
 
 } // namespace facebook::torchcodec

From 9a6d3d3693b5a2b53a81e0eff5f151c5311bfdad Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Sat, 12 Jul 2025 14:07:19 +0100
Subject: [PATCH 06/14] Add maybe_unused

---
 src/torchcodec/_core/CudaDeviceInterface.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
index b384f1818..fa5583130 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -162,7 +162,7 @@ AVBufferRef* getCudaContext(const torch::Device& device) {
 #endif
 }
 
-NppStreamContext createNppStreamContext(int deviceIndex) {
+NppStreamContext createNppStreamContext([[maybe_unused]] int deviceIndex) {
   // Build an NppStreamContext, either via the old helper or by hand on CUDA
   // 12.9+
 

From 2d681ad191d66982165766824d255aceed5f6f88 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Sat, 12 Jul 2025 20:59:46 +0100
Subject: [PATCH 07/14] Pass positive index

---
 src/torchcodec/_core/CudaDeviceInterface.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
index fa5583130..75372b583 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -174,7 +174,10 @@ NppStreamContext createNppStreamContext([[maybe_unused]] int deviceIndex) {
   // CUDA 12.9+: helper was removed, we need to build it manually
   cudaDeviceProp prop{};
   cudaError_t err = cudaGetDeviceProperties(&prop, deviceIndex);
-  TORCH_CHECK(err == cudaSuccess, "cudaGetDeviceProperties failed");
+  TORCH_CHECK(
+      err == cudaSuccess,
+      "cudaGetDeviceProperties failed: ",
+      cudaGetErrorString(err));
 
   nppCtx.nCudaDeviceId = deviceIndex;
   nppCtx.nMultiProcessorCount = prop.multiProcessorCount;
@@ -252,7 +255,8 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
   c10::cuda::CUDAGuard deviceGuard(device_);
 
   if (!nppCtxInitialized_) {
-    nppCtx_ = createNppStreamContext(device_.index());
+    nppCtx_ = createNppStreamContext(
+        static_cast<int>(getFFMPEGCompatibleDeviceIndex(device_)));
     nppCtxInitialized_ = true;
   }
   nppCtx_.hStream = at::cuda::getCurrentCUDAStream().stream();

From 320c06002900af01babe8e5ee0cd47123c0a5e2b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Sun, 27 Jul 2025 09:28:36 +0100
Subject: [PATCH 08/14] Try manual creation for all CUDA versions

---
 src/torchcodec/_core/CudaDeviceInterface.cpp | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
index 59a5b3f53..072a37009 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -163,15 +163,15 @@ AVBufferRef* getCudaContext(const torch::Device& device) {
 }
 
 NppStreamContext createNppStreamContext([[maybe_unused]] int deviceIndex) {
-  // Build an NppStreamContext, either via the old helper or by hand on CUDA
-  // 12.9+
+  // From 12.9, NPP recommends using a user-created NppStreamContext and using
+  // the `_Ctx()` calls:
+  // https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#npp-release-12-9-update-1
+  // And the nppGetStreamContext() helper is deprecated. We are explicitly
+  // supposed to create the NppStreamContext manually from the CUDA device
+  // properties:
+  // https://github.com/NVIDIA/CUDALibrarySamples/blob/d97803a40fab83c058bb3d68b6c38bd6eebfff43/NPP/README.md?plain=1#L54-L72
 
   NppStreamContext nppCtx{};
-#if CUDA_VERSION < 12090
-  NppStatus ctxStat = nppGetStreamContext(&nppCtx);
-  TORCH_CHECK(ctxStat == NPP_SUCCESS, "nppGetStreamContext failed");
-#else
-  // CUDA 12.9+: helper was removed, we need to build it manually
   cudaDeviceProp prop{};
   cudaError_t err = cudaGetDeviceProperties(&prop, deviceIndex);
   TORCH_CHECK(
@@ -187,7 +187,6 @@ NppStreamContext createNppStreamContext([[maybe_unused]] int deviceIndex) {
   nppCtx.nCudaDevAttrComputeCapabilityMajor = prop.major;
   nppCtx.nCudaDevAttrComputeCapabilityMinor = prop.minor;
   nppCtx.nStreamFlags = 0;
-#endif
   return nppCtx;
 }
 

From 7056fc0e5c990d1340eb5ff7f5b9916331b6f51d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Sun, 27 Jul 2025 11:14:39 +0100
Subject: [PATCH 09/14] remove cache, it should be per-device not per decoder
 instance. Leaving for later.

---
 src/torchcodec/_core/CudaDeviceInterface.cpp | 33 ++++++++++++--------
 src/torchcodec/_core/CudaDeviceInterface.h   |  2 --
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
index 072a37009..2d09ee245 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -162,7 +162,7 @@ AVBufferRef* getCudaContext(const torch::Device& device) {
 #endif
 }
 
-NppStreamContext createNppStreamContext([[maybe_unused]] int deviceIndex) {
+NppStreamContext createNppStreamContext(int deviceIndex) {
   // From 12.9, NPP recommends using a user-created NppStreamContext and using
   // the `_Ctx()` calls:
   // https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#npp-release-12-9-update-1
@@ -186,7 +186,16 @@ NppStreamContext createNppStreamContext([[maybe_unused]] int deviceIndex) {
   nppCtx.nSharedMemPerBlock = prop.sharedMemPerBlock;
   nppCtx.nCudaDevAttrComputeCapabilityMajor = prop.major;
   nppCtx.nCudaDevAttrComputeCapabilityMinor = prop.minor;
-  nppCtx.nStreamFlags = 0;
+
+  // TODO when implementing the cache logic, move these out. See other TODO
+  // below.
+  nppCtx.hStream = at::cuda::getCurrentCUDAStream(deviceIndex).stream();
+  err = cudaStreamGetFlags(nppCtx.hStream, &nppCtx.nStreamFlags);
+  TORCH_CHECK(
+      err == cudaSuccess,
+      "cudaStreamGetFlags failed: ",
+      cudaGetErrorString(err));
+
   return nppCtx;
 }
 
@@ -250,15 +259,13 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
     dst = allocateEmptyHWCTensor(height, width, device_);
   }
 
-  // Use the user-requested GPU for running the NPP kernel.
-  c10::cuda::CUDAGuard deviceGuard(device_);
-
-  if (!nppCtxInitialized_) {
-    nppCtx_ = createNppStreamContext(
-        static_cast<int>(getFFMPEGCompatibleDeviceIndex(device_)));
-    nppCtxInitialized_ = true;
-  }
-  nppCtx_.hStream = at::cuda::getCurrentCUDAStream().stream();
+  // TODO cache the NppStreamContext! It currently gets re-recated for every
+  // single frame. The cache should be per-device, similar to the existing
+  // hw_device_ctx cache. When implementing the cache logic, the
+  // NppStreamContext hStream and nStreamFlags should not be part of the cache
+  // because they may change across calls.
+  NppStreamContext nppCtx = createNppStreamContext(
+      static_cast<int>(getFFMPEGCompatibleDeviceIndex(device_)));
 
   // Prepare ROI + pointers
   NppiSize oSizeROI = {width, height};
@@ -273,7 +280,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
         static_cast<Npp8u*>(dst.data_ptr()),
         dst.stride(0),
         oSizeROI,
-        nppCtx_);
+        nppCtx);
   } else {
     status = nppiNV12ToRGB_8u_P2C3R_Ctx(
         input,
@@ -281,7 +288,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
         static_cast<Npp8u*>(dst.data_ptr()),
         dst.stride(0),
         oSizeROI,
-        nppCtx_);
+        nppCtx);
   }
   TORCH_CHECK(status == NPP_SUCCESS, "Failed to convert NV12 frame.");
 }
diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h
index d62250132..526f4a977 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.h
+++ b/src/torchcodec/_core/CudaDeviceInterface.h
@@ -31,8 +31,6 @@ class CudaDeviceInterface : public DeviceInterface {
 
  private:
   AVBufferRef* ctx_ = nullptr;
-  NppStreamContext nppCtx_{};
-  bool nppCtxInitialized_ = false;
 };
 
 } // namespace facebook::torchcodec

From 9ddc670a902cc1f71ca9fcc3dc2ad365d2611870 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Wed, 30 Jul 2025 09:02:40 +0100
Subject: [PATCH 10/14] Revert "remove cache, it should be per-device not per
 decoder instance. Leaving"

This reverts commit 7056fc0e5c990d1340eb5ff7f5b9916331b6f51d.
---
 src/torchcodec/_core/CudaDeviceInterface.cpp | 33 ++++++++------------
 src/torchcodec/_core/CudaDeviceInterface.h   |  2 ++
 2 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
index 2d09ee245..072a37009 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -162,7 +162,7 @@ AVBufferRef* getCudaContext(const torch::Device& device) {
 #endif
 }
 
-NppStreamContext createNppStreamContext(int deviceIndex) {
+NppStreamContext createNppStreamContext([[maybe_unused]] int deviceIndex) {
   // From 12.9, NPP recommends using a user-created NppStreamContext and using
   // the `_Ctx()` calls:
   // https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#npp-release-12-9-update-1
@@ -186,16 +186,7 @@ NppStreamContext createNppStreamContext(int deviceIndex) {
   nppCtx.nSharedMemPerBlock = prop.sharedMemPerBlock;
   nppCtx.nCudaDevAttrComputeCapabilityMajor = prop.major;
   nppCtx.nCudaDevAttrComputeCapabilityMinor = prop.minor;
-
-  // TODO when implementing the cache logic, move these out. See other TODO
-  // below.
-  nppCtx.hStream = at::cuda::getCurrentCUDAStream(deviceIndex).stream();
-  err = cudaStreamGetFlags(nppCtx.hStream, &nppCtx.nStreamFlags);
-  TORCH_CHECK(
-      err == cudaSuccess,
-      "cudaStreamGetFlags failed: ",
-      cudaGetErrorString(err));
-
+  nppCtx.nStreamFlags = 0;
   return nppCtx;
 }
 
@@ -259,13 +250,15 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
     dst = allocateEmptyHWCTensor(height, width, device_);
   }
 
-  // TODO cache the NppStreamContext! It currently gets re-recated for every
-  // single frame. The cache should be per-device, similar to the existing
-  // hw_device_ctx cache. When implementing the cache logic, the
-  // NppStreamContext hStream and nStreamFlags should not be part of the cache
-  // because they may change across calls.
-  NppStreamContext nppCtx = createNppStreamContext(
-      static_cast<int>(getFFMPEGCompatibleDeviceIndex(device_)));
+  // Use the user-requested GPU for running the NPP kernel.
+  c10::cuda::CUDAGuard deviceGuard(device_);
+
+  if (!nppCtxInitialized_) {
+    nppCtx_ = createNppStreamContext(
+        static_cast<int>(getFFMPEGCompatibleDeviceIndex(device_)));
+    nppCtxInitialized_ = true;
+  }
+  nppCtx_.hStream = at::cuda::getCurrentCUDAStream().stream();
 
   // Prepare ROI + pointers
   NppiSize oSizeROI = {width, height};
@@ -280,7 +273,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
         static_cast<Npp8u*>(dst.data_ptr()),
         dst.stride(0),
         oSizeROI,
-        nppCtx);
+        nppCtx_);
   } else {
     status = nppiNV12ToRGB_8u_P2C3R_Ctx(
         input,
@@ -288,7 +281,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
         static_cast<Npp8u*>(dst.data_ptr()),
         dst.stride(0),
         oSizeROI,
-        nppCtx);
+        nppCtx_);
   }
   TORCH_CHECK(status == NPP_SUCCESS, "Failed to convert NV12 frame.");
 }
diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h
index 526f4a977..d62250132 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.h
+++ b/src/torchcodec/_core/CudaDeviceInterface.h
@@ -31,6 +31,8 @@ class CudaDeviceInterface : public DeviceInterface {
 
  private:
   AVBufferRef* ctx_ = nullptr;
+  NppStreamContext nppCtx_{};
+  bool nppCtxInitialized_ = false;
 };
 
 } // namespace facebook::torchcodec

From c27d4b5d068f2f9b8154ea47fe10e67b5f0b31f1 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Wed, 30 Jul 2025 09:11:42 +0100
Subject: [PATCH 11/14] Remove deviceGuard

---
 src/torchcodec/_core/CudaDeviceInterface.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
index 4616aedf9..1ad519ca7 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -294,9 +294,6 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
     dst = allocateEmptyHWCTensor(height, width, device_);
   }
 
-  // Use the user-requested GPU for running the NPP kernel.
-  c10::cuda::CUDAGuard deviceGuard(device_);
-
   if (!nppCtxInitialized_) {
     nppCtx_ = createNppStreamContext(
         static_cast<int>(getFFMPEGCompatibleDeviceIndex(device_)));

From 5306ca4ac5a9574c407ad1c08bf8daf9da296ffc Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Wed, 30 Jul 2025 09:26:22 +0100
Subject: [PATCH 12/14] Revert "Remove deviceGuard"

This reverts commit c27d4b5d068f2f9b8154ea47fe10e67b5f0b31f1.
---
 src/torchcodec/_core/CudaDeviceInterface.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
index 1ad519ca7..4616aedf9 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -294,6 +294,9 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
     dst = allocateEmptyHWCTensor(height, width, device_);
   }
 
+  // Use the user-requested GPU for running the NPP kernel.
+  c10::cuda::CUDAGuard deviceGuard(device_);
+
   if (!nppCtxInitialized_) {
     nppCtx_ = createNppStreamContext(
         static_cast<int>(getFFMPEGCompatibleDeviceIndex(device_)));

From b454c0c38803e5556893393b4968ec0bb5d0f87d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Wed, 30 Jul 2025 09:26:33 +0100
Subject: [PATCH 13/14] Reapply "remove cache, it should be per-device not per
 decoder instance. Leaving"

This reverts commit 9ddc670a902cc1f71ca9fcc3dc2ad365d2611870.
---
 src/torchcodec/_core/CudaDeviceInterface.cpp | 33 ++++++++++++--------
 src/torchcodec/_core/CudaDeviceInterface.h   |  2 --
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
index 4616aedf9..63c28d263 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -162,7 +162,7 @@ AVBufferRef* getCudaContext(const torch::Device& device) {
 #endif
 }
 
-NppStreamContext createNppStreamContext([[maybe_unused]] int deviceIndex) {
+NppStreamContext createNppStreamContext(int deviceIndex) {
   // From 12.9, NPP recommends using a user-created NppStreamContext and using
   // the `_Ctx()` calls:
   // https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#npp-release-12-9-update-1
@@ -186,7 +186,16 @@ NppStreamContext createNppStreamContext([[maybe_unused]] int deviceIndex) {
   nppCtx.nSharedMemPerBlock = prop.sharedMemPerBlock;
   nppCtx.nCudaDevAttrComputeCapabilityMajor = prop.major;
   nppCtx.nCudaDevAttrComputeCapabilityMinor = prop.minor;
-  nppCtx.nStreamFlags = 0;
+
+  // TODO when implementing the cache logic, move these out. See other TODO
+  // below.
+  nppCtx.hStream = at::cuda::getCurrentCUDAStream(deviceIndex).stream();
+  err = cudaStreamGetFlags(nppCtx.hStream, &nppCtx.nStreamFlags);
+  TORCH_CHECK(
+      err == cudaSuccess,
+      "cudaStreamGetFlags failed: ",
+      cudaGetErrorString(err));
+
   return nppCtx;
 }
 
@@ -294,15 +303,13 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
     dst = allocateEmptyHWCTensor(height, width, device_);
   }
 
-  // Use the user-requested GPU for running the NPP kernel.
-  c10::cuda::CUDAGuard deviceGuard(device_);
-
-  if (!nppCtxInitialized_) {
-    nppCtx_ = createNppStreamContext(
-        static_cast<int>(getFFMPEGCompatibleDeviceIndex(device_)));
-    nppCtxInitialized_ = true;
-  }
-  nppCtx_.hStream = at::cuda::getCurrentCUDAStream().stream();
+  // TODO cache the NppStreamContext! It currently gets re-recated for every
+  // single frame. The cache should be per-device, similar to the existing
+  // hw_device_ctx cache. When implementing the cache logic, the
+  // NppStreamContext hStream and nStreamFlags should not be part of the cache
+  // because they may change across calls.
+  NppStreamContext nppCtx = createNppStreamContext(
+      static_cast<int>(getFFMPEGCompatibleDeviceIndex(device_)));
 
   // Prepare ROI + pointers
   NppiSize oSizeROI = {width, height};
@@ -317,7 +324,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
         static_cast<Npp8u*>(dst.data_ptr()),
         dst.stride(0),
         oSizeROI,
-        nppCtx_);
+        nppCtx);
   } else {
     status = nppiNV12ToRGB_8u_P2C3R_Ctx(
         input,
@@ -325,7 +332,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
         static_cast<Npp8u*>(dst.data_ptr()),
         dst.stride(0),
         oSizeROI,
-        nppCtx_);
+        nppCtx);
   }
   TORCH_CHECK(status == NPP_SUCCESS, "Failed to convert NV12 frame.");
 }
diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h
index d62250132..526f4a977 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.h
+++ b/src/torchcodec/_core/CudaDeviceInterface.h
@@ -31,8 +31,6 @@ class CudaDeviceInterface : public DeviceInterface {
 
  private:
   AVBufferRef* ctx_ = nullptr;
-  NppStreamContext nppCtx_{};
-  bool nppCtxInitialized_ = false;
 };
 
 } // namespace facebook::torchcodec

From 8cedbde0113a4fce6e4dd17b5e7b905d94d598b0 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@meta.com>
Date: Wed, 30 Jul 2025 09:28:23 +0100
Subject: [PATCH 14/14] remove comment

---
 src/torchcodec/_core/CudaDeviceInterface.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
index 63c28d263..9bfea4e52 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -311,7 +311,6 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
   NppStreamContext nppCtx = createNppStreamContext(
       static_cast<int>(getFFMPEGCompatibleDeviceIndex(device_)));
 
-  // Prepare ROI + pointers
   NppiSize oSizeROI = {width, height};
   Npp8u* input[2] = {avFrame->data[0], avFrame->data[1]};