diff --git a/.github/workflows/linux_cuda_wheel.yaml b/.github/workflows/linux_cuda_wheel.yaml
index 61d6f50d0..a381c1669 100644
--- a/.github/workflows/linux_cuda_wheel.yaml
+++ b/.github/workflows/linux_cuda_wheel.yaml
@@ -67,7 +67,9 @@ jobs:
           # For the actual release we should add that label and change this to
           # include more python versions.
         python-version: ['3.9']
-        cuda-version: ['12.6', '12.8']
+        # We test against 12.6 and 12.9 to avoid having too big of a CI matrix,
+        # but for releases we should add 12.8.
+        cuda-version: ['12.6', '12.9']
         # TODO: put back ffmpeg 5 https://github.com/pytorch/torchcodec/issues/325
         ffmpeg-version-for-tests: ['4.4.2', '6', '7']
 
diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
index 9ea7807d7..9bfea4e52 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -161,6 +161,44 @@ AVBufferRef* getCudaContext(const torch::Device& device) {
       device, nonNegativeDeviceIndex, type);
 #endif
 }
+
+NppStreamContext createNppStreamContext(int deviceIndex) {
+  // From 12.9, NPP recommends using a user-created NppStreamContext and using
+  // the `_Ctx()` calls:
+  // https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#npp-release-12-9-update-1
+  // And the nppGetStreamContext() helper is deprecated. We are explicitly
+  // supposed to create the NppStreamContext manually from the CUDA device
+  // properties:
+  // https://github.com/NVIDIA/CUDALibrarySamples/blob/d97803a40fab83c058bb3d68b6c38bd6eebfff43/NPP/README.md?plain=1#L54-L72
+
+  NppStreamContext nppCtx{};
+  cudaDeviceProp prop{};
+  cudaError_t err = cudaGetDeviceProperties(&prop, deviceIndex);
+  TORCH_CHECK(
+      err == cudaSuccess,
+      "cudaGetDeviceProperties failed: ",
+      cudaGetErrorString(err));
+
+  nppCtx.nCudaDeviceId = deviceIndex;
+  nppCtx.nMultiProcessorCount = prop.multiProcessorCount;
+  nppCtx.nMaxThreadsPerMultiProcessor = prop.maxThreadsPerMultiProcessor;
+  nppCtx.nMaxThreadsPerBlock = prop.maxThreadsPerBlock;
+  nppCtx.nSharedMemPerBlock = prop.sharedMemPerBlock;
+  nppCtx.nCudaDevAttrComputeCapabilityMajor = prop.major;
+  nppCtx.nCudaDevAttrComputeCapabilityMinor = prop.minor;
+
+  // TODO when implementing the cache logic, move these out. See other TODO
+  // below.
+  nppCtx.hStream = at::cuda::getCurrentCUDAStream(deviceIndex).stream();
+  err = cudaStreamGetFlags(nppCtx.hStream, &nppCtx.nStreamFlags);
+  TORCH_CHECK(
+      err == cudaSuccess,
+      "cudaStreamGetFlags failed: ",
+      cudaGetErrorString(err));
+
+  return nppCtx;
+}
+
 } // namespace
 
 CudaDeviceInterface::CudaDeviceInterface(const torch::Device& device)
@@ -265,37 +303,37 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
     dst = allocateEmptyHWCTensor(height, width, device_);
   }
 
-  // Use the user-requested GPU for running the NPP kernel.
-  c10::cuda::CUDAGuard deviceGuard(device_);
+  // TODO cache the NppStreamContext! It currently gets re-recated for every
+  // single frame. The cache should be per-device, similar to the existing
+  // hw_device_ctx cache. When implementing the cache logic, the
+  // NppStreamContext hStream and nStreamFlags should not be part of the cache
+  // because they may change across calls.
+  NppStreamContext nppCtx = createNppStreamContext(
+      static_cast<int>(getFFMPEGCompatibleDeviceIndex(device_)));
 
   NppiSize oSizeROI = {width, height};
   Npp8u* input[2] = {avFrame->data[0], avFrame->data[1]};
 
   NppStatus status;
+
   if (avFrame->colorspace == AVColorSpace::AVCOL_SPC_BT709) {
-    status = nppiNV12ToRGB_709CSC_8u_P2C3R(
+    status = nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx(
         input,
         avFrame->linesize[0],
         static_cast<Npp8u*>(dst.data_ptr()),
         dst.stride(0),
-        oSizeROI);
+        oSizeROI,
+        nppCtx);
   } else {
-    status = nppiNV12ToRGB_8u_P2C3R(
+    status = nppiNV12ToRGB_8u_P2C3R_Ctx(
         input,
         avFrame->linesize[0],
         static_cast<Npp8u*>(dst.data_ptr()),
         dst.stride(0),
-        oSizeROI);
+        oSizeROI,
+        nppCtx);
   }
   TORCH_CHECK(status == NPP_SUCCESS, "Failed to convert NV12 frame.");
-
-  // Make the pytorch stream wait for the npp kernel to finish before using the
-  // output.
-  at::cuda::CUDAEvent nppDoneEvent;
-  at::cuda::CUDAStream nppStreamWrapper =
-      c10::cuda::getStreamFromExternal(nppGetStream(), device_.index());
-  nppDoneEvent.record(nppStreamWrapper);
-  nppDoneEvent.block(at::cuda::getCurrentCUDAStream());
 }
 
 // inspired by https://github.com/FFmpeg/FFmpeg/commit/ad67ea9
diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h
index 01f3b19b5..526f4a977 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.h
+++ b/src/torchcodec/_core/CudaDeviceInterface.h
@@ -6,6 +6,7 @@
 
 #pragma once
 
+#include <npp.h>
 #include "src/torchcodec/_core/DeviceInterface.h"
 
 namespace facebook::torchcodec {