From 886f64ad984169c8c0ea31c13905e15d8971ff17 Mon Sep 17 00:00:00 2001 From: Serge Panev Date: Mon, 7 Jul 2025 11:31:16 +0900 Subject: [PATCH 01/14] Update NPP calls for CUDA >= 12.9 --- src/torchcodec/_core/CudaDeviceInterface.cpp | 60 ++++++++++++++------ 1 file changed, 43 insertions(+), 17 deletions(-) diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp index 8086d0b4b..c56fe788b 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.cpp +++ b/src/torchcodec/_core/CudaDeviceInterface.cpp @@ -224,41 +224,67 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( // Use the user-requested GPU for running the NPP kernel. c10::cuda::CUDAGuard deviceGuard(device_); - NppiSize oSizeROI = {width, height}; - Npp8u* input[2] = {avFrame->data[0], avFrame->data[1]}; + cudaStream_t rawStream = at::cuda::getCurrentCUDAStream().stream(); + + // Build an NppStreamContext, either via the old helper or by hand on CUDA 12.9+ + NppStreamContext nppCtx{}; + #if CUDA_VERSION < 12090 + NppStatus ctxStat = nppGetStreamContext(&nppCtx); + TORCH_CHECK(ctxStat == NPP_SUCCESS, "nppGetStreamContext failed"); + // override if you want to force a particular stream + nppCtx.hStream = rawStream; + #else + // CUDA 12.9+: helper was removed, we need to build it manually + int dev = 0; + cudaError_t err = cudaGetDevice(&dev); + TORCH_CHECK(err == cudaSuccess, "cudaGetDevice failed"); + cudaDeviceProp prop{}; + err = cudaGetDeviceProperties(&prop, dev); + TORCH_CHECK(err == cudaSuccess, "cudaGetDeviceProperties failed"); + + nppCtx.nCudaDeviceId = dev; + nppCtx.nMultiProcessorCount = prop.multiProcessorCount; + nppCtx.nMaxThreadsPerMultiProcessor = prop.maxThreadsPerMultiProcessor; + nppCtx.nMaxThreadsPerBlock = prop.maxThreadsPerBlock; + nppCtx.nSharedMemPerBlock = prop.sharedMemPerBlock; + nppCtx.nCudaDevAttrComputeCapabilityMajor = prop.major; + nppCtx.nCudaDevAttrComputeCapabilityMinor = prop.minor; + nppCtx.nStreamFlags = 0; + nppCtx.hStream = rawStream; + #endif + + // Prepare ROI + pointers + NppiSize oSizeROI = { width, height }; + Npp8u* input[2] = { avFrame->data[0], avFrame->data[1] }; auto start = std::chrono::high_resolution_clock::now(); NppStatus status; + if (avFrame->colorspace == AVColorSpace::AVCOL_SPC_BT709) { - status = nppiNV12ToRGB_709CSC_8u_P2C3R( + status = nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx( input, avFrame->linesize[0], static_cast(dst.data_ptr()), dst.stride(0), - oSizeROI); + oSizeROI, + nppCtx); } else { - status = nppiNV12ToRGB_8u_P2C3R( + status = nppiNV12ToRGB_8u_P2C3R_Ctx( input, avFrame->linesize[0], static_cast(dst.data_ptr()), dst.stride(0), - oSizeROI); + oSizeROI, + nppCtx); } TORCH_CHECK(status == NPP_SUCCESS, "Failed to convert NV12 frame."); - // Make the pytorch stream wait for the npp kernel to finish before using the - // output. - at::cuda::CUDAEvent nppDoneEvent; - at::cuda::CUDAStream nppStreamWrapper = - c10::cuda::getStreamFromExternal(nppGetStream(), device_.index()); - nppDoneEvent.record(nppStreamWrapper); - nppDoneEvent.block(at::cuda::getCurrentCUDAStream()); - auto end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration(end - start); + VLOG(9) << "NPP Conversion of frame h=" << height + << " w=" << width + << " took: " << duration.count() << "us"; - std::chrono::duration duration = end - start; - VLOG(9) << "NPP Conversion of frame height=" << height << " width=" << width - << " took: " << duration.count() << "us" << std::endl; } // inspired by https://github.com/FFmpeg/FFmpeg/commit/ad67ea9 From 6023ea395edf83b2162c85cf32a3f6c0466cd221 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 7 Jul 2025 11:34:50 +0100 Subject: [PATCH 02/14] Add testing against CUDA 12.9 --- .github/workflows/linux_cuda_wheel.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/linux_cuda_wheel.yaml b/.github/workflows/linux_cuda_wheel.yaml index bd57cac5e..8bc8c3bc1 100644 --- a/.github/workflows/linux_cuda_wheel.yaml +++ b/.github/workflows/linux_cuda_wheel.yaml @@ -67,7 +67,9 @@ jobs: # For the actual release we should add that label and change this to # include more python versions. python-version: ['3.9'] - cuda-version: ['12.6', '12.8'] + # We test against 12.6 and 12.9 to avoid having too big of a CI matrix, + # but for releases we should add 12.8. + cuda-version: ['12.6', '12.9'] # TODO: put back ffmpeg 5 https://github.com/pytorch/torchcodec/issues/325 ffmpeg-version-for-tests: ['4.4.2', '6', '7'] From d66cb33bb7d25399cfbba692e30186a930e9e27b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 7 Jul 2025 12:39:37 +0100 Subject: [PATCH 03/14] Linter --- src/torchcodec/_core/CudaDeviceInterface.cpp | 59 ++++++++++---------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp index c56fe788b..4ee0bf690 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.cpp +++ b/src/torchcodec/_core/CudaDeviceInterface.cpp @@ -226,36 +226,37 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( cudaStream_t rawStream = at::cuda::getCurrentCUDAStream().stream(); - // Build an NppStreamContext, either via the old helper or by hand on CUDA 12.9+ + // Build an NppStreamContext, either via the old helper or by hand on + // CUDA 12.9+ NppStreamContext nppCtx{}; - #if CUDA_VERSION < 12090 - NppStatus ctxStat = nppGetStreamContext(&nppCtx); - TORCH_CHECK(ctxStat == NPP_SUCCESS, "nppGetStreamContext failed"); - // override if you want to force a particular stream - nppCtx.hStream = rawStream; - #else - // CUDA 12.9+: helper was removed, we need to build it manually - int dev = 0; - cudaError_t err = cudaGetDevice(&dev); - TORCH_CHECK(err == cudaSuccess, "cudaGetDevice failed"); - cudaDeviceProp prop{}; - err = cudaGetDeviceProperties(&prop, dev); - TORCH_CHECK(err == cudaSuccess, "cudaGetDeviceProperties failed"); - - nppCtx.nCudaDeviceId = dev; - nppCtx.nMultiProcessorCount = prop.multiProcessorCount; - nppCtx.nMaxThreadsPerMultiProcessor = prop.maxThreadsPerMultiProcessor; - nppCtx.nMaxThreadsPerBlock = prop.maxThreadsPerBlock; - nppCtx.nSharedMemPerBlock = prop.sharedMemPerBlock; - nppCtx.nCudaDevAttrComputeCapabilityMajor = prop.major; - nppCtx.nCudaDevAttrComputeCapabilityMinor = prop.minor; - nppCtx.nStreamFlags = 0; - nppCtx.hStream = rawStream; - #endif +#if CUDA_VERSION < 12090 + NppStatus ctxStat = nppGetStreamContext(&nppCtx); + TORCH_CHECK(ctxStat == NPP_SUCCESS, "nppGetStreamContext failed"); + // override if you want to force a particular stream + nppCtx.hStream = rawStream; +#else + // CUDA 12.9+: helper was removed, we need to build it manually + int dev = 0; + cudaError_t err = cudaGetDevice(&dev); + TORCH_CHECK(err == cudaSuccess, "cudaGetDevice failed"); + cudaDeviceProp prop{}; + err = cudaGetDeviceProperties(&prop, dev); + TORCH_CHECK(err == cudaSuccess, "cudaGetDeviceProperties failed"); + + nppCtx.nCudaDeviceId = dev; + nppCtx.nMultiProcessorCount = prop.multiProcessorCount; + nppCtx.nMaxThreadsPerMultiProcessor = prop.maxThreadsPerMultiProcessor; + nppCtx.nMaxThreadsPerBlock = prop.maxThreadsPerBlock; + nppCtx.nSharedMemPerBlock = prop.sharedMemPerBlock; + nppCtx.nCudaDevAttrComputeCapabilityMajor = prop.major; + nppCtx.nCudaDevAttrComputeCapabilityMinor = prop.minor; + nppCtx.nStreamFlags = 0; + nppCtx.hStream = rawStream; +#endif // Prepare ROI + pointers - NppiSize oSizeROI = { width, height }; - Npp8u* input[2] = { avFrame->data[0], avFrame->data[1] }; + NppiSize oSizeROI = {width, height}; + Npp8u* input[2] = {avFrame->data[0], avFrame->data[1]}; auto start = std::chrono::high_resolution_clock::now(); NppStatus status; @@ -281,10 +282,8 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( auto end = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration(end - start); - VLOG(9) << "NPP Conversion of frame h=" << height - << " w=" << width + VLOG(9) << "NPP Conversion of frame h=" << height << " w=" << width << " took: " << duration.count() << "us"; - } // inspired by https://github.com/FFmpeg/FFmpeg/commit/ad67ea9 From ecf01a91f19f75908922199eb15a1208237f6fd3 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 12 Jul 2025 11:33:11 +0100 Subject: [PATCH 04/14] Move nppContext creation into separate function. Also rely on device_ --- src/torchcodec/_core/CudaDeviceInterface.cpp | 55 +++++++++++--------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp index 4ee0bf690..7aeb3b357 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.cpp +++ b/src/torchcodec/_core/CudaDeviceInterface.cpp @@ -161,6 +161,35 @@ AVBufferRef* getCudaContext(const torch::Device& device) { device, nonNegativeDeviceIndex, type); #endif } + +NppStreamContext createNppStreamContext( + cudaStream_t rawStream, + int deviceIndex) { + NppStreamContext nppCtx{}; +#if CUDA_VERSION < 12090 + NppStatus ctxStat = nppGetStreamContext(&nppCtx); + TORCH_CHECK(ctxStat == NPP_SUCCESS, "nppGetStreamContext failed"); + // override if you want to force a particular stream + nppCtx.hStream = rawStream; +#else + // CUDA 12.9+: helper was removed, we need to build it manually + cudaDeviceProp prop{}; + cudaError_t err = cudaGetDeviceProperties(&prop, deviceIndex); + TORCH_CHECK(err == cudaSuccess, "cudaGetDeviceProperties failed"); + + nppCtx.nCudaDeviceId = deviceIndex; + nppCtx.nMultiProcessorCount = prop.multiProcessorCount; + nppCtx.nMaxThreadsPerMultiProcessor = prop.maxThreadsPerMultiProcessor; + nppCtx.nMaxThreadsPerBlock = prop.maxThreadsPerBlock; + nppCtx.nSharedMemPerBlock = prop.sharedMemPerBlock; + nppCtx.nCudaDevAttrComputeCapabilityMajor = prop.major; + nppCtx.nCudaDevAttrComputeCapabilityMinor = prop.minor; + nppCtx.nStreamFlags = 0; + nppCtx.hStream = rawStream; +#endif + return nppCtx; +} + } // namespace CudaDeviceInterface::CudaDeviceInterface(const torch::Device& device) @@ -228,31 +257,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( // Build an NppStreamContext, either via the old helper or by hand on // CUDA 12.9+ - NppStreamContext nppCtx{}; -#if CUDA_VERSION < 12090 - NppStatus ctxStat = nppGetStreamContext(&nppCtx); - TORCH_CHECK(ctxStat == NPP_SUCCESS, "nppGetStreamContext failed"); - // override if you want to force a particular stream - nppCtx.hStream = rawStream; -#else - // CUDA 12.9+: helper was removed, we need to build it manually - int dev = 0; - cudaError_t err = cudaGetDevice(&dev); - TORCH_CHECK(err == cudaSuccess, "cudaGetDevice failed"); - cudaDeviceProp prop{}; - err = cudaGetDeviceProperties(&prop, dev); - TORCH_CHECK(err == cudaSuccess, "cudaGetDeviceProperties failed"); - - nppCtx.nCudaDeviceId = dev; - nppCtx.nMultiProcessorCount = prop.multiProcessorCount; - nppCtx.nMaxThreadsPerMultiProcessor = prop.maxThreadsPerMultiProcessor; - nppCtx.nMaxThreadsPerBlock = prop.maxThreadsPerBlock; - nppCtx.nSharedMemPerBlock = prop.sharedMemPerBlock; - nppCtx.nCudaDevAttrComputeCapabilityMajor = prop.major; - nppCtx.nCudaDevAttrComputeCapabilityMinor = prop.minor; - nppCtx.nStreamFlags = 0; - nppCtx.hStream = rawStream; -#endif + NppStreamContext nppCtx = createNppStreamContext(rawStream, device_.index()); // Prepare ROI + pointers NppiSize oSizeROI = {width, height}; From 565896e4935113eab2a50dc66124548b0acb0025 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 12 Jul 2025 11:54:35 +0100 Subject: [PATCH 05/14] Use cache for nppContext object --- src/torchcodec/_core/CudaDeviceInterface.cpp | 24 +++++++++----------- src/torchcodec/_core/CudaDeviceInterface.h | 3 +++ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp index 7aeb3b357..b384f1818 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.cpp +++ b/src/torchcodec/_core/CudaDeviceInterface.cpp @@ -162,15 +162,14 @@ AVBufferRef* getCudaContext(const torch::Device& device) { #endif } -NppStreamContext createNppStreamContext( - cudaStream_t rawStream, - int deviceIndex) { +NppStreamContext createNppStreamContext(int deviceIndex) { + // Build an NppStreamContext, either via the old helper or by hand on CUDA + // 12.9+ + NppStreamContext nppCtx{}; #if CUDA_VERSION < 12090 NppStatus ctxStat = nppGetStreamContext(&nppCtx); TORCH_CHECK(ctxStat == NPP_SUCCESS, "nppGetStreamContext failed"); - // override if you want to force a particular stream - nppCtx.hStream = rawStream; #else // CUDA 12.9+: helper was removed, we need to build it manually cudaDeviceProp prop{}; @@ -185,7 +184,6 @@ NppStreamContext createNppStreamContext( nppCtx.nCudaDevAttrComputeCapabilityMajor = prop.major; nppCtx.nCudaDevAttrComputeCapabilityMinor = prop.minor; nppCtx.nStreamFlags = 0; - nppCtx.hStream = rawStream; #endif return nppCtx; } @@ -253,11 +251,11 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( // Use the user-requested GPU for running the NPP kernel. c10::cuda::CUDAGuard deviceGuard(device_); - cudaStream_t rawStream = at::cuda::getCurrentCUDAStream().stream(); - - // Build an NppStreamContext, either via the old helper or by hand on - // CUDA 12.9+ - NppStreamContext nppCtx = createNppStreamContext(rawStream, device_.index()); + if (!nppCtxInitialized_) { + nppCtx_ = createNppStreamContext(device_.index()); + nppCtxInitialized_ = true; + } + nppCtx_.hStream = at::cuda::getCurrentCUDAStream().stream(); // Prepare ROI + pointers NppiSize oSizeROI = {width, height}; @@ -273,7 +271,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( static_cast(dst.data_ptr()), dst.stride(0), oSizeROI, - nppCtx); + nppCtx_); } else { status = nppiNV12ToRGB_8u_P2C3R_Ctx( input, @@ -281,7 +279,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( static_cast(dst.data_ptr()), dst.stride(0), oSizeROI, - nppCtx); + nppCtx_); } TORCH_CHECK(status == NPP_SUCCESS, "Failed to convert NV12 frame."); diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h index 01f3b19b5..d62250132 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.h +++ b/src/torchcodec/_core/CudaDeviceInterface.h @@ -6,6 +6,7 @@ #pragma once +#include #include "src/torchcodec/_core/DeviceInterface.h" namespace facebook::torchcodec { @@ -30,6 +31,8 @@ class CudaDeviceInterface : public DeviceInterface { private: AVBufferRef* ctx_ = nullptr; + NppStreamContext nppCtx_{}; + bool nppCtxInitialized_ = false; }; } // namespace facebook::torchcodec From 9a6d3d3693b5a2b53a81e0eff5f151c5311bfdad Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 12 Jul 2025 14:07:19 +0100 Subject: [PATCH 06/14] Add maybe_unused --- src/torchcodec/_core/CudaDeviceInterface.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp index b384f1818..fa5583130 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.cpp +++ b/src/torchcodec/_core/CudaDeviceInterface.cpp @@ -162,7 +162,7 @@ AVBufferRef* getCudaContext(const torch::Device& device) { #endif } -NppStreamContext createNppStreamContext(int deviceIndex) { +NppStreamContext createNppStreamContext([[maybe_unused]] int deviceIndex) { // Build an NppStreamContext, either via the old helper or by hand on CUDA // 12.9+ From 2d681ad191d66982165766824d255aceed5f6f88 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 12 Jul 2025 20:59:46 +0100 Subject: [PATCH 07/14] Pass positive index --- src/torchcodec/_core/CudaDeviceInterface.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp index fa5583130..75372b583 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.cpp +++ b/src/torchcodec/_core/CudaDeviceInterface.cpp @@ -174,7 +174,10 @@ NppStreamContext createNppStreamContext([[maybe_unused]] int deviceIndex) { // CUDA 12.9+: helper was removed, we need to build it manually cudaDeviceProp prop{}; cudaError_t err = cudaGetDeviceProperties(&prop, deviceIndex); - TORCH_CHECK(err == cudaSuccess, "cudaGetDeviceProperties failed"); + TORCH_CHECK( + err == cudaSuccess, + "cudaGetDeviceProperties failed: ", + cudaGetErrorString(err)); nppCtx.nCudaDeviceId = deviceIndex; nppCtx.nMultiProcessorCount = prop.multiProcessorCount; @@ -252,7 +255,8 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( c10::cuda::CUDAGuard deviceGuard(device_); if (!nppCtxInitialized_) { - nppCtx_ = createNppStreamContext(device_.index()); + nppCtx_ = createNppStreamContext( + static_cast(getFFMPEGCompatibleDeviceIndex(device_))); nppCtxInitialized_ = true; } nppCtx_.hStream = at::cuda::getCurrentCUDAStream().stream(); From 320c06002900af01babe8e5ee0cd47123c0a5e2b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 27 Jul 2025 09:28:36 +0100 Subject: [PATCH 08/14] Try manual creation for all CUDA versions --- src/torchcodec/_core/CudaDeviceInterface.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp index 59a5b3f53..072a37009 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.cpp +++ b/src/torchcodec/_core/CudaDeviceInterface.cpp @@ -163,15 +163,15 @@ AVBufferRef* getCudaContext(const torch::Device& device) { } NppStreamContext createNppStreamContext([[maybe_unused]] int deviceIndex) { - // Build an NppStreamContext, either via the old helper or by hand on CUDA - // 12.9+ + // From 12.9, NPP recommends using a user-created NppStreamContext and using + // the `_Ctx()` calls: + // https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#npp-release-12-9-update-1 + // And the nppGetStreamContext() helper is deprecated. We are explicitly + // supposed to create the NppStreamContext manually from the CUDA device + // properties: + // https://github.com/NVIDIA/CUDALibrarySamples/blob/d97803a40fab83c058bb3d68b6c38bd6eebfff43/NPP/README.md?plain=1#L54-L72 NppStreamContext nppCtx{}; -#if CUDA_VERSION < 12090 - NppStatus ctxStat = nppGetStreamContext(&nppCtx); - TORCH_CHECK(ctxStat == NPP_SUCCESS, "nppGetStreamContext failed"); -#else - // CUDA 12.9+: helper was removed, we need to build it manually cudaDeviceProp prop{}; cudaError_t err = cudaGetDeviceProperties(&prop, deviceIndex); TORCH_CHECK( @@ -187,7 +187,6 @@ NppStreamContext createNppStreamContext([[maybe_unused]] int deviceIndex) { nppCtx.nCudaDevAttrComputeCapabilityMajor = prop.major; nppCtx.nCudaDevAttrComputeCapabilityMinor = prop.minor; nppCtx.nStreamFlags = 0; -#endif return nppCtx; } From 7056fc0e5c990d1340eb5ff7f5b9916331b6f51d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 27 Jul 2025 11:14:39 +0100 Subject: [PATCH 09/14] remove cache, it should be per-device not per decoder instance. Leaving for later. --- src/torchcodec/_core/CudaDeviceInterface.cpp | 33 ++++++++++++-------- src/torchcodec/_core/CudaDeviceInterface.h | 2 -- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp index 072a37009..2d09ee245 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.cpp +++ b/src/torchcodec/_core/CudaDeviceInterface.cpp @@ -162,7 +162,7 @@ AVBufferRef* getCudaContext(const torch::Device& device) { #endif } -NppStreamContext createNppStreamContext([[maybe_unused]] int deviceIndex) { +NppStreamContext createNppStreamContext(int deviceIndex) { // From 12.9, NPP recommends using a user-created NppStreamContext and using // the `_Ctx()` calls: // https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#npp-release-12-9-update-1 @@ -186,7 +186,16 @@ NppStreamContext createNppStreamContext([[maybe_unused]] int deviceIndex) { nppCtx.nSharedMemPerBlock = prop.sharedMemPerBlock; nppCtx.nCudaDevAttrComputeCapabilityMajor = prop.major; nppCtx.nCudaDevAttrComputeCapabilityMinor = prop.minor; - nppCtx.nStreamFlags = 0; + + // TODO when implementing the cache logic, move these out. See other TODO + // below. + nppCtx.hStream = at::cuda::getCurrentCUDAStream(deviceIndex).stream(); + err = cudaStreamGetFlags(nppCtx.hStream, &nppCtx.nStreamFlags); + TORCH_CHECK( + err == cudaSuccess, + "cudaStreamGetFlags failed: ", + cudaGetErrorString(err)); + return nppCtx; } @@ -250,15 +259,13 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( dst = allocateEmptyHWCTensor(height, width, device_); } - // Use the user-requested GPU for running the NPP kernel. - c10::cuda::CUDAGuard deviceGuard(device_); - - if (!nppCtxInitialized_) { - nppCtx_ = createNppStreamContext( - static_cast(getFFMPEGCompatibleDeviceIndex(device_))); - nppCtxInitialized_ = true; - } - nppCtx_.hStream = at::cuda::getCurrentCUDAStream().stream(); + // TODO cache the NppStreamContext! It currently gets re-recated for every + // single frame. The cache should be per-device, similar to the existing + // hw_device_ctx cache. When implementing the cache logic, the + // NppStreamContext hStream and nStreamFlags should not be part of the cache + // because they may change across calls. + NppStreamContext nppCtx = createNppStreamContext( + static_cast(getFFMPEGCompatibleDeviceIndex(device_))); // Prepare ROI + pointers NppiSize oSizeROI = {width, height}; @@ -273,7 +280,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( static_cast(dst.data_ptr()), dst.stride(0), oSizeROI, - nppCtx_); + nppCtx); } else { status = nppiNV12ToRGB_8u_P2C3R_Ctx( input, @@ -281,7 +288,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( static_cast(dst.data_ptr()), dst.stride(0), oSizeROI, - nppCtx_); + nppCtx); } TORCH_CHECK(status == NPP_SUCCESS, "Failed to convert NV12 frame."); } diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h index d62250132..526f4a977 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.h +++ b/src/torchcodec/_core/CudaDeviceInterface.h @@ -31,8 +31,6 @@ class CudaDeviceInterface : public DeviceInterface { private: AVBufferRef* ctx_ = nullptr; - NppStreamContext nppCtx_{}; - bool nppCtxInitialized_ = false; }; } // namespace facebook::torchcodec From 9ddc670a902cc1f71ca9fcc3dc2ad365d2611870 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 30 Jul 2025 09:02:40 +0100 Subject: [PATCH 10/14] Revert "remove cache, it should be per-device not per decoder instance. Leaving" This reverts commit 7056fc0e5c990d1340eb5ff7f5b9916331b6f51d. --- src/torchcodec/_core/CudaDeviceInterface.cpp | 33 ++++++++------------ src/torchcodec/_core/CudaDeviceInterface.h | 2 ++ 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp index 2d09ee245..072a37009 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.cpp +++ b/src/torchcodec/_core/CudaDeviceInterface.cpp @@ -162,7 +162,7 @@ AVBufferRef* getCudaContext(const torch::Device& device) { #endif } -NppStreamContext createNppStreamContext(int deviceIndex) { +NppStreamContext createNppStreamContext([[maybe_unused]] int deviceIndex) { // From 12.9, NPP recommends using a user-created NppStreamContext and using // the `_Ctx()` calls: // https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#npp-release-12-9-update-1 @@ -186,16 +186,7 @@ NppStreamContext createNppStreamContext(int deviceIndex) { nppCtx.nSharedMemPerBlock = prop.sharedMemPerBlock; nppCtx.nCudaDevAttrComputeCapabilityMajor = prop.major; nppCtx.nCudaDevAttrComputeCapabilityMinor = prop.minor; - - // TODO when implementing the cache logic, move these out. See other TODO - // below. - nppCtx.hStream = at::cuda::getCurrentCUDAStream(deviceIndex).stream(); - err = cudaStreamGetFlags(nppCtx.hStream, &nppCtx.nStreamFlags); - TORCH_CHECK( - err == cudaSuccess, - "cudaStreamGetFlags failed: ", - cudaGetErrorString(err)); - + nppCtx.nStreamFlags = 0; return nppCtx; } @@ -259,13 +250,15 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( dst = allocateEmptyHWCTensor(height, width, device_); } - // TODO cache the NppStreamContext! It currently gets re-recated for every - // single frame. The cache should be per-device, similar to the existing - // hw_device_ctx cache. When implementing the cache logic, the - // NppStreamContext hStream and nStreamFlags should not be part of the cache - // because they may change across calls. - NppStreamContext nppCtx = createNppStreamContext( - static_cast(getFFMPEGCompatibleDeviceIndex(device_))); + // Use the user-requested GPU for running the NPP kernel. + c10::cuda::CUDAGuard deviceGuard(device_); + + if (!nppCtxInitialized_) { + nppCtx_ = createNppStreamContext( + static_cast(getFFMPEGCompatibleDeviceIndex(device_))); + nppCtxInitialized_ = true; + } + nppCtx_.hStream = at::cuda::getCurrentCUDAStream().stream(); // Prepare ROI + pointers NppiSize oSizeROI = {width, height}; @@ -280,7 +273,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( static_cast(dst.data_ptr()), dst.stride(0), oSizeROI, - nppCtx); + nppCtx_); } else { status = nppiNV12ToRGB_8u_P2C3R_Ctx( input, @@ -288,7 +281,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( static_cast(dst.data_ptr()), dst.stride(0), oSizeROI, - nppCtx); + nppCtx_); } TORCH_CHECK(status == NPP_SUCCESS, "Failed to convert NV12 frame."); } diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h index 526f4a977..d62250132 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.h +++ b/src/torchcodec/_core/CudaDeviceInterface.h @@ -31,6 +31,8 @@ class CudaDeviceInterface : public DeviceInterface { private: AVBufferRef* ctx_ = nullptr; + NppStreamContext nppCtx_{}; + bool nppCtxInitialized_ = false; }; } // namespace facebook::torchcodec From c27d4b5d068f2f9b8154ea47fe10e67b5f0b31f1 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 30 Jul 2025 09:11:42 +0100 Subject: [PATCH 11/14] Remove deviceGuard --- src/torchcodec/_core/CudaDeviceInterface.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp index 4616aedf9..1ad519ca7 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.cpp +++ b/src/torchcodec/_core/CudaDeviceInterface.cpp @@ -294,9 +294,6 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( dst = allocateEmptyHWCTensor(height, width, device_); } - // Use the user-requested GPU for running the NPP kernel. - c10::cuda::CUDAGuard deviceGuard(device_); - if (!nppCtxInitialized_) { nppCtx_ = createNppStreamContext( static_cast(getFFMPEGCompatibleDeviceIndex(device_))); From 5306ca4ac5a9574c407ad1c08bf8daf9da296ffc Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 30 Jul 2025 09:26:22 +0100 Subject: [PATCH 12/14] Revert "Remove deviceGuard" This reverts commit c27d4b5d068f2f9b8154ea47fe10e67b5f0b31f1. --- src/torchcodec/_core/CudaDeviceInterface.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp index 1ad519ca7..4616aedf9 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.cpp +++ b/src/torchcodec/_core/CudaDeviceInterface.cpp @@ -294,6 +294,9 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( dst = allocateEmptyHWCTensor(height, width, device_); } + // Use the user-requested GPU for running the NPP kernel. + c10::cuda::CUDAGuard deviceGuard(device_); + if (!nppCtxInitialized_) { nppCtx_ = createNppStreamContext( static_cast(getFFMPEGCompatibleDeviceIndex(device_))); From b454c0c38803e5556893393b4968ec0bb5d0f87d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 30 Jul 2025 09:26:33 +0100 Subject: [PATCH 13/14] Reapply "remove cache, it should be per-device not per decoder instance. Leaving" This reverts commit 9ddc670a902cc1f71ca9fcc3dc2ad365d2611870. --- src/torchcodec/_core/CudaDeviceInterface.cpp | 33 ++++++++++++-------- src/torchcodec/_core/CudaDeviceInterface.h | 2 -- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp index 4616aedf9..63c28d263 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.cpp +++ b/src/torchcodec/_core/CudaDeviceInterface.cpp @@ -162,7 +162,7 @@ AVBufferRef* getCudaContext(const torch::Device& device) { #endif } -NppStreamContext createNppStreamContext([[maybe_unused]] int deviceIndex) { +NppStreamContext createNppStreamContext(int deviceIndex) { // From 12.9, NPP recommends using a user-created NppStreamContext and using // the `_Ctx()` calls: // https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#npp-release-12-9-update-1 @@ -186,7 +186,16 @@ NppStreamContext createNppStreamContext([[maybe_unused]] int deviceIndex) { nppCtx.nSharedMemPerBlock = prop.sharedMemPerBlock; nppCtx.nCudaDevAttrComputeCapabilityMajor = prop.major; nppCtx.nCudaDevAttrComputeCapabilityMinor = prop.minor; - nppCtx.nStreamFlags = 0; + + // TODO when implementing the cache logic, move these out. See other TODO + // below. + nppCtx.hStream = at::cuda::getCurrentCUDAStream(deviceIndex).stream(); + err = cudaStreamGetFlags(nppCtx.hStream, &nppCtx.nStreamFlags); + TORCH_CHECK( + err == cudaSuccess, + "cudaStreamGetFlags failed: ", + cudaGetErrorString(err)); + return nppCtx; } @@ -294,15 +303,13 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( dst = allocateEmptyHWCTensor(height, width, device_); } - // Use the user-requested GPU for running the NPP kernel. - c10::cuda::CUDAGuard deviceGuard(device_); - - if (!nppCtxInitialized_) { - nppCtx_ = createNppStreamContext( - static_cast(getFFMPEGCompatibleDeviceIndex(device_))); - nppCtxInitialized_ = true; - } - nppCtx_.hStream = at::cuda::getCurrentCUDAStream().stream(); + // TODO cache the NppStreamContext! It currently gets re-recated for every + // single frame. The cache should be per-device, similar to the existing + // hw_device_ctx cache. When implementing the cache logic, the + // NppStreamContext hStream and nStreamFlags should not be part of the cache + // because they may change across calls. + NppStreamContext nppCtx = createNppStreamContext( + static_cast(getFFMPEGCompatibleDeviceIndex(device_))); // Prepare ROI + pointers NppiSize oSizeROI = {width, height}; @@ -317,7 +324,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( static_cast(dst.data_ptr()), dst.stride(0), oSizeROI, - nppCtx_); + nppCtx); } else { status = nppiNV12ToRGB_8u_P2C3R_Ctx( input, @@ -325,7 +332,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( static_cast(dst.data_ptr()), dst.stride(0), oSizeROI, - nppCtx_); + nppCtx); } TORCH_CHECK(status == NPP_SUCCESS, "Failed to convert NV12 frame."); } diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h index d62250132..526f4a977 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.h +++ b/src/torchcodec/_core/CudaDeviceInterface.h @@ -31,8 +31,6 @@ class CudaDeviceInterface : public DeviceInterface { private: AVBufferRef* ctx_ = nullptr; - NppStreamContext nppCtx_{}; - bool nppCtxInitialized_ = false; }; } // namespace facebook::torchcodec From 8cedbde0113a4fce6e4dd17b5e7b905d94d598b0 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 30 Jul 2025 09:28:23 +0100 Subject: [PATCH 14/14] remove comment --- src/torchcodec/_core/CudaDeviceInterface.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp index 63c28d263..9bfea4e52 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.cpp +++ b/src/torchcodec/_core/CudaDeviceInterface.cpp @@ -311,7 +311,6 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( NppStreamContext nppCtx = createNppStreamContext( static_cast(getFFMPEGCompatibleDeviceIndex(device_))); - // Prepare ROI + pointers NppiSize oSizeROI = {width, height}; Npp8u* input[2] = {avFrame->data[0], avFrame->data[1]};