diff --git a/.github/workflows/linux_cuda_wheel.yaml b/.github/workflows/linux_cuda_wheel.yaml index 61d6f50d0..a381c1669 100644 --- a/.github/workflows/linux_cuda_wheel.yaml +++ b/.github/workflows/linux_cuda_wheel.yaml @@ -67,7 +67,9 @@ jobs: # For the actual release we should add that label and change this to # include more python versions. python-version: ['3.9'] - cuda-version: ['12.6', '12.8'] + # We test against 12.6 and 12.9 to avoid having too big of a CI matrix, + # but for releases we should add 12.8. + cuda-version: ['12.6', '12.9'] # TODO: put back ffmpeg 5 https://github.com/pytorch/torchcodec/issues/325 ffmpeg-version-for-tests: ['4.4.2', '6', '7'] diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp index 9ea7807d7..9bfea4e52 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.cpp +++ b/src/torchcodec/_core/CudaDeviceInterface.cpp @@ -161,6 +161,44 @@ AVBufferRef* getCudaContext(const torch::Device& device) { device, nonNegativeDeviceIndex, type); #endif } + +NppStreamContext createNppStreamContext(int deviceIndex) { + // From 12.9, NPP recommends using a user-created NppStreamContext and using + // the `_Ctx()` calls: + // https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#npp-release-12-9-update-1 + // And the nppGetStreamContext() helper is deprecated. We are explicitly + // supposed to create the NppStreamContext manually from the CUDA device + // properties: + // https://github.com/NVIDIA/CUDALibrarySamples/blob/d97803a40fab83c058bb3d68b6c38bd6eebfff43/NPP/README.md?plain=1#L54-L72 + + NppStreamContext nppCtx{}; + cudaDeviceProp prop{}; + cudaError_t err = cudaGetDeviceProperties(&prop, deviceIndex); + TORCH_CHECK( + err == cudaSuccess, + "cudaGetDeviceProperties failed: ", + cudaGetErrorString(err)); + + nppCtx.nCudaDeviceId = deviceIndex; + nppCtx.nMultiProcessorCount = prop.multiProcessorCount; + nppCtx.nMaxThreadsPerMultiProcessor = prop.maxThreadsPerMultiProcessor; + nppCtx.nMaxThreadsPerBlock = prop.maxThreadsPerBlock; + nppCtx.nSharedMemPerBlock = prop.sharedMemPerBlock; + nppCtx.nCudaDevAttrComputeCapabilityMajor = prop.major; + nppCtx.nCudaDevAttrComputeCapabilityMinor = prop.minor; + + // TODO when implementing the cache logic, move these out. See other TODO + // below. + nppCtx.hStream = at::cuda::getCurrentCUDAStream(deviceIndex).stream(); + err = cudaStreamGetFlags(nppCtx.hStream, &nppCtx.nStreamFlags); + TORCH_CHECK( + err == cudaSuccess, + "cudaStreamGetFlags failed: ", + cudaGetErrorString(err)); + + return nppCtx; +} + } // namespace CudaDeviceInterface::CudaDeviceInterface(const torch::Device& device) @@ -265,37 +303,37 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( dst = allocateEmptyHWCTensor(height, width, device_); } - // Use the user-requested GPU for running the NPP kernel. - c10::cuda::CUDAGuard deviceGuard(device_); + // TODO cache the NppStreamContext! It currently gets re-recated for every + // single frame. The cache should be per-device, similar to the existing + // hw_device_ctx cache. When implementing the cache logic, the + // NppStreamContext hStream and nStreamFlags should not be part of the cache + // because they may change across calls. + NppStreamContext nppCtx = createNppStreamContext( + static_cast(getFFMPEGCompatibleDeviceIndex(device_))); NppiSize oSizeROI = {width, height}; Npp8u* input[2] = {avFrame->data[0], avFrame->data[1]}; NppStatus status; + if (avFrame->colorspace == AVColorSpace::AVCOL_SPC_BT709) { - status = nppiNV12ToRGB_709CSC_8u_P2C3R( + status = nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx( input, avFrame->linesize[0], static_cast(dst.data_ptr()), dst.stride(0), - oSizeROI); + oSizeROI, + nppCtx); } else { - status = nppiNV12ToRGB_8u_P2C3R( + status = nppiNV12ToRGB_8u_P2C3R_Ctx( input, avFrame->linesize[0], static_cast(dst.data_ptr()), dst.stride(0), - oSizeROI); + oSizeROI, + nppCtx); } TORCH_CHECK(status == NPP_SUCCESS, "Failed to convert NV12 frame."); - - // Make the pytorch stream wait for the npp kernel to finish before using the - // output. - at::cuda::CUDAEvent nppDoneEvent; - at::cuda::CUDAStream nppStreamWrapper = - c10::cuda::getStreamFromExternal(nppGetStream(), device_.index()); - nppDoneEvent.record(nppStreamWrapper); - nppDoneEvent.block(at::cuda::getCurrentCUDAStream()); } // inspired by https://github.com/FFmpeg/FFmpeg/commit/ad67ea9 diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h index 01f3b19b5..526f4a977 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.h +++ b/src/torchcodec/_core/CudaDeviceInterface.h @@ -6,6 +6,7 @@ #pragma once +#include #include "src/torchcodec/_core/DeviceInterface.h" namespace facebook::torchcodec {