diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index 095c82e27..a21668955 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -109,32 +109,52 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) { caps.nMaxMBCount); // Decoder creation parameters, taken from DALI - CUVIDDECODECREATEINFO decoder_info = {}; - decoder_info.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8; - decoder_info.ChromaFormat = videoFormat->chroma_format; - decoder_info.CodecType = videoFormat->codec; - decoder_info.ulHeight = videoFormat->coded_height; - decoder_info.ulWidth = videoFormat->coded_width; - decoder_info.ulMaxHeight = videoFormat->coded_height; - decoder_info.ulMaxWidth = videoFormat->coded_width; - decoder_info.ulTargetHeight = + CUVIDDECODECREATEINFO decoderParams = {}; + decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8; + decoderParams.ChromaFormat = videoFormat->chroma_format; + decoderParams.OutputFormat = cudaVideoSurfaceFormat_NV12; + decoderParams.ulCreationFlags = cudaVideoCreate_Default; + decoderParams.CodecType = videoFormat->codec; + decoderParams.ulHeight = videoFormat->coded_height; + decoderParams.ulWidth = videoFormat->coded_width; + decoderParams.ulMaxHeight = videoFormat->coded_height; + decoderParams.ulMaxWidth = videoFormat->coded_width; + decoderParams.ulTargetHeight = videoFormat->display_area.bottom - videoFormat->display_area.top; - decoder_info.ulTargetWidth = + decoderParams.ulTargetWidth = videoFormat->display_area.right - videoFormat->display_area.left; - decoder_info.ulNumDecodeSurfaces = videoFormat->min_num_decode_surfaces; - decoder_info.ulNumOutputSurfaces = 2; - decoder_info.display_area.left = videoFormat->display_area.left; - decoder_info.display_area.right = videoFormat->display_area.right; - decoder_info.display_area.top = videoFormat->display_area.top; - decoder_info.display_area.bottom = videoFormat->display_area.bottom; + decoderParams.ulNumDecodeSurfaces = videoFormat->min_num_decode_surfaces; + decoderParams.ulNumOutputSurfaces = 2; + decoderParams.display_area.left = videoFormat->display_area.left; + decoderParams.display_area.right = videoFormat->display_area.right; + decoderParams.display_area.top = videoFormat->display_area.top; + decoderParams.display_area.bottom = videoFormat->display_area.bottom; CUvideodecoder* decoder = new CUvideodecoder(); - result = cuvidCreateDecoder(decoder, &decoder_info); + result = cuvidCreateDecoder(decoder, &decoderParams); TORCH_CHECK( result == CUDA_SUCCESS, "Failed to create NVDEC decoder: ", result); return UniqueCUvideodecoder(decoder, CUvideoDecoderDeleter{}); } +cudaVideoCodec validateCodecSupport(AVCodecID codecId) { + switch (codecId) { + case AV_CODEC_ID_H264: + return cudaVideoCodec_H264; + case AV_CODEC_ID_HEVC: + return cudaVideoCodec_HEVC; + // TODONVDEC P0: support more codecs + // case AV_CODEC_ID_AV1: return cudaVideoCodec_AV1; + // case AV_CODEC_ID_MPEG4: return cudaVideoCodec_MPEG4; + // case AV_CODEC_ID_VP8: return cudaVideoCodec_VP8; + // case AV_CODEC_ID_VP9: return cudaVideoCodec_VP9; + // case AV_CODEC_ID_MJPEG: return cudaVideoCodec_JPEG; + default: { + TORCH_CHECK(false, "Unsupported codec type: ", avcodec_get_name(codecId)); + } + } +} + } // namespace BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device) @@ -160,36 +180,100 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() { } } -void BetaCudaDeviceInterface::initialize(const AVStream* avStream) { +void BetaCudaDeviceInterface::initialize( + const AVStream* avStream, + const UniqueDecodingAVFormatContext& avFormatCtx) { torch::Tensor dummyTensorForCudaInitialization = torch::empty( {1}, torch::TensorOptions().dtype(torch::kUInt8).device(device_)); - TORCH_CHECK(avStream != nullptr, "AVStream cannot be null"); - timeBase_ = avStream->time_base; - auto cudaDevice = torch::Device(torch::kCUDA); defaultCudaInterface_ = std::unique_ptr(createDeviceInterface(cudaDevice)); AVCodecContext dummyCodecContext = {}; - defaultCudaInterface_->initialize(avStream); + defaultCudaInterface_->initialize(avStream, avFormatCtx); defaultCudaInterface_->registerHardwareDeviceWithCodec(&dummyCodecContext); - const AVCodecParameters* codecpar = avStream->codecpar; - TORCH_CHECK(codecpar != nullptr, "CodecParameters cannot be null"); + TORCH_CHECK(avStream != nullptr, "AVStream cannot be null"); + timeBase_ = avStream->time_base; + + const AVCodecParameters* codecPar = avStream->codecpar; + TORCH_CHECK(codecPar != nullptr, "CodecParameters cannot be null"); + + initializeBSF(codecPar, avFormatCtx); + + // Create parser. Default values that aren't obvious are taken from DALI. + CUVIDPARSERPARAMS parserParams = {}; + parserParams.CodecType = validateCodecSupport(codecPar->codec_id); + parserParams.ulMaxNumDecodeSurfaces = 8; + parserParams.ulMaxDisplayDelay = 0; + // Callback setup, all are triggered by the parser within a call + // to cuvidParseVideoData + parserParams.pUserData = this; + parserParams.pfnSequenceCallback = pfnSequenceCallback; + parserParams.pfnDecodePicture = pfnDecodePictureCallback; + parserParams.pfnDisplayPicture = pfnDisplayPictureCallback; + CUresult result = cuvidCreateVideoParser(&videoParser_, &parserParams); TORCH_CHECK( - // TODONVDEC P0 support more - avStream->codecpar->codec_id == AV_CODEC_ID_H264, - "Can only do H264 for now"); + result == CUDA_SUCCESS, "Failed to create video parser: ", result); +} +void BetaCudaDeviceInterface::initializeBSF( + const AVCodecParameters* codecPar, + const UniqueDecodingAVFormatContext& avFormatCtx) { // Setup bit stream filters (BSF): // https://ffmpeg.org/doxygen/7.0/group__lavc__bsf.html - // This is only needed for some formats, like H264 or HEVC. TODONVDEC P1: For - // now we apply BSF unconditionally, but it should be optional and dependent - // on codec and container. - const AVBitStreamFilter* avBSF = av_bsf_get_by_name("h264_mp4toannexb"); + // This is only needed for some formats, like H264 or HEVC. + + TORCH_CHECK(codecPar != nullptr, "codecPar cannot be null"); + TORCH_CHECK(avFormatCtx != nullptr, "AVFormatContext cannot be null"); + TORCH_CHECK( + avFormatCtx->iformat != nullptr, + "AVFormatContext->iformat cannot be null"); + std::string filterName; + + // Matching logic is taken from DALI + switch (codecPar->codec_id) { + case AV_CODEC_ID_H264: { + const std::string formatName = avFormatCtx->iformat->long_name + ? avFormatCtx->iformat->long_name + : ""; + + if (formatName == "QuickTime / MOV" || + formatName == "FLV (Flash Video)" || + formatName == "Matroska / WebM" || formatName == "raw H.264 video") { + filterName = "h264_mp4toannexb"; + } + break; + } + + case AV_CODEC_ID_HEVC: { + const std::string formatName = avFormatCtx->iformat->long_name + ? avFormatCtx->iformat->long_name + : ""; + + if (formatName == "QuickTime / MOV" || + formatName == "FLV (Flash Video)" || + formatName == "Matroska / WebM" || formatName == "raw HEVC video") { + filterName = "hevc_mp4toannexb"; + } + break; + } + + default: + // No bitstream filter needed for other codecs + // TODONVDEC P1 MPEG4 will need one! + break; + } + + if (filterName.empty()) { + // Only initialize BSF if we actually need one + return; + } + + const AVBitStreamFilter* avBSF = av_bsf_get_by_name(filterName.c_str()); TORCH_CHECK( - avBSF != nullptr, "Failed to find h264_mp4toannexb bitstream filter"); + avBSF != nullptr, "Failed to find bitstream filter: ", filterName); AVBSFContext* avBSFContext = nullptr; int retVal = av_bsf_alloc(avBSF, &avBSFContext); @@ -200,7 +284,7 @@ void BetaCudaDeviceInterface::initialize(const AVStream* avStream) { bitstreamFilter_.reset(avBSFContext); - retVal = avcodec_parameters_copy(bitstreamFilter_->par_in, codecpar); + retVal = avcodec_parameters_copy(bitstreamFilter_->par_in, codecPar); TORCH_CHECK( retVal >= AVSUCCESS, "Failed to copy codec parameters: ", @@ -211,22 +295,6 @@ void BetaCudaDeviceInterface::initialize(const AVStream* avStream) { retVal == AVSUCCESS, "Failed to initialize bitstream filter: ", getFFMPEGErrorStringFromErrorCode(retVal)); - - // Create parser. Default values that aren't obvious are taken from DALI. - CUVIDPARSERPARAMS parserParams = {}; - parserParams.CodecType = cudaVideoCodec_H264; - parserParams.ulMaxNumDecodeSurfaces = 8; - parserParams.ulMaxDisplayDelay = 0; - // Callback setup, all are triggered by the parser within a call - // to cuvidParseVideoData - parserParams.pUserData = this; - parserParams.pfnSequenceCallback = pfnSequenceCallback; - parserParams.pfnDecodePicture = pfnDecodePictureCallback; - parserParams.pfnDisplayPicture = pfnDisplayPictureCallback; - - CUresult result = cuvidCreateVideoParser(&videoParser_, &parserParams); - TORCH_CHECK( - result == CUDA_SUCCESS, "Failed to create video parser: ", result); } // This callback is called by the parser within cuvidParseVideoData when there @@ -360,6 +428,10 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) { CUVIDPARSERDISPINFO dispInfo = readyFrames_.front(); readyFrames_.pop(); + // TODONVDEC P1 we need to set the procParams.output_stream field to the + // current CUDA stream and ensure proper synchronization. There's a related + // NVDECTODO in CudaDeviceInterface.cpp where we do the necessary + // synchronization for NPP. CUVIDPROCPARAMS procParams = {}; procParams.progressive_frame = dispInfo.progressive_frame; procParams.top_field_first = dispInfo.top_field_first; diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h index d5f436b14..fd6138af8 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.h +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.h @@ -37,7 +37,9 @@ class BetaCudaDeviceInterface : public DeviceInterface { explicit BetaCudaDeviceInterface(const torch::Device& device); virtual ~BetaCudaDeviceInterface(); - void initialize(const AVStream* avStream) override; + void initialize( + const AVStream* avStream, + const UniqueDecodingAVFormatContext& avFormatCtx) override; void convertAVFrameToFrameOutput( UniqueAVFrame& avFrame, @@ -61,6 +63,9 @@ class BetaCudaDeviceInterface : public DeviceInterface { private: // Apply bitstream filter, modifies packet in-place void applyBSF(ReferenceAVPacket& packet); + void initializeBSF( + const AVCodecParameters* codecPar, + const UniqueDecodingAVFormatContext& avFormatCtx); UniqueAVFrame convertCudaFrameToAVFrame( CUdeviceptr framePtr, diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp index 8c85c2fcf..e6b96e3e4 100644 --- a/src/torchcodec/_core/CpuDeviceInterface.cpp +++ b/src/torchcodec/_core/CpuDeviceInterface.cpp @@ -46,7 +46,9 @@ CpuDeviceInterface::CpuDeviceInterface(const torch::Device& device) device_.type() == torch::kCPU, "Unsupported device: ", device_.str()); } -void CpuDeviceInterface::initialize(const AVStream* avStream) { +void CpuDeviceInterface::initialize( + const AVStream* avStream, + [[maybe_unused]] const UniqueDecodingAVFormatContext& avFormatCtx) { TORCH_CHECK(avStream != nullptr, "avStream is null"); timeBase_ = avStream->time_base; } diff --git a/src/torchcodec/_core/CpuDeviceInterface.h b/src/torchcodec/_core/CpuDeviceInterface.h index 305b5ae14..399b0c6be 100644 --- a/src/torchcodec/_core/CpuDeviceInterface.h +++ b/src/torchcodec/_core/CpuDeviceInterface.h @@ -23,7 +23,9 @@ class CpuDeviceInterface : public DeviceInterface { return std::nullopt; } - virtual void initialize(const AVStream* avStream) override; + virtual void initialize( + const AVStream* avStream, + const UniqueDecodingAVFormatContext& avFormatCtx) override; virtual void initializeVideo( const VideoStreamOptions& videoStreamOptions, diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp index c7f02185a..37e5053d5 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.cpp +++ b/src/torchcodec/_core/CudaDeviceInterface.cpp @@ -203,14 +203,16 @@ CudaDeviceInterface::~CudaDeviceInterface() { } } -void CudaDeviceInterface::initialize(const AVStream* avStream) { +void CudaDeviceInterface::initialize( + const AVStream* avStream, + const UniqueDecodingAVFormatContext& avFormatCtx) { TORCH_CHECK(avStream != nullptr, "avStream is null"); timeBase_ = avStream->time_base; cpuInterface_ = createDeviceInterface(torch::kCPU); TORCH_CHECK( cpuInterface_ != nullptr, "Failed to create CPU device interface"); - cpuInterface_->initialize(avStream); + cpuInterface_->initialize(avStream, avFormatCtx); cpuInterface_->initializeVideo( VideoStreamOptions(), {}, diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h index 42d517a72..88a8e5b9c 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.h +++ b/src/torchcodec/_core/CudaDeviceInterface.h @@ -20,7 +20,9 @@ class CudaDeviceInterface : public DeviceInterface { std::optional findCodec(const AVCodecID& codecId) override; - void initialize(const AVStream* avStream) override; + void initialize( + const AVStream* avStream, + const UniqueDecodingAVFormatContext& avFormatCtx) override; void initializeVideo( const VideoStreamOptions& videoStreamOptions, diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h index b7d5ef07a..284d83378 100644 --- a/src/torchcodec/_core/DeviceInterface.h +++ b/src/torchcodec/_core/DeviceInterface.h @@ -52,7 +52,9 @@ class DeviceInterface { }; // Initialize the device with parameters generic to all kinds of decoding. - virtual void initialize(const AVStream* avStream) = 0; + virtual void initialize( + const AVStream* avStream, + const UniqueDecodingAVFormatContext& avFormatCtx) = 0; // Initialize the device with parameters specific to video decoding. There is // a default empty implementation. diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp index afa852e48..a6239f3a3 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.cpp +++ b/src/torchcodec/_core/SingleStreamDecoder.cpp @@ -439,7 +439,7 @@ void SingleStreamDecoder::addStream( TORCH_CHECK( deviceInterface_ != nullptr, "Failed to create device interface. This should never happen, please report."); - deviceInterface_->initialize(streamInfo.stream); + deviceInterface_->initialize(streamInfo.stream, formatContext_); // TODO_CODE_QUALITY it's pretty meh to have a video-specific logic within // addStream() which is supposed to be generic diff --git a/test/resources/testsrc2_h265.mp4 b/test/resources/testsrc2_h265.mp4 new file mode 100644 index 000000000..0563d1999 Binary files /dev/null and b/test/resources/testsrc2_h265.mp4 differ diff --git a/test/test_decoders.py b/test/test_decoders.py index 7ffc67566..c1319cb61 100644 --- a/test/test_decoders.py +++ b/test/test_decoders.py @@ -44,6 +44,7 @@ SINE_MONO_S32_44100, SINE_MONO_S32_8000, TEST_SRC_2_720P, + TEST_SRC_2_720P_H265, ) @@ -1415,7 +1416,9 @@ def test_get_frames_at_tensor_indices(self): # assert_tensor_close_on_at_least or something like that. @needs_cuda - @pytest.mark.parametrize("asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE)) + @pytest.mark.parametrize( + "asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE, TEST_SRC_2_720P_H265) + ) @pytest.mark.parametrize("contiguous_indices", (True, False)) @pytest.mark.parametrize("seek_mode", ("exact", "approximate")) def test_beta_cuda_interface_get_frame_at( @@ -1445,7 +1448,9 @@ def test_beta_cuda_interface_get_frame_at( assert beta_frame.duration_seconds == ref_frame.duration_seconds @needs_cuda - @pytest.mark.parametrize("asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE)) + @pytest.mark.parametrize( + "asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE, TEST_SRC_2_720P_H265) + ) @pytest.mark.parametrize("contiguous_indices", (True, False)) @pytest.mark.parametrize("seek_mode", ("exact", "approximate")) def test_beta_cuda_interface_get_frames_at( @@ -1476,7 +1481,9 @@ def test_beta_cuda_interface_get_frames_at( ) @needs_cuda - @pytest.mark.parametrize("asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE)) + @pytest.mark.parametrize( + "asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE, TEST_SRC_2_720P_H265) + ) @pytest.mark.parametrize("seek_mode", ("exact", "approximate")) def test_beta_cuda_interface_get_frame_played_at(self, asset, seek_mode): ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode) @@ -1498,7 +1505,9 @@ def test_beta_cuda_interface_get_frame_played_at(self, asset, seek_mode): assert beta_frame.duration_seconds == ref_frame.duration_seconds @needs_cuda - @pytest.mark.parametrize("asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE)) + @pytest.mark.parametrize( + "asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE, TEST_SRC_2_720P_H265) + ) @pytest.mark.parametrize("seek_mode", ("exact", "approximate")) def test_beta_cuda_interface_get_frames_played_at(self, asset, seek_mode): ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode) @@ -1521,7 +1530,9 @@ def test_beta_cuda_interface_get_frames_played_at(self, asset, seek_mode): ) @needs_cuda - @pytest.mark.parametrize("asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE)) + @pytest.mark.parametrize( + "asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE, TEST_SRC_2_720P_H265) + ) @pytest.mark.parametrize("seek_mode", ("exact", "approximate")) def test_beta_cuda_interface_backwards(self, asset, seek_mode): @@ -1541,12 +1552,24 @@ def test_beta_cuda_interface_backwards(self, asset, seek_mode): assert beta_frame.pts_seconds == ref_frame.pts_seconds assert beta_frame.duration_seconds == ref_frame.duration_seconds + @needs_cuda + def test_beta_cuda_interface_small_h265(self): + # TODONVDEC P2 investigate why/how the default interface can decode this + # video. + + # This is fine on the default interface - why? + VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0) + # But it fails on the beta interface due to input validation checks, which we took from DALI! + with pytest.raises( + RuntimeError, + match="Video is too small in at least one dimension. Provided: 128x128 vs supported:144x144", + ): + VideoDecoder(H265_VIDEO.path, device="cuda:0:beta").get_frame_at(0) + @needs_cuda def test_beta_cuda_interface_error(self): - with pytest.raises(RuntimeError, match="Can only do H264 for now"): + with pytest.raises(RuntimeError, match="Unsupported codec type: av1"): VideoDecoder(AV1_VIDEO.path, device="cuda:0:beta") - with pytest.raises(RuntimeError, match="Can only do H264 for now"): - VideoDecoder(H265_VIDEO.path, device="cuda:0:beta") with pytest.raises(RuntimeError, match="Unsupported device"): VideoDecoder(NASA_VIDEO.path, device="cuda:0:bad_variant") diff --git a/test/utils.py b/test/utils.py index 3ce603a9f..644dc0bce 100644 --- a/test/utils.py +++ b/test/utils.py @@ -688,3 +688,12 @@ def sample_format(self) -> str: }, frames={0: {}}, # Not needed for now ) +# ffmpeg -f lavfi -i testsrc2=duration=10:size=1280x720:rate=30 -c:v libx265 -crf 23 -preset medium output.mp4 +TEST_SRC_2_720P_H265 = TestVideo( + filename="testsrc2_h265.mp4", + default_stream_index=0, + stream_infos={ + 0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3), + }, + frames={0: {}}, # Not needed for now +)