diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index a21668955..a502de0de 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -143,8 +143,9 @@ cudaVideoCodec validateCodecSupport(AVCodecID codecId) { return cudaVideoCodec_H264; case AV_CODEC_ID_HEVC: return cudaVideoCodec_HEVC; + case AV_CODEC_ID_AV1: + return cudaVideoCodec_AV1; // TODONVDEC P0: support more codecs - // case AV_CODEC_ID_AV1: return cudaVideoCodec_AV1; // case AV_CODEC_ID_MPEG4: return cudaVideoCodec_MPEG4; // case AV_CODEC_ID_VP8: return cudaVideoCodec_VP8; // case AV_CODEC_ID_VP9: return cudaVideoCodec_VP9; @@ -195,6 +196,7 @@ void BetaCudaDeviceInterface::initialize( TORCH_CHECK(avStream != nullptr, "AVStream cannot be null"); timeBase_ = avStream->time_base; + frameRateAvgFromFFmpeg_ = avStream->r_frame_rate; const AVCodecParameters* codecPar = avStream->codecpar; TORCH_CHECK(codecPar != nullptr, "CodecParameters cannot be null"); @@ -494,14 +496,19 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame( avFrame->format = AV_PIX_FMT_CUDA; avFrame->pts = dispInfo.timestamp; - // TODONVDEC P0: Zero division error!!! - // TODONVDEC P0: Move AVRational arithmetic to FFMPEGCommon, and put the - // similar SingleStreamDecoder stuff there too. - unsigned int frameRateNum = videoFormat_.frame_rate.numerator; - unsigned int frameRateDen = videoFormat_.frame_rate.denominator; - int64_t duration = static_cast((frameRateDen * timeBase_.den)) / - (frameRateNum * timeBase_.num); - setDuration(avFrame, duration); + // TODONVDEC P2: We compute the duration based on average frame rate info: + // either from NVCUVID if it's valid, otherwise from FFmpeg as fallback. But + // both of these are based on average frame rate, so if the video has + // variable frame rate, the durations may be off. We should try to see if we + // can set the duration more accurately. Unfortunately it's not given by + // dispInfo. One option would be to set it based on the pts difference between + // consecutive frames, if the next frame is already available. + int frameRateNum = static_cast(videoFormat_.frame_rate.numerator); + int frameRateDen = static_cast(videoFormat_.frame_rate.denominator); + AVRational frameRate = (frameRateNum > 0 && frameRateDen > 0) + ? AVRational{frameRateNum, frameRateDen} + : frameRateAvgFromFFmpeg_; + setDuration(avFrame, computeSafeDuration(frameRate, timeBase_)); // We need to assign the frame colorspace. This is crucial for proper color // conversion. NVCUVID stores that in the matrix_coefficients field, but diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h index fd6138af8..9c452c48c 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.h +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.h @@ -84,7 +84,8 @@ class BetaCudaDeviceInterface : public DeviceInterface { // isFlushing_) bool isFlushing_ = false; - AVRational timeBase_ = {0, 0}; + AVRational timeBase_ = {0, 1}; + AVRational frameRateAvgFromFFmpeg_ = {0, 1}; UniqueAVBSFContext bitstreamFilter_; diff --git a/src/torchcodec/_core/FFMPEGCommon.cpp b/src/torchcodec/_core/FFMPEGCommon.cpp index 200fc9359..93176917c 100644 --- a/src/torchcodec/_core/FFMPEGCommon.cpp +++ b/src/torchcodec/_core/FFMPEGCommon.cpp @@ -501,4 +501,26 @@ AVIOContext* avioAllocContext( seek); } +double ptsToSeconds(int64_t pts, const AVRational& timeBase) { + // To perform the multiplication before the division, av_q2d is not used + return static_cast(pts) * timeBase.num / timeBase.den; +} + +int64_t secondsToClosestPts(double seconds, const AVRational& timeBase) { + return static_cast( + std::round(seconds * timeBase.den / timeBase.num)); +} + +int64_t computeSafeDuration( + const AVRational& frameRate, + const AVRational& timeBase) { + if (frameRate.num <= 0 || frameRate.den <= 0 || timeBase.num <= 0 || + timeBase.den <= 0) { + return 0; + } else { + return (static_cast(frameRate.den) * timeBase.den) / + (static_cast(timeBase.num) * frameRate.num); + } +} + } // namespace facebook::torchcodec diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h index ac40f079a..6bdcffc28 100644 --- a/src/torchcodec/_core/FFMPEGCommon.h +++ b/src/torchcodec/_core/FFMPEGCommon.h @@ -232,4 +232,10 @@ AVIOContext* avioAllocContext( AVIOWriteFunction write_packet, AVIOSeekFunction seek); +double ptsToSeconds(int64_t pts, const AVRational& timeBase); +int64_t secondsToClosestPts(double seconds, const AVRational& timeBase); +int64_t computeSafeDuration( + const AVRational& frameRate, + const AVRational& timeBase); + } // namespace facebook::torchcodec diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp index a6239f3a3..53c15cd33 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.cpp +++ b/src/torchcodec/_core/SingleStreamDecoder.cpp @@ -17,16 +17,6 @@ namespace facebook::torchcodec { namespace { -double ptsToSeconds(int64_t pts, const AVRational& timeBase) { - // To perform the multiplication before the division, av_q2d is not used - return static_cast(pts) * timeBase.num / timeBase.den; -} - -int64_t secondsToClosestPts(double seconds, const AVRational& timeBase) { - return static_cast( - std::round(seconds * timeBase.den / timeBase.num)); -} - // Some videos aren't properly encoded and do not specify pts values for // packets, and thus for frames. Unset values correspond to INT64_MIN. When that // happens, we fallback to the dts value which hopefully exists and is correct. diff --git a/test/test_decoders.py b/test/test_decoders.py index c1319cb61..260a7baa2 100644 --- a/test/test_decoders.py +++ b/test/test_decoders.py @@ -1417,13 +1417,23 @@ def test_get_frames_at_tensor_indices(self): @needs_cuda @pytest.mark.parametrize( - "asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE, TEST_SRC_2_720P_H265) + "asset", + ( + NASA_VIDEO, + TEST_SRC_2_720P, + BT709_FULL_RANGE, + TEST_SRC_2_720P_H265, + AV1_VIDEO, + ), ) @pytest.mark.parametrize("contiguous_indices", (True, False)) @pytest.mark.parametrize("seek_mode", ("exact", "approximate")) def test_beta_cuda_interface_get_frame_at( self, asset, contiguous_indices, seek_mode ): + if asset == AV1_VIDEO and seek_mode == "approximate": + pytest.skip("AV1 asset doesn't work with approximate mode") + ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode) beta_decoder = VideoDecoder( asset.path, device="cuda:0:beta", seek_mode=seek_mode @@ -1449,13 +1459,23 @@ def test_beta_cuda_interface_get_frame_at( @needs_cuda @pytest.mark.parametrize( - "asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE, TEST_SRC_2_720P_H265) + "asset", + ( + NASA_VIDEO, + TEST_SRC_2_720P, + BT709_FULL_RANGE, + TEST_SRC_2_720P_H265, + AV1_VIDEO, + ), ) @pytest.mark.parametrize("contiguous_indices", (True, False)) @pytest.mark.parametrize("seek_mode", ("exact", "approximate")) def test_beta_cuda_interface_get_frames_at( self, asset, contiguous_indices, seek_mode ): + if asset == AV1_VIDEO and seek_mode == "approximate": + pytest.skip("AV1 asset doesn't work with approximate mode") + ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode) beta_decoder = VideoDecoder( asset.path, device="cuda:0:beta", seek_mode=seek_mode @@ -1482,10 +1502,20 @@ def test_beta_cuda_interface_get_frames_at( @needs_cuda @pytest.mark.parametrize( - "asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE, TEST_SRC_2_720P_H265) + "asset", + ( + NASA_VIDEO, + TEST_SRC_2_720P, + BT709_FULL_RANGE, + TEST_SRC_2_720P_H265, + AV1_VIDEO, + ), ) @pytest.mark.parametrize("seek_mode", ("exact", "approximate")) def test_beta_cuda_interface_get_frame_played_at(self, asset, seek_mode): + if asset == AV1_VIDEO and seek_mode == "approximate": + pytest.skip("AV1 asset doesn't work with approximate mode") + ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode) beta_decoder = VideoDecoder( asset.path, device="cuda:0:beta", seek_mode=seek_mode @@ -1499,17 +1529,30 @@ def test_beta_cuda_interface_get_frame_played_at(self, asset, seek_mode): for pts in timestamps: ref_frame = ref_decoder.get_frame_played_at(pts) beta_frame = beta_decoder.get_frame_played_at(pts) - torch.testing.assert_close(beta_frame.data, ref_frame.data, rtol=0, atol=0) + if get_ffmpeg_major_version() > 4: # TODONVDEC P1 see above + torch.testing.assert_close( + beta_frame.data, ref_frame.data, rtol=0, atol=0 + ) assert beta_frame.pts_seconds == ref_frame.pts_seconds assert beta_frame.duration_seconds == ref_frame.duration_seconds @needs_cuda @pytest.mark.parametrize( - "asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE, TEST_SRC_2_720P_H265) + "asset", + ( + NASA_VIDEO, + TEST_SRC_2_720P, + BT709_FULL_RANGE, + TEST_SRC_2_720P_H265, + AV1_VIDEO, + ), ) @pytest.mark.parametrize("seek_mode", ("exact", "approximate")) def test_beta_cuda_interface_get_frames_played_at(self, asset, seek_mode): + if asset == AV1_VIDEO and seek_mode == "approximate": + pytest.skip("AV1 asset doesn't work with approximate mode") + ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode) beta_decoder = VideoDecoder( asset.path, device="cuda:0:beta", seek_mode=seek_mode @@ -1523,7 +1566,10 @@ def test_beta_cuda_interface_get_frames_played_at(self, asset, seek_mode): ref_frames = ref_decoder.get_frames_played_at(timestamps) beta_frames = beta_decoder.get_frames_played_at(timestamps) - torch.testing.assert_close(beta_frames.data, ref_frames.data, rtol=0, atol=0) + if get_ffmpeg_major_version() > 4: # TODONVDEC P1 see above + torch.testing.assert_close( + beta_frames.data, ref_frames.data, rtol=0, atol=0 + ) torch.testing.assert_close(beta_frames.pts_seconds, ref_frames.pts_seconds) torch.testing.assert_close( beta_frames.duration_seconds, ref_frames.duration_seconds @@ -1531,10 +1577,19 @@ def test_beta_cuda_interface_get_frames_played_at(self, asset, seek_mode): @needs_cuda @pytest.mark.parametrize( - "asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE, TEST_SRC_2_720P_H265) + "asset", + ( + NASA_VIDEO, + TEST_SRC_2_720P, + BT709_FULL_RANGE, + TEST_SRC_2_720P_H265, + AV1_VIDEO, + ), ) @pytest.mark.parametrize("seek_mode", ("exact", "approximate")) def test_beta_cuda_interface_backwards(self, asset, seek_mode): + if asset == AV1_VIDEO and seek_mode == "approximate": + pytest.skip("AV1 asset doesn't work with approximate mode") ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode) beta_decoder = VideoDecoder( @@ -1543,11 +1598,20 @@ def test_beta_cuda_interface_backwards(self, asset, seek_mode): assert ref_decoder.metadata == beta_decoder.metadata - for frame_index in [0, 100, 10, 50, 20, 200, 150, 389]: + for frame_index in [0, 1, 2, 1, 0, 100, 10, 50, 20, 200, 150, 150, 150, 389, 2]: + # This is ugly, but OK: the indices values above are relevant for + # the NASA_VIDEO. We need to avoid going out of bounds for other + # videos so we cap the frame_index. This test still serves its + # purpose: no matter what the range of the video, we're still doing + # backwards seeks. frame_index = min(frame_index, len(ref_decoder) - 1) + ref_frame = ref_decoder.get_frame_at(frame_index) beta_frame = beta_decoder.get_frame_at(frame_index) - torch.testing.assert_close(beta_frame.data, ref_frame.data, rtol=0, atol=0) + if get_ffmpeg_major_version() > 4: # TODONVDEC P1 see above + torch.testing.assert_close( + beta_frame.data, ref_frame.data, rtol=0, atol=0 + ) assert beta_frame.pts_seconds == ref_frame.pts_seconds assert beta_frame.duration_seconds == ref_frame.duration_seconds @@ -1568,8 +1632,6 @@ def test_beta_cuda_interface_small_h265(self): @needs_cuda def test_beta_cuda_interface_error(self): - with pytest.raises(RuntimeError, match="Unsupported codec type: av1"): - VideoDecoder(AV1_VIDEO.path, device="cuda:0:beta") with pytest.raises(RuntimeError, match="Unsupported device"): VideoDecoder(NASA_VIDEO.path, device="cuda:0:bad_variant")