From 389c0789332e3e3e3e0ed6bcc21f09212e7f2f25 Mon Sep 17 00:00:00 2001 From: Molly Xu Date: Mon, 20 Oct 2025 14:03:01 -0700 Subject: [PATCH 1/3] fallback to container duration in approximate mode when stream duration is unavailable --- src/torchcodec/_core/SingleStreamDecoder.cpp | 10 ++++++++++ test/test_decoders.py | 11 ----------- test/utils.py | 7 ------- 3 files changed, 10 insertions(+), 18 deletions(-) diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp index 2fbc111c1..e21cca49b 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.cpp +++ b/src/torchcodec/_core/SingleStreamDecoder.cpp @@ -158,6 +158,16 @@ void SingleStreamDecoder::initializeDecoder() { ptsToSeconds(formatContext_->duration, defaultTimeBase); } + // Use container duration as fallback for streams missing stream duration + if (containerMetadata_.durationSecondsFromHeader.has_value()) { + for (auto& streamMetadata : containerMetadata_.allStreamMetadata) { + if (!streamMetadata.durationSecondsFromHeader.has_value()) { + streamMetadata.durationSecondsFromHeader = + containerMetadata_.durationSecondsFromHeader; + } + } + } + if (formatContext_->bit_rate > 0) { containerMetadata_.bitRate = formatContext_->bit_rate; } diff --git a/test/test_decoders.py b/test/test_decoders.py index 098e4e969..2000a1572 100644 --- a/test/test_decoders.py +++ b/test/test_decoders.py @@ -46,7 +46,6 @@ SINE_MONO_S32, SINE_MONO_S32_44100, SINE_MONO_S32_8000, - supports_approximate_mode, TEST_SRC_2_720P, TEST_SRC_2_720P_H265, TEST_SRC_2_720P_MPEG4, @@ -1492,8 +1491,6 @@ def test_get_frames_at_tensor_indices(self): def test_beta_cuda_interface_get_frame_at( self, asset, contiguous_indices, seek_mode ): - if seek_mode == "approximate" and not supports_approximate_mode(asset): - pytest.skip("asset doesn't work with approximate mode") if in_fbcode() and asset is AV1_VIDEO: pytest.skip("AV1 CUDA not supported internally") @@ -1540,8 +1537,6 @@ def test_beta_cuda_interface_get_frame_at( def test_beta_cuda_interface_get_frames_at( self, asset, contiguous_indices, seek_mode ): - if seek_mode == "approximate" and not supports_approximate_mode(asset): - pytest.skip("asset doesn't work with approximate mode") if in_fbcode() and asset is AV1_VIDEO: pytest.skip("AV1 CUDA not supported internally") @@ -1585,8 +1580,6 @@ def test_beta_cuda_interface_get_frames_at( ) @pytest.mark.parametrize("seek_mode", ("exact", "approximate")) def test_beta_cuda_interface_get_frame_played_at(self, asset, seek_mode): - if seek_mode == "approximate" and not supports_approximate_mode(asset): - pytest.skip("asset doesn't work with approximate mode") if in_fbcode() and asset is AV1_VIDEO: pytest.skip("AV1 CUDA not supported internally") @@ -1627,8 +1620,6 @@ def test_beta_cuda_interface_get_frame_played_at(self, asset, seek_mode): ) @pytest.mark.parametrize("seek_mode", ("exact", "approximate")) def test_beta_cuda_interface_get_frames_played_at(self, asset, seek_mode): - if seek_mode == "approximate" and not supports_approximate_mode(asset): - pytest.skip("asset doesn't work with approximate mode") if in_fbcode() and asset is AV1_VIDEO: pytest.skip("AV1 CUDA not supported internally") @@ -1670,8 +1661,6 @@ def test_beta_cuda_interface_get_frames_played_at(self, asset, seek_mode): ) @pytest.mark.parametrize("seek_mode", ("exact", "approximate")) def test_beta_cuda_interface_backwards(self, asset, seek_mode): - if seek_mode == "approximate" and not supports_approximate_mode(asset): - pytest.skip("asset doesn't work with approximate mode") if in_fbcode() and asset is AV1_VIDEO: pytest.skip("AV1 CUDA not supported internally") diff --git a/test/utils.py b/test/utils.py index b59681b37..08e120c06 100644 --- a/test/utils.py +++ b/test/utils.py @@ -798,10 +798,3 @@ def sample_format(self) -> str: }, frames={0: {}}, # Not needed for now ) - - -def supports_approximate_mode(asset: TestVideo) -> bool: - # Those are missing the `duration` field so they fail in approximate mode (on all devices). - # TODO: we should address this, see - # https://github.com/meta-pytorch/torchcodec/issues/945 - return asset not in (AV1_VIDEO, TEST_SRC_2_720P_VP9, TEST_SRC_2_720P_VP8) From 27718be8f0f800377787458169e3d0fe20ef14ef Mon Sep 17 00:00:00 2001 From: Molly Xu Date: Thu, 13 Nov 2025 11:18:14 -0800 Subject: [PATCH 2/3] update pr to reflect refactor --- src/torchcodec/_core/Metadata.cpp | 9 ++++++--- src/torchcodec/_core/Metadata.h | 3 +++ src/torchcodec/_core/SingleStreamDecoder.cpp | 17 +++++++---------- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/src/torchcodec/_core/Metadata.cpp b/src/torchcodec/_core/Metadata.cpp index 58a115dcf..c717815df 100644 --- a/src/torchcodec/_core/Metadata.cpp +++ b/src/torchcodec/_core/Metadata.cpp @@ -29,6 +29,9 @@ std::optional StreamMetadata::getDurationSeconds( return static_cast(numFramesFromHeader.value()) / averageFpsFromHeader.value(); } + if (durationSecondsFromContainer.has_value()) { + return durationSecondsFromContainer.value(); + } return std::nullopt; default: TORCH_CHECK(false, "Unknown SeekMode"); @@ -80,13 +83,13 @@ std::optional StreamMetadata::getNumFrames(SeekMode seekMode) const { numFramesFromContent.has_value(), "Missing numFramesFromContent"); return numFramesFromContent.value(); case SeekMode::approximate: { + auto durationSeconds = getDurationSeconds(seekMode); if (numFramesFromHeader.has_value()) { return numFramesFromHeader.value(); } - if (averageFpsFromHeader.has_value() && - durationSecondsFromHeader.has_value()) { + if (averageFpsFromHeader.has_value() && durationSeconds.has_value()) { return static_cast( - averageFpsFromHeader.value() * durationSecondsFromHeader.value()); + averageFpsFromHeader.value() * durationSeconds.value()); } return std::nullopt; } diff --git a/src/torchcodec/_core/Metadata.h b/src/torchcodec/_core/Metadata.h index e138d5dc0..c3289868d 100644 --- a/src/torchcodec/_core/Metadata.h +++ b/src/torchcodec/_core/Metadata.h @@ -35,6 +35,9 @@ struct StreamMetadata { std::optional averageFpsFromHeader; std::optional bitRate; + // Used as fallback in approximate mode when stream duration is unavailable. + std::optional durationSecondsFromContainer; + // More accurate duration, obtained by scanning the file. // These presentation timestamps are in time base. std::optional beginStreamPtsFromContent; diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp index 53c73b5ae..508fc4bf4 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.cpp +++ b/src/torchcodec/_core/SingleStreamDecoder.cpp @@ -158,16 +158,6 @@ void SingleStreamDecoder::initializeDecoder() { ptsToSeconds(formatContext_->duration, defaultTimeBase); } - // Use container duration as fallback for streams missing stream duration - if (containerMetadata_.durationSecondsFromHeader.has_value()) { - for (auto& streamMetadata : containerMetadata_.allStreamMetadata) { - if (!streamMetadata.durationSecondsFromHeader.has_value()) { - streamMetadata.durationSecondsFromHeader = - containerMetadata_.durationSecondsFromHeader; - } - } - } - if (formatContext_->bit_rate > 0) { containerMetadata_.bitRate = formatContext_->bit_rate; } @@ -182,6 +172,13 @@ void SingleStreamDecoder::initializeDecoder() { containerMetadata_.bestAudioStreamIndex = bestAudioStream; } + if (containerMetadata_.durationSecondsFromHeader.has_value()) { + for (auto& streamMetadata : containerMetadata_.allStreamMetadata) { + streamMetadata.durationSecondsFromContainer = + containerMetadata_.durationSecondsFromHeader; + } + } + if (seekMode_ == SeekMode::exact) { scanFileAndUpdateMetadataAndIndex(); } From 68e48db4b7506fc1269c524c3481c2991e71d827 Mon Sep 17 00:00:00 2001 From: Molly Xu Date: Tue, 18 Nov 2025 21:51:49 -0800 Subject: [PATCH 3/3] address feedback --- src/torchcodec/_core/SingleStreamDecoder.cpp | 50 +++++++++----------- src/torchcodec/_core/_metadata.py | 3 +- 2 files changed, 25 insertions(+), 28 deletions(-) diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp index 71056a546..22aca7bcd 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.cpp +++ b/src/torchcodec/_core/SingleStreamDecoder.cpp @@ -100,6 +100,26 @@ void SingleStreamDecoder::initializeDecoder() { "Failed to find stream info: ", getFFMPEGErrorStringFromErrorCode(status)); + if (formatContext_->duration > 0) { + AVRational defaultTimeBase{1, AV_TIME_BASE}; + containerMetadata_.durationSecondsFromHeader = + ptsToSeconds(formatContext_->duration, defaultTimeBase); + } + + if (formatContext_->bit_rate > 0) { + containerMetadata_.bitRate = formatContext_->bit_rate; + } + + int bestVideoStream = getBestStreamIndex(AVMEDIA_TYPE_VIDEO); + if (bestVideoStream >= 0) { + containerMetadata_.bestVideoStreamIndex = bestVideoStream; + } + + int bestAudioStream = getBestStreamIndex(AVMEDIA_TYPE_AUDIO); + if (bestAudioStream >= 0) { + containerMetadata_.bestAudioStreamIndex = bestAudioStream; + } + for (unsigned int i = 0; i < formatContext_->nb_streams; i++) { AVStream* avStream = formatContext_->streams[i]; StreamMetadata streamMetadata; @@ -149,34 +169,10 @@ void SingleStreamDecoder::initializeDecoder() { containerMetadata_.numAudioStreams++; } - containerMetadata_.allStreamMetadata.push_back(streamMetadata); - } - - if (formatContext_->duration > 0) { - AVRational defaultTimeBase{1, AV_TIME_BASE}; - containerMetadata_.durationSecondsFromHeader = - ptsToSeconds(formatContext_->duration, defaultTimeBase); - } - - if (formatContext_->bit_rate > 0) { - containerMetadata_.bitRate = formatContext_->bit_rate; - } - - int bestVideoStream = getBestStreamIndex(AVMEDIA_TYPE_VIDEO); - if (bestVideoStream >= 0) { - containerMetadata_.bestVideoStreamIndex = bestVideoStream; - } + streamMetadata.durationSecondsFromContainer = + containerMetadata_.durationSecondsFromHeader; - int bestAudioStream = getBestStreamIndex(AVMEDIA_TYPE_AUDIO); - if (bestAudioStream >= 0) { - containerMetadata_.bestAudioStreamIndex = bestAudioStream; - } - - if (containerMetadata_.durationSecondsFromHeader.has_value()) { - for (auto& streamMetadata : containerMetadata_.allStreamMetadata) { - streamMetadata.durationSecondsFromContainer = - containerMetadata_.durationSecondsFromHeader; - } + containerMetadata_.allStreamMetadata.push_back(streamMetadata); } if (seekMode_ == SeekMode::exact) { diff --git a/src/torchcodec/_core/_metadata.py b/src/torchcodec/_core/_metadata.py index 08bcf2b55..1d5a7d103 100644 --- a/src/torchcodec/_core/_metadata.py +++ b/src/torchcodec/_core/_metadata.py @@ -44,7 +44,8 @@ class StreamMetadata: from the actual frames if a :term:`scan` was performed. Otherwise we fall back to ``duration_seconds_from_header``. If that value is also None, we instead calculate the duration from ``num_frames_from_header`` and - ``average_fps_from_header``. + ``average_fps_from_header``. If all of those are unavailable, we fall back + to the container-level ``duration_seconds_from_header``. """ begin_stream_seconds: Optional[float] """Beginning of the stream, in seconds (float). Conceptually, this