From 329a34b626ec26ab6bb7e7f6f8536892a4cb950e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 30 Jul 2024 03:24:37 -0700 Subject: [PATCH 1/6] Add 'scan' term to glossary and update docs --- docs/source/glossary.rst | 5 ++ src/torchcodec/decoders/_core/_metadata.py | 52 ++++++++++--------- .../decoders/_simple_video_decoder.py | 7 ++- 3 files changed, 38 insertions(+), 26 deletions(-) diff --git a/docs/source/glossary.rst b/docs/source/glossary.rst index cb125f6e4..7afb74a72 100644 --- a/docs/source/glossary.rst +++ b/docs/source/glossary.rst @@ -12,3 +12,8 @@ Glossary `_: *The best stream is determined according to various heuristics as the most likely to be what the user expects.* + + scan + A scan corresponds to an entire pass over a video file, with the purpose + of retrieving metadata about the different streams and frames. **It does + not involve decoding**, so it is a lot cheaper than decoding the file. diff --git a/src/torchcodec/decoders/_core/_metadata.py b/src/torchcodec/decoders/_core/_metadata.py index 83ed807c7..54a8d24c6 100644 --- a/src/torchcodec/decoders/_core/_metadata.py +++ b/src/torchcodec/decoders/_core/_metadata.py @@ -24,7 +24,7 @@ class VideoStreamMetadata: """Metadata of a single video stream.""" duration_seconds_from_header: Optional[float] - """Duration of the stream, in seconds obtained from the header (float or + """Duration of the stream, in seconds, obtained from the header (float or None). This could be inaccurate.""" bit_rate: Optional[float] """Bit rate of the stream, in seconds (float or None).""" @@ -38,19 +38,17 @@ class VideoStreamMetadata: than ``num_frames_from_header``. We recommend using the ``num_frames`` attribute instead. (int or None).""" begin_stream_from_content_seconds: Optional[float] - """Beginning of the stream in seconds (float or None). - This is min(frame.pts) for all frames in this stream.""" + """Beginning of the stream, in seconds (float or None). + This corresponds to the first frame's :term:`pts`. Usually, this is equal to + 0.""" end_stream_from_content_seconds: Optional[float] - """End of the stream in seconds (float or None). - This is max(frame.pts + frame.duration) for all frames in this stream. - Note that frames have a pts and duration and the interval defined by - [pts, pts + duration) is a half-open interval (the right boundary is open). - Therefore no frame is displayed at this time value. - Calling - SimpleVideoDecoder.get_frame_displayed_at(end_stream_from_content_seconds) - will raise a StopIteration exception. - If you want to get the last frame you can use [-1] on a SimpleVideoDecoder - object.""" + """End of the stream, in seconds (float or None). + This is last_frame.pts + last_frame.duration, so according to our + convention, no frame is displayed at this time: read more in + :meth:`~torchcodec.decoders.SimpleVideoDecoder.get_frame_displayed_at`. + Retrieving the last frame is best done by simply indexing the + :class:`~torchcodec.decoders.SimpleVideoDecoder` object with ``[-1]``. + """ codec: Optional[str] """Codec (str or None).""" width: Optional[int] @@ -58,15 +56,16 @@ class VideoStreamMetadata: height: Optional[int] """Height of the frames (int or None).""" average_fps_from_header: Optional[float] - """Averate fps of the stream (float or None).""" + """Averate fps of the stream, obtained from the header (float or None). + We recommend using the ``average_fps`` attribute instead.""" stream_index: int """Index of the stream within the video (int).""" @property def num_frames(self) -> Optional[int]: """Number of frames in the stream. This corresponds to - ``num_frames_from_content`` if it's not None, otherwise it corresponds - to ``num_frames_from_header``. + ``num_frames_from_content`` if a :term:`scan` was made, otherwise it + corresponds to ``num_frames_from_header``. """ if self.num_frames_from_content is not None: return self.num_frames_from_content @@ -76,8 +75,8 @@ def num_frames(self) -> Optional[int]: @property def duration_seconds(self) -> Optional[float]: """Duration of the stream in seconds. We try to calculate the duration - from the actual frames if we scanned the frames. Otherwise we fall back - to the duration obtained from the header. + from the actual frames if a :term:`scan` wsa performed. Otherwise we + fall back to ``duration_seconds_from_header``. """ if ( self.end_stream_from_content_seconds is None @@ -91,14 +90,17 @@ def duration_seconds(self) -> Optional[float]: @property def average_fps(self) -> Optional[float]: - """Average fps of the stream. We try to get the average fps from the - actual frames if we scanned the frames. Otherwise we fall back to the - fps obtained from the header. + """Average fps of the stream. If a :term:`scan` was perfomed, this is + computed from the number of frames and the duration of the stream. + Otherwise we fall back to ``average_fps_from_header``. """ - if ( - self.end_stream_from_content_seconds is None - or self.begin_stream_from_content_seconds is None - or self.num_frames is None + if any( + attr is None + for attr in ( + self.end_stream_from_content_seconds, + self.begin_stream_from_content_seconds, + self.num_frames, + ) ): return self.average_fps_from_header return self.num_frames / ( diff --git a/src/torchcodec/decoders/_simple_video_decoder.py b/src/torchcodec/decoders/_simple_video_decoder.py index 349fda94f..295164b2f 100644 --- a/src/torchcodec/decoders/_simple_video_decoder.py +++ b/src/torchcodec/decoders/_simple_video_decoder.py @@ -77,7 +77,8 @@ def __repr__(self): class SimpleVideoDecoder: """A single-stream video decoder. - If the video contains multiple video streams, the :term:`best stream` is used. + If the video contains multiple video streams, the :term:`best stream` is + used. This decoder always performs a :term:`scan` of the video. Args: source (str, ``Pathlib.path``, ``torch.Tensor``, or bytes): The source of the video. @@ -266,6 +267,10 @@ def get_frames_at(self, start: int, stop: int, step: int = 1) -> FrameBatch: def get_frame_displayed_at(self, seconds: float) -> Frame: """Return a single frame displayed at the given timestamp in seconds. + Each frame has a :term:`pts` and a duration. The convention is that a + given frame is displayed during the [frame_pts, frame_pts + + frame_duration) interval. + Args: seconds (float): The time stamp in seconds when the frame is displayed, i.e. seconds is in From 9eedbd73430669516371cc8630f568ca2e5a7b2d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 30 Jul 2024 03:37:51 -0700 Subject: [PATCH 2/6] mypy garbage --- src/torchcodec/decoders/_core/_metadata.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/torchcodec/decoders/_core/_metadata.py b/src/torchcodec/decoders/_core/_metadata.py index 54a8d24c6..811f8d939 100644 --- a/src/torchcodec/decoders/_core/_metadata.py +++ b/src/torchcodec/decoders/_core/_metadata.py @@ -94,13 +94,10 @@ def average_fps(self) -> Optional[float]: computed from the number of frames and the duration of the stream. Otherwise we fall back to ``average_fps_from_header``. """ - if any( - attr is None - for attr in ( - self.end_stream_from_content_seconds, - self.begin_stream_from_content_seconds, - self.num_frames, - ) + if ( + self.end_stream_from_content_seconds is None + or self.begin_stream_from_content_seconds is None + or self.num_frames is None ): return self.average_fps_from_header return self.num_frames / ( From 6374cc977ac15891f9e465a9c3ccd1edf1c19a83 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 30 Jul 2024 03:39:29 -0700 Subject: [PATCH 3/6] readme --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 71c920895..344c33b03 100644 --- a/README.md +++ b/README.md @@ -23,12 +23,13 @@ from torchcodec.decoders import SimpleVideoDecoder decoder = SimpleVideoDecoder("path/to/video.mp4") decoder.metadata -# VideoStreamMetadata: (Truncated output) +# VideoStreamMetadata: # num_frames: 250 # duration_seconds: 10.0 # bit_rate: 31315.0 # codec: h264 # average_fps: 25.0 +# ... (truncated output) len(decoder) # == decoder.metadata.num_frames! # 250 From 8faccc65d348755f90d309dd0e1625ca35ce561c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 30 Jul 2024 09:20:34 -0700 Subject: [PATCH 4/6] Rename stuff --- src/torchcodec/decoders/_core/_metadata.py | 24 +++++++++---------- .../decoders/_simple_video_decoder.py | 20 ++++++++-------- test/decoders/test_metadata.py | 4 ++-- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/torchcodec/decoders/_core/_metadata.py b/src/torchcodec/decoders/_core/_metadata.py index 811f8d939..d2fe7fe94 100644 --- a/src/torchcodec/decoders/_core/_metadata.py +++ b/src/torchcodec/decoders/_core/_metadata.py @@ -37,11 +37,11 @@ class VideoStreamMetadata: content (the scan doesn't involve decoding). This is more accurate than ``num_frames_from_header``. We recommend using the ``num_frames`` attribute instead. (int or None).""" - begin_stream_from_content_seconds: Optional[float] + begin_stream_seconds: Optional[float] """Beginning of the stream, in seconds (float or None). This corresponds to the first frame's :term:`pts`. Usually, this is equal to 0.""" - end_stream_from_content_seconds: Optional[float] + end_stream_seconds: Optional[float] """End of the stream, in seconds (float or None). This is last_frame.pts + last_frame.duration, so according to our convention, no frame is displayed at this time: read more in @@ -79,13 +79,13 @@ def duration_seconds(self) -> Optional[float]: fall back to ``duration_seconds_from_header``. """ if ( - self.end_stream_from_content_seconds is None - or self.begin_stream_from_content_seconds is None + self.end_stream_seconds is None + or self.begin_stream_seconds is None ): return self.duration_seconds_from_header return ( - self.end_stream_from_content_seconds - - self.begin_stream_from_content_seconds + self.end_stream_seconds + - self.begin_stream_seconds ) @property @@ -95,14 +95,14 @@ def average_fps(self) -> Optional[float]: Otherwise we fall back to ``average_fps_from_header``. """ if ( - self.end_stream_from_content_seconds is None - or self.begin_stream_from_content_seconds is None + self.end_stream_seconds is None + or self.begin_stream_seconds is None or self.num_frames is None ): return self.average_fps_from_header return self.num_frames / ( - self.end_stream_from_content_seconds - - self.begin_stream_from_content_seconds + self.end_stream_seconds + - self.begin_stream_seconds ) def __repr__(self): @@ -160,10 +160,10 @@ def get_video_metadata(decoder: torch.Tensor) -> VideoMetadata: # keys with the Python names num_frames_from_header=stream_dict.get("numFrames"), num_frames_from_content=stream_dict.get("numFramesFromScan"), - begin_stream_from_content_seconds=stream_dict.get( + begin_stream_seconds=stream_dict.get( "minPtsSecondsFromScan" ), - end_stream_from_content_seconds=stream_dict.get( + end_stream_seconds=stream_dict.get( "maxPtsSecondsFromScan" ), codec=stream_dict.get("codec"), diff --git a/src/torchcodec/decoders/_simple_video_decoder.py b/src/torchcodec/decoders/_simple_video_decoder.py index 295164b2f..6d1b29f5f 100644 --- a/src/torchcodec/decoders/_simple_video_decoder.py +++ b/src/torchcodec/decoders/_simple_video_decoder.py @@ -141,22 +141,22 @@ def __init__( ) self._num_frames = self.metadata.num_frames_from_content - if self.metadata.begin_stream_from_content_seconds is None: + if self.metadata.begin_stream_seconds is None: raise ValueError( "The minimum pts value in seconds is unknown. " + _ERROR_REPORTING_INSTRUCTIONS ) - self._begin_stream_from_content_seconds = ( - self.metadata.begin_stream_from_content_seconds + self._begin_stream_seconds = ( + self.metadata.begin_stream_seconds ) - if self.metadata.end_stream_from_content_seconds is None: + if self.metadata.end_stream_seconds is None: raise ValueError( "The maximum pts value in seconds is unknown. " + _ERROR_REPORTING_INSTRUCTIONS ) - self._end_stream_from_content_seconds = ( - self.metadata.end_stream_from_content_seconds + self._end_stream_seconds = ( + self.metadata.end_stream_seconds ) def __len__(self) -> int: @@ -280,14 +280,14 @@ def get_frame_displayed_at(self, seconds: float) -> Frame: Frame: The frame that is displayed at ``seconds``. """ if ( - not self._begin_stream_from_content_seconds + not self._begin_stream_seconds <= seconds - < self._end_stream_from_content_seconds + < self._end_stream_seconds ): raise IndexError( f"Invalid pts in seconds: {seconds}. " - f"It must be greater than or equal to {self._begin_stream_from_content_seconds} " - f"and less than {self._end_stream_from_content_seconds}." + f"It must be greater than or equal to {self._begin_stream_seconds} " + f"and less than {self._end_stream_seconds}." ) data, pts_seconds, duration_seconds = core.get_frame_at_pts( self._decoder, seconds diff --git a/test/decoders/test_metadata.py b/test/decoders/test_metadata.py index a9375fea8..1ccceb627 100644 --- a/test/decoders/test_metadata.py +++ b/test/decoders/test_metadata.py @@ -92,8 +92,8 @@ def test_num_frames_fallback( bit_rate=123, num_frames_from_header=num_frames_from_header, num_frames_from_content=num_frames_from_content, - begin_stream_from_content_seconds=0, - end_stream_from_content_seconds=4, + begin_stream_seconds=0, + end_stream_seconds=4, codec="whatever", width=123, height=321, From cfb8918fa14673371d71fd913bb2875da9f9b002 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 30 Jul 2024 09:28:43 -0700 Subject: [PATCH 5/6] More stuff --- examples/basic_example.py | 9 ++--- src/torchcodec/decoders/_core/_metadata.py | 40 +++++++------------ .../decoders/_simple_video_decoder.py | 22 ++-------- 3 files changed, 22 insertions(+), 49 deletions(-) diff --git a/examples/basic_example.py b/examples/basic_example.py index 31f54825b..212f73638 100644 --- a/examples/basic_example.py +++ b/examples/basic_example.py @@ -150,14 +150,11 @@ def plot(frames: torch.Tensor, title : Optional[str] = None): # ------------------------- # # So far, we have retrieved frames based on their index. We can also retrieve -# frames based on *when* they are displayed. The available method are -# :meth:`~torchcodec.decoders.SimpleVideoDecoder.get_frame_displayed_at` and -# :meth:`~torchcodec.decoders.SimpleVideoDecoder.get_frames_displayed_at`, which -# also return :class:`~torchcodec.decoders.Frame` and -# :class:`~torchcodec.decoders.FrameBatch` objects respectively. +# frames based on *when* they are displayed with +# :meth:`~torchcodec.decoders.SimpleVideoDecoder.get_frame_displayed_at`, which +# also returns :class:`~torchcodec.decoders.Frame` and frame_at_2_seconds = decoder.get_frame_displayed_at(seconds=2) print(f"{type(frame_at_2_seconds) = }") print(frame_at_2_seconds) plot(frame_at_2_seconds.data, "Frame displayed at 2 seconds") -# TODO_BEFORE_RELEASE: illustrate get_frames_displayed_at diff --git a/src/torchcodec/decoders/_core/_metadata.py b/src/torchcodec/decoders/_core/_metadata.py index d2fe7fe94..8218a8b85 100644 --- a/src/torchcodec/decoders/_core/_metadata.py +++ b/src/torchcodec/decoders/_core/_metadata.py @@ -39,15 +39,18 @@ class VideoStreamMetadata: ``num_frames`` attribute instead. (int or None).""" begin_stream_seconds: Optional[float] """Beginning of the stream, in seconds (float or None). - This corresponds to the first frame's :term:`pts`. Usually, this is equal to - 0.""" + Conceptually, this corresponds to the first frame's :term:`pts`. It is + computed as min(frame.pts) across all frames in the stream. Usually, this is + equal to 0.""" end_stream_seconds: Optional[float] """End of the stream, in seconds (float or None). - This is last_frame.pts + last_frame.duration, so according to our - convention, no frame is displayed at this time: read more in - :meth:`~torchcodec.decoders.SimpleVideoDecoder.get_frame_displayed_at`. - Retrieving the last frame is best done by simply indexing the - :class:`~torchcodec.decoders.SimpleVideoDecoder` object with ``[-1]``. + Conceptually, this corresponds to last_frame.pts + last_frame.duration. It + is computed as max(frame.pts + frame.duration) across all frames in the + stream. Note that no frame is displayed at this time value, so calling + :meth:`~torchcodec.decoders.SimpleVideoDecoder.get_frame_displayed_at` with + this value would result in an error. Retrieving the last frame is best done + by simply indexing the :class:`~torchcodec.decoders.SimpleVideoDecoder` + object with ``[-1]``. """ codec: Optional[str] """Codec (str or None).""" @@ -78,15 +81,9 @@ def duration_seconds(self) -> Optional[float]: from the actual frames if a :term:`scan` wsa performed. Otherwise we fall back to ``duration_seconds_from_header``. """ - if ( - self.end_stream_seconds is None - or self.begin_stream_seconds is None - ): + if self.end_stream_seconds is None or self.begin_stream_seconds is None: return self.duration_seconds_from_header - return ( - self.end_stream_seconds - - self.begin_stream_seconds - ) + return self.end_stream_seconds - self.begin_stream_seconds @property def average_fps(self) -> Optional[float]: @@ -100,10 +97,7 @@ def average_fps(self) -> Optional[float]: or self.num_frames is None ): return self.average_fps_from_header - return self.num_frames / ( - self.end_stream_seconds - - self.begin_stream_seconds - ) + return self.num_frames / (self.end_stream_seconds - self.begin_stream_seconds) def __repr__(self): # Overridden because properites are not printed by default. @@ -160,12 +154,8 @@ def get_video_metadata(decoder: torch.Tensor) -> VideoMetadata: # keys with the Python names num_frames_from_header=stream_dict.get("numFrames"), num_frames_from_content=stream_dict.get("numFramesFromScan"), - begin_stream_seconds=stream_dict.get( - "minPtsSecondsFromScan" - ), - end_stream_seconds=stream_dict.get( - "maxPtsSecondsFromScan" - ), + begin_stream_seconds=stream_dict.get("minPtsSecondsFromScan"), + end_stream_seconds=stream_dict.get("maxPtsSecondsFromScan"), codec=stream_dict.get("codec"), width=stream_dict.get("width"), height=stream_dict.get("height"), diff --git a/src/torchcodec/decoders/_simple_video_decoder.py b/src/torchcodec/decoders/_simple_video_decoder.py index 6d1b29f5f..4a8ec4494 100644 --- a/src/torchcodec/decoders/_simple_video_decoder.py +++ b/src/torchcodec/decoders/_simple_video_decoder.py @@ -146,18 +146,14 @@ def __init__( "The minimum pts value in seconds is unknown. " + _ERROR_REPORTING_INSTRUCTIONS ) - self._begin_stream_seconds = ( - self.metadata.begin_stream_seconds - ) + self._begin_stream_seconds = self.metadata.begin_stream_seconds if self.metadata.end_stream_seconds is None: raise ValueError( "The maximum pts value in seconds is unknown. " + _ERROR_REPORTING_INSTRUCTIONS ) - self._end_stream_seconds = ( - self.metadata.end_stream_seconds - ) + self._end_stream_seconds = self.metadata.end_stream_seconds def __len__(self) -> int: return self._num_frames @@ -267,23 +263,13 @@ def get_frames_at(self, start: int, stop: int, step: int = 1) -> FrameBatch: def get_frame_displayed_at(self, seconds: float) -> Frame: """Return a single frame displayed at the given timestamp in seconds. - Each frame has a :term:`pts` and a duration. The convention is that a - given frame is displayed during the [frame_pts, frame_pts + - frame_duration) interval. - Args: - seconds (float): The time stamp in seconds when the frame is - displayed, i.e. seconds is in - [:term:`pts`, :term:`pts` + duration). + seconds (float): The time stamp in seconds when the frame is displayed. Returns: Frame: The frame that is displayed at ``seconds``. """ - if ( - not self._begin_stream_seconds - <= seconds - < self._end_stream_seconds - ): + if not self._begin_stream_seconds <= seconds < self._end_stream_seconds: raise IndexError( f"Invalid pts in seconds: {seconds}. " f"It must be greater than or equal to {self._begin_stream_seconds} " From f299effdea9c7d912671040a6e999a9e0cb0fd05 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 30 Jul 2024 09:57:07 -0700 Subject: [PATCH 6/6] Address comments --- examples/basic_example.py | 2 +- src/torchcodec/decoders/_core/_metadata.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/basic_example.py b/examples/basic_example.py index 212f73638..735e1464d 100644 --- a/examples/basic_example.py +++ b/examples/basic_example.py @@ -152,7 +152,7 @@ def plot(frames: torch.Tensor, title : Optional[str] = None): # So far, we have retrieved frames based on their index. We can also retrieve # frames based on *when* they are displayed with # :meth:`~torchcodec.decoders.SimpleVideoDecoder.get_frame_displayed_at`, which -# also returns :class:`~torchcodec.decoders.Frame` and +# also returns :class:`~torchcodec.decoders.Frame`. frame_at_2_seconds = decoder.get_frame_displayed_at(seconds=2) print(f"{type(frame_at_2_seconds) = }") diff --git a/src/torchcodec/decoders/_core/_metadata.py b/src/torchcodec/decoders/_core/_metadata.py index 8218a8b85..8db223959 100644 --- a/src/torchcodec/decoders/_core/_metadata.py +++ b/src/torchcodec/decoders/_core/_metadata.py @@ -78,7 +78,7 @@ def num_frames(self) -> Optional[int]: @property def duration_seconds(self) -> Optional[float]: """Duration of the stream in seconds. We try to calculate the duration - from the actual frames if a :term:`scan` wsa performed. Otherwise we + from the actual frames if a :term:`scan` was performed. Otherwise we fall back to ``duration_seconds_from_header``. """ if self.end_stream_seconds is None or self.begin_stream_seconds is None: