diff --git a/README.md b/README.md index 71c920895..344c33b03 100644 --- a/README.md +++ b/README.md @@ -23,12 +23,13 @@ from torchcodec.decoders import SimpleVideoDecoder decoder = SimpleVideoDecoder("path/to/video.mp4") decoder.metadata -# VideoStreamMetadata: (Truncated output) +# VideoStreamMetadata: # num_frames: 250 # duration_seconds: 10.0 # bit_rate: 31315.0 # codec: h264 # average_fps: 25.0 +# ... (truncated output) len(decoder) # == decoder.metadata.num_frames! # 250 diff --git a/docs/source/glossary.rst b/docs/source/glossary.rst index cb125f6e4..7afb74a72 100644 --- a/docs/source/glossary.rst +++ b/docs/source/glossary.rst @@ -12,3 +12,8 @@ Glossary `_: *The best stream is determined according to various heuristics as the most likely to be what the user expects.* + + scan + A scan corresponds to an entire pass over a video file, with the purpose + of retrieving metadata about the different streams and frames. **It does + not involve decoding**, so it is a lot cheaper than decoding the file. diff --git a/examples/basic_example.py b/examples/basic_example.py index 31f54825b..735e1464d 100644 --- a/examples/basic_example.py +++ b/examples/basic_example.py @@ -150,14 +150,11 @@ def plot(frames: torch.Tensor, title : Optional[str] = None): # ------------------------- # # So far, we have retrieved frames based on their index. We can also retrieve -# frames based on *when* they are displayed. The available method are -# :meth:`~torchcodec.decoders.SimpleVideoDecoder.get_frame_displayed_at` and -# :meth:`~torchcodec.decoders.SimpleVideoDecoder.get_frames_displayed_at`, which -# also return :class:`~torchcodec.decoders.Frame` and -# :class:`~torchcodec.decoders.FrameBatch` objects respectively. +# frames based on *when* they are displayed with +# :meth:`~torchcodec.decoders.SimpleVideoDecoder.get_frame_displayed_at`, which +# also returns :class:`~torchcodec.decoders.Frame`. frame_at_2_seconds = decoder.get_frame_displayed_at(seconds=2) print(f"{type(frame_at_2_seconds) = }") print(frame_at_2_seconds) plot(frame_at_2_seconds.data, "Frame displayed at 2 seconds") -# TODO_BEFORE_RELEASE: illustrate get_frames_displayed_at diff --git a/src/torchcodec/decoders/_core/_metadata.py b/src/torchcodec/decoders/_core/_metadata.py index 83ed807c7..8db223959 100644 --- a/src/torchcodec/decoders/_core/_metadata.py +++ b/src/torchcodec/decoders/_core/_metadata.py @@ -24,7 +24,7 @@ class VideoStreamMetadata: """Metadata of a single video stream.""" duration_seconds_from_header: Optional[float] - """Duration of the stream, in seconds obtained from the header (float or + """Duration of the stream, in seconds, obtained from the header (float or None). This could be inaccurate.""" bit_rate: Optional[float] """Bit rate of the stream, in seconds (float or None).""" @@ -37,20 +37,21 @@ class VideoStreamMetadata: content (the scan doesn't involve decoding). This is more accurate than ``num_frames_from_header``. We recommend using the ``num_frames`` attribute instead. (int or None).""" - begin_stream_from_content_seconds: Optional[float] - """Beginning of the stream in seconds (float or None). - This is min(frame.pts) for all frames in this stream.""" - end_stream_from_content_seconds: Optional[float] - """End of the stream in seconds (float or None). - This is max(frame.pts + frame.duration) for all frames in this stream. - Note that frames have a pts and duration and the interval defined by - [pts, pts + duration) is a half-open interval (the right boundary is open). - Therefore no frame is displayed at this time value. - Calling - SimpleVideoDecoder.get_frame_displayed_at(end_stream_from_content_seconds) - will raise a StopIteration exception. - If you want to get the last frame you can use [-1] on a SimpleVideoDecoder - object.""" + begin_stream_seconds: Optional[float] + """Beginning of the stream, in seconds (float or None). + Conceptually, this corresponds to the first frame's :term:`pts`. It is + computed as min(frame.pts) across all frames in the stream. Usually, this is + equal to 0.""" + end_stream_seconds: Optional[float] + """End of the stream, in seconds (float or None). + Conceptually, this corresponds to last_frame.pts + last_frame.duration. It + is computed as max(frame.pts + frame.duration) across all frames in the + stream. Note that no frame is displayed at this time value, so calling + :meth:`~torchcodec.decoders.SimpleVideoDecoder.get_frame_displayed_at` with + this value would result in an error. Retrieving the last frame is best done + by simply indexing the :class:`~torchcodec.decoders.SimpleVideoDecoder` + object with ``[-1]``. + """ codec: Optional[str] """Codec (str or None).""" width: Optional[int] @@ -58,15 +59,16 @@ class VideoStreamMetadata: height: Optional[int] """Height of the frames (int or None).""" average_fps_from_header: Optional[float] - """Averate fps of the stream (float or None).""" + """Averate fps of the stream, obtained from the header (float or None). + We recommend using the ``average_fps`` attribute instead.""" stream_index: int """Index of the stream within the video (int).""" @property def num_frames(self) -> Optional[int]: """Number of frames in the stream. This corresponds to - ``num_frames_from_content`` if it's not None, otherwise it corresponds - to ``num_frames_from_header``. + ``num_frames_from_content`` if a :term:`scan` was made, otherwise it + corresponds to ``num_frames_from_header``. """ if self.num_frames_from_content is not None: return self.num_frames_from_content @@ -76,35 +78,26 @@ def num_frames(self) -> Optional[int]: @property def duration_seconds(self) -> Optional[float]: """Duration of the stream in seconds. We try to calculate the duration - from the actual frames if we scanned the frames. Otherwise we fall back - to the duration obtained from the header. + from the actual frames if a :term:`scan` was performed. Otherwise we + fall back to ``duration_seconds_from_header``. """ - if ( - self.end_stream_from_content_seconds is None - or self.begin_stream_from_content_seconds is None - ): + if self.end_stream_seconds is None or self.begin_stream_seconds is None: return self.duration_seconds_from_header - return ( - self.end_stream_from_content_seconds - - self.begin_stream_from_content_seconds - ) + return self.end_stream_seconds - self.begin_stream_seconds @property def average_fps(self) -> Optional[float]: - """Average fps of the stream. We try to get the average fps from the - actual frames if we scanned the frames. Otherwise we fall back to the - fps obtained from the header. + """Average fps of the stream. If a :term:`scan` was perfomed, this is + computed from the number of frames and the duration of the stream. + Otherwise we fall back to ``average_fps_from_header``. """ if ( - self.end_stream_from_content_seconds is None - or self.begin_stream_from_content_seconds is None + self.end_stream_seconds is None + or self.begin_stream_seconds is None or self.num_frames is None ): return self.average_fps_from_header - return self.num_frames / ( - self.end_stream_from_content_seconds - - self.begin_stream_from_content_seconds - ) + return self.num_frames / (self.end_stream_seconds - self.begin_stream_seconds) def __repr__(self): # Overridden because properites are not printed by default. @@ -161,12 +154,8 @@ def get_video_metadata(decoder: torch.Tensor) -> VideoMetadata: # keys with the Python names num_frames_from_header=stream_dict.get("numFrames"), num_frames_from_content=stream_dict.get("numFramesFromScan"), - begin_stream_from_content_seconds=stream_dict.get( - "minPtsSecondsFromScan" - ), - end_stream_from_content_seconds=stream_dict.get( - "maxPtsSecondsFromScan" - ), + begin_stream_seconds=stream_dict.get("minPtsSecondsFromScan"), + end_stream_seconds=stream_dict.get("maxPtsSecondsFromScan"), codec=stream_dict.get("codec"), width=stream_dict.get("width"), height=stream_dict.get("height"), diff --git a/src/torchcodec/decoders/_simple_video_decoder.py b/src/torchcodec/decoders/_simple_video_decoder.py index 349fda94f..4a8ec4494 100644 --- a/src/torchcodec/decoders/_simple_video_decoder.py +++ b/src/torchcodec/decoders/_simple_video_decoder.py @@ -77,7 +77,8 @@ def __repr__(self): class SimpleVideoDecoder: """A single-stream video decoder. - If the video contains multiple video streams, the :term:`best stream` is used. + If the video contains multiple video streams, the :term:`best stream` is + used. This decoder always performs a :term:`scan` of the video. Args: source (str, ``Pathlib.path``, ``torch.Tensor``, or bytes): The source of the video. @@ -140,23 +141,19 @@ def __init__( ) self._num_frames = self.metadata.num_frames_from_content - if self.metadata.begin_stream_from_content_seconds is None: + if self.metadata.begin_stream_seconds is None: raise ValueError( "The minimum pts value in seconds is unknown. " + _ERROR_REPORTING_INSTRUCTIONS ) - self._begin_stream_from_content_seconds = ( - self.metadata.begin_stream_from_content_seconds - ) + self._begin_stream_seconds = self.metadata.begin_stream_seconds - if self.metadata.end_stream_from_content_seconds is None: + if self.metadata.end_stream_seconds is None: raise ValueError( "The maximum pts value in seconds is unknown. " + _ERROR_REPORTING_INSTRUCTIONS ) - self._end_stream_from_content_seconds = ( - self.metadata.end_stream_from_content_seconds - ) + self._end_stream_seconds = self.metadata.end_stream_seconds def __len__(self) -> int: return self._num_frames @@ -267,22 +264,16 @@ def get_frame_displayed_at(self, seconds: float) -> Frame: """Return a single frame displayed at the given timestamp in seconds. Args: - seconds (float): The time stamp in seconds when the frame is - displayed, i.e. seconds is in - [:term:`pts`, :term:`pts` + duration). + seconds (float): The time stamp in seconds when the frame is displayed. Returns: Frame: The frame that is displayed at ``seconds``. """ - if ( - not self._begin_stream_from_content_seconds - <= seconds - < self._end_stream_from_content_seconds - ): + if not self._begin_stream_seconds <= seconds < self._end_stream_seconds: raise IndexError( f"Invalid pts in seconds: {seconds}. " - f"It must be greater than or equal to {self._begin_stream_from_content_seconds} " - f"and less than {self._end_stream_from_content_seconds}." + f"It must be greater than or equal to {self._begin_stream_seconds} " + f"and less than {self._end_stream_seconds}." ) data, pts_seconds, duration_seconds = core.get_frame_at_pts( self._decoder, seconds diff --git a/test/decoders/test_metadata.py b/test/decoders/test_metadata.py index a9375fea8..1ccceb627 100644 --- a/test/decoders/test_metadata.py +++ b/test/decoders/test_metadata.py @@ -92,8 +92,8 @@ def test_num_frames_fallback( bit_rate=123, num_frames_from_header=num_frames_from_header, num_frames_from_content=num_frames_from_content, - begin_stream_from_content_seconds=0, - end_stream_from_content_seconds=4, + begin_stream_seconds=0, + end_stream_seconds=4, codec="whatever", width=123, height=321,