meta-pytorch · NicolasHug · Jul 30, 2024 · Jul 30, 2024 · Jul 30, 2024 · Jul 30, 2024
diff --git a/README.md b/README.md
@@ -23,12 +23,13 @@ from torchcodec.decoders import SimpleVideoDecoder
 decoder = SimpleVideoDecoder("path/to/video.mp4")
 
 decoder.metadata
-# VideoStreamMetadata:  (Truncated output)
+# VideoStreamMetadata:
 #   num_frames: 250
 #   duration_seconds: 10.0
 #   bit_rate: 31315.0
 #   codec: h264
 #   average_fps: 25.0
+#   ... (truncated output)
 
 len(decoder)  # == decoder.metadata.num_frames!
 # 250

diff --git a/docs/source/glossary.rst b/docs/source/glossary.rst
@@ -12,3 +12,8 @@ Glossary
        <https://ffmpeg.org/doxygen/trunk/group__lavf__decoding.html#ga757780d38f482deb4d809c6c521fbcc2>`_:
 
         *The best stream is determined according to various heuristics as the most likely to be what the user expects.*
+
+    scan
+       A scan corresponds to an entire pass over a video file, with the purpose
+       of retrieving metadata about the different streams and frames. **It does
+       not involve decoding**, so it is a lot cheaper than decoding the file.
diff --git a/examples/basic_example.py b/examples/basic_example.py
@@ -150,14 +150,11 @@ def plot(frames: torch.Tensor, title : Optional[str] = None):
 # -------------------------
 #
 # So far, we have retrieved frames based on their index. We can also retrieve
-# frames based on *when* they are displayed.  The available method are
-# :meth:`~torchcodec.decoders.SimpleVideoDecoder.get_frame_displayed_at` and
-# :meth:`~torchcodec.decoders.SimpleVideoDecoder.get_frames_displayed_at`, which
-# also return :class:`~torchcodec.decoders.Frame` and
-# :class:`~torchcodec.decoders.FrameBatch` objects respectively.
+# frames based on *when* they are displayed with
+# :meth:`~torchcodec.decoders.SimpleVideoDecoder.get_frame_displayed_at`, which
+# also returns :class:`~torchcodec.decoders.Frame`.
 
 frame_at_2_seconds = decoder.get_frame_displayed_at(seconds=2)
 print(f"{type(frame_at_2_seconds) = }")
 print(frame_at_2_seconds)
 plot(frame_at_2_seconds.data, "Frame displayed at 2 seconds")
-# TODO_BEFORE_RELEASE: illustrate get_frames_displayed_at
diff --git a/src/torchcodec/decoders/_core/_metadata.py b/src/torchcodec/decoders/_core/_metadata.py
@@ -24,7 +24,7 @@ class VideoStreamMetadata:
     """Metadata of a single video stream."""
 
     duration_seconds_from_header: Optional[float]
-    """Duration of the stream, in seconds obtained from the header (float or
+    """Duration of the stream, in seconds, obtained from the header (float or
     None). This could be inaccurate."""
     bit_rate: Optional[float]
     """Bit rate of the stream, in seconds (float or None)."""
@@ -37,36 +37,38 @@ class VideoStreamMetadata:
     content (the scan doesn't involve decoding). This is more accurate
     than ``num_frames_from_header``. We recommend using the
     ``num_frames`` attribute instead. (int or None)."""
-    begin_stream_from_content_seconds: Optional[float]
-    """Beginning of the stream in seconds (float or None).
-    This is min(frame.pts) for all frames in this stream."""
-    end_stream_from_content_seconds: Optional[float]
-    """End of the stream in seconds (float or None).
-    This is max(frame.pts + frame.duration) for all frames in this stream.
-    Note that frames have a pts and duration and the interval defined by
-    [pts, pts + duration) is a half-open interval (the right boundary is open).
-    Therefore no frame is displayed at this time value.
-    Calling
-    SimpleVideoDecoder.get_frame_displayed_at(end_stream_from_content_seconds)
-    will raise a StopIteration exception.
-    If you want to get the last frame you can use [-1] on a SimpleVideoDecoder
-    object."""
+    begin_stream_seconds: Optional[float]
+    """Beginning of the stream, in seconds (float or None).
+    Conceptually, this corresponds to the first frame's :term:`pts`. It is
+    computed as min(frame.pts) across all frames in the stream. Usually, this is
+    equal to 0."""
+    end_stream_seconds: Optional[float]
+    """End of the stream, in seconds (float or None).
+    Conceptually, this corresponds to last_frame.pts + last_frame.duration. It
+    is computed as max(frame.pts + frame.duration) across all frames in the
+    stream. Note that no frame is displayed at this time value, so calling
+    :meth:`~torchcodec.decoders.SimpleVideoDecoder.get_frame_displayed_at` with
+    this value would result in an error. Retrieving the last frame is best done
+    by simply indexing the :class:`~torchcodec.decoders.SimpleVideoDecoder`
+    object with ``[-1]``.
+    """
     codec: Optional[str]
     """Codec (str or None)."""
     width: Optional[int]
     """Width of the frames (int or None)."""
     height: Optional[int]
     """Height of the frames (int or None)."""
     average_fps_from_header: Optional[float]
-    """Averate fps of the stream (float or None)."""
+    """Averate fps of the stream, obtained from the header (float or None).
+    We recommend using the ``average_fps`` attribute instead."""
     stream_index: int
     """Index of the stream within the video (int)."""
 
     @property
     def num_frames(self) -> Optional[int]:
         """Number of frames in the stream. This corresponds to
-        ``num_frames_from_content`` if it's not None, otherwise it corresponds
-        to ``num_frames_from_header``.
+        ``num_frames_from_content`` if a :term:`scan` was made, otherwise it
+        corresponds to ``num_frames_from_header``.
         """
         if self.num_frames_from_content is not None:
             return self.num_frames_from_content
@@ -76,35 +78,26 @@ def num_frames(self) -> Optional[int]:
     @property
     def duration_seconds(self) -> Optional[float]:
         """Duration of the stream in seconds. We try to calculate the duration
-        from the actual frames if we scanned the frames. Otherwise we fall back
-        to the duration obtained from the header.
+        from the actual frames if a :term:`scan` was performed. Otherwise we
+        fall back to ``duration_seconds_from_header``.
         """
-        if (
-            self.end_stream_from_content_seconds is None
-            or self.begin_stream_from_content_seconds is None
-        ):
+        if self.end_stream_seconds is None or self.begin_stream_seconds is None:
             return self.duration_seconds_from_header
-        return (
-            self.end_stream_from_content_seconds
-            - self.begin_stream_from_content_seconds
-        )
+        return self.end_stream_seconds - self.begin_stream_seconds
 
     @property
     def average_fps(self) -> Optional[float]:
-        """Average fps of the stream. We try to get the average fps from the
-        actual frames if we scanned the frames. Otherwise we fall back to the
-        fps obtained from the header.
+        """Average fps of the stream. If a :term:`scan` was perfomed, this is
+        computed from the number of frames and the duration of the stream.
+        Otherwise we fall back to ``average_fps_from_header``.
         """
         if (
-            self.end_stream_from_content_seconds is None
-            or self.begin_stream_from_content_seconds is None
+            self.end_stream_seconds is None
+            or self.begin_stream_seconds is None
             or self.num_frames is None
         ):
             return self.average_fps_from_header
-        return self.num_frames / (
-            self.end_stream_from_content_seconds
-            - self.begin_stream_from_content_seconds
-        )
+        return self.num_frames / (self.end_stream_seconds - self.begin_stream_seconds)
 
     def __repr__(self):
         # Overridden because properites are not printed by default.
@@ -161,12 +154,8 @@ def get_video_metadata(decoder: torch.Tensor) -> VideoMetadata:
                 # keys with the Python names
                 num_frames_from_header=stream_dict.get("numFrames"),
                 num_frames_from_content=stream_dict.get("numFramesFromScan"),
-                begin_stream_from_content_seconds=stream_dict.get(
-                    "minPtsSecondsFromScan"
-                ),
-                end_stream_from_content_seconds=stream_dict.get(
-                    "maxPtsSecondsFromScan"
-                ),
+                begin_stream_seconds=stream_dict.get("minPtsSecondsFromScan"),
+                end_stream_seconds=stream_dict.get("maxPtsSecondsFromScan"),
                 codec=stream_dict.get("codec"),
                 width=stream_dict.get("width"),
                 height=stream_dict.get("height"),

diff --git a/src/torchcodec/decoders/_simple_video_decoder.py b/src/torchcodec/decoders/_simple_video_decoder.py
@@ -77,7 +77,8 @@ def __repr__(self):
 class SimpleVideoDecoder:
     """A single-stream video decoder.
 
-    If the video contains multiple video streams, the :term:`best stream` is used.
+    If the video contains multiple video streams, the :term:`best stream` is
+    used. This decoder always performs a :term:`scan` of the video.
 
     Args:
         source (str, ``Pathlib.path``, ``torch.Tensor``, or bytes): The source of the video.
@@ -140,23 +141,19 @@ def __init__(
             )
         self._num_frames = self.metadata.num_frames_from_content
 
-        if self.metadata.begin_stream_from_content_seconds is None:
+        if self.metadata.begin_stream_seconds is None:
             raise ValueError(
                 "The minimum pts value in seconds is unknown. "
                 + _ERROR_REPORTING_INSTRUCTIONS
             )
-        self._begin_stream_from_content_seconds = (
-            self.metadata.begin_stream_from_content_seconds
-        )
+        self._begin_stream_seconds = self.metadata.begin_stream_seconds
 
-        if self.metadata.end_stream_from_content_seconds is None:
+        if self.metadata.end_stream_seconds is None:
             raise ValueError(
                 "The maximum pts value in seconds is unknown. "
                 + _ERROR_REPORTING_INSTRUCTIONS
             )
-        self._end_stream_from_content_seconds = (
-            self.metadata.end_stream_from_content_seconds
-        )
+        self._end_stream_seconds = self.metadata.end_stream_seconds
 
     def __len__(self) -> int:
         return self._num_frames
@@ -267,22 +264,16 @@ def get_frame_displayed_at(self, seconds: float) -> Frame:
         """Return a single frame displayed at the given timestamp in seconds.
 
         Args:
-            seconds (float): The time stamp in seconds when the frame is
-                displayed, i.e. seconds is in
-                [:term:`pts`, :term:`pts` + duration).
+            seconds (float): The time stamp in seconds when the frame is displayed.
 
         Returns:
             Frame: The frame that is displayed at ``seconds``.
         """
-        if (
-            not self._begin_stream_from_content_seconds
-            <= seconds
-            < self._end_stream_from_content_seconds
-        ):
+        if not self._begin_stream_seconds <= seconds < self._end_stream_seconds:
             raise IndexError(
                 f"Invalid pts in seconds: {seconds}. "
-                f"It must be greater than or equal to {self._begin_stream_from_content_seconds} "
-                f"and less than {self._end_stream_from_content_seconds}."
+                f"It must be greater than or equal to {self._begin_stream_seconds} "
+                f"and less than {self._end_stream_seconds}."
             )
         data, pts_seconds, duration_seconds = core.get_frame_at_pts(
             self._decoder, seconds

diff --git a/test/decoders/test_metadata.py b/test/decoders/test_metadata.py
@@ -92,8 +92,8 @@ def test_num_frames_fallback(
         bit_rate=123,
         num_frames_from_header=num_frames_from_header,
         num_frames_from_content=num_frames_from_content,
-        begin_stream_from_content_seconds=0,
-        end_stream_from_content_seconds=4,
+        begin_stream_seconds=0,
+        end_stream_seconds=4,
         codec="whatever",
         width=123,
         height=321,