meta-pytorch · facebook-github-bot · Aug 14, 2024 · Aug 13, 2024
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -940,6 +940,16 @@ VideoDecoder::DecodedOutput VideoDecoder::getFrameDisplayedAtTimestampNoDemux(
         double frameStartTime = ptsToSeconds(frame->pts, stream.timeBase);
         double frameEndTime =
             ptsToSeconds(frame->pts + getDuration(frame), stream.timeBase);
+        if (frameStartTime > seconds) {
+          // FFMPEG seeked past the frame we are looking for even though we
+          // set max_ts to be our needed timestamp in avformat_seek_file()
+          // in maybeSeekToBeforeDesiredPts().
+          // This could be a bug in FFMPEG: https://trac.ffmpeg.org/ticket/11137
+          // In this case we return the very next frame instead of throwing an
+          // exception.
+          // TODO: Maybe log to stderr for Debug builds?
+          return true;
+        }
         return seconds >= frameStartTime && seconds < frameEndTime;
       });
 }

diff --git a/test/decoders/test_simple_video_decoder.py b/test/decoders/test_simple_video_decoder.py
@@ -9,7 +9,7 @@
 
 from torchcodec.decoders import _core, SimpleVideoDecoder
 
-from ..utils import assert_tensor_close, assert_tensor_equal, NASA_VIDEO
+from ..utils import assert_tensor_close, assert_tensor_equal, H265_VIDEO, NASA_VIDEO
 
 
 class TestSimpleDecoder:
@@ -320,6 +320,15 @@ def test_get_frame_displayed_at(self):
         assert isinstance(decoder.get_frame_displayed_at(6.02).pts_seconds, float)
         assert isinstance(decoder.get_frame_displayed_at(6.02).duration_seconds, float)
 
+    def test_get_frame_displayed_at_h265(self):
+        decoder = SimpleVideoDecoder(H265_VIDEO.path)
+        # Note that for H265, FFMPEG's seeking is not precise. Even though we ask to
+        # seek with a max_ts=0.5, FFMPEG will seek beyond that point.
+        # TODO: Revert use frame5 in the test below once it's fixed upstream:
+        # https://trac.ffmpeg.org/ticket/11137
+        ref_frame6 = H265_VIDEO.get_frame_by_name("frame000006")
+        assert_tensor_equal(ref_frame6, decoder.get_frame_displayed_at(0.5).data)
+
     def test_get_frame_displayed_at_fails(self):
         decoder = SimpleVideoDecoder(NASA_VIDEO.path)
 

diff --git a/test/generate_reference_resources.sh b/test/generate_reference_resources.sh
@@ -42,8 +42,19 @@ ffmpeg -y -i "$VIDEO_PATH" -b:a 192K -vn "$VIDEO_PATH.audio.mp3"
 
 # TODO: Add frames decoded by Nvidia's NVDEC.
 
+# This video was generated by running the following:
+# conda install -c conda-forge x265
+# ./configure --enable-nonfree --enable-gpl --prefix=$(readlink -f ../bin) --enable-libx265  --enable-rpath --extra-ldflags=-Wl,-rpath=$CONDA_PREFIX/lib --enable-filter=drawtext --enable-libfontconfig --enable-libfreetype --enable-libharfbuzz
+# ffmpeg -f lavfi -i color=size=128x128:duration=1:rate=10:color=blue -vf "drawtext=fontsize=30:fontcolor=white:x=(w-text_w)/2:y=(h-text_h)/2:text='Frame %{frame_num}'" -vcodec libx265 -pix_fmt yuv420p -g 2 -crf 10 h265_video.mp4 -y
+VIDEO_PATH=$RESOURCES_DIR/h265_video.mp4
+FRAMES=(6)
+for frame in "${FRAMES[@]}"; do
+  frame_name=$(printf "%06d" "$frame")
+  ffmpeg -y -i "$VIDEO_PATH" -vf select="eq(n\,$frame)" -vsync vfr -q:v 2 "$VIDEO_PATH.frame$frame_name.bmp"
+done
+
 for bmp in "$RESOURCES_DIR"/*.bmp
 do
-  python3 convert_image_to_tensor.py "$bmp"
+  python3 "$TORCHCODEC_PATH/test/convert_image_to_tensor.py" "$bmp"
   rm -f "$bmp"
 done
diff --git a/test/resources/h265_video.mp4 b/test/resources/h265_video.mp4
diff --git a/test/resources/h265_video.mp4.frame000006.pt b/test/resources/h265_video.mp4.frame000006.pt
diff --git a/test/utils.py b/test/utils.py
@@ -152,3 +152,17 @@ def empty_chw_tensor(self) -> torch.Tensor:
 # When we start actually decoding audio-only files, we'll probably need to define
 # a TestAudio class with audio specific values. Until then, we only need a filename.
 NASA_AUDIO = TestContainerFile(filename="nasa_13013.mp4.audio.mp3", frames={})
+
+H265_VIDEO = TestVideo(
+    filename="h265_video.mp4",
+    height=128,
+    width=128,
+    num_color_channels=3,
+    # TODO_OPEN_ISSUE Scott: improve the testing framework so that these values are loaded from a JSON
+    # file and not hardcoded. These values were copied over by hand from the JSON
+    # output from the following command:
+    #  $ ffprobe -v error -hide_banner -select_streams v:1 -show_frames -of json test/resources/h265_video.mp4 > out.json
+    frames={
+        6: TestFrameInfo(pts_seconds=0.6, duration_seconds=0.1),
+    },
+)