From 179b59d638bf25bc4db4d2d2a00f09d87f9c81fc Mon Sep 17 00:00:00 2001 From: Ahmad Sharif Date: Tue, 6 Aug 2024 11:42:41 -0700 Subject: [PATCH] [torchcodec] Add a NoDemux suffix to functions that do not demux (#147) Summary: Pull Request resolved: https://github.com/pytorch/torchcodec/pull/147 This more clearly indicates that the function does not perform demuxing (separating frames into separate streams) and returns outputs from any available stream. Reviewed By: scotts Differential Revision: D60687942 fbshipit-source-id: af8d5e6e3a5949bcaa4e91f5ee170bf594d7373a --- benchmarks/decoders/BenchmarkDecodersMain.cpp | 4 +-- .../decoders/_core/VideoDecoder.cpp | 8 ++--- src/torchcodec/decoders/_core/VideoDecoder.h | 19 ++++++----- .../decoders/_core/VideoDecoderOps.cpp | 4 +-- test/decoders/VideoDecoderTest.cpp | 33 ++++++++++--------- 5 files changed, 35 insertions(+), 33 deletions(-) diff --git a/benchmarks/decoders/BenchmarkDecodersMain.cpp b/benchmarks/decoders/BenchmarkDecodersMain.cpp index a9762a0b5..3a4d44b2f 100644 --- a/benchmarks/decoders/BenchmarkDecodersMain.cpp +++ b/benchmarks/decoders/BenchmarkDecodersMain.cpp @@ -63,7 +63,7 @@ void runNDecodeIterations( decoder->addVideoStreamDecoder(-1); for (double pts : ptsList) { decoder->setCursorPtsInSeconds(pts); - torch::Tensor tensor = decoder->getNextDecodedOutput().frame; + torch::Tensor tensor = decoder->getNextDecodedOutputNoDemux().frame; } if (i + 1 == warmupIterations) { start = std::chrono::high_resolution_clock::now(); @@ -95,7 +95,7 @@ void runNdecodeIterationsGrabbingConsecutiveFrames( VideoDecoder::createFromFilePath(videoPath); decoder->addVideoStreamDecoder(-1); for (int j = 0; j < consecutiveFrameCount; ++j) { - torch::Tensor tensor = decoder->getNextDecodedOutput().frame; + torch::Tensor tensor = decoder->getNextDecodedOutputNoDemux().frame; } if (i + 1 == warmupIterations) { start = std::chrono::high_resolution_clock::now(); diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp index 84e5d794e..5aaffbd1b 100644 --- a/src/torchcodec/decoders/_core/VideoDecoder.cpp +++ b/src/torchcodec/decoders/_core/VideoDecoder.cpp @@ -651,7 +651,7 @@ VideoDecoder::DecodedOutput VideoDecoder::getDecodedOutputWithFilter( if (activeStreamIndices_.size() == 0) { throw std::runtime_error("No active streams configured."); } - VLOG(9) << "Starting getNextDecodedOutput()"; + VLOG(9) << "Starting getNextDecodedOutputNoDemux()"; resetDecodeStats(); if (maybeDesiredPts_.has_value()) { VLOG(9) << "maybeDesiredPts_=" << *maybeDesiredPts_; @@ -792,7 +792,7 @@ VideoDecoder::DecodedOutput VideoDecoder::convertAVFrameToDecodedOutput( return output; } -VideoDecoder::DecodedOutput VideoDecoder::getFrameDisplayedAtTimestamp( +VideoDecoder::DecodedOutput VideoDecoder::getFrameDisplayedAtTimestampNoDemux( double seconds) { for (auto& [streamIndex, stream] : streams_) { double frameStartTime = ptsToSeconds(stream.currentPts, stream.timeBase); @@ -867,7 +867,7 @@ VideoDecoder::DecodedOutput VideoDecoder::getFrameAtIndex( int64_t pts = stream.allFrames[frameIndex].pts; setCursorPtsInSeconds(ptsToSeconds(pts, stream.timeBase)); - return getNextDecodedOutput(); + return getNextDecodedOutputNoDemux(); } VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesAtIndexes( @@ -1020,7 +1020,7 @@ VideoDecoder::getFramesDisplayedByTimestampInRange( return output; } -VideoDecoder::DecodedOutput VideoDecoder::getNextDecodedOutput() { +VideoDecoder::DecodedOutput VideoDecoder::getNextDecodedOutputNoDemux() { return getDecodedOutputWithFilter( [this](int frameStreamIndex, AVFrame* frame) { StreamInfo& activeStream = streams_[frameStreamIndex]; diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h index e54731554..ca9b6a181 100644 --- a/src/torchcodec/decoders/_core/VideoDecoder.h +++ b/src/torchcodec/decoders/_core/VideoDecoder.h @@ -150,8 +150,8 @@ class VideoDecoder { // ---- SINGLE FRAME SEEK AND DECODING API ---- // Places the cursor at the first frame on or after the position in seconds. - // Calling getNextFrameAsTensor() will return the first frame at or after this - // position. + // Calling getNextDecodedOutputNoDemux() will return the first frame at or + // after this position. void setCursorPtsInSeconds(double seconds); struct DecodedOutput { // The actual decoded output as a Tensor. @@ -177,13 +177,14 @@ class VideoDecoder { }; // Decodes the frame where the current cursor position is. It also advances // the cursor to the next frame. - DecodedOutput getNextDecodedOutput(); - // Decodes the frame that is visible at a given timestamp. Frames in the video - // have a presentation timestamp and a duration. For example, if a frame has - // presentation timestamp of 5.0s and a duration of 1.0s, it will be visible - // in the timestamp range [5.0, 6.0). i.e. it will be returned when this - // function is called with seconds=5.0 or seconds=5.999, etc. - DecodedOutput getFrameDisplayedAtTimestamp(double seconds); + DecodedOutput getNextDecodedOutputNoDemux(); + // Decodes the first frame in any added stream that is visible at a given + // timestamp. Frames in the video have a presentation timestamp and a + // duration. For example, if a frame has presentation timestamp of 5.0s and a + // duration of 1.0s, it will be visible in the timestamp range [5.0, 6.0). + // i.e. it will be returned when this function is called with seconds=5.0 or + // seconds=5.999, etc. + DecodedOutput getFrameDisplayedAtTimestampNoDemux(double seconds); DecodedOutput getFrameAtIndex(int streamIndex, int64_t frameIndex); struct BatchDecodedOutput { torch::Tensor frames; diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp index 4b271f2eb..c2251fc79 100644 --- a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp +++ b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp @@ -142,7 +142,7 @@ OpsDecodedOutput get_next_frame(at::Tensor& decoder) { auto videoDecoder = unwrapTensorToGetDecoder(decoder); VideoDecoder::DecodedOutput result; try { - result = videoDecoder->getNextDecodedOutput(); + result = videoDecoder->getNextDecodedOutputNoDemux(); } catch (const VideoDecoder::EndOfFileException& e) { throw pybind11::stop_iteration(e.what()); } @@ -156,7 +156,7 @@ OpsDecodedOutput get_next_frame(at::Tensor& decoder) { OpsDecodedOutput get_frame_at_pts(at::Tensor& decoder, double seconds) { auto videoDecoder = unwrapTensorToGetDecoder(decoder); - auto result = videoDecoder->getFrameDisplayedAtTimestamp(seconds); + auto result = videoDecoder->getFrameDisplayedAtTimestampNoDemux(seconds); return makeOpsDecodedOutput(result); } diff --git a/test/decoders/VideoDecoderTest.cpp b/test/decoders/VideoDecoderTest.cpp index 057148b36..148d32a1f 100644 --- a/test/decoders/VideoDecoderTest.cpp +++ b/test/decoders/VideoDecoderTest.cpp @@ -148,7 +148,7 @@ TEST(VideoDecoderTest, RespectsWidthAndHeightFromOptions) { streamOptions.width = 100; streamOptions.height = 120; decoder->addVideoStreamDecoder(-1, streamOptions); - torch::Tensor tensor = decoder->getNextDecodedOutput().frame; + torch::Tensor tensor = decoder->getNextDecodedOutputNoDemux().frame; EXPECT_EQ(tensor.sizes(), std::vector({3, 120, 100})); } @@ -159,7 +159,7 @@ TEST(VideoDecoderTest, RespectsOutputTensorDimensionOrderFromOptions) { VideoDecoder::VideoStreamDecoderOptions streamOptions; streamOptions.dimensionOrder = "NHWC"; decoder->addVideoStreamDecoder(-1, streamOptions); - torch::Tensor tensor = decoder->getNextDecodedOutput().frame; + torch::Tensor tensor = decoder->getNextDecodedOutputNoDemux().frame; EXPECT_EQ(tensor.sizes(), std::vector({270, 480, 3})); } @@ -168,12 +168,12 @@ TEST_P(VideoDecoderTest, ReturnsFirstTwoFramesOfVideo) { std::unique_ptr ourDecoder = createDecoderFromPath(path, GetParam()); ourDecoder->addVideoStreamDecoder(-1); - auto output = ourDecoder->getNextDecodedOutput(); + auto output = ourDecoder->getNextDecodedOutputNoDemux(); torch::Tensor tensor0FromOurDecoder = output.frame; EXPECT_EQ(tensor0FromOurDecoder.sizes(), std::vector({3, 270, 480})); EXPECT_EQ(output.ptsSeconds, 0.0); EXPECT_EQ(output.pts, 0); - output = ourDecoder->getNextDecodedOutput(); + output = ourDecoder->getNextDecodedOutputNoDemux(); torch::Tensor tensor1FromOurDecoder = output.frame; EXPECT_EQ(tensor1FromOurDecoder.sizes(), std::vector({3, 270, 480})); EXPECT_EQ(output.ptsSeconds, 1'001. / 30'000); @@ -254,11 +254,11 @@ TEST_P(VideoDecoderTest, SeeksCloseToEof) { createDecoderFromPath(path, GetParam()); ourDecoder->addVideoStreamDecoder(-1); ourDecoder->setCursorPtsInSeconds(388388. / 30'000); - auto output = ourDecoder->getNextDecodedOutput(); + auto output = ourDecoder->getNextDecodedOutputNoDemux(); EXPECT_EQ(output.ptsSeconds, 388'388. / 30'000); - output = ourDecoder->getNextDecodedOutput(); + output = ourDecoder->getNextDecodedOutputNoDemux(); EXPECT_EQ(output.ptsSeconds, 389'389. / 30'000); - EXPECT_THROW(ourDecoder->getNextDecodedOutput(), std::exception); + EXPECT_THROW(ourDecoder->getNextDecodedOutputNoDemux(), std::exception); } TEST_P(VideoDecoderTest, GetsFrameDisplayedAtTimestamp) { @@ -266,18 +266,19 @@ TEST_P(VideoDecoderTest, GetsFrameDisplayedAtTimestamp) { std::unique_ptr ourDecoder = createDecoderFromPath(path, GetParam()); ourDecoder->addVideoStreamDecoder(-1); - auto output = ourDecoder->getFrameDisplayedAtTimestamp(6.006); + auto output = ourDecoder->getFrameDisplayedAtTimestampNoDemux(6.006); EXPECT_EQ(output.ptsSeconds, 6.006); // The frame's duration is 0.033367 according to ffprobe, // so the next frame is displayed at timestamp=6.039367. const double kNextFramePts = 6.039366666666667; // The frame that is displayed a microsecond before the next frame is still // the previous frame. - output = ourDecoder->getFrameDisplayedAtTimestamp(kNextFramePts - 1e-6); + output = + ourDecoder->getFrameDisplayedAtTimestampNoDemux(kNextFramePts - 1e-6); EXPECT_EQ(output.ptsSeconds, 6.006); // The frame that is displayed at the exact pts of the frame is the next // frame. - output = ourDecoder->getFrameDisplayedAtTimestamp(kNextFramePts); + output = ourDecoder->getFrameDisplayedAtTimestampNoDemux(kNextFramePts); EXPECT_EQ(output.ptsSeconds, kNextFramePts); // This is the timestamp of the last frame in this video. @@ -287,7 +288,7 @@ TEST_P(VideoDecoderTest, GetsFrameDisplayedAtTimestamp) { kPtsOfLastFrameInVideoStream + kDurationOfLastFrameInVideoStream; // Sanity check: make sure duration is strictly positive. EXPECT_GT(kPtsPlusDurationOfLastFrame, kPtsOfLastFrameInVideoStream); - output = ourDecoder->getFrameDisplayedAtTimestamp( + output = ourDecoder->getFrameDisplayedAtTimestampNoDemux( kPtsPlusDurationOfLastFrame - 1e-6); EXPECT_EQ(output.ptsSeconds, kPtsOfLastFrameInVideoStream); } @@ -298,7 +299,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) { createDecoderFromPath(path, GetParam()); ourDecoder->addVideoStreamDecoder(-1); ourDecoder->setCursorPtsInSeconds(6.0); - auto output = ourDecoder->getNextDecodedOutput(); + auto output = ourDecoder->getNextDecodedOutputNoDemux(); torch::Tensor tensor6FromOurDecoder = output.frame; EXPECT_EQ(output.ptsSeconds, 180'180. / 30'000); torch::Tensor tensor6FromFFMPEG = @@ -314,7 +315,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) { EXPECT_GT(ourDecoder->getDecodeStats().numPacketsSentToDecoder, 180); ourDecoder->setCursorPtsInSeconds(6.1); - output = ourDecoder->getNextDecodedOutput(); + output = ourDecoder->getNextDecodedOutputNoDemux(); torch::Tensor tensor61FromOurDecoder = output.frame; EXPECT_EQ(output.ptsSeconds, 183'183. / 30'000); torch::Tensor tensor61FromFFMPEG = @@ -334,7 +335,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) { EXPECT_LT(ourDecoder->getDecodeStats().numPacketsSentToDecoder, 10); ourDecoder->setCursorPtsInSeconds(10.0); - output = ourDecoder->getNextDecodedOutput(); + output = ourDecoder->getNextDecodedOutputNoDemux(); torch::Tensor tensor10FromOurDecoder = output.frame; EXPECT_EQ(output.ptsSeconds, 300'300. / 30'000); torch::Tensor tensor10FromFFMPEG = @@ -351,7 +352,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) { EXPECT_GT(ourDecoder->getDecodeStats().numPacketsSentToDecoder, 60); ourDecoder->setCursorPtsInSeconds(6.0); - output = ourDecoder->getNextDecodedOutput(); + output = ourDecoder->getNextDecodedOutputNoDemux(); tensor6FromOurDecoder = output.frame; EXPECT_EQ(output.ptsSeconds, 180'180. / 30'000); EXPECT_TRUE(torch::equal(tensor6FromOurDecoder, tensor6FromFFMPEG)); @@ -366,7 +367,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) { constexpr double kPtsOfLastFrameInVideoStream = 389'389. / 30'000; // ~12.9 ourDecoder->setCursorPtsInSeconds(kPtsOfLastFrameInVideoStream); - output = ourDecoder->getNextDecodedOutput(); + output = ourDecoder->getNextDecodedOutputNoDemux(); torch::Tensor tensor7FromOurDecoder = output.frame; EXPECT_EQ(output.ptsSeconds, 389'389. / 30'000); torch::Tensor tensor7FromFFMPEG =