From 6121df98e707a8d08b37f0ae6c6ce3693ca4018a Mon Sep 17 00:00:00 2001 From: Ahmad Sharif Date: Fri, 2 Aug 2024 15:09:14 -0700 Subject: [PATCH] [torchcodec] Add a NoDemux suffix to functions that do not demux (#147) Summary: Pull Request resolved: https://github.com/pytorch/torchcodec/pull/147 This more clearly indicates that the function does not perform demuxing (separating frames into separate streams) and returns outputs from any available stream. Reviewed By: scotts Differential Revision: D60687942 --- benchmarks/decoders/BenchmarkDecodersMain.cpp | 4 +- .../decoders/_core/VideoDecoder.cpp | 8 ++-- src/torchcodec/decoders/_core/VideoDecoder.h | 19 +++++----- .../decoders/_core/VideoDecoderOps.cpp | 4 +- test/decoders/VideoDecoderTest.cpp | 37 ++++++++++--------- 5 files changed, 37 insertions(+), 35 deletions(-) diff --git a/benchmarks/decoders/BenchmarkDecodersMain.cpp b/benchmarks/decoders/BenchmarkDecodersMain.cpp index 5be64e6d6..c1b15bafb 100644 --- a/benchmarks/decoders/BenchmarkDecodersMain.cpp +++ b/benchmarks/decoders/BenchmarkDecodersMain.cpp @@ -63,7 +63,7 @@ void runNDecodeIterations( decoder->addVideoStreamDecoder(-1); for (double pts : ptsList) { decoder->setCursorPtsInSeconds(pts); - torch::Tensor tensor = decoder->getNextDecodedOutput().frame; + torch::Tensor tensor = decoder->getNextDecodedOutputNoDemux().frame; } if (i + 1 == warmupIterations) { start = std::chrono::high_resolution_clock::now(); @@ -95,7 +95,7 @@ void runNdecodeIterationsGrabbingConsecutiveFrames( VideoDecoder::createFromFilePath(videoPath); decoder->addVideoStreamDecoder(-1); for (int j = 0; j < consecutiveFrameCount; ++j) { - torch::Tensor tensor = decoder->getNextDecodedOutput().frame; + torch::Tensor tensor = decoder->getNextDecodedOutputNoDemux().frame; } if (i + 1 == warmupIterations) { start = std::chrono::high_resolution_clock::now(); diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp index faccce3cb..0d0821648 100644 --- a/src/torchcodec/decoders/_core/VideoDecoder.cpp +++ b/src/torchcodec/decoders/_core/VideoDecoder.cpp @@ -761,7 +761,7 @@ VideoDecoder::DecodedOutput VideoDecoder::getDecodedOutputWithFilter( if (activeStreamIndices_.size() == 0) { throw std::runtime_error("No active streams configured."); } - VLOG(9) << "Starting getNextDecodedOutput()"; + VLOG(9) << "Starting getNextDecodedOutputNoDemux()"; resetDecodeStats(); if (maybeDesiredPts_.has_value()) { VLOG(9) << "maybeDesiredPts_=" << *maybeDesiredPts_; @@ -922,7 +922,7 @@ VideoDecoder::DecodedOutput VideoDecoder::convertAVFrameToDecodedOutput( return output; } -VideoDecoder::DecodedOutput VideoDecoder::getFrameDisplayedAtTimestamp( +VideoDecoder::DecodedOutput VideoDecoder::getFrameDisplayedAtTimestampNoDemux( double seconds) { for (auto& [streamIndex, stream] : streams_) { double frameStartTime = ptsToSeconds(stream.currentPts, stream.timeBase); @@ -981,7 +981,7 @@ VideoDecoder::DecodedOutput VideoDecoder::getFrameAtIndex( } int64_t pts = stream.allFrames[frameIndex].pts; setCursorPtsInSeconds(ptsToSeconds(pts, stream.timeBase)); - return getNextDecodedOutput(); + return getNextDecodedOutputNoDemux(); } VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesAtIndexes( @@ -1134,7 +1134,7 @@ VideoDecoder::getFramesDisplayedByTimestampInRange( return output; } -VideoDecoder::DecodedOutput VideoDecoder::getNextDecodedOutput() { +VideoDecoder::DecodedOutput VideoDecoder::getNextDecodedOutputNoDemux() { return getDecodedOutputWithFilter( [this](int frameStreamIndex, AVFrame* frame) { StreamInfo& activeStream = streams_[frameStreamIndex]; diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h index 0ef3757f1..202946840 100644 --- a/src/torchcodec/decoders/_core/VideoDecoder.h +++ b/src/torchcodec/decoders/_core/VideoDecoder.h @@ -153,8 +153,8 @@ class VideoDecoder { // ---- SINGLE FRAME SEEK AND DECODING API ---- // Places the cursor at the first frame on or after the position in seconds. - // Calling getNextFrameAsTensor() will return the first frame at or after this - // position. + // Calling getNextDecodedOutputNoDemux() will return the first frame at or + // after this position. void setCursorPtsInSeconds(double seconds); struct DecodedOutput { // The actual decoded output as a Tensor. @@ -180,13 +180,14 @@ class VideoDecoder { }; // Decodes the frame where the current cursor position is. It also advances // the cursor to the next frame. - DecodedOutput getNextDecodedOutput(); - // Decodes the frame that is visible at a given timestamp. Frames in the video - // have a presentation timestamp and a duration. For example, if a frame has - // presentation timestamp of 5.0s and a duration of 1.0s, it will be visible - // in the timestamp range [5.0, 6.0). i.e. it will be returned when this - // function is called with seconds=5.0 or seconds=5.999, etc. - DecodedOutput getFrameDisplayedAtTimestamp(double seconds); + DecodedOutput getNextDecodedOutputNoDemux(); + // Decodes the first frame in any added stream that is visible at a given + // timestamp. Frames in the video have a presentation timestamp and a + // duration. For example, if a frame has presentation timestamp of 5.0s and a + // duration of 1.0s, it will be visible in the timestamp range [5.0, 6.0). + // i.e. it will be returned when this function is called with seconds=5.0 or + // seconds=5.999, etc. + DecodedOutput getFrameDisplayedAtTimestampNoDemux(double seconds); DecodedOutput getFrameAtIndex(int streamIndex, int64_t frameIndex); struct BatchDecodedOutput { torch::Tensor frames; diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp index b7bcee23d..5856388db 100644 --- a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp +++ b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp @@ -145,7 +145,7 @@ OpsDecodedOutput get_next_frame(at::Tensor& decoder) { auto videoDecoder = unwrapTensorToGetDecoder(decoder); VideoDecoder::DecodedOutput result; try { - result = videoDecoder->getNextDecodedOutput(); + result = videoDecoder->getNextDecodedOutputNoDemux(); } catch (const VideoDecoder::EndOfFileException& e) { throw pybind11::stop_iteration(e.what()); } @@ -159,7 +159,7 @@ OpsDecodedOutput get_next_frame(at::Tensor& decoder) { OpsDecodedOutput get_frame_at_pts(at::Tensor& decoder, double seconds) { auto videoDecoder = unwrapTensorToGetDecoder(decoder); - auto result = videoDecoder->getFrameDisplayedAtTimestamp(seconds); + auto result = videoDecoder->getFrameDisplayedAtTimestampNoDemux(seconds); return makeOpsDecodedOutput(result); } diff --git a/test/decoders/VideoDecoderTest.cpp b/test/decoders/VideoDecoderTest.cpp index 1fe19316a..04cbed0a6 100644 --- a/test/decoders/VideoDecoderTest.cpp +++ b/test/decoders/VideoDecoderTest.cpp @@ -152,7 +152,7 @@ TEST(VideoDecoderTest, RespectsWidthAndHeightFromOptions) { streamOptions.width = 100; streamOptions.height = 120; decoder->addVideoStreamDecoder(-1, streamOptions); - torch::Tensor tensor = decoder->getNextDecodedOutput().frame; + torch::Tensor tensor = decoder->getNextDecodedOutputNoDemux().frame; EXPECT_EQ(tensor.sizes(), std::vector({3, 120, 100})); } @@ -163,7 +163,7 @@ TEST(VideoDecoderTest, RespectsOutputTensorDimensionOrderFromOptions) { VideoDecoder::VideoStreamDecoderOptions streamOptions; streamOptions.dimensionOrder = "NHWC"; decoder->addVideoStreamDecoder(-1, streamOptions); - torch::Tensor tensor = decoder->getNextDecodedOutput().frame; + torch::Tensor tensor = decoder->getNextDecodedOutputNoDemux().frame; EXPECT_EQ(tensor.sizes(), std::vector({270, 480, 3})); } @@ -172,12 +172,12 @@ TEST_P(VideoDecoderTest, ReturnsFirstTwoFramesOfVideo) { std::unique_ptr ourDecoder = createDecoderFromPath(path, GetParam()); ourDecoder->addVideoStreamDecoder(-1); - auto output = ourDecoder->getNextDecodedOutput(); + auto output = ourDecoder->getNextDecodedOutputNoDemux(); torch::Tensor tensor0FromOurDecoder = output.frame; EXPECT_EQ(tensor0FromOurDecoder.sizes(), std::vector({3, 270, 480})); EXPECT_EQ(output.ptsSeconds, 0.0); EXPECT_EQ(output.pts, 0); - output = ourDecoder->getNextDecodedOutput(); + output = ourDecoder->getNextDecodedOutputNoDemux(); torch::Tensor tensor1FromOurDecoder = output.frame; EXPECT_EQ(tensor1FromOurDecoder.sizes(), std::vector({3, 270, 480})); EXPECT_EQ(output.ptsSeconds, 1'001. / 30'000); @@ -219,12 +219,12 @@ TEST(GPUVideoDecoderTest, ReturnsFirstTwoFramesOfVideo) { ASSERT_TRUE(streamOptions.device.is_cuda()); ASSERT_EQ(streamOptions.device.type(), torch::DeviceType::CUDA); ourDecoder->addVideoStreamDecoder(-1, streamOptions); - auto output = ourDecoder->getNextDecodedOutput(); + auto output = ourDecoder->getNextDecodedOutputNoDemux(); torch::Tensor tensor1FromOurDecoder = output.frame; EXPECT_EQ(tensor1FromOurDecoder.sizes(), std::vector({3, 270, 480})); EXPECT_EQ(output.ptsSeconds, 0.0); EXPECT_EQ(output.pts, 0); - output = ourDecoder->getNextDecodedOutput(); + output = ourDecoder->getNextDecodedOutputNoDemux(); torch::Tensor tensor2FromOurDecoder = output.frame; EXPECT_EQ(tensor2FromOurDecoder.sizes(), std::vector({3, 270, 480})); EXPECT_EQ(output.ptsSeconds, 1'001. / 30'000); @@ -306,11 +306,11 @@ TEST_P(VideoDecoderTest, SeeksCloseToEof) { createDecoderFromPath(path, GetParam()); ourDecoder->addVideoStreamDecoder(-1); ourDecoder->setCursorPtsInSeconds(388388. / 30'000); - auto output = ourDecoder->getNextDecodedOutput(); + auto output = ourDecoder->getNextDecodedOutputNoDemux(); EXPECT_EQ(output.ptsSeconds, 388'388. / 30'000); - output = ourDecoder->getNextDecodedOutput(); + output = ourDecoder->getNextDecodedOutputNoDemux(); EXPECT_EQ(output.ptsSeconds, 389'389. / 30'000); - EXPECT_THROW(ourDecoder->getNextDecodedOutput(), std::exception); + EXPECT_THROW(ourDecoder->getNextDecodedOutputNoDemux(), std::exception); } TEST_P(VideoDecoderTest, GetsFrameDisplayedAtTimestamp) { @@ -318,18 +318,19 @@ TEST_P(VideoDecoderTest, GetsFrameDisplayedAtTimestamp) { std::unique_ptr ourDecoder = createDecoderFromPath(path, GetParam()); ourDecoder->addVideoStreamDecoder(-1); - auto output = ourDecoder->getFrameDisplayedAtTimestamp(6.006); + auto output = ourDecoder->getFrameDisplayedAtTimestampNoDemux(6.006); EXPECT_EQ(output.ptsSeconds, 6.006); // The frame's duration is 0.033367 according to ffprobe, // so the next frame is displayed at timestamp=6.039367. const double kNextFramePts = 6.039366666666667; // The frame that is displayed a microsecond before the next frame is still // the previous frame. - output = ourDecoder->getFrameDisplayedAtTimestamp(kNextFramePts - 1e-6); + output = + ourDecoder->getFrameDisplayedAtTimestampNoDemux(kNextFramePts - 1e-6); EXPECT_EQ(output.ptsSeconds, 6.006); // The frame that is displayed at the exact pts of the frame is the next // frame. - output = ourDecoder->getFrameDisplayedAtTimestamp(kNextFramePts); + output = ourDecoder->getFrameDisplayedAtTimestampNoDemux(kNextFramePts); EXPECT_EQ(output.ptsSeconds, kNextFramePts); // This is the timestamp of the last frame in this video. @@ -339,7 +340,7 @@ TEST_P(VideoDecoderTest, GetsFrameDisplayedAtTimestamp) { kPtsOfLastFrameInVideoStream + kDurationOfLastFrameInVideoStream; // Sanity check: make sure duration is strictly positive. EXPECT_GT(kPtsPlusDurationOfLastFrame, kPtsOfLastFrameInVideoStream); - output = ourDecoder->getFrameDisplayedAtTimestamp( + output = ourDecoder->getFrameDisplayedAtTimestampNoDemux( kPtsPlusDurationOfLastFrame - 1e-6); EXPECT_EQ(output.ptsSeconds, kPtsOfLastFrameInVideoStream); } @@ -350,7 +351,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) { createDecoderFromPath(path, GetParam()); ourDecoder->addVideoStreamDecoder(-1); ourDecoder->setCursorPtsInSeconds(6.0); - auto output = ourDecoder->getNextDecodedOutput(); + auto output = ourDecoder->getNextDecodedOutputNoDemux(); torch::Tensor tensor6FromOurDecoder = output.frame; EXPECT_EQ(output.ptsSeconds, 180'180. / 30'000); torch::Tensor tensor6FromFFMPEG = @@ -366,7 +367,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) { EXPECT_GT(ourDecoder->getDecodeStats().numPacketsSentToDecoder, 180); ourDecoder->setCursorPtsInSeconds(6.1); - output = ourDecoder->getNextDecodedOutput(); + output = ourDecoder->getNextDecodedOutputNoDemux(); torch::Tensor tensor61FromOurDecoder = output.frame; EXPECT_EQ(output.ptsSeconds, 183'183. / 30'000); torch::Tensor tensor61FromFFMPEG = @@ -386,7 +387,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) { EXPECT_LT(ourDecoder->getDecodeStats().numPacketsSentToDecoder, 10); ourDecoder->setCursorPtsInSeconds(10.0); - output = ourDecoder->getNextDecodedOutput(); + output = ourDecoder->getNextDecodedOutputNoDemux(); torch::Tensor tensor10FromOurDecoder = output.frame; EXPECT_EQ(output.ptsSeconds, 300'300. / 30'000); torch::Tensor tensor10FromFFMPEG = @@ -403,7 +404,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) { EXPECT_GT(ourDecoder->getDecodeStats().numPacketsSentToDecoder, 60); ourDecoder->setCursorPtsInSeconds(6.0); - output = ourDecoder->getNextDecodedOutput(); + output = ourDecoder->getNextDecodedOutputNoDemux(); tensor6FromOurDecoder = output.frame; EXPECT_EQ(output.ptsSeconds, 180'180. / 30'000); EXPECT_TRUE(torch::equal(tensor6FromOurDecoder, tensor6FromFFMPEG)); @@ -418,7 +419,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) { constexpr double kPtsOfLastFrameInVideoStream = 389'389. / 30'000; // ~12.9 ourDecoder->setCursorPtsInSeconds(kPtsOfLastFrameInVideoStream); - output = ourDecoder->getNextDecodedOutput(); + output = ourDecoder->getNextDecodedOutputNoDemux(); torch::Tensor tensor7FromOurDecoder = output.frame; EXPECT_EQ(output.ptsSeconds, 389'389. / 30'000); torch::Tensor tensor7FromFFMPEG =