From 6121df98e707a8d08b37f0ae6c6ce3693ca4018a Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@meta.com>
Date: Fri, 2 Aug 2024 15:09:14 -0700
Subject: [PATCH] [torchcodec] Add a NoDemux suffix to functions that do not
 demux (#147)

Summary:
Pull Request resolved: https://github.com/pytorch/torchcodec/pull/147

This more clearly indicates that the function does not perform demuxing (separating frames into separate streams) and returns outputs from any available stream.

Reviewed By: scotts

Differential Revision: D60687942
---
 benchmarks/decoders/BenchmarkDecodersMain.cpp |  4 +-
 .../decoders/_core/VideoDecoder.cpp           |  8 ++--
 src/torchcodec/decoders/_core/VideoDecoder.h  | 19 +++++-----
 .../decoders/_core/VideoDecoderOps.cpp        |  4 +-
 test/decoders/VideoDecoderTest.cpp            | 37 ++++++++++---------
 5 files changed, 37 insertions(+), 35 deletions(-)

diff --git a/benchmarks/decoders/BenchmarkDecodersMain.cpp b/benchmarks/decoders/BenchmarkDecodersMain.cpp
index 5be64e6d6..c1b15bafb 100644
--- a/benchmarks/decoders/BenchmarkDecodersMain.cpp
+++ b/benchmarks/decoders/BenchmarkDecodersMain.cpp
@@ -63,7 +63,7 @@ void runNDecodeIterations(
     decoder->addVideoStreamDecoder(-1);
     for (double pts : ptsList) {
       decoder->setCursorPtsInSeconds(pts);
-      torch::Tensor tensor = decoder->getNextDecodedOutput().frame;
+      torch::Tensor tensor = decoder->getNextDecodedOutputNoDemux().frame;
     }
     if (i + 1 == warmupIterations) {
       start = std::chrono::high_resolution_clock::now();
@@ -95,7 +95,7 @@ void runNdecodeIterationsGrabbingConsecutiveFrames(
         VideoDecoder::createFromFilePath(videoPath);
     decoder->addVideoStreamDecoder(-1);
     for (int j = 0; j < consecutiveFrameCount; ++j) {
-      torch::Tensor tensor = decoder->getNextDecodedOutput().frame;
+      torch::Tensor tensor = decoder->getNextDecodedOutputNoDemux().frame;
     }
     if (i + 1 == warmupIterations) {
       start = std::chrono::high_resolution_clock::now();
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
index faccce3cb..0d0821648 100644
--- a/src/torchcodec/decoders/_core/VideoDecoder.cpp
+++ b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -761,7 +761,7 @@ VideoDecoder::DecodedOutput VideoDecoder::getDecodedOutputWithFilter(
   if (activeStreamIndices_.size() == 0) {
     throw std::runtime_error("No active streams configured.");
   }
-  VLOG(9) << "Starting getNextDecodedOutput()";
+  VLOG(9) << "Starting getNextDecodedOutputNoDemux()";
   resetDecodeStats();
   if (maybeDesiredPts_.has_value()) {
     VLOG(9) << "maybeDesiredPts_=" << *maybeDesiredPts_;
@@ -922,7 +922,7 @@ VideoDecoder::DecodedOutput VideoDecoder::convertAVFrameToDecodedOutput(
   return output;
 }
 
-VideoDecoder::DecodedOutput VideoDecoder::getFrameDisplayedAtTimestamp(
+VideoDecoder::DecodedOutput VideoDecoder::getFrameDisplayedAtTimestampNoDemux(
     double seconds) {
   for (auto& [streamIndex, stream] : streams_) {
     double frameStartTime = ptsToSeconds(stream.currentPts, stream.timeBase);
@@ -981,7 +981,7 @@ VideoDecoder::DecodedOutput VideoDecoder::getFrameAtIndex(
   }
   int64_t pts = stream.allFrames[frameIndex].pts;
   setCursorPtsInSeconds(ptsToSeconds(pts, stream.timeBase));
-  return getNextDecodedOutput();
+  return getNextDecodedOutputNoDemux();
 }
 
 VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesAtIndexes(
@@ -1134,7 +1134,7 @@ VideoDecoder::getFramesDisplayedByTimestampInRange(
   return output;
 }
 
-VideoDecoder::DecodedOutput VideoDecoder::getNextDecodedOutput() {
+VideoDecoder::DecodedOutput VideoDecoder::getNextDecodedOutputNoDemux() {
   return getDecodedOutputWithFilter(
       [this](int frameStreamIndex, AVFrame* frame) {
         StreamInfo& activeStream = streams_[frameStreamIndex];
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
index 0ef3757f1..202946840 100644
--- a/src/torchcodec/decoders/_core/VideoDecoder.h
+++ b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -153,8 +153,8 @@ class VideoDecoder {
 
   // ---- SINGLE FRAME SEEK AND DECODING API ----
   // Places the cursor at the first frame on or after the position in seconds.
-  // Calling getNextFrameAsTensor() will return the first frame at or after this
-  // position.
+  // Calling getNextDecodedOutputNoDemux() will return the first frame at or
+  // after this position.
   void setCursorPtsInSeconds(double seconds);
   struct DecodedOutput {
     // The actual decoded output as a Tensor.
@@ -180,13 +180,14 @@ class VideoDecoder {
   };
   // Decodes the frame where the current cursor position is. It also advances
   // the cursor to the next frame.
-  DecodedOutput getNextDecodedOutput();
-  // Decodes the frame that is visible at a given timestamp. Frames in the video
-  // have a presentation timestamp and a duration. For example, if a frame has
-  // presentation timestamp of 5.0s and a duration of 1.0s, it will be visible
-  // in the timestamp range [5.0, 6.0). i.e. it will be returned when this
-  // function is called with seconds=5.0 or seconds=5.999, etc.
-  DecodedOutput getFrameDisplayedAtTimestamp(double seconds);
+  DecodedOutput getNextDecodedOutputNoDemux();
+  // Decodes the first frame in any added stream that is visible at a given
+  // timestamp. Frames in the video have a presentation timestamp and a
+  // duration. For example, if a frame has presentation timestamp of 5.0s and a
+  // duration of 1.0s, it will be visible in the timestamp range [5.0, 6.0).
+  // i.e. it will be returned when this function is called with seconds=5.0 or
+  // seconds=5.999, etc.
+  DecodedOutput getFrameDisplayedAtTimestampNoDemux(double seconds);
   DecodedOutput getFrameAtIndex(int streamIndex, int64_t frameIndex);
   struct BatchDecodedOutput {
     torch::Tensor frames;
diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
index b7bcee23d..5856388db 100644
--- a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
+++ b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
@@ -145,7 +145,7 @@ OpsDecodedOutput get_next_frame(at::Tensor& decoder) {
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
   VideoDecoder::DecodedOutput result;
   try {
-    result = videoDecoder->getNextDecodedOutput();
+    result = videoDecoder->getNextDecodedOutputNoDemux();
   } catch (const VideoDecoder::EndOfFileException& e) {
     throw pybind11::stop_iteration(e.what());
   }
@@ -159,7 +159,7 @@ OpsDecodedOutput get_next_frame(at::Tensor& decoder) {
 
 OpsDecodedOutput get_frame_at_pts(at::Tensor& decoder, double seconds) {
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
-  auto result = videoDecoder->getFrameDisplayedAtTimestamp(seconds);
+  auto result = videoDecoder->getFrameDisplayedAtTimestampNoDemux(seconds);
   return makeOpsDecodedOutput(result);
 }
 
diff --git a/test/decoders/VideoDecoderTest.cpp b/test/decoders/VideoDecoderTest.cpp
index 1fe19316a..04cbed0a6 100644
--- a/test/decoders/VideoDecoderTest.cpp
+++ b/test/decoders/VideoDecoderTest.cpp
@@ -152,7 +152,7 @@ TEST(VideoDecoderTest, RespectsWidthAndHeightFromOptions) {
   streamOptions.width = 100;
   streamOptions.height = 120;
   decoder->addVideoStreamDecoder(-1, streamOptions);
-  torch::Tensor tensor = decoder->getNextDecodedOutput().frame;
+  torch::Tensor tensor = decoder->getNextDecodedOutputNoDemux().frame;
   EXPECT_EQ(tensor.sizes(), std::vector<long>({3, 120, 100}));
 }
 
@@ -163,7 +163,7 @@ TEST(VideoDecoderTest, RespectsOutputTensorDimensionOrderFromOptions) {
   VideoDecoder::VideoStreamDecoderOptions streamOptions;
   streamOptions.dimensionOrder = "NHWC";
   decoder->addVideoStreamDecoder(-1, streamOptions);
-  torch::Tensor tensor = decoder->getNextDecodedOutput().frame;
+  torch::Tensor tensor = decoder->getNextDecodedOutputNoDemux().frame;
   EXPECT_EQ(tensor.sizes(), std::vector<long>({270, 480, 3}));
 }
 
@@ -172,12 +172,12 @@ TEST_P(VideoDecoderTest, ReturnsFirstTwoFramesOfVideo) {
   std::unique_ptr<VideoDecoder> ourDecoder =
       createDecoderFromPath(path, GetParam());
   ourDecoder->addVideoStreamDecoder(-1);
-  auto output = ourDecoder->getNextDecodedOutput();
+  auto output = ourDecoder->getNextDecodedOutputNoDemux();
   torch::Tensor tensor0FromOurDecoder = output.frame;
   EXPECT_EQ(tensor0FromOurDecoder.sizes(), std::vector<long>({3, 270, 480}));
   EXPECT_EQ(output.ptsSeconds, 0.0);
   EXPECT_EQ(output.pts, 0);
-  output = ourDecoder->getNextDecodedOutput();
+  output = ourDecoder->getNextDecodedOutputNoDemux();
   torch::Tensor tensor1FromOurDecoder = output.frame;
   EXPECT_EQ(tensor1FromOurDecoder.sizes(), std::vector<long>({3, 270, 480}));
   EXPECT_EQ(output.ptsSeconds, 1'001. / 30'000);
@@ -219,12 +219,12 @@ TEST(GPUVideoDecoderTest, ReturnsFirstTwoFramesOfVideo) {
   ASSERT_TRUE(streamOptions.device.is_cuda());
   ASSERT_EQ(streamOptions.device.type(), torch::DeviceType::CUDA);
   ourDecoder->addVideoStreamDecoder(-1, streamOptions);
-  auto output = ourDecoder->getNextDecodedOutput();
+  auto output = ourDecoder->getNextDecodedOutputNoDemux();
   torch::Tensor tensor1FromOurDecoder = output.frame;
   EXPECT_EQ(tensor1FromOurDecoder.sizes(), std::vector<long>({3, 270, 480}));
   EXPECT_EQ(output.ptsSeconds, 0.0);
   EXPECT_EQ(output.pts, 0);
-  output = ourDecoder->getNextDecodedOutput();
+  output = ourDecoder->getNextDecodedOutputNoDemux();
   torch::Tensor tensor2FromOurDecoder = output.frame;
   EXPECT_EQ(tensor2FromOurDecoder.sizes(), std::vector<long>({3, 270, 480}));
   EXPECT_EQ(output.ptsSeconds, 1'001. / 30'000);
@@ -306,11 +306,11 @@ TEST_P(VideoDecoderTest, SeeksCloseToEof) {
       createDecoderFromPath(path, GetParam());
   ourDecoder->addVideoStreamDecoder(-1);
   ourDecoder->setCursorPtsInSeconds(388388. / 30'000);
-  auto output = ourDecoder->getNextDecodedOutput();
+  auto output = ourDecoder->getNextDecodedOutputNoDemux();
   EXPECT_EQ(output.ptsSeconds, 388'388. / 30'000);
-  output = ourDecoder->getNextDecodedOutput();
+  output = ourDecoder->getNextDecodedOutputNoDemux();
   EXPECT_EQ(output.ptsSeconds, 389'389. / 30'000);
-  EXPECT_THROW(ourDecoder->getNextDecodedOutput(), std::exception);
+  EXPECT_THROW(ourDecoder->getNextDecodedOutputNoDemux(), std::exception);
 }
 
 TEST_P(VideoDecoderTest, GetsFrameDisplayedAtTimestamp) {
@@ -318,18 +318,19 @@ TEST_P(VideoDecoderTest, GetsFrameDisplayedAtTimestamp) {
   std::unique_ptr<VideoDecoder> ourDecoder =
       createDecoderFromPath(path, GetParam());
   ourDecoder->addVideoStreamDecoder(-1);
-  auto output = ourDecoder->getFrameDisplayedAtTimestamp(6.006);
+  auto output = ourDecoder->getFrameDisplayedAtTimestampNoDemux(6.006);
   EXPECT_EQ(output.ptsSeconds, 6.006);
   // The frame's duration is 0.033367 according to ffprobe,
   // so the next frame is displayed at timestamp=6.039367.
   const double kNextFramePts = 6.039366666666667;
   // The frame that is displayed a microsecond before the next frame is still
   // the previous frame.
-  output = ourDecoder->getFrameDisplayedAtTimestamp(kNextFramePts - 1e-6);
+  output =
+      ourDecoder->getFrameDisplayedAtTimestampNoDemux(kNextFramePts - 1e-6);
   EXPECT_EQ(output.ptsSeconds, 6.006);
   // The frame that is displayed at the exact pts of the frame is the next
   // frame.
-  output = ourDecoder->getFrameDisplayedAtTimestamp(kNextFramePts);
+  output = ourDecoder->getFrameDisplayedAtTimestampNoDemux(kNextFramePts);
   EXPECT_EQ(output.ptsSeconds, kNextFramePts);
 
   // This is the timestamp of the last frame in this video.
@@ -339,7 +340,7 @@ TEST_P(VideoDecoderTest, GetsFrameDisplayedAtTimestamp) {
       kPtsOfLastFrameInVideoStream + kDurationOfLastFrameInVideoStream;
   // Sanity check: make sure duration is strictly positive.
   EXPECT_GT(kPtsPlusDurationOfLastFrame, kPtsOfLastFrameInVideoStream);
-  output = ourDecoder->getFrameDisplayedAtTimestamp(
+  output = ourDecoder->getFrameDisplayedAtTimestampNoDemux(
       kPtsPlusDurationOfLastFrame - 1e-6);
   EXPECT_EQ(output.ptsSeconds, kPtsOfLastFrameInVideoStream);
 }
@@ -350,7 +351,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) {
       createDecoderFromPath(path, GetParam());
   ourDecoder->addVideoStreamDecoder(-1);
   ourDecoder->setCursorPtsInSeconds(6.0);
-  auto output = ourDecoder->getNextDecodedOutput();
+  auto output = ourDecoder->getNextDecodedOutputNoDemux();
   torch::Tensor tensor6FromOurDecoder = output.frame;
   EXPECT_EQ(output.ptsSeconds, 180'180. / 30'000);
   torch::Tensor tensor6FromFFMPEG =
@@ -366,7 +367,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) {
   EXPECT_GT(ourDecoder->getDecodeStats().numPacketsSentToDecoder, 180);
 
   ourDecoder->setCursorPtsInSeconds(6.1);
-  output = ourDecoder->getNextDecodedOutput();
+  output = ourDecoder->getNextDecodedOutputNoDemux();
   torch::Tensor tensor61FromOurDecoder = output.frame;
   EXPECT_EQ(output.ptsSeconds, 183'183. / 30'000);
   torch::Tensor tensor61FromFFMPEG =
@@ -386,7 +387,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) {
   EXPECT_LT(ourDecoder->getDecodeStats().numPacketsSentToDecoder, 10);
 
   ourDecoder->setCursorPtsInSeconds(10.0);
-  output = ourDecoder->getNextDecodedOutput();
+  output = ourDecoder->getNextDecodedOutputNoDemux();
   torch::Tensor tensor10FromOurDecoder = output.frame;
   EXPECT_EQ(output.ptsSeconds, 300'300. / 30'000);
   torch::Tensor tensor10FromFFMPEG =
@@ -403,7 +404,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) {
   EXPECT_GT(ourDecoder->getDecodeStats().numPacketsSentToDecoder, 60);
 
   ourDecoder->setCursorPtsInSeconds(6.0);
-  output = ourDecoder->getNextDecodedOutput();
+  output = ourDecoder->getNextDecodedOutputNoDemux();
   tensor6FromOurDecoder = output.frame;
   EXPECT_EQ(output.ptsSeconds, 180'180. / 30'000);
   EXPECT_TRUE(torch::equal(tensor6FromOurDecoder, tensor6FromFFMPEG));
@@ -418,7 +419,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) {
 
   constexpr double kPtsOfLastFrameInVideoStream = 389'389. / 30'000; // ~12.9
   ourDecoder->setCursorPtsInSeconds(kPtsOfLastFrameInVideoStream);
-  output = ourDecoder->getNextDecodedOutput();
+  output = ourDecoder->getNextDecodedOutputNoDemux();
   torch::Tensor tensor7FromOurDecoder = output.frame;
   EXPECT_EQ(output.ptsSeconds, 389'389. / 30'000);
   torch::Tensor tensor7FromFFMPEG =