From 7c37e9a5c8f0c0a894640da46acc00fea2b00597 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 29 Oct 2024 13:10:36 +0000
Subject: [PATCH 1/8] Remove call from convertAVFrameToDecodedOutputOnCPU

---
 packaging/check_glibcxx.py                    |  4 +-
 .../decoders/_core/VideoDecoder.cpp           | 77 +++++++++----------
 src/torchcodec/decoders/_core/VideoDecoder.h  |  2 +
 .../decoders/_core/VideoDecoderOps.cpp        |  2 +
 4 files changed, 45 insertions(+), 40 deletions(-)
diff --git a/packaging/check_glibcxx.py b/packaging/check_glibcxx.py
index 37ff654c7..b7efd9813 100644
--- a/packaging/check_glibcxx.py
+++ b/packaging/check_glibcxx.py
@@ -46,7 +46,9 @@
         all_symbols.add(match.group(0))
 
 if not all_symbols:
-    raise ValueError(f"No GLIBCXX symbols found in {symbol_matches}. Something is wrong.")
+    raise ValueError(
+        f"No GLIBCXX symbols found in {symbol_matches}. Something is wrong."
+    )
 
 all_versions = (symbol.split("_")[1].split(".") for symbol in all_symbols)
 all_versions = (tuple(int(v) for v in version) for version in all_versions)
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
index 76c744936..bee385e7f 100644
--- a/src/torchcodec/decoders/_core/VideoDecoder.cpp
+++ b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -34,31 +34,6 @@ double ptsToSeconds(int64_t pts, const AVRational& timeBase) {
   return ptsToSeconds(pts, timeBase.den);
 }
 
-// Returns a [N]CHW *view* of a [N]HWC input tensor, if the options require so.
-// The [N] leading batch-dimension is optional i.e. the input tensor can be 3D
-// or 4D.
-// Calling permute() is guaranteed to return a view as per the docs:
-// https://pytorch.org/docs/stable/generated/torch.permute.html
-torch::Tensor MaybePermuteHWC2CHW(
-    const VideoDecoder::VideoStreamDecoderOptions& options,
-    torch::Tensor& hwcTensor) {
-  if (options.dimensionOrder == "NHWC") {
-    return hwcTensor;
-  }
-  auto numDimensions = hwcTensor.dim();
-  auto shape = hwcTensor.sizes();
-  if (numDimensions == 3) {
-    TORCH_CHECK(shape[2] == 3, "Not a HWC tensor: ", shape);
-    return hwcTensor.permute({2, 0, 1});
-  } else if (numDimensions == 4) {
-    TORCH_CHECK(shape[3] == 3, "Not a NHWC tensor: ", shape);
-    return hwcTensor.permute({0, 3, 1, 2});
-  } else {
-    TORCH_CHECK(
-        false, "Expected tensor with 3 or 4 dimensions, got ", numDimensions);
-  }
-}
-
 struct AVInput {
   UniqueAVFormatContext formatContext;
   std::unique_ptr<AVIOBytesContext> ioBytesContext;
@@ -136,6 +111,31 @@ VideoDecoder::ColorConversionLibrary getDefaultColorConversionLibraryForWidth(
 
 } // namespace
 
+// Returns a [N]CHW *view* of a [N]HWC input tensor, if the options require so.
+// The [N] leading batch-dimension is optional i.e. the input tensor can be 3D
+// or 4D.
+// Calling permute() is guaranteed to return a view as per the docs:
+// https://pytorch.org/docs/stable/generated/torch.permute.html
+torch::Tensor VideoDecoder::MaybePermuteHWC2CHW(
+    int streamIndex,
+    torch::Tensor& hwcTensor) {
+  if (streams_[streamIndex].options.dimensionOrder == "NHWC") {
+    return hwcTensor;
+  }
+  auto numDimensions = hwcTensor.dim();
+  auto shape = hwcTensor.sizes();
+  if (numDimensions == 3) {
+    TORCH_CHECK(shape[2] == 3, "Not a HWC tensor: ", shape);
+    return hwcTensor.permute({2, 0, 1});
+  } else if (numDimensions == 4) {
+    TORCH_CHECK(shape[3] == 3, "Not a NHWC tensor: ", shape);
+    return hwcTensor.permute({0, 3, 1, 2});
+  } else {
+    TORCH_CHECK(
+        false, "Expected tensor with 3 or 4 dimensions, got ", numDimensions);
+  }
+}
+
 VideoDecoder::VideoStreamDecoderOptions::VideoStreamDecoderOptions(
     const std::string& optionsString) {
   std::vector<std::string> tokens =
@@ -929,14 +929,6 @@ void VideoDecoder::convertAVFrameToDecodedOutputOnCPU(
           "Invalid color conversion library: " +
           std::to_string(static_cast<int>(streamInfo.colorConversionLibrary)));
     }
-    if (!preAllocatedOutputTensor.has_value()) {
-      // We only convert to CHW if a pre-allocated tensor wasn't passed. When a
-      // pre-allocated tensor is passed, it's up to the caller (typically a
-      // batch API) to do the conversion. This is more efficient as it allows
-      // batch NHWC tensors to be permuted only once, instead of permuting HWC
-      // tensors N times.
-      output.frame = MaybePermuteHWC2CHW(streamInfo.options, output.frame);
-    }
 
   } else if (output.streamType == AVMEDIA_TYPE_AUDIO) {
     // TODO: https://github.com/pytorch-labs/torchcodec/issues/85 implement
@@ -978,7 +970,9 @@ VideoDecoder::DecodedOutput VideoDecoder::getFramePlayedAtTimestampNoDemux(
         return seconds >= frameStartTime && seconds < frameEndTime;
       });
   // Convert the frame to tensor.
-  return convertAVFrameToDecodedOutput(rawOutput);
+  auto output = convertAVFrameToDecodedOutput(rawOutput);
+  output.frame = MaybePermuteHWC2CHW(output.streamIndex, output.frame);
+  return output;
 }
 
 void VideoDecoder::validateUserProvidedStreamIndex(uint64_t streamIndex) {
@@ -1023,7 +1017,12 @@ VideoDecoder::DecodedOutput VideoDecoder::getFrameAtIndex(
 
   int64_t pts = stream.allFrames[frameIndex].pts;
   setCursorPtsInSeconds(ptsToSeconds(pts, stream.timeBase));
-  return getNextDecodedOutputNoDemux(preAllocatedOutputTensor);
+  auto output = getNextDecodedOutputNoDemux(preAllocatedOutputTensor);
+
+  if (!preAllocatedOutputTensor.has_value()) {
+    output.frame = MaybePermuteHWC2CHW(streamIndex, output.frame);
+  }
+  return output;
 }
 
 VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesAtIndices(
@@ -1080,7 +1079,7 @@ VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesAtIndices(
     }
     previousIndexInVideo = indexInVideo;
   }
-  output.frames = MaybePermuteHWC2CHW(options, output.frames);
+  output.frames = MaybePermuteHWC2CHW(streamIndex, output.frames);
   return output;
 }
 
@@ -1154,7 +1153,7 @@ VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesInRange(
     output.ptsSeconds[f] = singleOut.ptsSeconds;
     output.durationSeconds[f] = singleOut.durationSeconds;
   }
-  output.frames = MaybePermuteHWC2CHW(options, output.frames);
+  output.frames = MaybePermuteHWC2CHW(streamIndex, output.frames);
   return output;
 }
 
@@ -1207,7 +1206,7 @@ VideoDecoder::getFramesPlayedByTimestampInRange(
   // need this special case below.
   if (startSeconds == stopSeconds) {
     BatchDecodedOutput output(0, options, streamMetadata);
-    output.frames = MaybePermuteHWC2CHW(options, output.frames);
+    output.frames = MaybePermuteHWC2CHW(streamIndex, output.frames);
     return output;
   }
 
@@ -1247,7 +1246,7 @@ VideoDecoder::getFramesPlayedByTimestampInRange(
     output.ptsSeconds[f] = singleOut.ptsSeconds;
     output.durationSeconds[f] = singleOut.durationSeconds;
   }
-  output.frames = MaybePermuteHWC2CHW(options, output.frames);
+  output.frames = MaybePermuteHWC2CHW(streamIndex, output.frames);
 
   return output;
 }
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
index c6b70c895..2d1a84d1c 100644
--- a/src/torchcodec/decoders/_core/VideoDecoder.h
+++ b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -157,6 +157,8 @@ class VideoDecoder {
       int streamIndex,
       const AudioStreamDecoderOptions& options = AudioStreamDecoderOptions());
 
+  torch::Tensor MaybePermuteHWC2CHW(int streamIndex, torch::Tensor& hwcTensor);
+
   // ---- SINGLE FRAME SEEK AND DECODING API ----
   // Places the cursor at the first frame on or after the position in seconds.
   // Calling getNextDecodedOutputNoDemux() will return the first frame at or
diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
index a769c8b53..755a63a62 100644
--- a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
+++ b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
@@ -202,6 +202,8 @@ OpsDecodedOutput get_next_frame(at::Tensor& decoder) {
         "image_size is unexpected. Expected 3, got: " +
         std::to_string(result.frame.sizes().size()));
   }
+  result.frame =
+      videoDecoder->MaybePermuteHWC2CHW(result.streamIndex, result.frame);
   return makeOpsDecodedOutput(result);
 }
 

From f273b524226c64192294c116c1a79535b81a52e3 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 29 Oct 2024 13:19:21 +0000
Subject: [PATCH 2/8] Nit

---
 src/torchcodec/decoders/_core/VideoDecoder.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
index bee385e7f..dc5b33fb6 100644
--- a/src/torchcodec/decoders/_core/VideoDecoder.cpp
+++ b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -1282,7 +1282,7 @@ double VideoDecoder::getPtsSecondsForFrame(
     int streamIndex,
     int64_t frameIndex) {
   validateUserProvidedStreamIndex(streamIndex);
-  validateScannedAllStreams("getFrameAtIndex");
+  validateScannedAllStreams("getPtsSecondsForFrame");
 
   const auto& stream = streams_[streamIndex];
   validateFrameIndex(stream, frameIndex);

From 639952a8daae015f61846c2cad6171e7b69b8dc3 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 29 Oct 2024 13:32:50 +0000
Subject: [PATCH 3/8] Slighlty better

---
 src/torchcodec/decoders/_core/VideoDecoder.cpp    | 6 ++++++
 src/torchcodec/decoders/_core/VideoDecoder.h      | 1 +
 src/torchcodec/decoders/_core/VideoDecoderOps.cpp | 4 +---
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
index dc5b33fb6..9a3308c9a 100644
--- a/src/torchcodec/decoders/_core/VideoDecoder.cpp
+++ b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -1260,6 +1260,12 @@ VideoDecoder::RawDecodedOutput VideoDecoder::getNextRawDecodedOutputNoDemux() {
   return rawOutput;
 }
 
+VideoDecoder::DecodedOutput VideoDecoder::getNextFrame() {
+  auto output = getNextDecodedOutputNoDemux();
+  output.frame = MaybePermuteHWC2CHW(output.streamIndex, output.frame);
+  return output;
+}
+
 VideoDecoder::DecodedOutput VideoDecoder::getNextDecodedOutputNoDemux(
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
   auto rawOutput = getNextRawDecodedOutputNoDemux();
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
index 2d1a84d1c..87b61791a 100644
--- a/src/torchcodec/decoders/_core/VideoDecoder.h
+++ b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -216,6 +216,7 @@ class VideoDecoder {
   };
   // Decodes the frame where the current cursor position is. It also advances
   // the cursor to the next frame.
+  DecodedOutput getNextFrame();
   DecodedOutput getNextDecodedOutputNoDemux(
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
   // Decodes the first frame in any added stream that is visible at a given
diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
index 755a63a62..b66f92681 100644
--- a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
+++ b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
@@ -193,7 +193,7 @@ OpsDecodedOutput get_next_frame(at::Tensor& decoder) {
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
   VideoDecoder::DecodedOutput result;
   try {
-    result = videoDecoder->getNextDecodedOutputNoDemux();
+    result = videoDecoder->getNextFrame();
   } catch (const VideoDecoder::EndOfFileException& e) {
     C10_THROW_ERROR(IndexError, e.what());
   }
@@ -202,8 +202,6 @@ OpsDecodedOutput get_next_frame(at::Tensor& decoder) {
         "image_size is unexpected. Expected 3, got: " +
         std::to_string(result.frame.sizes().size()));
   }
-  result.frame =
-      videoDecoder->MaybePermuteHWC2CHW(result.streamIndex, result.frame);
   return makeOpsDecodedOutput(result);
 }
 

From 3ae34c79219ac7fa6ca48059b45d6421b71702f8 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 29 Oct 2024 13:48:18 +0000
Subject: [PATCH 4/8] No more if

---
 .../decoders/_core/VideoDecoder.cpp           | 25 ++++++++++++-------
 src/torchcodec/decoders/_core/VideoDecoder.h  |  5 ++++
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
index 9a3308c9a..2d5ad72f0 100644
--- a/src/torchcodec/decoders/_core/VideoDecoder.cpp
+++ b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -1009,6 +1009,16 @@ VideoDecoder::DecodedOutput VideoDecoder::getFrameAtIndex(
     int streamIndex,
     int64_t frameIndex,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
+  auto output = getFrameAtIndexInternal(
+      streamIndex, frameIndex, preAllocatedOutputTensor);
+  output.frame = MaybePermuteHWC2CHW(streamIndex, output.frame);
+  return output;
+}
+
+VideoDecoder::DecodedOutput VideoDecoder::getFrameAtIndexInternal(
+    int streamIndex,
+    int64_t frameIndex,
+    std::optional<torch::Tensor> preAllocatedOutputTensor) {
   validateUserProvidedStreamIndex(streamIndex);
   validateScannedAllStreams("getFrameAtIndex");
 
@@ -1017,12 +1027,7 @@ VideoDecoder::DecodedOutput VideoDecoder::getFrameAtIndex(
 
   int64_t pts = stream.allFrames[frameIndex].pts;
   setCursorPtsInSeconds(ptsToSeconds(pts, stream.timeBase));
-  auto output = getNextDecodedOutputNoDemux(preAllocatedOutputTensor);
-
-  if (!preAllocatedOutputTensor.has_value()) {
-    output.frame = MaybePermuteHWC2CHW(streamIndex, output.frame);
-  }
-  return output;
+  return getNextDecodedOutputNoDemux(preAllocatedOutputTensor);
 }
 
 VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesAtIndices(
@@ -1072,7 +1077,7 @@ VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesAtIndices(
       output.durationSeconds[indexInOutput] =
           output.durationSeconds[previousIndexInOutput];
     } else {
-      DecodedOutput singleOut = getFrameAtIndex(
+      DecodedOutput singleOut = getFrameAtIndexInternal(
           streamIndex, indexInVideo, output.frames[indexInOutput]);
       output.ptsSeconds[indexInOutput] = singleOut.ptsSeconds;
       output.durationSeconds[indexInOutput] = singleOut.durationSeconds;
@@ -1149,7 +1154,8 @@ VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesInRange(
   BatchDecodedOutput output(numOutputFrames, options, streamMetadata);
 
   for (int64_t i = start, f = 0; i < stop; i += step, ++f) {
-    DecodedOutput singleOut = getFrameAtIndex(streamIndex, i, output.frames[f]);
+    DecodedOutput singleOut =
+        getFrameAtIndexInternal(streamIndex, i, output.frames[f]);
     output.ptsSeconds[f] = singleOut.ptsSeconds;
     output.durationSeconds[f] = singleOut.durationSeconds;
   }
@@ -1242,7 +1248,8 @@ VideoDecoder::getFramesPlayedByTimestampInRange(
   int64_t numFrames = stopFrameIndex - startFrameIndex;
   BatchDecodedOutput output(numFrames, options, streamMetadata);
   for (int64_t i = startFrameIndex, f = 0; i < stopFrameIndex; ++i, ++f) {
-    DecodedOutput singleOut = getFrameAtIndex(streamIndex, i, output.frames[f]);
+    DecodedOutput singleOut =
+        getFrameAtIndexInternal(streamIndex, i, output.frames[f]);
     output.ptsSeconds[f] = singleOut.ptsSeconds;
     output.durationSeconds[f] = singleOut.durationSeconds;
   }
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
index 87b61791a..b0d1b063e 100644
--- a/src/torchcodec/decoders/_core/VideoDecoder.h
+++ b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -389,6 +389,11 @@ class VideoDecoder {
       DecodedOutput& output,
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
+  DecodedOutput getFrameAtIndexInternal(
+      int streamIndex,
+      int64_t frameIndex,
+      std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
+
   DecoderOptions options_;
   ContainerMetadata containerMetadata_;
   UniqueAVFormatContext formatContext_;

From 80192190ceeeea9dbe29d97f2fefd26183f1ea88 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 29 Oct 2024 13:58:05 +0000
Subject: [PATCH 5/8] Make getNextDecodedOutputNoDemux private

---
 src/torchcodec/decoders/_core/VideoDecoder.h |  4 ++--
 test/decoders/VideoDecoderTest.cpp           | 24 ++++++++++----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
index b0d1b063e..fbf8177dc 100644
--- a/src/torchcodec/decoders/_core/VideoDecoder.h
+++ b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -217,8 +217,6 @@ class VideoDecoder {
   // Decodes the frame where the current cursor position is. It also advances
   // the cursor to the next frame.
   DecodedOutput getNextFrame();
-  DecodedOutput getNextDecodedOutputNoDemux(
-      std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
   // Decodes the first frame in any added stream that is visible at a given
   // timestamp. Frames in the video have a presentation timestamp and a
   // duration. For example, if a frame has presentation timestamp of 5.0s and a
@@ -393,6 +391,8 @@ class VideoDecoder {
       int streamIndex,
       int64_t frameIndex,
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
+  DecodedOutput getNextDecodedOutputNoDemux(
+      std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
   DecoderOptions options_;
   ContainerMetadata containerMetadata_;
diff --git a/test/decoders/VideoDecoderTest.cpp b/test/decoders/VideoDecoderTest.cpp
index 389d47f4f..bba480650 100644
--- a/test/decoders/VideoDecoderTest.cpp
+++ b/test/decoders/VideoDecoderTest.cpp
@@ -148,7 +148,7 @@ TEST(VideoDecoderTest, RespectsWidthAndHeightFromOptions) {
   streamOptions.width = 100;
   streamOptions.height = 120;
   decoder->addVideoStreamDecoder(-1, streamOptions);
-  torch::Tensor tensor = decoder->getNextDecodedOutputNoDemux().frame;
+  torch::Tensor tensor = decoder->getNextFrame().frame;
   EXPECT_EQ(tensor.sizes(), std::vector<long>({3, 120, 100}));
 }
 
@@ -159,7 +159,7 @@ TEST(VideoDecoderTest, RespectsOutputTensorDimensionOrderFromOptions) {
   VideoDecoder::VideoStreamDecoderOptions streamOptions;
   streamOptions.dimensionOrder = "NHWC";
   decoder->addVideoStreamDecoder(-1, streamOptions);
-  torch::Tensor tensor = decoder->getNextDecodedOutputNoDemux().frame;
+  torch::Tensor tensor = decoder->getNextFrame().frame;
   EXPECT_EQ(tensor.sizes(), std::vector<long>({270, 480, 3}));
 }
 
@@ -168,12 +168,12 @@ TEST_P(VideoDecoderTest, ReturnsFirstTwoFramesOfVideo) {
   std::unique_ptr<VideoDecoder> ourDecoder =
       createDecoderFromPath(path, GetParam());
   ourDecoder->addVideoStreamDecoder(-1);
-  auto output = ourDecoder->getNextDecodedOutputNoDemux();
+  auto output = ourDecoder->getNextFrame();
   torch::Tensor tensor0FromOurDecoder = output.frame;
   EXPECT_EQ(tensor0FromOurDecoder.sizes(), std::vector<long>({3, 270, 480}));
   EXPECT_EQ(output.ptsSeconds, 0.0);
   EXPECT_EQ(output.pts, 0);
-  output = ourDecoder->getNextDecodedOutputNoDemux();
+  output = ourDecoder->getNextFrame();
   torch::Tensor tensor1FromOurDecoder = output.frame;
   EXPECT_EQ(tensor1FromOurDecoder.sizes(), std::vector<long>({3, 270, 480}));
   EXPECT_EQ(output.ptsSeconds, 1'001. / 30'000);
@@ -254,11 +254,11 @@ TEST_P(VideoDecoderTest, SeeksCloseToEof) {
       createDecoderFromPath(path, GetParam());
   ourDecoder->addVideoStreamDecoder(-1);
   ourDecoder->setCursorPtsInSeconds(388388. / 30'000);
-  auto output = ourDecoder->getNextDecodedOutputNoDemux();
+  auto output = ourDecoder->getNextFrame();
   EXPECT_EQ(output.ptsSeconds, 388'388. / 30'000);
-  output = ourDecoder->getNextDecodedOutputNoDemux();
+  output = ourDecoder->getNextFrame();
   EXPECT_EQ(output.ptsSeconds, 389'389. / 30'000);
-  EXPECT_THROW(ourDecoder->getNextDecodedOutputNoDemux(), std::exception);
+  EXPECT_THROW(ourDecoder->getNextFrame(), std::exception);
 }
 
 TEST_P(VideoDecoderTest, GetsFramePlayedAtTimestamp) {
@@ -298,7 +298,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) {
       createDecoderFromPath(path, GetParam());
   ourDecoder->addVideoStreamDecoder(-1);
   ourDecoder->setCursorPtsInSeconds(6.0);
-  auto output = ourDecoder->getNextDecodedOutputNoDemux();
+  auto output = ourDecoder->getNextFrame();
   torch::Tensor tensor6FromOurDecoder = output.frame;
   EXPECT_EQ(output.ptsSeconds, 180'180. / 30'000);
   torch::Tensor tensor6FromFFMPEG =
@@ -314,7 +314,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) {
   EXPECT_GT(ourDecoder->getDecodeStats().numPacketsSentToDecoder, 180);
 
   ourDecoder->setCursorPtsInSeconds(6.1);
-  output = ourDecoder->getNextDecodedOutputNoDemux();
+  output = ourDecoder->getNextFrame();
   torch::Tensor tensor61FromOurDecoder = output.frame;
   EXPECT_EQ(output.ptsSeconds, 183'183. / 30'000);
   torch::Tensor tensor61FromFFMPEG =
@@ -334,7 +334,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) {
   EXPECT_LT(ourDecoder->getDecodeStats().numPacketsSentToDecoder, 10);
 
   ourDecoder->setCursorPtsInSeconds(10.0);
-  output = ourDecoder->getNextDecodedOutputNoDemux();
+  output = ourDecoder->getNextFrame();
   torch::Tensor tensor10FromOurDecoder = output.frame;
   EXPECT_EQ(output.ptsSeconds, 300'300. / 30'000);
   torch::Tensor tensor10FromFFMPEG =
@@ -351,7 +351,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) {
   EXPECT_GT(ourDecoder->getDecodeStats().numPacketsSentToDecoder, 60);
 
   ourDecoder->setCursorPtsInSeconds(6.0);
-  output = ourDecoder->getNextDecodedOutputNoDemux();
+  output = ourDecoder->getNextFrame();
   tensor6FromOurDecoder = output.frame;
   EXPECT_EQ(output.ptsSeconds, 180'180. / 30'000);
   EXPECT_TRUE(torch::equal(tensor6FromOurDecoder, tensor6FromFFMPEG));
@@ -366,7 +366,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) {
 
   constexpr double kPtsOfLastFrameInVideoStream = 389'389. / 30'000; // ~12.9
   ourDecoder->setCursorPtsInSeconds(kPtsOfLastFrameInVideoStream);
-  output = ourDecoder->getNextDecodedOutputNoDemux();
+  output = ourDecoder->getNextFrame();
   torch::Tensor tensor7FromOurDecoder = output.frame;
   EXPECT_EQ(output.ptsSeconds, 389'389. / 30'000);
   torch::Tensor tensor7FromFFMPEG =

From dc40ad6b4a5c0169d3a07a0209c31708a34bbfa3 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 29 Oct 2024 14:27:23 +0000
Subject: [PATCH 6/8] Fix CUDA tests?

---
 src/torchcodec/decoders/_core/CudaDevice.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/torchcodec/decoders/_core/CudaDevice.cpp b/src/torchcodec/decoders/_core/CudaDevice.cpp
index dea0e7293..5da3f4928 100644
--- a/src/torchcodec/decoders/_core/CudaDevice.cpp
+++ b/src/torchcodec/decoders/_core/CudaDevice.cpp
@@ -240,11 +240,6 @@ void convertAVFrameToDecodedOutputOnCuda(
   std::chrono::duration<double, std::micro> duration = end - start;
   VLOG(9) << "NPP Conversion of frame height=" << height << " width=" << width
           << " took: " << duration.count() << "us" << std::endl;
-  if (options.dimensionOrder == "NCHW") {
-    // The docs guaranty this to return a view:
-    // https://pytorch.org/docs/stable/generated/torch.permute.html
-    dst = dst.permute({2, 0, 1});
-  }
 }
 
 } // namespace facebook::torchcodec

From 6bd363d293a517636a098bb691a935cc686a5110 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 29 Oct 2024 16:00:55 +0000
Subject: [PATCH 7/8] getNextFrame -> getNextFrameNoDemux

---
 benchmarks/decoders/BenchmarkDecodersMain.cpp |  4 ++--
 .../decoders/_core/VideoDecoder.cpp           |  2 +-
 src/torchcodec/decoders/_core/VideoDecoder.h  |  2 +-
 .../decoders/_core/VideoDecoderOps.cpp        |  2 +-
 test/decoders/VideoDecoderTest.cpp            | 24 +++++++++----------
 5 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/benchmarks/decoders/BenchmarkDecodersMain.cpp b/benchmarks/decoders/BenchmarkDecodersMain.cpp
index 25d518de4..fd6b137dc 100644
--- a/benchmarks/decoders/BenchmarkDecodersMain.cpp
+++ b/benchmarks/decoders/BenchmarkDecodersMain.cpp
@@ -63,7 +63,7 @@ void runNDecodeIterations(
     decoder->addVideoStreamDecoder(-1);
     for (double pts : ptsList) {
       decoder->setCursorPtsInSeconds(pts);
-      torch::Tensor tensor = decoder->getNextDecodedOutputNoDemux().frame;
+      torch::Tensor tensor = decoder->getNextFrameNoDemux().frame;
     }
     if (i + 1 == warmupIterations) {
       start = std::chrono::high_resolution_clock::now();
@@ -95,7 +95,7 @@ void runNdecodeIterationsGrabbingConsecutiveFrames(
         VideoDecoder::createFromFilePath(videoPath);
     decoder->addVideoStreamDecoder(-1);
     for (int j = 0; j < consecutiveFrameCount; ++j) {
-      torch::Tensor tensor = decoder->getNextDecodedOutputNoDemux().frame;
+      torch::Tensor tensor = decoder->getNextFrameNoDemux().frame;
     }
     if (i + 1 == warmupIterations) {
       start = std::chrono::high_resolution_clock::now();
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
index 2d5ad72f0..54e35387a 100644
--- a/src/torchcodec/decoders/_core/VideoDecoder.cpp
+++ b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -1267,7 +1267,7 @@ VideoDecoder::RawDecodedOutput VideoDecoder::getNextRawDecodedOutputNoDemux() {
   return rawOutput;
 }
 
-VideoDecoder::DecodedOutput VideoDecoder::getNextFrame() {
+VideoDecoder::DecodedOutput VideoDecoder::getNextFrameNoDemux() {
   auto output = getNextDecodedOutputNoDemux();
   output.frame = MaybePermuteHWC2CHW(output.streamIndex, output.frame);
   return output;
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
index fbf8177dc..22af498ef 100644
--- a/src/torchcodec/decoders/_core/VideoDecoder.h
+++ b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -216,7 +216,7 @@ class VideoDecoder {
   };
   // Decodes the frame where the current cursor position is. It also advances
   // the cursor to the next frame.
-  DecodedOutput getNextFrame();
+  DecodedOutput getNextFrameNoDemux();
   // Decodes the first frame in any added stream that is visible at a given
   // timestamp. Frames in the video have a presentation timestamp and a
   // duration. For example, if a frame has presentation timestamp of 5.0s and a
diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
index b66f92681..7117ccab4 100644
--- a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
+++ b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
@@ -193,7 +193,7 @@ OpsDecodedOutput get_next_frame(at::Tensor& decoder) {
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
   VideoDecoder::DecodedOutput result;
   try {
-    result = videoDecoder->getNextFrame();
+    result = videoDecoder->getNextFrameNoDemux();
   } catch (const VideoDecoder::EndOfFileException& e) {
     C10_THROW_ERROR(IndexError, e.what());
   }
diff --git a/test/decoders/VideoDecoderTest.cpp b/test/decoders/VideoDecoderTest.cpp
index bba480650..5f2e62203 100644
--- a/test/decoders/VideoDecoderTest.cpp
+++ b/test/decoders/VideoDecoderTest.cpp
@@ -148,7 +148,7 @@ TEST(VideoDecoderTest, RespectsWidthAndHeightFromOptions) {
   streamOptions.width = 100;
   streamOptions.height = 120;
   decoder->addVideoStreamDecoder(-1, streamOptions);
-  torch::Tensor tensor = decoder->getNextFrame().frame;
+  torch::Tensor tensor = decoder->getNextFrameNoDemux().frame;
   EXPECT_EQ(tensor.sizes(), std::vector<long>({3, 120, 100}));
 }
 
@@ -159,7 +159,7 @@ TEST(VideoDecoderTest, RespectsOutputTensorDimensionOrderFromOptions) {
   VideoDecoder::VideoStreamDecoderOptions streamOptions;
   streamOptions.dimensionOrder = "NHWC";
   decoder->addVideoStreamDecoder(-1, streamOptions);
-  torch::Tensor tensor = decoder->getNextFrame().frame;
+  torch::Tensor tensor = decoder->getNextFrameNoDemux().frame;
   EXPECT_EQ(tensor.sizes(), std::vector<long>({270, 480, 3}));
 }
 
@@ -168,12 +168,12 @@ TEST_P(VideoDecoderTest, ReturnsFirstTwoFramesOfVideo) {
   std::unique_ptr<VideoDecoder> ourDecoder =
       createDecoderFromPath(path, GetParam());
   ourDecoder->addVideoStreamDecoder(-1);
-  auto output = ourDecoder->getNextFrame();
+  auto output = ourDecoder->getNextFrameNoDemux();
   torch::Tensor tensor0FromOurDecoder = output.frame;
   EXPECT_EQ(tensor0FromOurDecoder.sizes(), std::vector<long>({3, 270, 480}));
   EXPECT_EQ(output.ptsSeconds, 0.0);
   EXPECT_EQ(output.pts, 0);
-  output = ourDecoder->getNextFrame();
+  output = ourDecoder->getNextFrameNoDemux();
   torch::Tensor tensor1FromOurDecoder = output.frame;
   EXPECT_EQ(tensor1FromOurDecoder.sizes(), std::vector<long>({3, 270, 480}));
   EXPECT_EQ(output.ptsSeconds, 1'001. / 30'000);
@@ -254,11 +254,11 @@ TEST_P(VideoDecoderTest, SeeksCloseToEof) {
       createDecoderFromPath(path, GetParam());
   ourDecoder->addVideoStreamDecoder(-1);
   ourDecoder->setCursorPtsInSeconds(388388. / 30'000);
-  auto output = ourDecoder->getNextFrame();
+  auto output = ourDecoder->getNextFrameNoDemux();
   EXPECT_EQ(output.ptsSeconds, 388'388. / 30'000);
-  output = ourDecoder->getNextFrame();
+  output = ourDecoder->getNextFrameNoDemux();
   EXPECT_EQ(output.ptsSeconds, 389'389. / 30'000);
-  EXPECT_THROW(ourDecoder->getNextFrame(), std::exception);
+  EXPECT_THROW(ourDecoder->getNextFrameNoDemux(), std::exception);
 }
 
 TEST_P(VideoDecoderTest, GetsFramePlayedAtTimestamp) {
@@ -298,7 +298,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) {
       createDecoderFromPath(path, GetParam());
   ourDecoder->addVideoStreamDecoder(-1);
   ourDecoder->setCursorPtsInSeconds(6.0);
-  auto output = ourDecoder->getNextFrame();
+  auto output = ourDecoder->getNextFrameNoDemux();
   torch::Tensor tensor6FromOurDecoder = output.frame;
   EXPECT_EQ(output.ptsSeconds, 180'180. / 30'000);
   torch::Tensor tensor6FromFFMPEG =
@@ -314,7 +314,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) {
   EXPECT_GT(ourDecoder->getDecodeStats().numPacketsSentToDecoder, 180);
 
   ourDecoder->setCursorPtsInSeconds(6.1);
-  output = ourDecoder->getNextFrame();
+  output = ourDecoder->getNextFrameNoDemux();
   torch::Tensor tensor61FromOurDecoder = output.frame;
   EXPECT_EQ(output.ptsSeconds, 183'183. / 30'000);
   torch::Tensor tensor61FromFFMPEG =
@@ -334,7 +334,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) {
   EXPECT_LT(ourDecoder->getDecodeStats().numPacketsSentToDecoder, 10);
 
   ourDecoder->setCursorPtsInSeconds(10.0);
-  output = ourDecoder->getNextFrame();
+  output = ourDecoder->getNextFrameNoDemux();
   torch::Tensor tensor10FromOurDecoder = output.frame;
   EXPECT_EQ(output.ptsSeconds, 300'300. / 30'000);
   torch::Tensor tensor10FromFFMPEG =
@@ -351,7 +351,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) {
   EXPECT_GT(ourDecoder->getDecodeStats().numPacketsSentToDecoder, 60);
 
   ourDecoder->setCursorPtsInSeconds(6.0);
-  output = ourDecoder->getNextFrame();
+  output = ourDecoder->getNextFrameNoDemux();
   tensor6FromOurDecoder = output.frame;
   EXPECT_EQ(output.ptsSeconds, 180'180. / 30'000);
   EXPECT_TRUE(torch::equal(tensor6FromOurDecoder, tensor6FromFFMPEG));
@@ -366,7 +366,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) {
 
   constexpr double kPtsOfLastFrameInVideoStream = 389'389. / 30'000; // ~12.9
   ourDecoder->setCursorPtsInSeconds(kPtsOfLastFrameInVideoStream);
-  output = ourDecoder->getNextFrame();
+  output = ourDecoder->getNextFrameNoDemux();
   torch::Tensor tensor7FromOurDecoder = output.frame;
   EXPECT_EQ(output.ptsSeconds, 389'389. / 30'000);
   torch::Tensor tensor7FromFFMPEG =

From 0b6590da0346dfe6cd795136a04e6c3f3c0d64f4 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 29 Oct 2024 16:03:08 +0000
Subject: [PATCH 8/8] getNextDecodedOutputNoDemux ->
 getNextFrameOutputNoDemuxInternal

---
 src/torchcodec/decoders/_core/VideoDecoder.cpp | 6 +++---
 src/torchcodec/decoders/_core/VideoDecoder.h   | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
index 54e35387a..94d2b9538 100644
--- a/src/torchcodec/decoders/_core/VideoDecoder.cpp
+++ b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -1027,7 +1027,7 @@ VideoDecoder::DecodedOutput VideoDecoder::getFrameAtIndexInternal(
 
   int64_t pts = stream.allFrames[frameIndex].pts;
   setCursorPtsInSeconds(ptsToSeconds(pts, stream.timeBase));
-  return getNextDecodedOutputNoDemux(preAllocatedOutputTensor);
+  return getNextFrameOutputNoDemuxInternal(preAllocatedOutputTensor);
 }
 
 VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesAtIndices(
@@ -1268,12 +1268,12 @@ VideoDecoder::RawDecodedOutput VideoDecoder::getNextRawDecodedOutputNoDemux() {
 }
 
 VideoDecoder::DecodedOutput VideoDecoder::getNextFrameNoDemux() {
-  auto output = getNextDecodedOutputNoDemux();
+  auto output = getNextFrameOutputNoDemuxInternal();
   output.frame = MaybePermuteHWC2CHW(output.streamIndex, output.frame);
   return output;
 }
 
-VideoDecoder::DecodedOutput VideoDecoder::getNextDecodedOutputNoDemux(
+VideoDecoder::DecodedOutput VideoDecoder::getNextFrameOutputNoDemuxInternal(
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
   auto rawOutput = getNextRawDecodedOutputNoDemux();
   return convertAVFrameToDecodedOutput(rawOutput, preAllocatedOutputTensor);
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
index 22af498ef..7411af187 100644
--- a/src/torchcodec/decoders/_core/VideoDecoder.h
+++ b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -161,8 +161,8 @@ class VideoDecoder {
 
   // ---- SINGLE FRAME SEEK AND DECODING API ----
   // Places the cursor at the first frame on or after the position in seconds.
-  // Calling getNextDecodedOutputNoDemux() will return the first frame at or
-  // after this position.
+  // Calling getNextFrameOutputNoDemuxInternal() will return the first frame at
+  // or after this position.
   void setCursorPtsInSeconds(double seconds);
   // This is an internal structure that is used to store the decoded output
   // from decoding a frame through color conversion. Example usage is:
@@ -391,7 +391,7 @@ class VideoDecoder {
       int streamIndex,
       int64_t frameIndex,
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
-  DecodedOutput getNextDecodedOutputNoDemux(
+  DecodedOutput getNextFrameOutputNoDemuxInternal(
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
   DecoderOptions options_;