From 19bbb9ad974a9da7e770c5d034076e6df4b263da Mon Sep 17 00:00:00 2001
From: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Date: Fri, 4 Apr 2025 20:20:11 +0000
Subject: [PATCH 1/6] Properly name DeviceInterface var member in decoder class

Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
---
 src/torchcodec/_core/SingleStreamDecoder.cpp | 16 ++++++++--------
 src/torchcodec/_core/SingleStreamDecoder.h   |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
index c7c714da3..e80851988 100644
--- a/src/torchcodec/_core/SingleStreamDecoder.cpp
+++ b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -406,7 +406,7 @@ void SingleStreamDecoder::addStream(
   streamInfo.stream = formatContext_->streams[activeStreamIndex_];
   streamInfo.avMediaType = mediaType;
 
-  deviceInterface = createDeviceInterface(device);
+  deviceInterface_ = createDeviceInterface(device);
 
   // This should never happen, checking just to be safe.
   TORCH_CHECK(
@@ -418,9 +418,9 @@ void SingleStreamDecoder::addStream(
   // TODO_CODE_QUALITY it's pretty meh to have a video-specific logic within
   // addStream() which is supposed to be generic
   if (mediaType == AVMEDIA_TYPE_VIDEO) {
-    if (deviceInterface) {
+    if (deviceInterface_) {
       avCodec = makeAVCodecOnlyUseForCallingAVFindBestStream(
-          deviceInterface->findCodec(streamInfo.stream->codecpar->codec_id)
+          deviceInterface_->findCodec(streamInfo.stream->codecpar->codec_id)
               .value_or(avCodec));
     }
   }
@@ -438,8 +438,8 @@ void SingleStreamDecoder::addStream(
 
   // TODO_CODE_QUALITY same as above.
   if (mediaType == AVMEDIA_TYPE_VIDEO) {
-    if (deviceInterface) {
-      deviceInterface->initializeContext(codecContext);
+    if (deviceInterface_) {
+      deviceInterface_->initializeContext(codecContext);
     }
   }
 
@@ -1210,11 +1210,11 @@ SingleStreamDecoder::convertAVFrameToFrameOutput(
       formatContext_->streams[activeStreamIndex_]->time_base);
   if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) {
     convertAudioAVFrameToFrameOutputOnCPU(avFrame, frameOutput);
-  } else if (!deviceInterface) {
+  } else if (!deviceInterface_) {
     convertAVFrameToFrameOutputOnCPU(
         avFrame, frameOutput, preAllocatedOutputTensor);
-  } else if (deviceInterface) {
-    deviceInterface->convertAVFrameToFrameOutput(
+  } else if (deviceInterface_) {
+    deviceInterface_->convertAVFrameToFrameOutput(
         streamInfo.videoStreamOptions,
         avFrame,
         frameOutput,
diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h
index 4879a3b7d..b1dc4fa23 100644
--- a/src/torchcodec/_core/SingleStreamDecoder.h
+++ b/src/torchcodec/_core/SingleStreamDecoder.h
@@ -492,7 +492,7 @@ class SingleStreamDecoder {
   SeekMode seekMode_;
   ContainerMetadata containerMetadata_;
   UniqueDecodingAVFormatContext formatContext_;
-  std::unique_ptr<DeviceInterface> deviceInterface;
+  std::unique_ptr<DeviceInterface> deviceInterface_;
   std::map<int, StreamInfo> streamInfos_;
   const int NO_ACTIVE_STREAM = -2;
   int activeStreamIndex_ = NO_ACTIVE_STREAM;

From 94ba2b50e66e81c9560a561263676ea959703412 Mon Sep 17 00:00:00 2001
From: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Date: Fri, 4 Apr 2025 21:13:10 +0000
Subject: [PATCH 2/6] Move stream options to dedicated header

Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
---
 src/torchcodec/_core/CudaDevice.cpp          |  2 +-
 src/torchcodec/_core/CudaDevice.h            |  2 +-
 src/torchcodec/_core/DeviceInterface.h       |  3 +-
 src/torchcodec/_core/SingleStreamDecoder.cpp |  9 ++--
 src/torchcodec/_core/SingleStreamDecoder.h   | 39 ++--------------
 src/torchcodec/_core/StreamOptions.h         | 49 ++++++++++++++++++++
 src/torchcodec/_core/custom_ops.cpp          |  8 ++--
 test/VideoDecoderTest.cpp                    | 15 +++---
 8 files changed, 71 insertions(+), 56 deletions(-)
 create mode 100644 src/torchcodec/_core/StreamOptions.h

diff --git a/src/torchcodec/_core/CudaDevice.cpp b/src/torchcodec/_core/CudaDevice.cpp
index 5bde4106f..f41c529d5 100644
--- a/src/torchcodec/_core/CudaDevice.cpp
+++ b/src/torchcodec/_core/CudaDevice.cpp
@@ -190,7 +190,7 @@ void CudaDevice::initializeContext(AVCodecContext* codecContext) {
 }
 
 void CudaDevice::convertAVFrameToFrameOutput(
-    const SingleStreamDecoder::VideoStreamOptions& videoStreamOptions,
+    const VideoStreamOptions& videoStreamOptions,
     UniqueAVFrame& avFrame,
     SingleStreamDecoder::FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
diff --git a/src/torchcodec/_core/CudaDevice.h b/src/torchcodec/_core/CudaDevice.h
index 0ed538593..20b45026b 100644
--- a/src/torchcodec/_core/CudaDevice.h
+++ b/src/torchcodec/_core/CudaDevice.h
@@ -21,7 +21,7 @@ class CudaDevice : public DeviceInterface {
   void initializeContext(AVCodecContext* codecContext) override;
 
   void convertAVFrameToFrameOutput(
-      const SingleStreamDecoder::VideoStreamOptions& videoStreamOptions,
+      const VideoStreamOptions& videoStreamOptions,
       UniqueAVFrame& avFrame,
       SingleStreamDecoder::FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor =
diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
index a5b0e3652..c33a8e37f 100644
--- a/src/torchcodec/_core/DeviceInterface.h
+++ b/src/torchcodec/_core/DeviceInterface.h
@@ -13,6 +13,7 @@
 #include <string>
 #include "FFMPEGCommon.h"
 #include "src/torchcodec/_core/SingleStreamDecoder.h"
+#include "src/torchcodec/_core/StreamOptions.h"
 
 namespace facebook::torchcodec {
 
@@ -41,7 +42,7 @@ class DeviceInterface {
   virtual void initializeContext(AVCodecContext* codecContext) = 0;
 
   virtual void convertAVFrameToFrameOutput(
-      const SingleStreamDecoder::VideoStreamOptions& videoStreamOptions,
+      const VideoStreamOptions& videoStreamOptions,
       UniqueAVFrame& avFrame,
       SingleStreamDecoder::FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt) = 0;
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
index e80851988..5edbb7fd9 100644
--- a/src/torchcodec/_core/SingleStreamDecoder.cpp
+++ b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -501,9 +501,8 @@ void SingleStreamDecoder::addVideoStream(
   // swscale requires widths to be multiples of 32:
   // https://stackoverflow.com/questions/74351955/turn-off-sw-scale-conversion-to-planar-yuv-32-byte-alignment-requirements
   // so we fall back to filtergraph if the width is not a multiple of 32.
-  auto defaultLibrary = (width % 32 == 0)
-      ? SingleStreamDecoder::ColorConversionLibrary::SWSCALE
-      : SingleStreamDecoder::ColorConversionLibrary::FILTERGRAPH;
+  auto defaultLibrary = (width % 32 == 0) ? ColorConversionLibrary::SWSCALE
+                                          : ColorConversionLibrary::FILTERGRAPH;
 
   streamInfo.colorConversionLibrary =
       videoStreamOptions.colorConversionLibrary.value_or(defaultLibrary);
@@ -2047,7 +2046,7 @@ FrameDims getHeightAndWidthFromResizedAVFrame(const AVFrame& resizedAVFrame) {
 }
 
 FrameDims getHeightAndWidthFromOptionsOrMetadata(
-    const SingleStreamDecoder::VideoStreamOptions& videoStreamOptions,
+    const VideoStreamOptions& videoStreamOptions,
     const SingleStreamDecoder::StreamMetadata& streamMetadata) {
   return FrameDims(
       videoStreamOptions.height.value_or(*streamMetadata.height),
@@ -2055,7 +2054,7 @@ FrameDims getHeightAndWidthFromOptionsOrMetadata(
 }
 
 FrameDims getHeightAndWidthFromOptionsOrAVFrame(
-    const SingleStreamDecoder::VideoStreamOptions& videoStreamOptions,
+    const VideoStreamOptions& videoStreamOptions,
     const UniqueAVFrame& avFrame) {
   return FrameDims(
       videoStreamOptions.height.value_or(avFrame->height),
diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h
index b1dc4fa23..6b54e72e7 100644
--- a/src/torchcodec/_core/SingleStreamDecoder.h
+++ b/src/torchcodec/_core/SingleStreamDecoder.h
@@ -14,6 +14,7 @@
 
 #include "src/torchcodec/_core/AVIOContextHolder.h"
 #include "src/torchcodec/_core/FFMPEGCommon.h"
+#include "src/torchcodec/_core/StreamOptions.h"
 
 namespace facebook::torchcodec {
 class DeviceInterface;
@@ -112,40 +113,6 @@ class SingleStreamDecoder {
   // ADDING STREAMS API
   // --------------------------------------------------------------------------
 
-  enum ColorConversionLibrary {
-    // TODO: Add an AUTO option later.
-    // Use the libavfilter library for color conversion.
-    FILTERGRAPH,
-    // Use the libswscale library for color conversion.
-    SWSCALE
-  };
-
-  struct VideoStreamOptions {
-    VideoStreamOptions() {}
-
-    // Number of threads we pass to FFMPEG for decoding.
-    // 0 means FFMPEG will choose the number of threads automatically to fully
-    // utilize all cores. If not set, it will be the default FFMPEG behavior for
-    // the given codec.
-    std::optional<int> ffmpegThreadCount;
-    // Currently the dimension order can be either NHWC or NCHW.
-    // H=height, W=width, C=channel.
-    std::string dimensionOrder = "NCHW";
-    // The output height and width of the frame. If not specified, the output
-    // is the same as the original video.
-    std::optional<int> width;
-    std::optional<int> height;
-    std::optional<ColorConversionLibrary> colorConversionLibrary;
-    // By default we use CPU for decoding for both C++ and python users.
-    torch::Device device = torch::kCPU;
-  };
-
-  struct AudioStreamOptions {
-    AudioStreamOptions() {}
-
-    std::optional<int> sampleRate;
-  };
-
   void addVideoStream(
       int streamIndex,
       const VideoStreamOptions& videoStreamOptions = VideoStreamOptions());
@@ -568,11 +535,11 @@ struct FrameDims {
 FrameDims getHeightAndWidthFromResizedAVFrame(const AVFrame& resizedAVFrame);
 
 FrameDims getHeightAndWidthFromOptionsOrMetadata(
-    const SingleStreamDecoder::VideoStreamOptions& videoStreamOptions,
+    const VideoStreamOptions& videoStreamOptions,
     const SingleStreamDecoder::StreamMetadata& streamMetadata);
 
 FrameDims getHeightAndWidthFromOptionsOrAVFrame(
-    const SingleStreamDecoder::VideoStreamOptions& videoStreamOptions,
+    const VideoStreamOptions& videoStreamOptions,
     const UniqueAVFrame& avFrame);
 
 torch::Tensor allocateEmptyHWCTensor(
diff --git a/src/torchcodec/_core/StreamOptions.h b/src/torchcodec/_core/StreamOptions.h
new file mode 100644
index 000000000..38e51209c
--- /dev/null
+++ b/src/torchcodec/_core/StreamOptions.h
@@ -0,0 +1,49 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <torch/types.h>
+#include <optional>
+#include <string>
+
+namespace facebook::torchcodec {
+
+enum ColorConversionLibrary {
+  // TODO: Add an AUTO option later.
+  // Use the libavfilter library for color conversion.
+  FILTERGRAPH,
+  // Use the libswscale library for color conversion.
+  SWSCALE
+};
+
+struct VideoStreamOptions {
+  VideoStreamOptions() {}
+
+  // Number of threads we pass to FFMPEG for decoding.
+  // 0 means FFMPEG will choose the number of threads automatically to fully
+  // utilize all cores. If not set, it will be the default FFMPEG behavior for
+  // the given codec.
+  std::optional<int> ffmpegThreadCount;
+  // Currently the dimension order can be either NHWC or NCHW.
+  // H=height, W=width, C=channel.
+  std::string dimensionOrder = "NCHW";
+  // The output height and width of the frame. If not specified, the output
+  // is the same as the original video.
+  std::optional<int> width;
+  std::optional<int> height;
+  std::optional<ColorConversionLibrary> colorConversionLibrary;
+  // By default we use CPU for decoding for both C++ and python users.
+  torch::Device device = torch::kCPU;
+};
+
+struct AudioStreamOptions {
+  AudioStreamOptions() {}
+
+  std::optional<int> sampleRate;
+};
+
+} // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
index 05a6390d6..7ec2fb9a4 100644
--- a/src/torchcodec/_core/custom_ops.cpp
+++ b/src/torchcodec/_core/custom_ops.cpp
@@ -218,7 +218,7 @@ void _add_video_stream(
     std::optional<int64_t> stream_index = std::nullopt,
     std::optional<std::string_view> device = std::nullopt,
     std::optional<std::string_view> color_conversion_library = std::nullopt) {
-  SingleStreamDecoder::VideoStreamOptions videoStreamOptions;
+  VideoStreamOptions videoStreamOptions;
   videoStreamOptions.width = width;
   videoStreamOptions.height = height;
   videoStreamOptions.ffmpegThreadCount = num_threads;
@@ -232,10 +232,10 @@ void _add_video_stream(
     std::string stdColorConversionLibrary{color_conversion_library.value()};
     if (stdColorConversionLibrary == "filtergraph") {
       videoStreamOptions.colorConversionLibrary =
-          SingleStreamDecoder::ColorConversionLibrary::FILTERGRAPH;
+          ColorConversionLibrary::FILTERGRAPH;
     } else if (stdColorConversionLibrary == "swscale") {
       videoStreamOptions.colorConversionLibrary =
-          SingleStreamDecoder::ColorConversionLibrary::SWSCALE;
+          ColorConversionLibrary::SWSCALE;
     } else {
       throw std::runtime_error(
           "Invalid color_conversion_library=" + stdColorConversionLibrary +
@@ -273,7 +273,7 @@ void add_audio_stream(
     at::Tensor& decoder,
     std::optional<int64_t> stream_index = std::nullopt,
     std::optional<int64_t> sample_rate = std::nullopt) {
-  SingleStreamDecoder::AudioStreamOptions audioStreamOptions;
+  AudioStreamOptions audioStreamOptions;
   audioStreamOptions.sampleRate = sample_rate;
 
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
diff --git a/test/VideoDecoderTest.cpp b/test/VideoDecoderTest.cpp
index 1937ff97c..ef0a6468d 100644
--- a/test/VideoDecoderTest.cpp
+++ b/test/VideoDecoderTest.cpp
@@ -150,7 +150,7 @@ TEST(SingleStreamDecoderTest, RespectsWidthAndHeightFromOptions) {
   std::string path = getResourcePath("nasa_13013.mp4");
   std::unique_ptr<SingleStreamDecoder> decoder =
       std::make_unique<SingleStreamDecoder>(path);
-  SingleStreamDecoder::VideoStreamOptions videoStreamOptions;
+  VideoStreamOptions videoStreamOptions;
   videoStreamOptions.width = 100;
   videoStreamOptions.height = 120;
   decoder->addVideoStream(-1, videoStreamOptions);
@@ -162,7 +162,7 @@ TEST(SingleStreamDecoderTest, RespectsOutputTensorDimensionOrderFromOptions) {
   std::string path = getResourcePath("nasa_13013.mp4");
   std::unique_ptr<SingleStreamDecoder> decoder =
       std::make_unique<SingleStreamDecoder>(path);
-  SingleStreamDecoder::VideoStreamOptions videoStreamOptions;
+  VideoStreamOptions videoStreamOptions;
   videoStreamOptions.dimensionOrder = "NHWC";
   decoder->addVideoStream(-1, videoStreamOptions);
   torch::Tensor tensor = decoder->getNextFrame().data;
@@ -234,7 +234,7 @@ TEST_P(SingleStreamDecoderTest, DecodesFramesInABatchInNHWC) {
   ourDecoder->scanFileAndUpdateMetadataAndIndex();
   int bestVideoStreamIndex =
       *ourDecoder->getContainerMetadata().bestVideoStreamIndex;
-  SingleStreamDecoder::VideoStreamOptions videoStreamOptions;
+  VideoStreamOptions videoStreamOptions;
   videoStreamOptions.dimensionOrder = "NHWC";
   ourDecoder->addVideoStream(bestVideoStreamIndex, videoStreamOptions);
   // Frame with index 180 corresponds to timestamp 6.006.
@@ -399,9 +399,9 @@ TEST_P(SingleStreamDecoderTest, PreAllocatedTensorFilterGraph) {
   ourDecoder->scanFileAndUpdateMetadataAndIndex();
   int bestVideoStreamIndex =
       *ourDecoder->getContainerMetadata().bestVideoStreamIndex;
-  SingleStreamDecoder::VideoStreamOptions videoStreamOptions;
+  VideoStreamOptions videoStreamOptions;
   videoStreamOptions.colorConversionLibrary =
-      SingleStreamDecoder::ColorConversionLibrary::FILTERGRAPH;
+      ColorConversionLibrary::FILTERGRAPH;
   ourDecoder->addVideoStream(bestVideoStreamIndex, videoStreamOptions);
   auto output =
       ourDecoder->getFrameAtIndexInternal(0, preAllocatedOutputTensor);
@@ -417,9 +417,8 @@ TEST_P(SingleStreamDecoderTest, PreAllocatedTensorSwscale) {
   ourDecoder->scanFileAndUpdateMetadataAndIndex();
   int bestVideoStreamIndex =
       *ourDecoder->getContainerMetadata().bestVideoStreamIndex;
-  SingleStreamDecoder::VideoStreamOptions videoStreamOptions;
-  videoStreamOptions.colorConversionLibrary =
-      SingleStreamDecoder::ColorConversionLibrary::SWSCALE;
+  VideoStreamOptions videoStreamOptions;
+  videoStreamOptions.colorConversionLibrary = ColorConversionLibrary::SWSCALE;
   ourDecoder->addVideoStream(bestVideoStreamIndex, videoStreamOptions);
   auto output =
       ourDecoder->getFrameAtIndexInternal(0, preAllocatedOutputTensor);

From b42b2a6c9f086be552e7f7b1866c01ca9c7188d1 Mon Sep 17 00:00:00 2001
From: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Date: Fri, 4 Apr 2025 21:34:03 +0000
Subject: [PATCH 3/6] Move frame output structs to dedicated header

Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
---
 src/torchcodec/_core/CudaDevice.cpp          |  2 +-
 src/torchcodec/_core/CudaDevice.h            |  2 +-
 src/torchcodec/_core/DeviceInterface.h       |  6 +-
 src/torchcodec/_core/Frame.h                 | 46 +++++++++
 src/torchcodec/_core/SingleStreamDecoder.cpp | 34 +++----
 src/torchcodec/_core/SingleStreamDecoder.h   | 87 +----------------
 src/torchcodec/_core/Stream.h                | 99 ++++++++++++++++++++
 src/torchcodec/_core/StreamOptions.h         | 49 ----------
 src/torchcodec/_core/custom_ops.cpp          | 15 ++-
 test/VideoDecoderTest.cpp                    |  6 +-
 10 files changed, 175 insertions(+), 171 deletions(-)
 create mode 100644 src/torchcodec/_core/Frame.h
 create mode 100644 src/torchcodec/_core/Stream.h
 delete mode 100644 src/torchcodec/_core/StreamOptions.h

diff --git a/src/torchcodec/_core/CudaDevice.cpp b/src/torchcodec/_core/CudaDevice.cpp
index f41c529d5..4f6c74073 100644
--- a/src/torchcodec/_core/CudaDevice.cpp
+++ b/src/torchcodec/_core/CudaDevice.cpp
@@ -192,7 +192,7 @@ void CudaDevice::initializeContext(AVCodecContext* codecContext) {
 void CudaDevice::convertAVFrameToFrameOutput(
     const VideoStreamOptions& videoStreamOptions,
     UniqueAVFrame& avFrame,
-    SingleStreamDecoder::FrameOutput& frameOutput,
+    FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
   TORCH_CHECK(
       avFrame->format == AV_PIX_FMT_CUDA,
diff --git a/src/torchcodec/_core/CudaDevice.h b/src/torchcodec/_core/CudaDevice.h
index 20b45026b..3aee6e2b1 100644
--- a/src/torchcodec/_core/CudaDevice.h
+++ b/src/torchcodec/_core/CudaDevice.h
@@ -23,7 +23,7 @@ class CudaDevice : public DeviceInterface {
   void convertAVFrameToFrameOutput(
       const VideoStreamOptions& videoStreamOptions,
       UniqueAVFrame& avFrame,
-      SingleStreamDecoder::FrameOutput& frameOutput,
+      FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor =
           std::nullopt) override;
 
diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
index c33a8e37f..d91870ed8 100644
--- a/src/torchcodec/_core/DeviceInterface.h
+++ b/src/torchcodec/_core/DeviceInterface.h
@@ -12,8 +12,8 @@
 #include <stdexcept>
 #include <string>
 #include "FFMPEGCommon.h"
-#include "src/torchcodec/_core/SingleStreamDecoder.h"
-#include "src/torchcodec/_core/StreamOptions.h"
+#include "src/torchcodec/_core/Frame.h"
+#include "src/torchcodec/_core/Stream.h"
 
 namespace facebook::torchcodec {
 
@@ -44,7 +44,7 @@ class DeviceInterface {
   virtual void convertAVFrameToFrameOutput(
       const VideoStreamOptions& videoStreamOptions,
       UniqueAVFrame& avFrame,
-      SingleStreamDecoder::FrameOutput& frameOutput,
+      FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt) = 0;
 
  protected:
diff --git a/src/torchcodec/_core/Frame.h b/src/torchcodec/_core/Frame.h
new file mode 100644
index 000000000..d2214c3ce
--- /dev/null
+++ b/src/torchcodec/_core/Frame.h
@@ -0,0 +1,46 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <torch/types.h>
+#include "src/torchcodec/_core/Stream.h"
+
+namespace facebook::torchcodec {
+
+// All public video decoding entry points return either a FrameOutput or a
+// FrameBatchOutput.
+// They are the equivalent of the user-facing Frame and FrameBatch classes in
+// Python. They contain RGB decoded frames along with some associated data
+// like PTS and duration.
+// FrameOutput is also relevant for audio decoding, typically as the output of
+// getNextFrame(), or as a temporary output variable.
+struct FrameOutput {
+  // data shape is:
+  // - 3D (C, H, W) or (H, W, C) for videos
+  // - 2D (numChannels, numSamples) for audio
+  torch::Tensor data;
+  double ptsSeconds;
+  double durationSeconds;
+};
+
+struct FrameBatchOutput {
+  torch::Tensor data; // 4D: of shape NCHW or NHWC.
+  torch::Tensor ptsSeconds; // 1D of shape (N,)
+  torch::Tensor durationSeconds; // 1D of shape (N,)
+
+  explicit FrameBatchOutput(
+      int64_t numFrames,
+      const VideoStreamOptions& videoStreamOptions,
+      const StreamMetadata& streamMetadata);
+};
+
+struct AudioFramesOutput {
+  torch::Tensor data; // shape is (numChannels, numSamples)
+  double ptsSeconds;
+};
+
+} // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
index 5edbb7fd9..ea7d341d2 100644
--- a/src/torchcodec/_core/SingleStreamDecoder.cpp
+++ b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -350,8 +350,7 @@ void SingleStreamDecoder::scanFileAndUpdateMetadataAndIndex() {
   scannedAllStreams_ = true;
 }
 
-SingleStreamDecoder::ContainerMetadata
-SingleStreamDecoder::getContainerMetadata() const {
+ContainerMetadata SingleStreamDecoder::getContainerMetadata() const {
   return containerMetadata_;
 }
 
@@ -538,7 +537,7 @@ void SingleStreamDecoder::addAudioStream(
 // HIGH-LEVEL DECODING ENTRY-POINTS
 // --------------------------------------------------------------------------
 
-SingleStreamDecoder::FrameOutput SingleStreamDecoder::getNextFrame() {
+FrameOutput SingleStreamDecoder::getNextFrame() {
   auto output = getNextFrameInternal();
   if (streamInfos_[activeStreamIndex_].avMediaType == AVMEDIA_TYPE_VIDEO) {
     output.data = maybePermuteHWC2CHW(output.data);
@@ -546,7 +545,7 @@ SingleStreamDecoder::FrameOutput SingleStreamDecoder::getNextFrame() {
   return output;
 }
 
-SingleStreamDecoder::FrameOutput SingleStreamDecoder::getNextFrameInternal(
+FrameOutput SingleStreamDecoder::getNextFrameInternal(
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
   validateActiveStream();
   UniqueAVFrame avFrame = decodeAVFrame(
@@ -554,14 +553,13 @@ SingleStreamDecoder::FrameOutput SingleStreamDecoder::getNextFrameInternal(
   return convertAVFrameToFrameOutput(avFrame, preAllocatedOutputTensor);
 }
 
-SingleStreamDecoder::FrameOutput SingleStreamDecoder::getFrameAtIndex(
-    int64_t frameIndex) {
+FrameOutput SingleStreamDecoder::getFrameAtIndex(int64_t frameIndex) {
   auto frameOutput = getFrameAtIndexInternal(frameIndex);
   frameOutput.data = maybePermuteHWC2CHW(frameOutput.data);
   return frameOutput;
 }
 
-SingleStreamDecoder::FrameOutput SingleStreamDecoder::getFrameAtIndexInternal(
+FrameOutput SingleStreamDecoder::getFrameAtIndexInternal(
     int64_t frameIndex,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
   validateActiveStream(AVMEDIA_TYPE_VIDEO);
@@ -576,7 +574,7 @@ SingleStreamDecoder::FrameOutput SingleStreamDecoder::getFrameAtIndexInternal(
   return getNextFrameInternal(preAllocatedOutputTensor);
 }
 
-SingleStreamDecoder::FrameBatchOutput SingleStreamDecoder::getFramesAtIndices(
+FrameBatchOutput SingleStreamDecoder::getFramesAtIndices(
     const std::vector<int64_t>& frameIndices) {
   validateActiveStream(AVMEDIA_TYPE_VIDEO);
 
@@ -635,7 +633,7 @@ SingleStreamDecoder::FrameBatchOutput SingleStreamDecoder::getFramesAtIndices(
   return frameBatchOutput;
 }
 
-SingleStreamDecoder::FrameBatchOutput SingleStreamDecoder::getFramesInRange(
+FrameBatchOutput SingleStreamDecoder::getFramesInRange(
     int64_t start,
     int64_t stop,
     int64_t step) {
@@ -669,8 +667,7 @@ SingleStreamDecoder::FrameBatchOutput SingleStreamDecoder::getFramesInRange(
   return frameBatchOutput;
 }
 
-SingleStreamDecoder::FrameOutput SingleStreamDecoder::getFramePlayedAt(
-    double seconds) {
+FrameOutput SingleStreamDecoder::getFramePlayedAt(double seconds) {
   validateActiveStream(AVMEDIA_TYPE_VIDEO);
   StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
   double frameStartTime =
@@ -710,7 +707,7 @@ SingleStreamDecoder::FrameOutput SingleStreamDecoder::getFramePlayedAt(
   return frameOutput;
 }
 
-SingleStreamDecoder::FrameBatchOutput SingleStreamDecoder::getFramesPlayedAt(
+FrameBatchOutput SingleStreamDecoder::getFramesPlayedAt(
     const std::vector<double>& timestamps) {
   validateActiveStream(AVMEDIA_TYPE_VIDEO);
 
@@ -740,8 +737,7 @@ SingleStreamDecoder::FrameBatchOutput SingleStreamDecoder::getFramesPlayedAt(
   return getFramesAtIndices(frameIndices);
 }
 
-SingleStreamDecoder::FrameBatchOutput
-SingleStreamDecoder::getFramesPlayedInRange(
+FrameBatchOutput SingleStreamDecoder::getFramesPlayedInRange(
     double startSeconds,
     double stopSeconds) {
   validateActiveStream(AVMEDIA_TYPE_VIDEO);
@@ -874,8 +870,7 @@ SingleStreamDecoder::getFramesPlayedInRange(
 // [2] If you're brave and curious, you can read the long "Seek offset for
 // audio" note in https://github.com/pytorch/torchcodec/pull/507/files, which
 // sums up past (and failed) attemps at working around this issue.
-SingleStreamDecoder::AudioFramesOutput
-SingleStreamDecoder::getFramesPlayedInRangeAudio(
+AudioFramesOutput SingleStreamDecoder::getFramesPlayedInRangeAudio(
     double startSeconds,
     std::optional<double> stopSecondsOptional) {
   validateActiveStream(AVMEDIA_TYPE_AUDIO);
@@ -1195,8 +1190,7 @@ UniqueAVFrame SingleStreamDecoder::decodeAVFrame(
 // AVFRAME <-> FRAME OUTPUT CONVERSION
 // --------------------------------------------------------------------------
 
-SingleStreamDecoder::FrameOutput
-SingleStreamDecoder::convertAVFrameToFrameOutput(
+FrameOutput SingleStreamDecoder::convertAVFrameToFrameOutput(
     UniqueAVFrame& avFrame,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
   // Convert the frame to tensor.
@@ -1546,7 +1540,7 @@ std::optional<torch::Tensor> SingleStreamDecoder::maybeFlushSwrBuffers() {
 // OUTPUT ALLOCATION AND SHAPE CONVERSION
 // --------------------------------------------------------------------------
 
-SingleStreamDecoder::FrameBatchOutput::FrameBatchOutput(
+FrameBatchOutput::FrameBatchOutput(
     int64_t numFrames,
     const VideoStreamOptions& videoStreamOptions,
     const StreamMetadata& streamMetadata)
@@ -2047,7 +2041,7 @@ FrameDims getHeightAndWidthFromResizedAVFrame(const AVFrame& resizedAVFrame) {
 
 FrameDims getHeightAndWidthFromOptionsOrMetadata(
     const VideoStreamOptions& videoStreamOptions,
-    const SingleStreamDecoder::StreamMetadata& streamMetadata) {
+    const StreamMetadata& streamMetadata) {
   return FrameDims(
       videoStreamOptions.height.value_or(*streamMetadata.height),
       videoStreamOptions.width.value_or(*streamMetadata.width));
diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h
index 6b54e72e7..3e9506ce3 100644
--- a/src/torchcodec/_core/SingleStreamDecoder.h
+++ b/src/torchcodec/_core/SingleStreamDecoder.h
@@ -14,7 +14,8 @@
 
 #include "src/torchcodec/_core/AVIOContextHolder.h"
 #include "src/torchcodec/_core/FFMPEGCommon.h"
-#include "src/torchcodec/_core/StreamOptions.h"
+#include "src/torchcodec/_core/Frame.h"
+#include "src/torchcodec/_core/Stream.h"
 
 namespace facebook::torchcodec {
 class DeviceInterface;
@@ -52,56 +53,6 @@ class SingleStreamDecoder {
   // the allFrames and keyFrames vectors.
   void scanFileAndUpdateMetadataAndIndex();
 
-  struct StreamMetadata {
-    // Common (video and audio) fields derived from the AVStream.
-    int streamIndex;
-    // See this link for what various values are available:
-    // https://ffmpeg.org/doxygen/trunk/group__lavu__misc.html#ga9a84bba4713dfced21a1a56163be1f48
-    AVMediaType mediaType;
-    std::optional<AVCodecID> codecId;
-    std::optional<std::string> codecName;
-    std::optional<double> durationSeconds;
-    std::optional<double> beginStreamFromHeader;
-    std::optional<int64_t> numFrames;
-    std::optional<int64_t> numKeyFrames;
-    std::optional<double> averageFps;
-    std::optional<double> bitRate;
-
-    // More accurate duration, obtained by scanning the file.
-    // These presentation timestamps are in time base.
-    std::optional<int64_t> minPtsFromScan;
-    std::optional<int64_t> maxPtsFromScan;
-    // These presentation timestamps are in seconds.
-    std::optional<double> minPtsSecondsFromScan;
-    std::optional<double> maxPtsSecondsFromScan;
-    // This can be useful for index-based seeking.
-    std::optional<int64_t> numFramesFromScan;
-
-    // Video-only fields derived from the AVCodecContext.
-    std::optional<int64_t> width;
-    std::optional<int64_t> height;
-
-    // Audio-only fields
-    std::optional<int64_t> sampleRate;
-    std::optional<int64_t> numChannels;
-    std::optional<std::string> sampleFormat;
-  };
-
-  struct ContainerMetadata {
-    std::vector<StreamMetadata> allStreamMetadata;
-    int numAudioStreams = 0;
-    int numVideoStreams = 0;
-    // Note that this is the container-level duration, which is usually the max
-    // of all stream durations available in the container.
-    std::optional<double> durationSeconds;
-    // Total BitRate level information at the container level in bit/s
-    std::optional<double> bitRate;
-    // If set, this is the index to the default audio stream.
-    std::optional<int> bestAudioStreamIndex;
-    // If set, this is the index to the default video stream.
-    std::optional<int> bestVideoStreamIndex;
-  };
-
   // Returns the metadata for the container.
   ContainerMetadata getContainerMetadata() const;
 
@@ -124,38 +75,6 @@ class SingleStreamDecoder {
   // DECODING AND SEEKING APIs
   // --------------------------------------------------------------------------
 
-  // All public video decoding entry points return either a FrameOutput or a
-  // FrameBatchOutput.
-  // They are the equivalent of the user-facing Frame and FrameBatch classes in
-  // Python. They contain RGB decoded frames along with some associated data
-  // like PTS and duration.
-  // FrameOutput is also relevant for audio decoding, typically as the output of
-  // getNextFrame(), or as a temporary output variable.
-  struct FrameOutput {
-    // data shape is:
-    // - 3D (C, H, W) or (H, W, C) for videos
-    // - 2D (numChannels, numSamples) for audio
-    torch::Tensor data;
-    double ptsSeconds;
-    double durationSeconds;
-  };
-
-  struct FrameBatchOutput {
-    torch::Tensor data; // 4D: of shape NCHW or NHWC.
-    torch::Tensor ptsSeconds; // 1D of shape (N,)
-    torch::Tensor durationSeconds; // 1D of shape (N,)
-
-    explicit FrameBatchOutput(
-        int64_t numFrames,
-        const VideoStreamOptions& videoStreamOptions,
-        const StreamMetadata& streamMetadata);
-  };
-
-  struct AudioFramesOutput {
-    torch::Tensor data; // shape is (numChannels, numSamples)
-    double ptsSeconds;
-  };
-
   // Places the cursor at the first frame on or after the position in seconds.
   // Calling getNextFrame() will return the first frame at
   // or after this position.
@@ -536,7 +455,7 @@ FrameDims getHeightAndWidthFromResizedAVFrame(const AVFrame& resizedAVFrame);
 
 FrameDims getHeightAndWidthFromOptionsOrMetadata(
     const VideoStreamOptions& videoStreamOptions,
-    const SingleStreamDecoder::StreamMetadata& streamMetadata);
+    const StreamMetadata& streamMetadata);
 
 FrameDims getHeightAndWidthFromOptionsOrAVFrame(
     const VideoStreamOptions& videoStreamOptions,
diff --git a/src/torchcodec/_core/Stream.h b/src/torchcodec/_core/Stream.h
new file mode 100644
index 000000000..a9ff1fe2c
--- /dev/null
+++ b/src/torchcodec/_core/Stream.h
@@ -0,0 +1,99 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <torch/types.h>
+#include <optional>
+#include <string>
+
+namespace facebook::torchcodec {
+
+enum ColorConversionLibrary {
+  // TODO: Add an AUTO option later.
+  // Use the libavfilter library for color conversion.
+  FILTERGRAPH,
+  // Use the libswscale library for color conversion.
+  SWSCALE
+};
+
+struct VideoStreamOptions {
+  VideoStreamOptions() {}
+
+  // Number of threads we pass to FFMPEG for decoding.
+  // 0 means FFMPEG will choose the number of threads automatically to fully
+  // utilize all cores. If not set, it will be the default FFMPEG behavior for
+  // the given codec.
+  std::optional<int> ffmpegThreadCount;
+  // Currently the dimension order can be either NHWC or NCHW.
+  // H=height, W=width, C=channel.
+  std::string dimensionOrder = "NCHW";
+  // The output height and width of the frame. If not specified, the output
+  // is the same as the original video.
+  std::optional<int> width;
+  std::optional<int> height;
+  std::optional<ColorConversionLibrary> colorConversionLibrary;
+  // By default we use CPU for decoding for both C++ and python users.
+  torch::Device device = torch::kCPU;
+};
+
+struct AudioStreamOptions {
+  AudioStreamOptions() {}
+
+  std::optional<int> sampleRate;
+};
+
+struct StreamMetadata {
+  // Common (video and audio) fields derived from the AVStream.
+  int streamIndex;
+  // See this link for what various values are available:
+  // https://ffmpeg.org/doxygen/trunk/group__lavu__misc.html#ga9a84bba4713dfced21a1a56163be1f48
+  AVMediaType mediaType;
+  std::optional<AVCodecID> codecId;
+  std::optional<std::string> codecName;
+  std::optional<double> durationSeconds;
+  std::optional<double> beginStreamFromHeader;
+  std::optional<int64_t> numFrames;
+  std::optional<int64_t> numKeyFrames;
+  std::optional<double> averageFps;
+  std::optional<double> bitRate;
+
+  // More accurate duration, obtained by scanning the file.
+  // These presentation timestamps are in time base.
+  std::optional<int64_t> minPtsFromScan;
+  std::optional<int64_t> maxPtsFromScan;
+  // These presentation timestamps are in seconds.
+  std::optional<double> minPtsSecondsFromScan;
+  std::optional<double> maxPtsSecondsFromScan;
+  // This can be useful for index-based seeking.
+  std::optional<int64_t> numFramesFromScan;
+
+  // Video-only fields derived from the AVCodecContext.
+  std::optional<int64_t> width;
+  std::optional<int64_t> height;
+
+  // Audio-only fields
+  std::optional<int64_t> sampleRate;
+  std::optional<int64_t> numChannels;
+  std::optional<std::string> sampleFormat;
+};
+
+struct ContainerMetadata {
+  std::vector<StreamMetadata> allStreamMetadata;
+  int numAudioStreams = 0;
+  int numVideoStreams = 0;
+  // Note that this is the container-level duration, which is usually the max
+  // of all stream durations available in the container.
+  std::optional<double> durationSeconds;
+  // Total BitRate level information at the container level in bit/s
+  std::optional<double> bitRate;
+  // If set, this is the index to the default audio stream.
+  std::optional<int> bestAudioStreamIndex;
+  // If set, this is the index to the default video stream.
+  std::optional<int> bestVideoStreamIndex;
+};
+
+} // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/StreamOptions.h b/src/torchcodec/_core/StreamOptions.h
deleted file mode 100644
index 38e51209c..000000000
--- a/src/torchcodec/_core/StreamOptions.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) Meta Platforms, Inc. and affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#pragma once
-
-#include <torch/types.h>
-#include <optional>
-#include <string>
-
-namespace facebook::torchcodec {
-
-enum ColorConversionLibrary {
-  // TODO: Add an AUTO option later.
-  // Use the libavfilter library for color conversion.
-  FILTERGRAPH,
-  // Use the libswscale library for color conversion.
-  SWSCALE
-};
-
-struct VideoStreamOptions {
-  VideoStreamOptions() {}
-
-  // Number of threads we pass to FFMPEG for decoding.
-  // 0 means FFMPEG will choose the number of threads automatically to fully
-  // utilize all cores. If not set, it will be the default FFMPEG behavior for
-  // the given codec.
-  std::optional<int> ffmpegThreadCount;
-  // Currently the dimension order can be either NHWC or NCHW.
-  // H=height, W=width, C=channel.
-  std::string dimensionOrder = "NCHW";
-  // The output height and width of the frame. If not specified, the output
-  // is the same as the original video.
-  std::optional<int> width;
-  std::optional<int> height;
-  std::optional<ColorConversionLibrary> colorConversionLibrary;
-  // By default we use CPU for decoding for both C++ and python users.
-  torch::Device device = torch::kCPU;
-};
-
-struct AudioStreamOptions {
-  AudioStreamOptions() {}
-
-  std::optional<int> sampleRate;
-};
-
-} // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
index 7ec2fb9a4..3f107a5a7 100644
--- a/src/torchcodec/_core/custom_ops.cpp
+++ b/src/torchcodec/_core/custom_ops.cpp
@@ -98,7 +98,7 @@ SingleStreamDecoder* unwrapTensorToGetDecoder(at::Tensor& tensor) {
 // under torch.compile().
 using OpsFrameOutput = std::tuple<at::Tensor, at::Tensor, at::Tensor>;
 
-OpsFrameOutput makeOpsFrameOutput(SingleStreamDecoder::FrameOutput& frame) {
+OpsFrameOutput makeOpsFrameOutput(FrameOutput& frame) {
   return std::make_tuple(
       frame.data,
       torch::tensor(frame.ptsSeconds, torch::dtype(torch::kFloat64)),
@@ -116,8 +116,7 @@ OpsFrameOutput makeOpsFrameOutput(SingleStreamDecoder::FrameOutput& frame) {
 //   single float.
 using OpsFrameBatchOutput = std::tuple<at::Tensor, at::Tensor, at::Tensor>;
 
-OpsFrameBatchOutput makeOpsFrameBatchOutput(
-    SingleStreamDecoder::FrameBatchOutput& batch) {
+OpsFrameBatchOutput makeOpsFrameBatchOutput(FrameBatchOutput& batch) {
   return std::make_tuple(batch.data, batch.ptsSeconds, batch.durationSeconds);
 }
 
@@ -127,8 +126,7 @@ OpsFrameBatchOutput makeOpsFrameBatchOutput(
 //   2. A single float value for the pts of the first frame, in seconds.
 using OpsAudioFramesOutput = std::tuple<at::Tensor, at::Tensor>;
 
-OpsAudioFramesOutput makeOpsAudioFramesOutput(
-    SingleStreamDecoder::AudioFramesOutput& audioFrames) {
+OpsAudioFramesOutput makeOpsAudioFramesOutput(AudioFramesOutput& audioFrames) {
   return std::make_tuple(
       audioFrames.data,
       torch::tensor(audioFrames.ptsSeconds, torch::dtype(torch::kFloat64)));
@@ -291,7 +289,7 @@ void seek_to_pts(at::Tensor& decoder, double seconds) {
 // duration as tensors.
 OpsFrameOutput get_next_frame(at::Tensor& decoder) {
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
-  SingleStreamDecoder::FrameOutput result;
+  FrameOutput result;
   try {
     result = videoDecoder->getNextFrame();
   } catch (const SingleStreamDecoder::EndOfFileException& e) {
@@ -305,7 +303,7 @@ OpsFrameOutput get_next_frame(at::Tensor& decoder) {
 // given timestamp T has T >= PTS and T < PTS + Duration.
 OpsFrameOutput get_frame_at_pts(at::Tensor& decoder, double seconds) {
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
-  SingleStreamDecoder::FrameOutput result;
+  FrameOutput result;
   try {
     result = videoDecoder->getFramePlayedAt(seconds);
   } catch (const SingleStreamDecoder::EndOfFileException& e) {
@@ -443,8 +441,7 @@ torch::Tensor _get_key_frame_indices(at::Tensor& decoder) {
 std::string get_json_metadata(at::Tensor& decoder) {
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
 
-  SingleStreamDecoder::ContainerMetadata videoMetadata =
-      videoDecoder->getContainerMetadata();
+  ContainerMetadata videoMetadata = videoDecoder->getContainerMetadata();
   auto maybeBestVideoStreamIndex = videoMetadata.bestVideoStreamIndex;
 
   std::map<std::string, std::string> metadataMap;
diff --git a/test/VideoDecoderTest.cpp b/test/VideoDecoderTest.cpp
index ef0a6468d..6cbeef293 100644
--- a/test/VideoDecoderTest.cpp
+++ b/test/VideoDecoderTest.cpp
@@ -69,8 +69,7 @@ TEST_P(SingleStreamDecoderTest, ReturnsFpsAndDurationForVideoInMetadata) {
   std::string path = getResourcePath("nasa_13013.mp4");
   std::unique_ptr<SingleStreamDecoder> decoder =
       createDecoderFromPath(path, GetParam());
-  SingleStreamDecoder::ContainerMetadata metadata =
-      decoder->getContainerMetadata();
+  ContainerMetadata metadata = decoder->getContainerMetadata();
   EXPECT_EQ(metadata.numAudioStreams, 2);
   EXPECT_EQ(metadata.numVideoStreams, 2);
 #if LIBAVFORMAT_VERSION_MAJOR >= 60
@@ -429,8 +428,7 @@ TEST_P(SingleStreamDecoderTest, GetAudioMetadata) {
   std::string path = getResourcePath("nasa_13013.mp4.audio.mp3");
   std::unique_ptr<SingleStreamDecoder> decoder =
       createDecoderFromPath(path, GetParam());
-  SingleStreamDecoder::ContainerMetadata metadata =
-      decoder->getContainerMetadata();
+  ContainerMetadata metadata = decoder->getContainerMetadata();
   EXPECT_EQ(metadata.numAudioStreams, 1);
   EXPECT_EQ(metadata.numVideoStreams, 0);
   EXPECT_EQ(metadata.allStreamMetadata.size(), 1);

From 19ad198a77b23905531357e7293140aef7b4bdec Mon Sep 17 00:00:00 2001
From: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Date: Fri, 4 Apr 2025 22:07:16 +0000
Subject: [PATCH 4/6] Cleanup DeviceInterface inheritance

Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
---
 src/torchcodec/_core/SingleStreamDecoder.cpp | 1 -
 src/torchcodec/_core/SingleStreamDecoder.h   | 2 +-
 src/torchcodec/_core/custom_ops.cpp          | 1 -
 test/VideoDecoderTest.cpp                    | 1 -
 4 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
index ea7d341d2..c389242cd 100644
--- a/src/torchcodec/_core/SingleStreamDecoder.cpp
+++ b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -13,7 +13,6 @@
 #include <sstream>
 #include <stdexcept>
 #include <string_view>
-#include "src/torchcodec/_core/DeviceInterface.h"
 #include "torch/types.h"
 
 extern "C" {
diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h
index 3e9506ce3..453b974f5 100644
--- a/src/torchcodec/_core/SingleStreamDecoder.h
+++ b/src/torchcodec/_core/SingleStreamDecoder.h
@@ -13,12 +13,12 @@
 #include <string_view>
 
 #include "src/torchcodec/_core/AVIOContextHolder.h"
+#include "src/torchcodec/_core/DeviceInterface.h"
 #include "src/torchcodec/_core/FFMPEGCommon.h"
 #include "src/torchcodec/_core/Frame.h"
 #include "src/torchcodec/_core/Stream.h"
 
 namespace facebook::torchcodec {
-class DeviceInterface;
 
 // The SingleStreamDecoder class can be used to decode video frames to Tensors.
 // Note that SingleStreamDecoder is not thread-safe.
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
index 3f107a5a7..9a9b87767 100644
--- a/src/torchcodec/_core/custom_ops.cpp
+++ b/src/torchcodec/_core/custom_ops.cpp
@@ -11,7 +11,6 @@
 #include "c10/core/SymIntArrayRef.h"
 #include "c10/util/Exception.h"
 #include "src/torchcodec/_core/AVIOBytesContext.h"
-#include "src/torchcodec/_core/DeviceInterface.h"
 #include "src/torchcodec/_core/Encoder.h"
 #include "src/torchcodec/_core/SingleStreamDecoder.h"
 
diff --git a/test/VideoDecoderTest.cpp b/test/VideoDecoderTest.cpp
index 6cbeef293..a30609c2a 100644
--- a/test/VideoDecoderTest.cpp
+++ b/test/VideoDecoderTest.cpp
@@ -5,7 +5,6 @@
 // LICENSE file in the root directory of this source tree.
 
 #include "src/torchcodec/_core/AVIOBytesContext.h"
-#include "src/torchcodec/_core/DeviceInterface.h"
 #include "src/torchcodec/_core/SingleStreamDecoder.h"
 
 #include <c10/util/Flags.h>

From bb63f09fea02085e173d953a8131d7b3c405ae95 Mon Sep 17 00:00:00 2001
From: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Date: Mon, 7 Apr 2025 16:10:17 +0000
Subject: [PATCH 5/6] Move metadata structs to dedicated header

Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
---
 src/torchcodec/_core/Frame.h    |  1 +
 src/torchcodec/_core/Metadata.h | 70 +++++++++++++++++++++++++++++++++
 src/torchcodec/_core/Stream.h   | 50 -----------------------
 3 files changed, 71 insertions(+), 50 deletions(-)
 create mode 100644 src/torchcodec/_core/Metadata.h

diff --git a/src/torchcodec/_core/Frame.h b/src/torchcodec/_core/Frame.h
index d2214c3ce..728d8d8ac 100644
--- a/src/torchcodec/_core/Frame.h
+++ b/src/torchcodec/_core/Frame.h
@@ -7,6 +7,7 @@
 #pragma once
 
 #include <torch/types.h>
+#include "src/torchcodec/_core/Metadata.h"
 #include "src/torchcodec/_core/Stream.h"
 
 namespace facebook::torchcodec {
diff --git a/src/torchcodec/_core/Metadata.h b/src/torchcodec/_core/Metadata.h
new file mode 100644
index 000000000..a8f300f49
--- /dev/null
+++ b/src/torchcodec/_core/Metadata.h
@@ -0,0 +1,70 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <optional>
+#include <string>
+#include <vector>
+
+extern "C" {
+#include <libavcodec/avcodec.h>
+#include <libavutil/avutil.h>
+}
+
+namespace facebook::torchcodec {
+
+struct StreamMetadata {
+  // Common (video and audio) fields derived from the AVStream.
+  int streamIndex;
+  // See this link for what various values are available:
+  // https://ffmpeg.org/doxygen/trunk/group__lavu__misc.html#ga9a84bba4713dfced21a1a56163be1f48
+  AVMediaType mediaType;
+  std::optional<AVCodecID> codecId;
+  std::optional<std::string> codecName;
+  std::optional<double> durationSeconds;
+  std::optional<double> beginStreamFromHeader;
+  std::optional<int64_t> numFrames;
+  std::optional<int64_t> numKeyFrames;
+  std::optional<double> averageFps;
+  std::optional<double> bitRate;
+
+  // More accurate duration, obtained by scanning the file.
+  // These presentation timestamps are in time base.
+  std::optional<int64_t> minPtsFromScan;
+  std::optional<int64_t> maxPtsFromScan;
+  // These presentation timestamps are in seconds.
+  std::optional<double> minPtsSecondsFromScan;
+  std::optional<double> maxPtsSecondsFromScan;
+  // This can be useful for index-based seeking.
+  std::optional<int64_t> numFramesFromScan;
+
+  // Video-only fields derived from the AVCodecContext.
+  std::optional<int64_t> width;
+  std::optional<int64_t> height;
+
+  // Audio-only fields
+  std::optional<int64_t> sampleRate;
+  std::optional<int64_t> numChannels;
+  std::optional<std::string> sampleFormat;
+};
+
+struct ContainerMetadata {
+  std::vector<StreamMetadata> allStreamMetadata;
+  int numAudioStreams = 0;
+  int numVideoStreams = 0;
+  // Note that this is the container-level duration, which is usually the max
+  // of all stream durations available in the container.
+  std::optional<double> durationSeconds;
+  // Total BitRate level information at the container level in bit/s
+  std::optional<double> bitRate;
+  // If set, this is the index to the default audio stream.
+  std::optional<int> bestAudioStreamIndex;
+  // If set, this is the index to the default video stream.
+  std::optional<int> bestVideoStreamIndex;
+};
+
+} // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/Stream.h b/src/torchcodec/_core/Stream.h
index a9ff1fe2c..38e51209c 100644
--- a/src/torchcodec/_core/Stream.h
+++ b/src/torchcodec/_core/Stream.h
@@ -46,54 +46,4 @@ struct AudioStreamOptions {
   std::optional<int> sampleRate;
 };
 
-struct StreamMetadata {
-  // Common (video and audio) fields derived from the AVStream.
-  int streamIndex;
-  // See this link for what various values are available:
-  // https://ffmpeg.org/doxygen/trunk/group__lavu__misc.html#ga9a84bba4713dfced21a1a56163be1f48
-  AVMediaType mediaType;
-  std::optional<AVCodecID> codecId;
-  std::optional<std::string> codecName;
-  std::optional<double> durationSeconds;
-  std::optional<double> beginStreamFromHeader;
-  std::optional<int64_t> numFrames;
-  std::optional<int64_t> numKeyFrames;
-  std::optional<double> averageFps;
-  std::optional<double> bitRate;
-
-  // More accurate duration, obtained by scanning the file.
-  // These presentation timestamps are in time base.
-  std::optional<int64_t> minPtsFromScan;
-  std::optional<int64_t> maxPtsFromScan;
-  // These presentation timestamps are in seconds.
-  std::optional<double> minPtsSecondsFromScan;
-  std::optional<double> maxPtsSecondsFromScan;
-  // This can be useful for index-based seeking.
-  std::optional<int64_t> numFramesFromScan;
-
-  // Video-only fields derived from the AVCodecContext.
-  std::optional<int64_t> width;
-  std::optional<int64_t> height;
-
-  // Audio-only fields
-  std::optional<int64_t> sampleRate;
-  std::optional<int64_t> numChannels;
-  std::optional<std::string> sampleFormat;
-};
-
-struct ContainerMetadata {
-  std::vector<StreamMetadata> allStreamMetadata;
-  int numAudioStreams = 0;
-  int numVideoStreams = 0;
-  // Note that this is the container-level duration, which is usually the max
-  // of all stream durations available in the container.
-  std::optional<double> durationSeconds;
-  // Total BitRate level information at the container level in bit/s
-  std::optional<double> bitRate;
-  // If set, this is the index to the default audio stream.
-  std::optional<int> bestAudioStreamIndex;
-  // If set, this is the index to the default video stream.
-  std::optional<int> bestVideoStreamIndex;
-};
-
 } // namespace facebook::torchcodec

From 6053a5514270eec15031ffc74d859ff6b9f6d2df Mon Sep 17 00:00:00 2001
From: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Date: Mon, 7 Apr 2025 16:16:08 +0000
Subject: [PATCH 6/6] Rename Stream.h back to StreamOptions.h

Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
---
 src/torchcodec/_core/DeviceInterface.h             | 2 +-
 src/torchcodec/_core/Frame.h                       | 2 +-
 src/torchcodec/_core/SingleStreamDecoder.h         | 2 +-
 src/torchcodec/_core/{Stream.h => StreamOptions.h} | 0
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename src/torchcodec/_core/{Stream.h => StreamOptions.h} (100%)

diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
index d91870ed8..b4197d7d5 100644
--- a/src/torchcodec/_core/DeviceInterface.h
+++ b/src/torchcodec/_core/DeviceInterface.h
@@ -13,7 +13,7 @@
 #include <string>
 #include "FFMPEGCommon.h"
 #include "src/torchcodec/_core/Frame.h"
-#include "src/torchcodec/_core/Stream.h"
+#include "src/torchcodec/_core/StreamOptions.h"
 
 namespace facebook::torchcodec {
 
diff --git a/src/torchcodec/_core/Frame.h b/src/torchcodec/_core/Frame.h
index 728d8d8ac..aa6897349 100644
--- a/src/torchcodec/_core/Frame.h
+++ b/src/torchcodec/_core/Frame.h
@@ -8,7 +8,7 @@
 
 #include <torch/types.h>
 #include "src/torchcodec/_core/Metadata.h"
-#include "src/torchcodec/_core/Stream.h"
+#include "src/torchcodec/_core/StreamOptions.h"
 
 namespace facebook::torchcodec {
 
diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h
index 453b974f5..7b275a209 100644
--- a/src/torchcodec/_core/SingleStreamDecoder.h
+++ b/src/torchcodec/_core/SingleStreamDecoder.h
@@ -16,7 +16,7 @@
 #include "src/torchcodec/_core/DeviceInterface.h"
 #include "src/torchcodec/_core/FFMPEGCommon.h"
 #include "src/torchcodec/_core/Frame.h"
-#include "src/torchcodec/_core/Stream.h"
+#include "src/torchcodec/_core/StreamOptions.h"
 
 namespace facebook::torchcodec {
 
diff --git a/src/torchcodec/_core/Stream.h b/src/torchcodec/_core/StreamOptions.h
similarity index 100%
rename from src/torchcodec/_core/Stream.h
rename to src/torchcodec/_core/StreamOptions.h