From 19bbb9ad974a9da7e770c5d034076e6df4b263da Mon Sep 17 00:00:00 2001 From: Dmitry Rogozhkin Date: Fri, 4 Apr 2025 20:20:11 +0000 Subject: [PATCH 1/6] Properly name DeviceInterface var member in decoder class Signed-off-by: Dmitry Rogozhkin --- src/torchcodec/_core/SingleStreamDecoder.cpp | 16 ++++++++-------- src/torchcodec/_core/SingleStreamDecoder.h | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp index c7c714da3..e80851988 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.cpp +++ b/src/torchcodec/_core/SingleStreamDecoder.cpp @@ -406,7 +406,7 @@ void SingleStreamDecoder::addStream( streamInfo.stream = formatContext_->streams[activeStreamIndex_]; streamInfo.avMediaType = mediaType; - deviceInterface = createDeviceInterface(device); + deviceInterface_ = createDeviceInterface(device); // This should never happen, checking just to be safe. TORCH_CHECK( @@ -418,9 +418,9 @@ void SingleStreamDecoder::addStream( // TODO_CODE_QUALITY it's pretty meh to have a video-specific logic within // addStream() which is supposed to be generic if (mediaType == AVMEDIA_TYPE_VIDEO) { - if (deviceInterface) { + if (deviceInterface_) { avCodec = makeAVCodecOnlyUseForCallingAVFindBestStream( - deviceInterface->findCodec(streamInfo.stream->codecpar->codec_id) + deviceInterface_->findCodec(streamInfo.stream->codecpar->codec_id) .value_or(avCodec)); } } @@ -438,8 +438,8 @@ void SingleStreamDecoder::addStream( // TODO_CODE_QUALITY same as above. if (mediaType == AVMEDIA_TYPE_VIDEO) { - if (deviceInterface) { - deviceInterface->initializeContext(codecContext); + if (deviceInterface_) { + deviceInterface_->initializeContext(codecContext); } } @@ -1210,11 +1210,11 @@ SingleStreamDecoder::convertAVFrameToFrameOutput( formatContext_->streams[activeStreamIndex_]->time_base); if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) { convertAudioAVFrameToFrameOutputOnCPU(avFrame, frameOutput); - } else if (!deviceInterface) { + } else if (!deviceInterface_) { convertAVFrameToFrameOutputOnCPU( avFrame, frameOutput, preAllocatedOutputTensor); - } else if (deviceInterface) { - deviceInterface->convertAVFrameToFrameOutput( + } else if (deviceInterface_) { + deviceInterface_->convertAVFrameToFrameOutput( streamInfo.videoStreamOptions, avFrame, frameOutput, diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h index 4879a3b7d..b1dc4fa23 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.h +++ b/src/torchcodec/_core/SingleStreamDecoder.h @@ -492,7 +492,7 @@ class SingleStreamDecoder { SeekMode seekMode_; ContainerMetadata containerMetadata_; UniqueDecodingAVFormatContext formatContext_; - std::unique_ptr deviceInterface; + std::unique_ptr deviceInterface_; std::map streamInfos_; const int NO_ACTIVE_STREAM = -2; int activeStreamIndex_ = NO_ACTIVE_STREAM; From 94ba2b50e66e81c9560a561263676ea959703412 Mon Sep 17 00:00:00 2001 From: Dmitry Rogozhkin Date: Fri, 4 Apr 2025 21:13:10 +0000 Subject: [PATCH 2/6] Move stream options to dedicated header Signed-off-by: Dmitry Rogozhkin --- src/torchcodec/_core/CudaDevice.cpp | 2 +- src/torchcodec/_core/CudaDevice.h | 2 +- src/torchcodec/_core/DeviceInterface.h | 3 +- src/torchcodec/_core/SingleStreamDecoder.cpp | 9 ++-- src/torchcodec/_core/SingleStreamDecoder.h | 39 ++-------------- src/torchcodec/_core/StreamOptions.h | 49 ++++++++++++++++++++ src/torchcodec/_core/custom_ops.cpp | 8 ++-- test/VideoDecoderTest.cpp | 15 +++--- 8 files changed, 71 insertions(+), 56 deletions(-) create mode 100644 src/torchcodec/_core/StreamOptions.h diff --git a/src/torchcodec/_core/CudaDevice.cpp b/src/torchcodec/_core/CudaDevice.cpp index 5bde4106f..f41c529d5 100644 --- a/src/torchcodec/_core/CudaDevice.cpp +++ b/src/torchcodec/_core/CudaDevice.cpp @@ -190,7 +190,7 @@ void CudaDevice::initializeContext(AVCodecContext* codecContext) { } void CudaDevice::convertAVFrameToFrameOutput( - const SingleStreamDecoder::VideoStreamOptions& videoStreamOptions, + const VideoStreamOptions& videoStreamOptions, UniqueAVFrame& avFrame, SingleStreamDecoder::FrameOutput& frameOutput, std::optional preAllocatedOutputTensor) { diff --git a/src/torchcodec/_core/CudaDevice.h b/src/torchcodec/_core/CudaDevice.h index 0ed538593..20b45026b 100644 --- a/src/torchcodec/_core/CudaDevice.h +++ b/src/torchcodec/_core/CudaDevice.h @@ -21,7 +21,7 @@ class CudaDevice : public DeviceInterface { void initializeContext(AVCodecContext* codecContext) override; void convertAVFrameToFrameOutput( - const SingleStreamDecoder::VideoStreamOptions& videoStreamOptions, + const VideoStreamOptions& videoStreamOptions, UniqueAVFrame& avFrame, SingleStreamDecoder::FrameOutput& frameOutput, std::optional preAllocatedOutputTensor = diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h index a5b0e3652..c33a8e37f 100644 --- a/src/torchcodec/_core/DeviceInterface.h +++ b/src/torchcodec/_core/DeviceInterface.h @@ -13,6 +13,7 @@ #include #include "FFMPEGCommon.h" #include "src/torchcodec/_core/SingleStreamDecoder.h" +#include "src/torchcodec/_core/StreamOptions.h" namespace facebook::torchcodec { @@ -41,7 +42,7 @@ class DeviceInterface { virtual void initializeContext(AVCodecContext* codecContext) = 0; virtual void convertAVFrameToFrameOutput( - const SingleStreamDecoder::VideoStreamOptions& videoStreamOptions, + const VideoStreamOptions& videoStreamOptions, UniqueAVFrame& avFrame, SingleStreamDecoder::FrameOutput& frameOutput, std::optional preAllocatedOutputTensor = std::nullopt) = 0; diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp index e80851988..5edbb7fd9 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.cpp +++ b/src/torchcodec/_core/SingleStreamDecoder.cpp @@ -501,9 +501,8 @@ void SingleStreamDecoder::addVideoStream( // swscale requires widths to be multiples of 32: // https://stackoverflow.com/questions/74351955/turn-off-sw-scale-conversion-to-planar-yuv-32-byte-alignment-requirements // so we fall back to filtergraph if the width is not a multiple of 32. - auto defaultLibrary = (width % 32 == 0) - ? SingleStreamDecoder::ColorConversionLibrary::SWSCALE - : SingleStreamDecoder::ColorConversionLibrary::FILTERGRAPH; + auto defaultLibrary = (width % 32 == 0) ? ColorConversionLibrary::SWSCALE + : ColorConversionLibrary::FILTERGRAPH; streamInfo.colorConversionLibrary = videoStreamOptions.colorConversionLibrary.value_or(defaultLibrary); @@ -2047,7 +2046,7 @@ FrameDims getHeightAndWidthFromResizedAVFrame(const AVFrame& resizedAVFrame) { } FrameDims getHeightAndWidthFromOptionsOrMetadata( - const SingleStreamDecoder::VideoStreamOptions& videoStreamOptions, + const VideoStreamOptions& videoStreamOptions, const SingleStreamDecoder::StreamMetadata& streamMetadata) { return FrameDims( videoStreamOptions.height.value_or(*streamMetadata.height), @@ -2055,7 +2054,7 @@ FrameDims getHeightAndWidthFromOptionsOrMetadata( } FrameDims getHeightAndWidthFromOptionsOrAVFrame( - const SingleStreamDecoder::VideoStreamOptions& videoStreamOptions, + const VideoStreamOptions& videoStreamOptions, const UniqueAVFrame& avFrame) { return FrameDims( videoStreamOptions.height.value_or(avFrame->height), diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h index b1dc4fa23..6b54e72e7 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.h +++ b/src/torchcodec/_core/SingleStreamDecoder.h @@ -14,6 +14,7 @@ #include "src/torchcodec/_core/AVIOContextHolder.h" #include "src/torchcodec/_core/FFMPEGCommon.h" +#include "src/torchcodec/_core/StreamOptions.h" namespace facebook::torchcodec { class DeviceInterface; @@ -112,40 +113,6 @@ class SingleStreamDecoder { // ADDING STREAMS API // -------------------------------------------------------------------------- - enum ColorConversionLibrary { - // TODO: Add an AUTO option later. - // Use the libavfilter library for color conversion. - FILTERGRAPH, - // Use the libswscale library for color conversion. - SWSCALE - }; - - struct VideoStreamOptions { - VideoStreamOptions() {} - - // Number of threads we pass to FFMPEG for decoding. - // 0 means FFMPEG will choose the number of threads automatically to fully - // utilize all cores. If not set, it will be the default FFMPEG behavior for - // the given codec. - std::optional ffmpegThreadCount; - // Currently the dimension order can be either NHWC or NCHW. - // H=height, W=width, C=channel. - std::string dimensionOrder = "NCHW"; - // The output height and width of the frame. If not specified, the output - // is the same as the original video. - std::optional width; - std::optional height; - std::optional colorConversionLibrary; - // By default we use CPU for decoding for both C++ and python users. - torch::Device device = torch::kCPU; - }; - - struct AudioStreamOptions { - AudioStreamOptions() {} - - std::optional sampleRate; - }; - void addVideoStream( int streamIndex, const VideoStreamOptions& videoStreamOptions = VideoStreamOptions()); @@ -568,11 +535,11 @@ struct FrameDims { FrameDims getHeightAndWidthFromResizedAVFrame(const AVFrame& resizedAVFrame); FrameDims getHeightAndWidthFromOptionsOrMetadata( - const SingleStreamDecoder::VideoStreamOptions& videoStreamOptions, + const VideoStreamOptions& videoStreamOptions, const SingleStreamDecoder::StreamMetadata& streamMetadata); FrameDims getHeightAndWidthFromOptionsOrAVFrame( - const SingleStreamDecoder::VideoStreamOptions& videoStreamOptions, + const VideoStreamOptions& videoStreamOptions, const UniqueAVFrame& avFrame); torch::Tensor allocateEmptyHWCTensor( diff --git a/src/torchcodec/_core/StreamOptions.h b/src/torchcodec/_core/StreamOptions.h new file mode 100644 index 000000000..38e51209c --- /dev/null +++ b/src/torchcodec/_core/StreamOptions.h @@ -0,0 +1,49 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include +#include +#include + +namespace facebook::torchcodec { + +enum ColorConversionLibrary { + // TODO: Add an AUTO option later. + // Use the libavfilter library for color conversion. + FILTERGRAPH, + // Use the libswscale library for color conversion. + SWSCALE +}; + +struct VideoStreamOptions { + VideoStreamOptions() {} + + // Number of threads we pass to FFMPEG for decoding. + // 0 means FFMPEG will choose the number of threads automatically to fully + // utilize all cores. If not set, it will be the default FFMPEG behavior for + // the given codec. + std::optional ffmpegThreadCount; + // Currently the dimension order can be either NHWC or NCHW. + // H=height, W=width, C=channel. + std::string dimensionOrder = "NCHW"; + // The output height and width of the frame. If not specified, the output + // is the same as the original video. + std::optional width; + std::optional height; + std::optional colorConversionLibrary; + // By default we use CPU for decoding for both C++ and python users. + torch::Device device = torch::kCPU; +}; + +struct AudioStreamOptions { + AudioStreamOptions() {} + + std::optional sampleRate; +}; + +} // namespace facebook::torchcodec diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp index 05a6390d6..7ec2fb9a4 100644 --- a/src/torchcodec/_core/custom_ops.cpp +++ b/src/torchcodec/_core/custom_ops.cpp @@ -218,7 +218,7 @@ void _add_video_stream( std::optional stream_index = std::nullopt, std::optional device = std::nullopt, std::optional color_conversion_library = std::nullopt) { - SingleStreamDecoder::VideoStreamOptions videoStreamOptions; + VideoStreamOptions videoStreamOptions; videoStreamOptions.width = width; videoStreamOptions.height = height; videoStreamOptions.ffmpegThreadCount = num_threads; @@ -232,10 +232,10 @@ void _add_video_stream( std::string stdColorConversionLibrary{color_conversion_library.value()}; if (stdColorConversionLibrary == "filtergraph") { videoStreamOptions.colorConversionLibrary = - SingleStreamDecoder::ColorConversionLibrary::FILTERGRAPH; + ColorConversionLibrary::FILTERGRAPH; } else if (stdColorConversionLibrary == "swscale") { videoStreamOptions.colorConversionLibrary = - SingleStreamDecoder::ColorConversionLibrary::SWSCALE; + ColorConversionLibrary::SWSCALE; } else { throw std::runtime_error( "Invalid color_conversion_library=" + stdColorConversionLibrary + @@ -273,7 +273,7 @@ void add_audio_stream( at::Tensor& decoder, std::optional stream_index = std::nullopt, std::optional sample_rate = std::nullopt) { - SingleStreamDecoder::AudioStreamOptions audioStreamOptions; + AudioStreamOptions audioStreamOptions; audioStreamOptions.sampleRate = sample_rate; auto videoDecoder = unwrapTensorToGetDecoder(decoder); diff --git a/test/VideoDecoderTest.cpp b/test/VideoDecoderTest.cpp index 1937ff97c..ef0a6468d 100644 --- a/test/VideoDecoderTest.cpp +++ b/test/VideoDecoderTest.cpp @@ -150,7 +150,7 @@ TEST(SingleStreamDecoderTest, RespectsWidthAndHeightFromOptions) { std::string path = getResourcePath("nasa_13013.mp4"); std::unique_ptr decoder = std::make_unique(path); - SingleStreamDecoder::VideoStreamOptions videoStreamOptions; + VideoStreamOptions videoStreamOptions; videoStreamOptions.width = 100; videoStreamOptions.height = 120; decoder->addVideoStream(-1, videoStreamOptions); @@ -162,7 +162,7 @@ TEST(SingleStreamDecoderTest, RespectsOutputTensorDimensionOrderFromOptions) { std::string path = getResourcePath("nasa_13013.mp4"); std::unique_ptr decoder = std::make_unique(path); - SingleStreamDecoder::VideoStreamOptions videoStreamOptions; + VideoStreamOptions videoStreamOptions; videoStreamOptions.dimensionOrder = "NHWC"; decoder->addVideoStream(-1, videoStreamOptions); torch::Tensor tensor = decoder->getNextFrame().data; @@ -234,7 +234,7 @@ TEST_P(SingleStreamDecoderTest, DecodesFramesInABatchInNHWC) { ourDecoder->scanFileAndUpdateMetadataAndIndex(); int bestVideoStreamIndex = *ourDecoder->getContainerMetadata().bestVideoStreamIndex; - SingleStreamDecoder::VideoStreamOptions videoStreamOptions; + VideoStreamOptions videoStreamOptions; videoStreamOptions.dimensionOrder = "NHWC"; ourDecoder->addVideoStream(bestVideoStreamIndex, videoStreamOptions); // Frame with index 180 corresponds to timestamp 6.006. @@ -399,9 +399,9 @@ TEST_P(SingleStreamDecoderTest, PreAllocatedTensorFilterGraph) { ourDecoder->scanFileAndUpdateMetadataAndIndex(); int bestVideoStreamIndex = *ourDecoder->getContainerMetadata().bestVideoStreamIndex; - SingleStreamDecoder::VideoStreamOptions videoStreamOptions; + VideoStreamOptions videoStreamOptions; videoStreamOptions.colorConversionLibrary = - SingleStreamDecoder::ColorConversionLibrary::FILTERGRAPH; + ColorConversionLibrary::FILTERGRAPH; ourDecoder->addVideoStream(bestVideoStreamIndex, videoStreamOptions); auto output = ourDecoder->getFrameAtIndexInternal(0, preAllocatedOutputTensor); @@ -417,9 +417,8 @@ TEST_P(SingleStreamDecoderTest, PreAllocatedTensorSwscale) { ourDecoder->scanFileAndUpdateMetadataAndIndex(); int bestVideoStreamIndex = *ourDecoder->getContainerMetadata().bestVideoStreamIndex; - SingleStreamDecoder::VideoStreamOptions videoStreamOptions; - videoStreamOptions.colorConversionLibrary = - SingleStreamDecoder::ColorConversionLibrary::SWSCALE; + VideoStreamOptions videoStreamOptions; + videoStreamOptions.colorConversionLibrary = ColorConversionLibrary::SWSCALE; ourDecoder->addVideoStream(bestVideoStreamIndex, videoStreamOptions); auto output = ourDecoder->getFrameAtIndexInternal(0, preAllocatedOutputTensor); From b42b2a6c9f086be552e7f7b1866c01ca9c7188d1 Mon Sep 17 00:00:00 2001 From: Dmitry Rogozhkin Date: Fri, 4 Apr 2025 21:34:03 +0000 Subject: [PATCH 3/6] Move frame output structs to dedicated header Signed-off-by: Dmitry Rogozhkin --- src/torchcodec/_core/CudaDevice.cpp | 2 +- src/torchcodec/_core/CudaDevice.h | 2 +- src/torchcodec/_core/DeviceInterface.h | 6 +- src/torchcodec/_core/Frame.h | 46 +++++++++ src/torchcodec/_core/SingleStreamDecoder.cpp | 34 +++---- src/torchcodec/_core/SingleStreamDecoder.h | 87 +---------------- src/torchcodec/_core/Stream.h | 99 ++++++++++++++++++++ src/torchcodec/_core/StreamOptions.h | 49 ---------- src/torchcodec/_core/custom_ops.cpp | 15 ++- test/VideoDecoderTest.cpp | 6 +- 10 files changed, 175 insertions(+), 171 deletions(-) create mode 100644 src/torchcodec/_core/Frame.h create mode 100644 src/torchcodec/_core/Stream.h delete mode 100644 src/torchcodec/_core/StreamOptions.h diff --git a/src/torchcodec/_core/CudaDevice.cpp b/src/torchcodec/_core/CudaDevice.cpp index f41c529d5..4f6c74073 100644 --- a/src/torchcodec/_core/CudaDevice.cpp +++ b/src/torchcodec/_core/CudaDevice.cpp @@ -192,7 +192,7 @@ void CudaDevice::initializeContext(AVCodecContext* codecContext) { void CudaDevice::convertAVFrameToFrameOutput( const VideoStreamOptions& videoStreamOptions, UniqueAVFrame& avFrame, - SingleStreamDecoder::FrameOutput& frameOutput, + FrameOutput& frameOutput, std::optional preAllocatedOutputTensor) { TORCH_CHECK( avFrame->format == AV_PIX_FMT_CUDA, diff --git a/src/torchcodec/_core/CudaDevice.h b/src/torchcodec/_core/CudaDevice.h index 20b45026b..3aee6e2b1 100644 --- a/src/torchcodec/_core/CudaDevice.h +++ b/src/torchcodec/_core/CudaDevice.h @@ -23,7 +23,7 @@ class CudaDevice : public DeviceInterface { void convertAVFrameToFrameOutput( const VideoStreamOptions& videoStreamOptions, UniqueAVFrame& avFrame, - SingleStreamDecoder::FrameOutput& frameOutput, + FrameOutput& frameOutput, std::optional preAllocatedOutputTensor = std::nullopt) override; diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h index c33a8e37f..d91870ed8 100644 --- a/src/torchcodec/_core/DeviceInterface.h +++ b/src/torchcodec/_core/DeviceInterface.h @@ -12,8 +12,8 @@ #include #include #include "FFMPEGCommon.h" -#include "src/torchcodec/_core/SingleStreamDecoder.h" -#include "src/torchcodec/_core/StreamOptions.h" +#include "src/torchcodec/_core/Frame.h" +#include "src/torchcodec/_core/Stream.h" namespace facebook::torchcodec { @@ -44,7 +44,7 @@ class DeviceInterface { virtual void convertAVFrameToFrameOutput( const VideoStreamOptions& videoStreamOptions, UniqueAVFrame& avFrame, - SingleStreamDecoder::FrameOutput& frameOutput, + FrameOutput& frameOutput, std::optional preAllocatedOutputTensor = std::nullopt) = 0; protected: diff --git a/src/torchcodec/_core/Frame.h b/src/torchcodec/_core/Frame.h new file mode 100644 index 000000000..d2214c3ce --- /dev/null +++ b/src/torchcodec/_core/Frame.h @@ -0,0 +1,46 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include +#include "src/torchcodec/_core/Stream.h" + +namespace facebook::torchcodec { + +// All public video decoding entry points return either a FrameOutput or a +// FrameBatchOutput. +// They are the equivalent of the user-facing Frame and FrameBatch classes in +// Python. They contain RGB decoded frames along with some associated data +// like PTS and duration. +// FrameOutput is also relevant for audio decoding, typically as the output of +// getNextFrame(), or as a temporary output variable. +struct FrameOutput { + // data shape is: + // - 3D (C, H, W) or (H, W, C) for videos + // - 2D (numChannels, numSamples) for audio + torch::Tensor data; + double ptsSeconds; + double durationSeconds; +}; + +struct FrameBatchOutput { + torch::Tensor data; // 4D: of shape NCHW or NHWC. + torch::Tensor ptsSeconds; // 1D of shape (N,) + torch::Tensor durationSeconds; // 1D of shape (N,) + + explicit FrameBatchOutput( + int64_t numFrames, + const VideoStreamOptions& videoStreamOptions, + const StreamMetadata& streamMetadata); +}; + +struct AudioFramesOutput { + torch::Tensor data; // shape is (numChannels, numSamples) + double ptsSeconds; +}; + +} // namespace facebook::torchcodec diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp index 5edbb7fd9..ea7d341d2 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.cpp +++ b/src/torchcodec/_core/SingleStreamDecoder.cpp @@ -350,8 +350,7 @@ void SingleStreamDecoder::scanFileAndUpdateMetadataAndIndex() { scannedAllStreams_ = true; } -SingleStreamDecoder::ContainerMetadata -SingleStreamDecoder::getContainerMetadata() const { +ContainerMetadata SingleStreamDecoder::getContainerMetadata() const { return containerMetadata_; } @@ -538,7 +537,7 @@ void SingleStreamDecoder::addAudioStream( // HIGH-LEVEL DECODING ENTRY-POINTS // -------------------------------------------------------------------------- -SingleStreamDecoder::FrameOutput SingleStreamDecoder::getNextFrame() { +FrameOutput SingleStreamDecoder::getNextFrame() { auto output = getNextFrameInternal(); if (streamInfos_[activeStreamIndex_].avMediaType == AVMEDIA_TYPE_VIDEO) { output.data = maybePermuteHWC2CHW(output.data); @@ -546,7 +545,7 @@ SingleStreamDecoder::FrameOutput SingleStreamDecoder::getNextFrame() { return output; } -SingleStreamDecoder::FrameOutput SingleStreamDecoder::getNextFrameInternal( +FrameOutput SingleStreamDecoder::getNextFrameInternal( std::optional preAllocatedOutputTensor) { validateActiveStream(); UniqueAVFrame avFrame = decodeAVFrame( @@ -554,14 +553,13 @@ SingleStreamDecoder::FrameOutput SingleStreamDecoder::getNextFrameInternal( return convertAVFrameToFrameOutput(avFrame, preAllocatedOutputTensor); } -SingleStreamDecoder::FrameOutput SingleStreamDecoder::getFrameAtIndex( - int64_t frameIndex) { +FrameOutput SingleStreamDecoder::getFrameAtIndex(int64_t frameIndex) { auto frameOutput = getFrameAtIndexInternal(frameIndex); frameOutput.data = maybePermuteHWC2CHW(frameOutput.data); return frameOutput; } -SingleStreamDecoder::FrameOutput SingleStreamDecoder::getFrameAtIndexInternal( +FrameOutput SingleStreamDecoder::getFrameAtIndexInternal( int64_t frameIndex, std::optional preAllocatedOutputTensor) { validateActiveStream(AVMEDIA_TYPE_VIDEO); @@ -576,7 +574,7 @@ SingleStreamDecoder::FrameOutput SingleStreamDecoder::getFrameAtIndexInternal( return getNextFrameInternal(preAllocatedOutputTensor); } -SingleStreamDecoder::FrameBatchOutput SingleStreamDecoder::getFramesAtIndices( +FrameBatchOutput SingleStreamDecoder::getFramesAtIndices( const std::vector& frameIndices) { validateActiveStream(AVMEDIA_TYPE_VIDEO); @@ -635,7 +633,7 @@ SingleStreamDecoder::FrameBatchOutput SingleStreamDecoder::getFramesAtIndices( return frameBatchOutput; } -SingleStreamDecoder::FrameBatchOutput SingleStreamDecoder::getFramesInRange( +FrameBatchOutput SingleStreamDecoder::getFramesInRange( int64_t start, int64_t stop, int64_t step) { @@ -669,8 +667,7 @@ SingleStreamDecoder::FrameBatchOutput SingleStreamDecoder::getFramesInRange( return frameBatchOutput; } -SingleStreamDecoder::FrameOutput SingleStreamDecoder::getFramePlayedAt( - double seconds) { +FrameOutput SingleStreamDecoder::getFramePlayedAt(double seconds) { validateActiveStream(AVMEDIA_TYPE_VIDEO); StreamInfo& streamInfo = streamInfos_[activeStreamIndex_]; double frameStartTime = @@ -710,7 +707,7 @@ SingleStreamDecoder::FrameOutput SingleStreamDecoder::getFramePlayedAt( return frameOutput; } -SingleStreamDecoder::FrameBatchOutput SingleStreamDecoder::getFramesPlayedAt( +FrameBatchOutput SingleStreamDecoder::getFramesPlayedAt( const std::vector& timestamps) { validateActiveStream(AVMEDIA_TYPE_VIDEO); @@ -740,8 +737,7 @@ SingleStreamDecoder::FrameBatchOutput SingleStreamDecoder::getFramesPlayedAt( return getFramesAtIndices(frameIndices); } -SingleStreamDecoder::FrameBatchOutput -SingleStreamDecoder::getFramesPlayedInRange( +FrameBatchOutput SingleStreamDecoder::getFramesPlayedInRange( double startSeconds, double stopSeconds) { validateActiveStream(AVMEDIA_TYPE_VIDEO); @@ -874,8 +870,7 @@ SingleStreamDecoder::getFramesPlayedInRange( // [2] If you're brave and curious, you can read the long "Seek offset for // audio" note in https://github.com/pytorch/torchcodec/pull/507/files, which // sums up past (and failed) attemps at working around this issue. -SingleStreamDecoder::AudioFramesOutput -SingleStreamDecoder::getFramesPlayedInRangeAudio( +AudioFramesOutput SingleStreamDecoder::getFramesPlayedInRangeAudio( double startSeconds, std::optional stopSecondsOptional) { validateActiveStream(AVMEDIA_TYPE_AUDIO); @@ -1195,8 +1190,7 @@ UniqueAVFrame SingleStreamDecoder::decodeAVFrame( // AVFRAME <-> FRAME OUTPUT CONVERSION // -------------------------------------------------------------------------- -SingleStreamDecoder::FrameOutput -SingleStreamDecoder::convertAVFrameToFrameOutput( +FrameOutput SingleStreamDecoder::convertAVFrameToFrameOutput( UniqueAVFrame& avFrame, std::optional preAllocatedOutputTensor) { // Convert the frame to tensor. @@ -1546,7 +1540,7 @@ std::optional SingleStreamDecoder::maybeFlushSwrBuffers() { // OUTPUT ALLOCATION AND SHAPE CONVERSION // -------------------------------------------------------------------------- -SingleStreamDecoder::FrameBatchOutput::FrameBatchOutput( +FrameBatchOutput::FrameBatchOutput( int64_t numFrames, const VideoStreamOptions& videoStreamOptions, const StreamMetadata& streamMetadata) @@ -2047,7 +2041,7 @@ FrameDims getHeightAndWidthFromResizedAVFrame(const AVFrame& resizedAVFrame) { FrameDims getHeightAndWidthFromOptionsOrMetadata( const VideoStreamOptions& videoStreamOptions, - const SingleStreamDecoder::StreamMetadata& streamMetadata) { + const StreamMetadata& streamMetadata) { return FrameDims( videoStreamOptions.height.value_or(*streamMetadata.height), videoStreamOptions.width.value_or(*streamMetadata.width)); diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h index 6b54e72e7..3e9506ce3 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.h +++ b/src/torchcodec/_core/SingleStreamDecoder.h @@ -14,7 +14,8 @@ #include "src/torchcodec/_core/AVIOContextHolder.h" #include "src/torchcodec/_core/FFMPEGCommon.h" -#include "src/torchcodec/_core/StreamOptions.h" +#include "src/torchcodec/_core/Frame.h" +#include "src/torchcodec/_core/Stream.h" namespace facebook::torchcodec { class DeviceInterface; @@ -52,56 +53,6 @@ class SingleStreamDecoder { // the allFrames and keyFrames vectors. void scanFileAndUpdateMetadataAndIndex(); - struct StreamMetadata { - // Common (video and audio) fields derived from the AVStream. - int streamIndex; - // See this link for what various values are available: - // https://ffmpeg.org/doxygen/trunk/group__lavu__misc.html#ga9a84bba4713dfced21a1a56163be1f48 - AVMediaType mediaType; - std::optional codecId; - std::optional codecName; - std::optional durationSeconds; - std::optional beginStreamFromHeader; - std::optional numFrames; - std::optional numKeyFrames; - std::optional averageFps; - std::optional bitRate; - - // More accurate duration, obtained by scanning the file. - // These presentation timestamps are in time base. - std::optional minPtsFromScan; - std::optional maxPtsFromScan; - // These presentation timestamps are in seconds. - std::optional minPtsSecondsFromScan; - std::optional maxPtsSecondsFromScan; - // This can be useful for index-based seeking. - std::optional numFramesFromScan; - - // Video-only fields derived from the AVCodecContext. - std::optional width; - std::optional height; - - // Audio-only fields - std::optional sampleRate; - std::optional numChannels; - std::optional sampleFormat; - }; - - struct ContainerMetadata { - std::vector allStreamMetadata; - int numAudioStreams = 0; - int numVideoStreams = 0; - // Note that this is the container-level duration, which is usually the max - // of all stream durations available in the container. - std::optional durationSeconds; - // Total BitRate level information at the container level in bit/s - std::optional bitRate; - // If set, this is the index to the default audio stream. - std::optional bestAudioStreamIndex; - // If set, this is the index to the default video stream. - std::optional bestVideoStreamIndex; - }; - // Returns the metadata for the container. ContainerMetadata getContainerMetadata() const; @@ -124,38 +75,6 @@ class SingleStreamDecoder { // DECODING AND SEEKING APIs // -------------------------------------------------------------------------- - // All public video decoding entry points return either a FrameOutput or a - // FrameBatchOutput. - // They are the equivalent of the user-facing Frame and FrameBatch classes in - // Python. They contain RGB decoded frames along with some associated data - // like PTS and duration. - // FrameOutput is also relevant for audio decoding, typically as the output of - // getNextFrame(), or as a temporary output variable. - struct FrameOutput { - // data shape is: - // - 3D (C, H, W) or (H, W, C) for videos - // - 2D (numChannels, numSamples) for audio - torch::Tensor data; - double ptsSeconds; - double durationSeconds; - }; - - struct FrameBatchOutput { - torch::Tensor data; // 4D: of shape NCHW or NHWC. - torch::Tensor ptsSeconds; // 1D of shape (N,) - torch::Tensor durationSeconds; // 1D of shape (N,) - - explicit FrameBatchOutput( - int64_t numFrames, - const VideoStreamOptions& videoStreamOptions, - const StreamMetadata& streamMetadata); - }; - - struct AudioFramesOutput { - torch::Tensor data; // shape is (numChannels, numSamples) - double ptsSeconds; - }; - // Places the cursor at the first frame on or after the position in seconds. // Calling getNextFrame() will return the first frame at // or after this position. @@ -536,7 +455,7 @@ FrameDims getHeightAndWidthFromResizedAVFrame(const AVFrame& resizedAVFrame); FrameDims getHeightAndWidthFromOptionsOrMetadata( const VideoStreamOptions& videoStreamOptions, - const SingleStreamDecoder::StreamMetadata& streamMetadata); + const StreamMetadata& streamMetadata); FrameDims getHeightAndWidthFromOptionsOrAVFrame( const VideoStreamOptions& videoStreamOptions, diff --git a/src/torchcodec/_core/Stream.h b/src/torchcodec/_core/Stream.h new file mode 100644 index 000000000..a9ff1fe2c --- /dev/null +++ b/src/torchcodec/_core/Stream.h @@ -0,0 +1,99 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include +#include +#include + +namespace facebook::torchcodec { + +enum ColorConversionLibrary { + // TODO: Add an AUTO option later. + // Use the libavfilter library for color conversion. + FILTERGRAPH, + // Use the libswscale library for color conversion. + SWSCALE +}; + +struct VideoStreamOptions { + VideoStreamOptions() {} + + // Number of threads we pass to FFMPEG for decoding. + // 0 means FFMPEG will choose the number of threads automatically to fully + // utilize all cores. If not set, it will be the default FFMPEG behavior for + // the given codec. + std::optional ffmpegThreadCount; + // Currently the dimension order can be either NHWC or NCHW. + // H=height, W=width, C=channel. + std::string dimensionOrder = "NCHW"; + // The output height and width of the frame. If not specified, the output + // is the same as the original video. + std::optional width; + std::optional height; + std::optional colorConversionLibrary; + // By default we use CPU for decoding for both C++ and python users. + torch::Device device = torch::kCPU; +}; + +struct AudioStreamOptions { + AudioStreamOptions() {} + + std::optional sampleRate; +}; + +struct StreamMetadata { + // Common (video and audio) fields derived from the AVStream. + int streamIndex; + // See this link for what various values are available: + // https://ffmpeg.org/doxygen/trunk/group__lavu__misc.html#ga9a84bba4713dfced21a1a56163be1f48 + AVMediaType mediaType; + std::optional codecId; + std::optional codecName; + std::optional durationSeconds; + std::optional beginStreamFromHeader; + std::optional numFrames; + std::optional numKeyFrames; + std::optional averageFps; + std::optional bitRate; + + // More accurate duration, obtained by scanning the file. + // These presentation timestamps are in time base. + std::optional minPtsFromScan; + std::optional maxPtsFromScan; + // These presentation timestamps are in seconds. + std::optional minPtsSecondsFromScan; + std::optional maxPtsSecondsFromScan; + // This can be useful for index-based seeking. + std::optional numFramesFromScan; + + // Video-only fields derived from the AVCodecContext. + std::optional width; + std::optional height; + + // Audio-only fields + std::optional sampleRate; + std::optional numChannels; + std::optional sampleFormat; +}; + +struct ContainerMetadata { + std::vector allStreamMetadata; + int numAudioStreams = 0; + int numVideoStreams = 0; + // Note that this is the container-level duration, which is usually the max + // of all stream durations available in the container. + std::optional durationSeconds; + // Total BitRate level information at the container level in bit/s + std::optional bitRate; + // If set, this is the index to the default audio stream. + std::optional bestAudioStreamIndex; + // If set, this is the index to the default video stream. + std::optional bestVideoStreamIndex; +}; + +} // namespace facebook::torchcodec diff --git a/src/torchcodec/_core/StreamOptions.h b/src/torchcodec/_core/StreamOptions.h deleted file mode 100644 index 38e51209c..000000000 --- a/src/torchcodec/_core/StreamOptions.h +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright (c) Meta Platforms, Inc. and affiliates. -// All rights reserved. -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include -#include - -namespace facebook::torchcodec { - -enum ColorConversionLibrary { - // TODO: Add an AUTO option later. - // Use the libavfilter library for color conversion. - FILTERGRAPH, - // Use the libswscale library for color conversion. - SWSCALE -}; - -struct VideoStreamOptions { - VideoStreamOptions() {} - - // Number of threads we pass to FFMPEG for decoding. - // 0 means FFMPEG will choose the number of threads automatically to fully - // utilize all cores. If not set, it will be the default FFMPEG behavior for - // the given codec. - std::optional ffmpegThreadCount; - // Currently the dimension order can be either NHWC or NCHW. - // H=height, W=width, C=channel. - std::string dimensionOrder = "NCHW"; - // The output height and width of the frame. If not specified, the output - // is the same as the original video. - std::optional width; - std::optional height; - std::optional colorConversionLibrary; - // By default we use CPU for decoding for both C++ and python users. - torch::Device device = torch::kCPU; -}; - -struct AudioStreamOptions { - AudioStreamOptions() {} - - std::optional sampleRate; -}; - -} // namespace facebook::torchcodec diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp index 7ec2fb9a4..3f107a5a7 100644 --- a/src/torchcodec/_core/custom_ops.cpp +++ b/src/torchcodec/_core/custom_ops.cpp @@ -98,7 +98,7 @@ SingleStreamDecoder* unwrapTensorToGetDecoder(at::Tensor& tensor) { // under torch.compile(). using OpsFrameOutput = std::tuple; -OpsFrameOutput makeOpsFrameOutput(SingleStreamDecoder::FrameOutput& frame) { +OpsFrameOutput makeOpsFrameOutput(FrameOutput& frame) { return std::make_tuple( frame.data, torch::tensor(frame.ptsSeconds, torch::dtype(torch::kFloat64)), @@ -116,8 +116,7 @@ OpsFrameOutput makeOpsFrameOutput(SingleStreamDecoder::FrameOutput& frame) { // single float. using OpsFrameBatchOutput = std::tuple; -OpsFrameBatchOutput makeOpsFrameBatchOutput( - SingleStreamDecoder::FrameBatchOutput& batch) { +OpsFrameBatchOutput makeOpsFrameBatchOutput(FrameBatchOutput& batch) { return std::make_tuple(batch.data, batch.ptsSeconds, batch.durationSeconds); } @@ -127,8 +126,7 @@ OpsFrameBatchOutput makeOpsFrameBatchOutput( // 2. A single float value for the pts of the first frame, in seconds. using OpsAudioFramesOutput = std::tuple; -OpsAudioFramesOutput makeOpsAudioFramesOutput( - SingleStreamDecoder::AudioFramesOutput& audioFrames) { +OpsAudioFramesOutput makeOpsAudioFramesOutput(AudioFramesOutput& audioFrames) { return std::make_tuple( audioFrames.data, torch::tensor(audioFrames.ptsSeconds, torch::dtype(torch::kFloat64))); @@ -291,7 +289,7 @@ void seek_to_pts(at::Tensor& decoder, double seconds) { // duration as tensors. OpsFrameOutput get_next_frame(at::Tensor& decoder) { auto videoDecoder = unwrapTensorToGetDecoder(decoder); - SingleStreamDecoder::FrameOutput result; + FrameOutput result; try { result = videoDecoder->getNextFrame(); } catch (const SingleStreamDecoder::EndOfFileException& e) { @@ -305,7 +303,7 @@ OpsFrameOutput get_next_frame(at::Tensor& decoder) { // given timestamp T has T >= PTS and T < PTS + Duration. OpsFrameOutput get_frame_at_pts(at::Tensor& decoder, double seconds) { auto videoDecoder = unwrapTensorToGetDecoder(decoder); - SingleStreamDecoder::FrameOutput result; + FrameOutput result; try { result = videoDecoder->getFramePlayedAt(seconds); } catch (const SingleStreamDecoder::EndOfFileException& e) { @@ -443,8 +441,7 @@ torch::Tensor _get_key_frame_indices(at::Tensor& decoder) { std::string get_json_metadata(at::Tensor& decoder) { auto videoDecoder = unwrapTensorToGetDecoder(decoder); - SingleStreamDecoder::ContainerMetadata videoMetadata = - videoDecoder->getContainerMetadata(); + ContainerMetadata videoMetadata = videoDecoder->getContainerMetadata(); auto maybeBestVideoStreamIndex = videoMetadata.bestVideoStreamIndex; std::map metadataMap; diff --git a/test/VideoDecoderTest.cpp b/test/VideoDecoderTest.cpp index ef0a6468d..6cbeef293 100644 --- a/test/VideoDecoderTest.cpp +++ b/test/VideoDecoderTest.cpp @@ -69,8 +69,7 @@ TEST_P(SingleStreamDecoderTest, ReturnsFpsAndDurationForVideoInMetadata) { std::string path = getResourcePath("nasa_13013.mp4"); std::unique_ptr decoder = createDecoderFromPath(path, GetParam()); - SingleStreamDecoder::ContainerMetadata metadata = - decoder->getContainerMetadata(); + ContainerMetadata metadata = decoder->getContainerMetadata(); EXPECT_EQ(metadata.numAudioStreams, 2); EXPECT_EQ(metadata.numVideoStreams, 2); #if LIBAVFORMAT_VERSION_MAJOR >= 60 @@ -429,8 +428,7 @@ TEST_P(SingleStreamDecoderTest, GetAudioMetadata) { std::string path = getResourcePath("nasa_13013.mp4.audio.mp3"); std::unique_ptr decoder = createDecoderFromPath(path, GetParam()); - SingleStreamDecoder::ContainerMetadata metadata = - decoder->getContainerMetadata(); + ContainerMetadata metadata = decoder->getContainerMetadata(); EXPECT_EQ(metadata.numAudioStreams, 1); EXPECT_EQ(metadata.numVideoStreams, 0); EXPECT_EQ(metadata.allStreamMetadata.size(), 1); From 19ad198a77b23905531357e7293140aef7b4bdec Mon Sep 17 00:00:00 2001 From: Dmitry Rogozhkin Date: Fri, 4 Apr 2025 22:07:16 +0000 Subject: [PATCH 4/6] Cleanup DeviceInterface inheritance Signed-off-by: Dmitry Rogozhkin --- src/torchcodec/_core/SingleStreamDecoder.cpp | 1 - src/torchcodec/_core/SingleStreamDecoder.h | 2 +- src/torchcodec/_core/custom_ops.cpp | 1 - test/VideoDecoderTest.cpp | 1 - 4 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp index ea7d341d2..c389242cd 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.cpp +++ b/src/torchcodec/_core/SingleStreamDecoder.cpp @@ -13,7 +13,6 @@ #include #include #include -#include "src/torchcodec/_core/DeviceInterface.h" #include "torch/types.h" extern "C" { diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h index 3e9506ce3..453b974f5 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.h +++ b/src/torchcodec/_core/SingleStreamDecoder.h @@ -13,12 +13,12 @@ #include #include "src/torchcodec/_core/AVIOContextHolder.h" +#include "src/torchcodec/_core/DeviceInterface.h" #include "src/torchcodec/_core/FFMPEGCommon.h" #include "src/torchcodec/_core/Frame.h" #include "src/torchcodec/_core/Stream.h" namespace facebook::torchcodec { -class DeviceInterface; // The SingleStreamDecoder class can be used to decode video frames to Tensors. // Note that SingleStreamDecoder is not thread-safe. diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp index 3f107a5a7..9a9b87767 100644 --- a/src/torchcodec/_core/custom_ops.cpp +++ b/src/torchcodec/_core/custom_ops.cpp @@ -11,7 +11,6 @@ #include "c10/core/SymIntArrayRef.h" #include "c10/util/Exception.h" #include "src/torchcodec/_core/AVIOBytesContext.h" -#include "src/torchcodec/_core/DeviceInterface.h" #include "src/torchcodec/_core/Encoder.h" #include "src/torchcodec/_core/SingleStreamDecoder.h" diff --git a/test/VideoDecoderTest.cpp b/test/VideoDecoderTest.cpp index 6cbeef293..a30609c2a 100644 --- a/test/VideoDecoderTest.cpp +++ b/test/VideoDecoderTest.cpp @@ -5,7 +5,6 @@ // LICENSE file in the root directory of this source tree. #include "src/torchcodec/_core/AVIOBytesContext.h" -#include "src/torchcodec/_core/DeviceInterface.h" #include "src/torchcodec/_core/SingleStreamDecoder.h" #include From bb63f09fea02085e173d953a8131d7b3c405ae95 Mon Sep 17 00:00:00 2001 From: Dmitry Rogozhkin Date: Mon, 7 Apr 2025 16:10:17 +0000 Subject: [PATCH 5/6] Move metadata structs to dedicated header Signed-off-by: Dmitry Rogozhkin --- src/torchcodec/_core/Frame.h | 1 + src/torchcodec/_core/Metadata.h | 70 +++++++++++++++++++++++++++++++++ src/torchcodec/_core/Stream.h | 50 ----------------------- 3 files changed, 71 insertions(+), 50 deletions(-) create mode 100644 src/torchcodec/_core/Metadata.h diff --git a/src/torchcodec/_core/Frame.h b/src/torchcodec/_core/Frame.h index d2214c3ce..728d8d8ac 100644 --- a/src/torchcodec/_core/Frame.h +++ b/src/torchcodec/_core/Frame.h @@ -7,6 +7,7 @@ #pragma once #include +#include "src/torchcodec/_core/Metadata.h" #include "src/torchcodec/_core/Stream.h" namespace facebook::torchcodec { diff --git a/src/torchcodec/_core/Metadata.h b/src/torchcodec/_core/Metadata.h new file mode 100644 index 000000000..a8f300f49 --- /dev/null +++ b/src/torchcodec/_core/Metadata.h @@ -0,0 +1,70 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include +#include +#include + +extern "C" { +#include +#include +} + +namespace facebook::torchcodec { + +struct StreamMetadata { + // Common (video and audio) fields derived from the AVStream. + int streamIndex; + // See this link for what various values are available: + // https://ffmpeg.org/doxygen/trunk/group__lavu__misc.html#ga9a84bba4713dfced21a1a56163be1f48 + AVMediaType mediaType; + std::optional codecId; + std::optional codecName; + std::optional durationSeconds; + std::optional beginStreamFromHeader; + std::optional numFrames; + std::optional numKeyFrames; + std::optional averageFps; + std::optional bitRate; + + // More accurate duration, obtained by scanning the file. + // These presentation timestamps are in time base. + std::optional minPtsFromScan; + std::optional maxPtsFromScan; + // These presentation timestamps are in seconds. + std::optional minPtsSecondsFromScan; + std::optional maxPtsSecondsFromScan; + // This can be useful for index-based seeking. + std::optional numFramesFromScan; + + // Video-only fields derived from the AVCodecContext. + std::optional width; + std::optional height; + + // Audio-only fields + std::optional sampleRate; + std::optional numChannels; + std::optional sampleFormat; +}; + +struct ContainerMetadata { + std::vector allStreamMetadata; + int numAudioStreams = 0; + int numVideoStreams = 0; + // Note that this is the container-level duration, which is usually the max + // of all stream durations available in the container. + std::optional durationSeconds; + // Total BitRate level information at the container level in bit/s + std::optional bitRate; + // If set, this is the index to the default audio stream. + std::optional bestAudioStreamIndex; + // If set, this is the index to the default video stream. + std::optional bestVideoStreamIndex; +}; + +} // namespace facebook::torchcodec diff --git a/src/torchcodec/_core/Stream.h b/src/torchcodec/_core/Stream.h index a9ff1fe2c..38e51209c 100644 --- a/src/torchcodec/_core/Stream.h +++ b/src/torchcodec/_core/Stream.h @@ -46,54 +46,4 @@ struct AudioStreamOptions { std::optional sampleRate; }; -struct StreamMetadata { - // Common (video and audio) fields derived from the AVStream. - int streamIndex; - // See this link for what various values are available: - // https://ffmpeg.org/doxygen/trunk/group__lavu__misc.html#ga9a84bba4713dfced21a1a56163be1f48 - AVMediaType mediaType; - std::optional codecId; - std::optional codecName; - std::optional durationSeconds; - std::optional beginStreamFromHeader; - std::optional numFrames; - std::optional numKeyFrames; - std::optional averageFps; - std::optional bitRate; - - // More accurate duration, obtained by scanning the file. - // These presentation timestamps are in time base. - std::optional minPtsFromScan; - std::optional maxPtsFromScan; - // These presentation timestamps are in seconds. - std::optional minPtsSecondsFromScan; - std::optional maxPtsSecondsFromScan; - // This can be useful for index-based seeking. - std::optional numFramesFromScan; - - // Video-only fields derived from the AVCodecContext. - std::optional width; - std::optional height; - - // Audio-only fields - std::optional sampleRate; - std::optional numChannels; - std::optional sampleFormat; -}; - -struct ContainerMetadata { - std::vector allStreamMetadata; - int numAudioStreams = 0; - int numVideoStreams = 0; - // Note that this is the container-level duration, which is usually the max - // of all stream durations available in the container. - std::optional durationSeconds; - // Total BitRate level information at the container level in bit/s - std::optional bitRate; - // If set, this is the index to the default audio stream. - std::optional bestAudioStreamIndex; - // If set, this is the index to the default video stream. - std::optional bestVideoStreamIndex; -}; - } // namespace facebook::torchcodec From 6053a5514270eec15031ffc74d859ff6b9f6d2df Mon Sep 17 00:00:00 2001 From: Dmitry Rogozhkin Date: Mon, 7 Apr 2025 16:16:08 +0000 Subject: [PATCH 6/6] Rename Stream.h back to StreamOptions.h Signed-off-by: Dmitry Rogozhkin --- src/torchcodec/_core/DeviceInterface.h | 2 +- src/torchcodec/_core/Frame.h | 2 +- src/torchcodec/_core/SingleStreamDecoder.h | 2 +- src/torchcodec/_core/{Stream.h => StreamOptions.h} | 0 4 files changed, 3 insertions(+), 3 deletions(-) rename src/torchcodec/_core/{Stream.h => StreamOptions.h} (100%) diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h index d91870ed8..b4197d7d5 100644 --- a/src/torchcodec/_core/DeviceInterface.h +++ b/src/torchcodec/_core/DeviceInterface.h @@ -13,7 +13,7 @@ #include #include "FFMPEGCommon.h" #include "src/torchcodec/_core/Frame.h" -#include "src/torchcodec/_core/Stream.h" +#include "src/torchcodec/_core/StreamOptions.h" namespace facebook::torchcodec { diff --git a/src/torchcodec/_core/Frame.h b/src/torchcodec/_core/Frame.h index 728d8d8ac..aa6897349 100644 --- a/src/torchcodec/_core/Frame.h +++ b/src/torchcodec/_core/Frame.h @@ -8,7 +8,7 @@ #include #include "src/torchcodec/_core/Metadata.h" -#include "src/torchcodec/_core/Stream.h" +#include "src/torchcodec/_core/StreamOptions.h" namespace facebook::torchcodec { diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h index 453b974f5..7b275a209 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.h +++ b/src/torchcodec/_core/SingleStreamDecoder.h @@ -16,7 +16,7 @@ #include "src/torchcodec/_core/DeviceInterface.h" #include "src/torchcodec/_core/FFMPEGCommon.h" #include "src/torchcodec/_core/Frame.h" -#include "src/torchcodec/_core/Stream.h" +#include "src/torchcodec/_core/StreamOptions.h" namespace facebook::torchcodec { diff --git a/src/torchcodec/_core/Stream.h b/src/torchcodec/_core/StreamOptions.h similarity index 100% rename from src/torchcodec/_core/Stream.h rename to src/torchcodec/_core/StreamOptions.h