Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 73 additions & 26 deletions src/torchcodec/_core/Encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -570,10 +570,10 @@ AVPixelFormat validatePixelFormat(
TORCH_CHECK(false, errorMsg.str());
}

void validateDoubleOption(
void tryToValidateCodecOption(
const AVCodec& avCodec,
const char* optionName,
double value) {
const std::string& value) {
if (!avCodec.priv_class) {
return;
}
Expand All @@ -586,24 +586,60 @@ void validateDoubleOption(
0,
AV_OPT_SEARCH_FAKE_OBJ,
nullptr);
// If the option was not found, let FFmpeg handle it later
// If option is not found we cannot validate it, let FFmpeg handle it
if (!option) {
return;
}
// Validate if option is defined as a numeric type
if (option->type == AV_OPT_TYPE_INT || option->type == AV_OPT_TYPE_INT64 ||
option->type == AV_OPT_TYPE_FLOAT || option->type == AV_OPT_TYPE_DOUBLE) {
TORCH_CHECK(
value >= option->min && value <= option->max,
optionName,
"=",
value,
" is out of valid range [",
option->min,
", ",
option->max,
"] for this codec. For more details, run 'ffmpeg -h encoder=",
avCodec.name,
"'");
try {
double numericValue = std::stod(value);
TORCH_CHECK(
numericValue >= option->min && numericValue <= option->max,
optionName,
"=",
numericValue,
" is out of valid range [",
option->min,
", ",
option->max,
"] for this codec. For more details, run 'ffmpeg -h encoder=",
avCodec.name,
"'");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is OK for now, I suspect min and max may not always be set on for all parameters, in which case we may error out when we shouldn't? We'll know if / when we get user reports about that. Let's keep it as-is for now and see if we need to revisit in the future.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We only access min/max on numeric parameters, my expectation (hope) it is populated for those.

} catch (const std::invalid_argument& e) {
TORCH_CHECK(
false,
"Option ",
optionName,
" expects a numeric value but got '",
value,
"'");
}
}
}

void sortCodecOptions(
const std::map<std::string, std::string>& extraOptions,
AVDictionary** codecDict,
AVDictionary** formatDict) {
// Accepts a map of options as input, then sorts them into codec options and
// format options. The sorted options are returned into two separate dicts.
const AVClass* formatClass = avformat_get_class();
for (const auto& [key, value] : extraOptions) {
const AVOption* fmtOpt = av_opt_find2(
&formatClass,
key.c_str(),
nullptr,
0,
AV_OPT_SEARCH_CHILDREN | AV_OPT_SEARCH_FAKE_OBJ,
nullptr);
if (fmtOpt) {
av_dict_set(formatDict, key.c_str(), value.c_str(), 0);
} else {
// Default to codec option (includes AVCodecContext + encoder-private)
av_dict_set(codecDict, key.c_str(), value.c_str(), 0);
}
}
}
} // namespace
Expand All @@ -621,6 +657,7 @@ VideoEncoder::~VideoEncoder() {
avFormatContext_->pb = nullptr;
}
}
av_dict_free(&avFormatOptions_);
}

VideoEncoder::VideoEncoder(
Expand Down Expand Up @@ -760,21 +797,31 @@ void VideoEncoder::initializeEncoder(
}

// Apply videoStreamOptions
AVDictionary* options = nullptr;
AVDictionary* avCodecOptions = nullptr;
if (videoStreamOptions.extraOptions.has_value()) {
for (const auto& [key, value] : videoStreamOptions.extraOptions.value()) {
tryToValidateCodecOption(*avCodec, key.c_str(), value);
}
sortCodecOptions(
videoStreamOptions.extraOptions.value(),
&avCodecOptions,
&avFormatOptions_);
}

if (videoStreamOptions.crf.has_value()) {
validateDoubleOption(*avCodec, "crf", videoStreamOptions.crf.value());
av_dict_set(
&options,
"crf",
std::to_string(videoStreamOptions.crf.value()).c_str(),
0);
std::string crfValue = std::to_string(videoStreamOptions.crf.value());
tryToValidateCodecOption(*avCodec, "crf", crfValue);
av_dict_set(&avCodecOptions, "crf", crfValue.c_str(), 0);
}
if (videoStreamOptions.preset.has_value()) {
av_dict_set(
&options, "preset", videoStreamOptions.preset.value().c_str(), 0);
&avCodecOptions,
"preset",
videoStreamOptions.preset.value().c_str(),
0);
}
int status = avcodec_open2(avCodecContext_.get(), avCodec, &options);
av_dict_free(&options);
int status = avcodec_open2(avCodecContext_.get(), avCodec, &avCodecOptions);
av_dict_free(&avCodecOptions);

TORCH_CHECK(
status == AVSUCCESS,
Expand All @@ -799,7 +846,7 @@ void VideoEncoder::encode() {
TORCH_CHECK(!encodeWasCalled_, "Cannot call encode() twice.");
encodeWasCalled_ = true;

int status = avformat_write_header(avFormatContext_.get(), nullptr);
int status = avformat_write_header(avFormatContext_.get(), &avFormatOptions_);
TORCH_CHECK(
status == AVSUCCESS,
"Error in avformat_write_header: ",
Expand Down
7 changes: 7 additions & 0 deletions src/torchcodec/_core/Encoder.h
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
#pragma once
#include <torch/types.h>
#include <map>
#include <string>
#include "AVIOContextHolder.h"
#include "FFMPEGCommon.h"
#include "StreamOptions.h"

extern "C" {
#include <libavutil/dict.h>
}

namespace facebook::torchcodec {
class AudioEncoder {
public:
Expand Down Expand Up @@ -179,6 +185,7 @@ class VideoEncoder {
std::unique_ptr<AVIOContextHolder> avioContextHolder_;

bool encodeWasCalled_ = false;
AVDictionary* avFormatOptions_ = nullptr;
};

} // namespace facebook::torchcodec
2 changes: 2 additions & 0 deletions src/torchcodec/_core/StreamOptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#pragma once

#include <torch/types.h>
#include <map>
#include <optional>
#include <string>
#include <string_view>
Expand Down Expand Up @@ -51,6 +52,7 @@ struct VideoStreamOptions {
std::optional<std::string> pixelFormat;
std::optional<double> crf;
std::optional<std::string> preset;
std::optional<std::map<std::string, std::string>> extraOptions;
};

struct AudioStreamOptions {
Expand Down
42 changes: 36 additions & 6 deletions src/torchcodec/_core/custom_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,11 @@ TORCH_LIBRARY(torchcodec_ns, m) {
m.def(
"_encode_audio_to_file_like(Tensor samples, int sample_rate, str format, int file_like_context, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
m.def(
"encode_video_to_file(Tensor frames, int frame_rate, str filename, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None) -> ()");
"encode_video_to_file(Tensor frames, int frame_rate, str filename, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None) -> ()");
m.def(
"encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None) -> Tensor");
"encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None) -> Tensor");
m.def(
"_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None) -> ()");
"_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None) -> ()");
m.def(
"create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
m.def(
Expand Down Expand Up @@ -158,6 +158,16 @@ std::string quoteValue(const std::string& value) {
return "\"" + value + "\"";
}

// Helper function to unflatten extra_options, alternating keys and values
std::map<std::string, std::string> unflattenExtraOptions(
const std::vector<std::string>& opts) {
std::map<std::string, std::string> optionsMap;
for (size_t i = 0; i < opts.size(); i += 2) {
optionsMap[opts[i]] = opts[i + 1];
}
return optionsMap;
}

std::string mapToJson(const std::map<std::string, std::string>& metadataMap) {
std::stringstream ss;
ss << "{\n";
Expand Down Expand Up @@ -606,12 +616,19 @@ void encode_video_to_file(
std::optional<std::string> codec = std::nullopt,
std::optional<std::string_view> pixel_format = std::nullopt,
std::optional<double> crf = std::nullopt,
std::optional<std::string_view> preset = std::nullopt) {
std::optional<std::string_view> preset = std::nullopt,
std::optional<std::vector<std::string>> extra_options = std::nullopt) {
VideoStreamOptions videoStreamOptions;
videoStreamOptions.codec = codec;
videoStreamOptions.pixelFormat = pixel_format;
videoStreamOptions.crf = crf;
videoStreamOptions.preset = preset;

if (extra_options.has_value()) {
videoStreamOptions.extraOptions =
unflattenExtraOptions(extra_options.value());
}

VideoEncoder(
frames,
validateInt64ToInt(frame_rate, "frame_rate"),
Expand All @@ -627,13 +644,20 @@ at::Tensor encode_video_to_tensor(
std::optional<std::string> codec = std::nullopt,
std::optional<std::string_view> pixel_format = std::nullopt,
std::optional<double> crf = std::nullopt,
std::optional<std::string_view> preset = std::nullopt) {
std::optional<std::string_view> preset = std::nullopt,
std::optional<std::vector<std::string>> extra_options = std::nullopt) {
auto avioContextHolder = std::make_unique<AVIOToTensorContext>();
VideoStreamOptions videoStreamOptions;
videoStreamOptions.codec = codec;
videoStreamOptions.pixelFormat = pixel_format;
videoStreamOptions.crf = crf;
videoStreamOptions.preset = preset;

if (extra_options.has_value()) {
videoStreamOptions.extraOptions =
unflattenExtraOptions(extra_options.value());
}

return VideoEncoder(
frames,
validateInt64ToInt(frame_rate, "frame_rate"),
Expand All @@ -651,7 +675,8 @@ void _encode_video_to_file_like(
std::optional<std::string> codec = std::nullopt,
std::optional<std::string_view> pixel_format = std::nullopt,
std::optional<double> crf = std::nullopt,
std::optional<std::string_view> preset = std::nullopt) {
std::optional<std::string_view> preset = std::nullopt,
std::optional<std::vector<std::string>> extra_options = std::nullopt) {
auto fileLikeContext =
reinterpret_cast<AVIOFileLikeContext*>(file_like_context);
TORCH_CHECK(
Expand All @@ -664,6 +689,11 @@ void _encode_video_to_file_like(
videoStreamOptions.crf = crf;
videoStreamOptions.preset = preset;

if (extra_options.has_value()) {
videoStreamOptions.extraOptions =
unflattenExtraOptions(extra_options.value());
}

VideoEncoder encoder(
frames,
validateInt64ToInt(frame_rate, "frame_rate"),
Expand Down
12 changes: 9 additions & 3 deletions src/torchcodec/_core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ def encode_video_to_file_like(
pixel_format: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
preset: Optional[str] = None,
extra_options: Optional[list[str]] = None,
) -> None:
"""Encode video frames to a file-like object.

Expand All @@ -229,6 +230,7 @@ def encode_video_to_file_like(
pixel_format: Optional pixel format (e.g., "yuv420p", "yuv444p")
crf: Optional constant rate factor for encoding quality
preset: Optional encoder preset as string (e.g., "ultrafast", "medium")
extra_options: Optional list of extra options as flattened key-value pairs
"""
assert _pybind_ops is not None

Expand All @@ -241,6 +243,7 @@ def encode_video_to_file_like(
pixel_format,
crf,
preset,
extra_options,
)


Expand Down Expand Up @@ -330,8 +333,9 @@ def encode_video_to_file_abstract(
filename: str,
codec: Optional[str],
pixel_format: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
preset: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
extra_options: Optional[list[str]] = None,
) -> None:
return

Expand All @@ -343,8 +347,9 @@ def encode_video_to_tensor_abstract(
format: str,
codec: Optional[str],
pixel_format: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
preset: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
extra_options: Optional[list[str]] = None,
) -> torch.Tensor:
return torch.empty([], dtype=torch.long)

Expand All @@ -357,8 +362,9 @@ def _encode_video_to_file_like_abstract(
file_like_context: int,
codec: Optional[str],
pixel_format: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
preset: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
extra_options: Optional[list[str]] = None,
) -> None:
return

Expand Down
Loading
Loading