Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions src/torchcodec/_core/Encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "torch/types.h"

extern "C" {
#include <libavutil/opt.h>
#include <libavutil/pixdesc.h>
}

Expand Down Expand Up @@ -568,6 +569,43 @@ AVPixelFormat validatePixelFormat(
}
TORCH_CHECK(false, errorMsg.str());
}

void validateDoubleOption(
const AVCodec& avCodec,
const char* optionName,
double value) {
if (!avCodec.priv_class) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are we OK to use priv_class? I.e. is it meant to be "private"?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the avcodec.h, priv_class is defined in the section for public fields: https://www.ffmpeg.org/doxygen/2.0/libavcodec_2avcodec_8h_source.html

return;
}
const AVOption* option = av_opt_find2(
// Convert obj arg from const AVClass* const* to non-const void*
// First cast to remove const, then cast to void*
const_cast<void*>(static_cast<const void*>(&avCodec.priv_class)),
optionName,
nullptr,
0,
AV_OPT_SEARCH_FAKE_OBJ,
nullptr);
// If the option was not found, let FFmpeg handle it later
if (!option) {
return;
}
if (option->type == AV_OPT_TYPE_INT || option->type == AV_OPT_TYPE_INT64 ||
option->type == AV_OPT_TYPE_FLOAT || option->type == AV_OPT_TYPE_DOUBLE) {
TORCH_CHECK(
value >= option->min && value <= option->max,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is comparing an int (value) to a double (min and max), we should cast.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment led me to realize that codecs can implement 'crf' as a double or an int.
I'll update the PR to accept either type, and treat it as a double in the C++, so this casting will not be necessary.

optionName,
"=",
value,
" is out of valid range [",
option->min,
", ",
option->max,
"] for this codec. For more details, run 'ffmpeg -h encoder=",
avCodec.name,
"'");
}
}
} // namespace

VideoEncoder::~VideoEncoder() {
Expand Down Expand Up @@ -700,6 +738,7 @@ void VideoEncoder::initializeEncoder(
// Apply videoStreamOptions
AVDictionary* options = nullptr;
if (videoStreamOptions.crf.has_value()) {
validateDoubleOption(*avCodec, "crf", videoStreamOptions.crf.value());
av_dict_set(
&options,
"crf",
Expand Down
2 changes: 1 addition & 1 deletion src/torchcodec/_core/StreamOptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ struct VideoStreamOptions {
// Encoding options
// TODO-VideoEncoder: Consider adding other optional fields here
// (bit rate, gop size, max b frames, preset)
std::optional<int> crf;
std::optional<double> crf;

// Optional pixel format for video encoding (e.g., "yuv420p", "yuv444p")
// If not specified, uses codec's default format.
Expand Down
12 changes: 6 additions & 6 deletions src/torchcodec/_core/custom_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,11 @@ TORCH_LIBRARY(torchcodec_ns, m) {
m.def(
"_encode_audio_to_file_like(Tensor samples, int sample_rate, str format, int file_like_context, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
m.def(
"encode_video_to_file(Tensor frames, int frame_rate, str filename, str? pixel_format=None, int? crf=None) -> ()");
"encode_video_to_file(Tensor frames, int frame_rate, str filename, str? pixel_format=None, float? crf=None) -> ()");
m.def(
"encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? pixel_format=None, int? crf=None) -> Tensor");
"encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? pixel_format=None, float? crf=None) -> Tensor");
m.def(
"_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? pixel_format=None, int? crf=None) -> ()");
"_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? pixel_format=None, float? crf=None) -> ()");
m.def(
"create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
m.def(
Expand Down Expand Up @@ -604,7 +604,7 @@ void encode_video_to_file(
int64_t frame_rate,
std::string_view file_name,
std::optional<std::string> pixel_format = std::nullopt,
std::optional<int64_t> crf = std::nullopt) {
std::optional<double> crf = std::nullopt) {
VideoStreamOptions videoStreamOptions;
videoStreamOptions.pixelFormat = pixel_format;
videoStreamOptions.crf = crf;
Expand All @@ -621,7 +621,7 @@ at::Tensor encode_video_to_tensor(
int64_t frame_rate,
std::string_view format,
std::optional<std::string> pixel_format = std::nullopt,
std::optional<int64_t> crf = std::nullopt) {
std::optional<double> crf = std::nullopt) {
auto avioContextHolder = std::make_unique<AVIOToTensorContext>();
VideoStreamOptions videoStreamOptions;
videoStreamOptions.pixelFormat = pixel_format;
Expand All @@ -641,7 +641,7 @@ void _encode_video_to_file_like(
std::string_view format,
int64_t file_like_context,
std::optional<std::string> pixel_format = std::nullopt,
std::optional<int64_t> crf = std::nullopt) {
std::optional<double> crf = std::nullopt) {
auto fileLikeContext =
reinterpret_cast<AVIOFileLikeContext*>(file_like_context);
TORCH_CHECK(
Expand Down
8 changes: 4 additions & 4 deletions src/torchcodec/_core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ def encode_video_to_file_like(
frame_rate: int,
format: str,
file_like: Union[io.RawIOBase, io.BufferedIOBase],
crf: Optional[int] = None,
crf: Optional[Union[int, float]] = None,
pixel_format: Optional[str] = None,
) -> None:
"""Encode video frames to a file-like object.
Expand Down Expand Up @@ -322,7 +322,7 @@ def encode_video_to_file_abstract(
frames: torch.Tensor,
frame_rate: int,
filename: str,
crf: Optional[int] = None,
crf: Optional[Union[int, float]] = None,
pixel_format: Optional[str] = None,
) -> None:
return
Expand All @@ -333,7 +333,7 @@ def encode_video_to_tensor_abstract(
frames: torch.Tensor,
frame_rate: int,
format: str,
crf: Optional[int] = None,
crf: Optional[Union[int, float]] = None,
pixel_format: Optional[str] = None,
) -> torch.Tensor:
return torch.empty([], dtype=torch.long)
Expand All @@ -345,7 +345,7 @@ def _encode_video_to_file_like_abstract(
frame_rate: int,
format: str,
file_like_context: int,
crf: Optional[int] = None,
crf: Optional[Union[int, float]] = None,
pixel_format: Optional[str] = None,
) -> None:
return
Expand Down
19 changes: 17 additions & 2 deletions src/torchcodec/encoders/_video_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def to_file(
dest: Union[str, Path],
*,
pixel_format: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
) -> None:
"""Encode frames into a file.

Expand All @@ -46,27 +47,35 @@ def to_file(
container format.
pixel_format (str, optional): The pixel format for encoding (e.g.,
"yuv420p", "yuv444p"). If not specified, uses codec's default format.
crf (int or float, optional): Constant Rate Factor for encoding quality. Lower values
mean better quality. Valid range depends on the encoder (commonly 0-51).
Defaults to None (which will use encoder's default).
"""
_core.encode_video_to_file(
frames=self._frames,
frame_rate=self._frame_rate,
filename=str(dest),
pixel_format=pixel_format,
crf=crf,
)

def to_tensor(
self,
format: str,
*,
pixel_format: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
) -> Tensor:
"""Encode frames into raw bytes, as a 1D uint8 Tensor.

Args:
format (str): The container format of the encoded frames, e.g. "mp4", "mov",
"mkv", "avi", "webm", "flv", or "gif"
"mkv", "avi", "webm", "flv", etc.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q - why remove "gif"? Do we not support it anymore?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do not test explicitly for it anymore, but it still works. I mostly wanted to amend the docstring to make it seem less like a finalized, exhaustive list of supported formats.

pixel_format (str, optional): The pixel format to encode frames into (e.g.,
"yuv420p", "yuv444p"). If not specified, uses codec's default format.
crf (int or float, optional): Constant Rate Factor for encoding quality. Lower values
mean better quality. Valid range depends on the encoder (commonly 0-51).
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it ever valid to be less than 0?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe -1 is valid and is equivalent to leaving crf unset. Otherwise, no negative values are valid.

Defaults to None (which will use encoder's default).

Returns:
Tensor: The raw encoded bytes as 4D uint8 Tensor.
Expand All @@ -76,6 +85,7 @@ def to_tensor(
frame_rate=self._frame_rate,
format=format,
pixel_format=pixel_format,
crf=crf,
)

def to_file_like(
Expand All @@ -84,6 +94,7 @@ def to_file_like(
format: str,
*,
pixel_format: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
) -> None:
"""Encode frames into a file-like object.

Expand All @@ -94,14 +105,18 @@ def to_file_like(
``write(data: bytes) -> int`` and ``seek(offset: int, whence:
int = 0) -> int``.
format (str): The container format of the encoded frames, e.g. "mp4", "mov",
"mkv", "avi", "webm", "flv", or "gif".
"mkv", "avi", "webm", "flv", etc.
pixel_format (str, optional): The pixel format for encoding (e.g.,
"yuv420p", "yuv444p"). If not specified, uses codec's default format.
crf (int or float, optional): Constant Rate Factor for encoding quality. Lower values
mean better quality. Valid range depends on the encoder (commonly 0-51).
Defaults to None (which will use encoder's default).
"""
_core.encode_video_to_file_like(
frames=self._frames,
frame_rate=self._frame_rate,
format=format,
file_like=file_like,
pixel_format=pixel_format,
crf=crf,
)
Loading
Loading