Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/torchcodec/_core/Encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -745,6 +745,10 @@ void VideoEncoder::initializeEncoder(
std::to_string(videoStreamOptions.crf.value()).c_str(),
0);
}
if (videoStreamOptions.preset.has_value()) {
av_dict_set(
&options, "preset", videoStreamOptions.preset.value().c_str(), 0);
}
int status = avcodec_open2(avCodecContext_.get(), avCodec, &options);
av_dict_free(&options);

Expand Down
6 changes: 2 additions & 4 deletions src/torchcodec/_core/StreamOptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,11 @@ struct VideoStreamOptions {
std::string_view deviceVariant = "ffmpeg";

// Encoding options
// TODO-VideoEncoder: Consider adding other optional fields here
// (bit rate, gop size, max b frames, preset)
std::optional<double> crf;

// Optional pixel format for video encoding (e.g., "yuv420p", "yuv444p")
// If not specified, uses codec's default format.
std::optional<std::string> pixelFormat;
std::optional<double> crf;
std::optional<std::string> preset;
};

struct AudioStreamOptions {
Expand Down
24 changes: 15 additions & 9 deletions src/torchcodec/_core/custom_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,11 @@ TORCH_LIBRARY(torchcodec_ns, m) {
m.def(
"_encode_audio_to_file_like(Tensor samples, int sample_rate, str format, int file_like_context, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
m.def(
"encode_video_to_file(Tensor frames, int frame_rate, str filename, str? pixel_format=None, float? crf=None) -> ()");
"encode_video_to_file(Tensor frames, int frame_rate, str filename, str? pixel_format=None, float? crf=None, str? preset=None) -> ()");
m.def(
"encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? pixel_format=None, float? crf=None) -> Tensor");
"encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? pixel_format=None, float? crf=None, str? preset=None) -> Tensor");
m.def(
"_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? pixel_format=None, float? crf=None) -> ()");
"_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? pixel_format=None, float? crf=None, str? preset=None) -> ()");
m.def(
"create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
m.def(
Expand Down Expand Up @@ -603,11 +603,13 @@ void encode_video_to_file(
const at::Tensor& frames,
int64_t frame_rate,
std::string_view file_name,
std::optional<std::string> pixel_format = std::nullopt,
std::optional<double> crf = std::nullopt) {
std::optional<std::string_view> pixel_format = std::nullopt,
std::optional<double> crf = std::nullopt,
std::optional<std::string_view> preset = std::nullopt) {
VideoStreamOptions videoStreamOptions;
videoStreamOptions.pixelFormat = pixel_format;
videoStreamOptions.crf = crf;
videoStreamOptions.preset = preset;
VideoEncoder(
frames,
validateInt64ToInt(frame_rate, "frame_rate"),
Expand All @@ -620,12 +622,14 @@ at::Tensor encode_video_to_tensor(
const at::Tensor& frames,
int64_t frame_rate,
std::string_view format,
std::optional<std::string> pixel_format = std::nullopt,
std::optional<double> crf = std::nullopt) {
std::optional<std::string_view> pixel_format = std::nullopt,
std::optional<double> crf = std::nullopt,
std::optional<std::string_view> preset = std::nullopt) {
auto avioContextHolder = std::make_unique<AVIOToTensorContext>();
VideoStreamOptions videoStreamOptions;
videoStreamOptions.pixelFormat = pixel_format;
videoStreamOptions.crf = crf;
videoStreamOptions.preset = preset;
return VideoEncoder(
frames,
validateInt64ToInt(frame_rate, "frame_rate"),
Expand All @@ -640,8 +644,9 @@ void _encode_video_to_file_like(
int64_t frame_rate,
std::string_view format,
int64_t file_like_context,
std::optional<std::string> pixel_format = std::nullopt,
std::optional<double> crf = std::nullopt) {
std::optional<std::string_view> pixel_format = std::nullopt,
std::optional<double> crf = std::nullopt,
std::optional<std::string_view> preset = std::nullopt) {
auto fileLikeContext =
reinterpret_cast<AVIOFileLikeContext*>(file_like_context);
TORCH_CHECK(
Expand All @@ -651,6 +656,7 @@ void _encode_video_to_file_like(
VideoStreamOptions videoStreamOptions;
videoStreamOptions.pixelFormat = pixel_format;
videoStreamOptions.crf = crf;
videoStreamOptions.preset = preset;

VideoEncoder encoder(
frames,
Expand Down
12 changes: 9 additions & 3 deletions src/torchcodec/_core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ def encode_video_to_file_like(
file_like: Union[io.RawIOBase, io.BufferedIOBase],
crf: Optional[Union[int, float]] = None,
pixel_format: Optional[str] = None,
preset: Optional[str] = None,
) -> None:
"""Encode video frames to a file-like object.

Expand All @@ -225,6 +226,7 @@ def encode_video_to_file_like(
file_like: File-like object that supports write() and seek() methods
crf: Optional constant rate factor for encoding quality
pixel_format: Optional pixel format (e.g., "yuv420p", "yuv444p")
preset: Optional encoder preset as string (e.g., "ultrafast", "medium")
"""
assert _pybind_ops is not None

Expand All @@ -235,6 +237,7 @@ def encode_video_to_file_like(
_pybind_ops.create_file_like_context(file_like, True), # True means for writing
pixel_format,
crf,
preset,
)


Expand Down Expand Up @@ -322,8 +325,9 @@ def encode_video_to_file_abstract(
frames: torch.Tensor,
frame_rate: int,
filename: str,
crf: Optional[Union[int, float]] = None,
pixel_format: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
preset: Optional[str] = None,
) -> None:
return

Expand All @@ -333,8 +337,9 @@ def encode_video_to_tensor_abstract(
frames: torch.Tensor,
frame_rate: int,
format: str,
crf: Optional[Union[int, float]] = None,
pixel_format: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
preset: Optional[str] = None,
) -> torch.Tensor:
return torch.empty([], dtype=torch.long)

Expand All @@ -345,8 +350,9 @@ def _encode_video_to_file_like_abstract(
frame_rate: int,
format: str,
file_like_context: int,
crf: Optional[Union[int, float]] = None,
pixel_format: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
preset: Optional[str] = None,
) -> None:
return

Expand Down
21 changes: 21 additions & 0 deletions src/torchcodec/encoders/_video_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def to_file(
*,
pixel_format: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
preset: Optional[Union[str, int]] = None,
) -> None:
"""Encode frames into a file.

Expand All @@ -50,13 +51,19 @@ def to_file(
crf (int or float, optional): Constant Rate Factor for encoding quality. Lower values
mean better quality. Valid range depends on the encoder (commonly 0-51).
Defaults to None (which will use encoder's default).
preset (str or int, optional): Encoder option that controls the tradeoff between
encoding speed and compression. Valid values depend on the encoder (commonly
a string: "fast", "medium", "slow"). Defaults to None
(which will use encoder's default).
"""
preset = str(preset) if isinstance(preset, int) else preset
_core.encode_video_to_file(
frames=self._frames,
frame_rate=self._frame_rate,
filename=str(dest),
pixel_format=pixel_format,
crf=crf,
preset=preset,
)

def to_tensor(
Expand All @@ -65,6 +72,7 @@ def to_tensor(
*,
pixel_format: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
preset: Optional[Union[str, int]] = None,
) -> Tensor:
"""Encode frames into raw bytes, as a 1D uint8 Tensor.

Expand All @@ -76,16 +84,22 @@ def to_tensor(
crf (int or float, optional): Constant Rate Factor for encoding quality. Lower values
mean better quality. Valid range depends on the encoder (commonly 0-51).
Defaults to None (which will use encoder's default).
preset (str or int, optional): Encoder option that controls the tradeoff between
encoding speed and compression. Valid values depend on the encoder (commonly
a string: "fast", "medium", "slow"). Defaults to None
(which will use encoder's default).

Returns:
Tensor: The raw encoded bytes as 4D uint8 Tensor.
"""
preset_value = str(preset) if isinstance(preset, int) else preset
return _core.encode_video_to_tensor(
frames=self._frames,
frame_rate=self._frame_rate,
format=format,
pixel_format=pixel_format,
crf=crf,
preset=preset_value,
)

def to_file_like(
Expand All @@ -95,6 +109,7 @@ def to_file_like(
*,
pixel_format: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
preset: Optional[Union[str, int]] = None,
) -> None:
"""Encode frames into a file-like object.

Expand All @@ -111,12 +126,18 @@ def to_file_like(
crf (int or float, optional): Constant Rate Factor for encoding quality. Lower values
mean better quality. Valid range depends on the encoder (commonly 0-51).
Defaults to None (which will use encoder's default).
preset (str or int, optional): Encoder option that controls the tradeoff between
encoding speed and compression. Valid values depend on the encoder (commonly
a string: "fast", "medium", "slow"). Defaults to None
(which will use encoder's default).
"""
preset = str(preset) if isinstance(preset, int) else preset
_core.encode_video_to_file_like(
frames=self._frames,
frame_rate=self._frame_rate,
format=format,
file_like=file_like,
pixel_format=pixel_format,
crf=crf,
preset=preset,
)
46 changes: 36 additions & 10 deletions test/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,6 +617,12 @@ def test_bad_input_parameterized(self, tmp_path, method):
)
getattr(encoder, method)(**valid_params, crf=-10)

with pytest.raises(
RuntimeError,
match=r"avcodec_open2 failed: Invalid argument",
):
encoder.to_tensor(format="mp4", preset="fake_preset")

@pytest.mark.parametrize("method", ["to_file", "to_tensor", "to_file_like"])
@pytest.mark.parametrize("crf", [23, 23.5, -0.9])
def test_crf_valid_values(self, method, crf, tmp_path):
Expand Down Expand Up @@ -826,13 +832,26 @@ def test_against_to_file(self, tmp_path, format, method):
pytest.param("webm", marks=pytest.mark.slow),
),
)
@pytest.mark.parametrize("pixel_format", ("yuv444p", "yuv420p"))
def test_video_encoder_against_ffmpeg_cli(self, tmp_path, format, pixel_format):
@pytest.mark.parametrize(
"encode_params",
[
{"pixel_format": "yuv444p", "crf": 0, "preset": None},
{"pixel_format": "yuv420p", "crf": 30, "preset": None},
{"pixel_format": "yuv420p", "crf": None, "preset": "ultrafast"},
{"pixel_format": "yuv420p", "crf": None, "preset": None},
],
)
def test_video_encoder_against_ffmpeg_cli(self, tmp_path, format, encode_params):
ffmpeg_version = get_ffmpeg_major_version()
if format == "webm" and (
ffmpeg_version == 4 or (IS_WINDOWS and ffmpeg_version in (6, 7))
):
pytest.skip("Codec for webm is not available in this FFmpeg installation.")

pixel_format = encode_params["pixel_format"]
crf = encode_params["crf"]
preset = encode_params["preset"]

if format in ("avi", "flv") and pixel_format == "yuv444p":
pytest.skip(f"Default codec for {format} does not support {pixel_format}")

Expand All @@ -845,8 +864,7 @@ def test_video_encoder_against_ffmpeg_cli(self, tmp_path, format, pixel_format):

ffmpeg_encoded_path = str(tmp_path / f"ffmpeg_output.{format}")
frame_rate = 30
crf = 0
# Some codecs (ex. MPEG4) do not support CRF.
# Some codecs (ex. MPEG4) do not support CRF or preset.
# Flags not supported by the selected codec will be ignored.
ffmpeg_cmd = [
"ffmpeg",
Expand All @@ -861,18 +879,26 @@ def test_video_encoder_against_ffmpeg_cli(self, tmp_path, format, pixel_format):
str(frame_rate),
"-i",
temp_raw_path,
"-pix_fmt",
pixel_format, # Output format
"-crf",
str(crf),
ffmpeg_encoded_path,
]
if pixel_format is not None: # Output format
ffmpeg_cmd.extend(["-pix_fmt", pixel_format])
if preset is not None:
ffmpeg_cmd.extend(["-preset", preset])
if crf is not None:
ffmpeg_cmd.extend(["-crf", str(crf)])
# Output path must be last
ffmpeg_cmd.append(ffmpeg_encoded_path)
subprocess.run(ffmpeg_cmd, check=True)

# Encode with our video encoder
encoder_output_path = str(tmp_path / f"encoder_output.{format}")
encoder = VideoEncoder(frames=source_frames, frame_rate=frame_rate)
encoder.to_file(dest=encoder_output_path, pixel_format=pixel_format, crf=crf)
encoder.to_file(
dest=encoder_output_path,
pixel_format=pixel_format,
crf=crf,
preset=preset,
)

ffmpeg_frames = self.decode(ffmpeg_encoded_path).data
encoder_frames = self.decode(encoder_output_path).data
Expand Down
Loading