diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp index ad44e28b7..90e326087 100644 --- a/src/torchcodec/_core/Encoder.cpp +++ b/src/torchcodec/_core/Encoder.cpp @@ -745,6 +745,10 @@ void VideoEncoder::initializeEncoder( std::to_string(videoStreamOptions.crf.value()).c_str(), 0); } + if (videoStreamOptions.preset.has_value()) { + av_dict_set( + &options, "preset", videoStreamOptions.preset.value().c_str(), 0); + } int status = avcodec_open2(avCodecContext_.get(), avCodec, &options); av_dict_free(&options); diff --git a/src/torchcodec/_core/StreamOptions.h b/src/torchcodec/_core/StreamOptions.h index c0de64f2e..01af6846c 100644 --- a/src/torchcodec/_core/StreamOptions.h +++ b/src/torchcodec/_core/StreamOptions.h @@ -45,13 +45,11 @@ struct VideoStreamOptions { std::string_view deviceVariant = "ffmpeg"; // Encoding options - // TODO-VideoEncoder: Consider adding other optional fields here - // (bit rate, gop size, max b frames, preset) - std::optional crf; - // Optional pixel format for video encoding (e.g., "yuv420p", "yuv444p") // If not specified, uses codec's default format. std::optional pixelFormat; + std::optional crf; + std::optional preset; }; struct AudioStreamOptions { diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp index 1ebd136c7..2ef5d0f49 100644 --- a/src/torchcodec/_core/custom_ops.cpp +++ b/src/torchcodec/_core/custom_ops.cpp @@ -37,11 +37,11 @@ TORCH_LIBRARY(torchcodec_ns, m) { m.def( "_encode_audio_to_file_like(Tensor samples, int sample_rate, str format, int file_like_context, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()"); m.def( - "encode_video_to_file(Tensor frames, int frame_rate, str filename, str? pixel_format=None, float? crf=None) -> ()"); + "encode_video_to_file(Tensor frames, int frame_rate, str filename, str? pixel_format=None, float? crf=None, str? preset=None) -> ()"); m.def( - "encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? pixel_format=None, float? crf=None) -> Tensor"); + "encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? pixel_format=None, float? crf=None, str? preset=None) -> Tensor"); m.def( - "_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? pixel_format=None, float? crf=None) -> ()"); + "_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? pixel_format=None, float? crf=None, str? preset=None) -> ()"); m.def( "create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor"); m.def( @@ -603,11 +603,13 @@ void encode_video_to_file( const at::Tensor& frames, int64_t frame_rate, std::string_view file_name, - std::optional pixel_format = std::nullopt, - std::optional crf = std::nullopt) { + std::optional pixel_format = std::nullopt, + std::optional crf = std::nullopt, + std::optional preset = std::nullopt) { VideoStreamOptions videoStreamOptions; videoStreamOptions.pixelFormat = pixel_format; videoStreamOptions.crf = crf; + videoStreamOptions.preset = preset; VideoEncoder( frames, validateInt64ToInt(frame_rate, "frame_rate"), @@ -620,12 +622,14 @@ at::Tensor encode_video_to_tensor( const at::Tensor& frames, int64_t frame_rate, std::string_view format, - std::optional pixel_format = std::nullopt, - std::optional crf = std::nullopt) { + std::optional pixel_format = std::nullopt, + std::optional crf = std::nullopt, + std::optional preset = std::nullopt) { auto avioContextHolder = std::make_unique(); VideoStreamOptions videoStreamOptions; videoStreamOptions.pixelFormat = pixel_format; videoStreamOptions.crf = crf; + videoStreamOptions.preset = preset; return VideoEncoder( frames, validateInt64ToInt(frame_rate, "frame_rate"), @@ -640,8 +644,9 @@ void _encode_video_to_file_like( int64_t frame_rate, std::string_view format, int64_t file_like_context, - std::optional pixel_format = std::nullopt, - std::optional crf = std::nullopt) { + std::optional pixel_format = std::nullopt, + std::optional crf = std::nullopt, + std::optional preset = std::nullopt) { auto fileLikeContext = reinterpret_cast(file_like_context); TORCH_CHECK( @@ -651,6 +656,7 @@ void _encode_video_to_file_like( VideoStreamOptions videoStreamOptions; videoStreamOptions.pixelFormat = pixel_format; videoStreamOptions.crf = crf; + videoStreamOptions.preset = preset; VideoEncoder encoder( frames, diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py index ebad670d1..a2f1fa0a3 100644 --- a/src/torchcodec/_core/ops.py +++ b/src/torchcodec/_core/ops.py @@ -215,6 +215,7 @@ def encode_video_to_file_like( file_like: Union[io.RawIOBase, io.BufferedIOBase], crf: Optional[Union[int, float]] = None, pixel_format: Optional[str] = None, + preset: Optional[str] = None, ) -> None: """Encode video frames to a file-like object. @@ -225,6 +226,7 @@ def encode_video_to_file_like( file_like: File-like object that supports write() and seek() methods crf: Optional constant rate factor for encoding quality pixel_format: Optional pixel format (e.g., "yuv420p", "yuv444p") + preset: Optional encoder preset as string (e.g., "ultrafast", "medium") """ assert _pybind_ops is not None @@ -235,6 +237,7 @@ def encode_video_to_file_like( _pybind_ops.create_file_like_context(file_like, True), # True means for writing pixel_format, crf, + preset, ) @@ -322,8 +325,9 @@ def encode_video_to_file_abstract( frames: torch.Tensor, frame_rate: int, filename: str, - crf: Optional[Union[int, float]] = None, pixel_format: Optional[str] = None, + crf: Optional[Union[int, float]] = None, + preset: Optional[str] = None, ) -> None: return @@ -333,8 +337,9 @@ def encode_video_to_tensor_abstract( frames: torch.Tensor, frame_rate: int, format: str, - crf: Optional[Union[int, float]] = None, pixel_format: Optional[str] = None, + crf: Optional[Union[int, float]] = None, + preset: Optional[str] = None, ) -> torch.Tensor: return torch.empty([], dtype=torch.long) @@ -345,8 +350,9 @@ def _encode_video_to_file_like_abstract( frame_rate: int, format: str, file_like_context: int, - crf: Optional[Union[int, float]] = None, pixel_format: Optional[str] = None, + crf: Optional[Union[int, float]] = None, + preset: Optional[str] = None, ) -> None: return diff --git a/src/torchcodec/encoders/_video_encoder.py b/src/torchcodec/encoders/_video_encoder.py index 318aac76e..d812d4a11 100644 --- a/src/torchcodec/encoders/_video_encoder.py +++ b/src/torchcodec/encoders/_video_encoder.py @@ -38,6 +38,7 @@ def to_file( *, pixel_format: Optional[str] = None, crf: Optional[Union[int, float]] = None, + preset: Optional[Union[str, int]] = None, ) -> None: """Encode frames into a file. @@ -50,13 +51,19 @@ def to_file( crf (int or float, optional): Constant Rate Factor for encoding quality. Lower values mean better quality. Valid range depends on the encoder (commonly 0-51). Defaults to None (which will use encoder's default). + preset (str or int, optional): Encoder option that controls the tradeoff between + encoding speed and compression. Valid values depend on the encoder (commonly + a string: "fast", "medium", "slow"). Defaults to None + (which will use encoder's default). """ + preset = str(preset) if isinstance(preset, int) else preset _core.encode_video_to_file( frames=self._frames, frame_rate=self._frame_rate, filename=str(dest), pixel_format=pixel_format, crf=crf, + preset=preset, ) def to_tensor( @@ -65,6 +72,7 @@ def to_tensor( *, pixel_format: Optional[str] = None, crf: Optional[Union[int, float]] = None, + preset: Optional[Union[str, int]] = None, ) -> Tensor: """Encode frames into raw bytes, as a 1D uint8 Tensor. @@ -76,16 +84,22 @@ def to_tensor( crf (int or float, optional): Constant Rate Factor for encoding quality. Lower values mean better quality. Valid range depends on the encoder (commonly 0-51). Defaults to None (which will use encoder's default). + preset (str or int, optional): Encoder option that controls the tradeoff between + encoding speed and compression. Valid values depend on the encoder (commonly + a string: "fast", "medium", "slow"). Defaults to None + (which will use encoder's default). Returns: Tensor: The raw encoded bytes as 4D uint8 Tensor. """ + preset_value = str(preset) if isinstance(preset, int) else preset return _core.encode_video_to_tensor( frames=self._frames, frame_rate=self._frame_rate, format=format, pixel_format=pixel_format, crf=crf, + preset=preset_value, ) def to_file_like( @@ -95,6 +109,7 @@ def to_file_like( *, pixel_format: Optional[str] = None, crf: Optional[Union[int, float]] = None, + preset: Optional[Union[str, int]] = None, ) -> None: """Encode frames into a file-like object. @@ -111,7 +126,12 @@ def to_file_like( crf (int or float, optional): Constant Rate Factor for encoding quality. Lower values mean better quality. Valid range depends on the encoder (commonly 0-51). Defaults to None (which will use encoder's default). + preset (str or int, optional): Encoder option that controls the tradeoff between + encoding speed and compression. Valid values depend on the encoder (commonly + a string: "fast", "medium", "slow"). Defaults to None + (which will use encoder's default). """ + preset = str(preset) if isinstance(preset, int) else preset _core.encode_video_to_file_like( frames=self._frames, frame_rate=self._frame_rate, @@ -119,4 +139,5 @@ def to_file_like( file_like=file_like, pixel_format=pixel_format, crf=crf, + preset=preset, ) diff --git a/test/test_encoders.py b/test/test_encoders.py index 0a360ccf9..4c737967f 100644 --- a/test/test_encoders.py +++ b/test/test_encoders.py @@ -617,6 +617,12 @@ def test_bad_input_parameterized(self, tmp_path, method): ) getattr(encoder, method)(**valid_params, crf=-10) + with pytest.raises( + RuntimeError, + match=r"avcodec_open2 failed: Invalid argument", + ): + encoder.to_tensor(format="mp4", preset="fake_preset") + @pytest.mark.parametrize("method", ["to_file", "to_tensor", "to_file_like"]) @pytest.mark.parametrize("crf", [23, 23.5, -0.9]) def test_crf_valid_values(self, method, crf, tmp_path): @@ -826,13 +832,26 @@ def test_against_to_file(self, tmp_path, format, method): pytest.param("webm", marks=pytest.mark.slow), ), ) - @pytest.mark.parametrize("pixel_format", ("yuv444p", "yuv420p")) - def test_video_encoder_against_ffmpeg_cli(self, tmp_path, format, pixel_format): + @pytest.mark.parametrize( + "encode_params", + [ + {"pixel_format": "yuv444p", "crf": 0, "preset": None}, + {"pixel_format": "yuv420p", "crf": 30, "preset": None}, + {"pixel_format": "yuv420p", "crf": None, "preset": "ultrafast"}, + {"pixel_format": "yuv420p", "crf": None, "preset": None}, + ], + ) + def test_video_encoder_against_ffmpeg_cli(self, tmp_path, format, encode_params): ffmpeg_version = get_ffmpeg_major_version() if format == "webm" and ( ffmpeg_version == 4 or (IS_WINDOWS and ffmpeg_version in (6, 7)) ): pytest.skip("Codec for webm is not available in this FFmpeg installation.") + + pixel_format = encode_params["pixel_format"] + crf = encode_params["crf"] + preset = encode_params["preset"] + if format in ("avi", "flv") and pixel_format == "yuv444p": pytest.skip(f"Default codec for {format} does not support {pixel_format}") @@ -845,8 +864,7 @@ def test_video_encoder_against_ffmpeg_cli(self, tmp_path, format, pixel_format): ffmpeg_encoded_path = str(tmp_path / f"ffmpeg_output.{format}") frame_rate = 30 - crf = 0 - # Some codecs (ex. MPEG4) do not support CRF. + # Some codecs (ex. MPEG4) do not support CRF or preset. # Flags not supported by the selected codec will be ignored. ffmpeg_cmd = [ "ffmpeg", @@ -861,18 +879,26 @@ def test_video_encoder_against_ffmpeg_cli(self, tmp_path, format, pixel_format): str(frame_rate), "-i", temp_raw_path, - "-pix_fmt", - pixel_format, # Output format - "-crf", - str(crf), - ffmpeg_encoded_path, ] + if pixel_format is not None: # Output format + ffmpeg_cmd.extend(["-pix_fmt", pixel_format]) + if preset is not None: + ffmpeg_cmd.extend(["-preset", preset]) + if crf is not None: + ffmpeg_cmd.extend(["-crf", str(crf)]) + # Output path must be last + ffmpeg_cmd.append(ffmpeg_encoded_path) subprocess.run(ffmpeg_cmd, check=True) # Encode with our video encoder encoder_output_path = str(tmp_path / f"encoder_output.{format}") encoder = VideoEncoder(frames=source_frames, frame_rate=frame_rate) - encoder.to_file(dest=encoder_output_path, pixel_format=pixel_format, crf=crf) + encoder.to_file( + dest=encoder_output_path, + pixel_format=pixel_format, + crf=crf, + preset=preset, + ) ffmpeg_frames = self.decode(ffmpeg_encoded_path).data encoder_frames = self.decode(encoder_output_path).data