Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions src/torchcodec/_core/Encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -642,6 +642,14 @@ void sortCodecOptions(
}
}
}

void validateFrameRate(int frameRate) {
TORCH_CHECK(
frameRate > 0,
"Invalid frame_rate: ",
frameRate,
". Frame rate must be a positive integer.");
}
} // namespace

VideoEncoder::~VideoEncoder() {
Expand All @@ -667,6 +675,7 @@ VideoEncoder::VideoEncoder(
const VideoStreamOptions& videoStreamOptions)
: frames_(validateFrames(frames)), inFrameRate_(frameRate) {
setFFmpegLogLevel();
validateFrameRate(frameRate);

// Allocate output format context
AVFormatContext* avFormatContext = nullptr;
Expand Down Expand Up @@ -724,6 +733,10 @@ VideoEncoder::VideoEncoder(

void VideoEncoder::initializeEncoder(
const VideoStreamOptions& videoStreamOptions) {
// Set output frame rate and validate
outFrameRate_ = videoStreamOptions.frameRate.value_or(inFrameRate_);
validateFrameRate(outFrameRate_);

const AVCodec* avCodec = nullptr;
// If codec arg is provided, find codec using logic similar to FFmpeg:
// https://github.com/FFmpeg/FFmpeg/blob/master/fftools/ffmpeg_opt.c#L804-L835
Expand Down Expand Up @@ -787,9 +800,8 @@ void VideoEncoder::initializeEncoder(
avCodecContext_->width = outWidth_;
avCodecContext_->height = outHeight_;
avCodecContext_->pix_fmt = outPixelFormat_;
// TODO-VideoEncoder: Verify that frame_rate and time_base are correct
avCodecContext_->time_base = {1, inFrameRate_};
avCodecContext_->framerate = {inFrameRate_, 1};
avCodecContext_->time_base = {1, outFrameRate_};
avCodecContext_->framerate = {outFrameRate_, 1};

// Set flag for containers that require extradata to be in the codec context
if (avFormatContext_->oformat->flags & AVFMT_GLOBALHEADER) {
Expand Down
1 change: 1 addition & 0 deletions src/torchcodec/_core/Encoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ class VideoEncoder {

const torch::Tensor frames_;
int inFrameRate_;
int outFrameRate_ = -1;

int inWidth_ = -1;
int inHeight_ = -1;
Expand Down
1 change: 1 addition & 0 deletions src/torchcodec/_core/StreamOptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ struct VideoStreamOptions {
std::optional<double> crf;
std::optional<std::string> preset;
std::optional<std::map<std::string, std::string>> extraOptions;
std::optional<int> frameRate;
};

struct AudioStreamOptions {
Expand Down
23 changes: 15 additions & 8 deletions src/torchcodec/_core/custom_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,11 @@ TORCH_LIBRARY(torchcodec_ns, m) {
m.def(
"_encode_audio_to_file_like(Tensor samples, int sample_rate, str format, int file_like_context, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
m.def(
"encode_video_to_file(Tensor frames, int frame_rate, str filename, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None) -> ()");
"encode_video_to_file(Tensor frames, int frame_rate, str filename, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None, int? desired_frame_rate=None) -> ()");
m.def(
"encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None) -> Tensor");
"encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None, int? desired_frame_rate=None) -> Tensor");
m.def(
"_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None) -> ()");
"_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None, int? desired_frame_rate=None) -> ()");
m.def(
"create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
m.def(
Expand Down Expand Up @@ -617,7 +617,8 @@ void encode_video_to_file(
std::optional<std::string_view> pixel_format = std::nullopt,
std::optional<double> crf = std::nullopt,
std::optional<std::string_view> preset = std::nullopt,
std::optional<std::vector<std::string>> extra_options = std::nullopt) {
std::optional<std::vector<std::string>> extra_options = std::nullopt,
std::optional<int64_t> desired_frame_rate = std::nullopt) {
VideoStreamOptions videoStreamOptions;
videoStreamOptions.codec = codec;
videoStreamOptions.pixelFormat = pixel_format;
Expand All @@ -628,7 +629,8 @@ void encode_video_to_file(
videoStreamOptions.extraOptions =
unflattenExtraOptions(extra_options.value());
}

videoStreamOptions.frameRate =
validateOptionalInt64ToInt(desired_frame_rate, "desired_frame_rate");
VideoEncoder(
frames,
validateInt64ToInt(frame_rate, "frame_rate"),
Expand All @@ -645,7 +647,8 @@ at::Tensor encode_video_to_tensor(
std::optional<std::string_view> pixel_format = std::nullopt,
std::optional<double> crf = std::nullopt,
std::optional<std::string_view> preset = std::nullopt,
std::optional<std::vector<std::string>> extra_options = std::nullopt) {
std::optional<std::vector<std::string>> extra_options = std::nullopt,
std::optional<int64_t> desired_frame_rate = std::nullopt) {
auto avioContextHolder = std::make_unique<AVIOToTensorContext>();
VideoStreamOptions videoStreamOptions;
videoStreamOptions.codec = codec;
Expand All @@ -657,7 +660,8 @@ at::Tensor encode_video_to_tensor(
videoStreamOptions.extraOptions =
unflattenExtraOptions(extra_options.value());
}

videoStreamOptions.frameRate =
validateOptionalInt64ToInt(desired_frame_rate, "desired_frame_rate");
return VideoEncoder(
frames,
validateInt64ToInt(frame_rate, "frame_rate"),
Expand All @@ -676,7 +680,8 @@ void _encode_video_to_file_like(
std::optional<std::string_view> pixel_format = std::nullopt,
std::optional<double> crf = std::nullopt,
std::optional<std::string_view> preset = std::nullopt,
std::optional<std::vector<std::string>> extra_options = std::nullopt) {
std::optional<std::vector<std::string>> extra_options = std::nullopt,
std::optional<int64_t> desired_frame_rate = std::nullopt) {
auto fileLikeContext =
reinterpret_cast<AVIOFileLikeContext*>(file_like_context);
TORCH_CHECK(
Expand All @@ -693,6 +698,8 @@ void _encode_video_to_file_like(
videoStreamOptions.extraOptions =
unflattenExtraOptions(extra_options.value());
}
videoStreamOptions.frameRate =
validateOptionalInt64ToInt(desired_frame_rate, "desired_frame_rate");

VideoEncoder encoder(
frames,
Expand Down
9 changes: 8 additions & 1 deletion src/torchcodec/_core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,19 +218,22 @@ def encode_video_to_file_like(
crf: Optional[Union[int, float]] = None,
preset: Optional[str] = None,
extra_options: Optional[list[str]] = None,
desired_frame_rate: Optional[int] = None,
) -> None:
"""Encode video frames to a file-like object.

Args:
frames: Video frames tensor
frame_rate: Frame rate in frames per second
frame_rate: Frame rate in frames per second (input frame rate)
format: Video format (e.g., "mp4", "mov", "mkv")
file_like: File-like object that supports write() and seek() methods
codec: Optional codec name (e.g., "libx264", "h264")
pixel_format: Optional pixel format (e.g., "yuv420p", "yuv444p")
crf: Optional constant rate factor for encoding quality
preset: Optional encoder preset as string (e.g., "ultrafast", "medium")
extra_options: Optional list of extra options as flattened key-value pairs
desired_frame_rate: Optional desired output frame rate. If not specified,
uses the input frame_rate.
"""
assert _pybind_ops is not None

Expand All @@ -244,6 +247,7 @@ def encode_video_to_file_like(
crf,
preset,
extra_options,
desired_frame_rate,
)


Expand Down Expand Up @@ -336,6 +340,7 @@ def encode_video_to_file_abstract(
preset: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
extra_options: Optional[list[str]] = None,
desired_frame_rate: Optional[int] = None,
) -> None:
return

Expand All @@ -350,6 +355,7 @@ def encode_video_to_tensor_abstract(
preset: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
extra_options: Optional[list[str]] = None,
desired_frame_rate: Optional[int] = None,
) -> torch.Tensor:
return torch.empty([], dtype=torch.long)

Expand All @@ -365,6 +371,7 @@ def _encode_video_to_file_like_abstract(
preset: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
extra_options: Optional[list[str]] = None,
desired_frame_rate: Optional[int] = None,
) -> None:
return

Expand Down
16 changes: 15 additions & 1 deletion src/torchcodec/encoders/_video_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ class VideoEncoder:
tensor of shape ``(N, C, H, W)`` where N is the number of frames,
C is 3 channels (RGB), H is height, and W is width.
Values must be uint8 in the range ``[0, 255]``.
frame_rate (int): The frame rate of the **input** ``frames``. Also defines the encoded **output** frame rate.
frame_rate (int): The frame rate of the **input** ``frames``. The
frame rate of the encoded output can be specified using the
encoding methods (``to_file``, etc.).
"""

def __init__(self, frames: Tensor, *, frame_rate: int):
Expand All @@ -41,6 +43,7 @@ def to_file(
pixel_format: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
preset: Optional[Union[str, int]] = None,
frame_rate: Optional[int] = None,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm happy to bikeshed on parameter order. sample_rate is the last param in AudioEncoder, but that constructor is much smaller. Here, having it last might make frame_rate less discoverable?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My views:

  1. Optional parameters should always be keyword-only.
  2. codec should be the first keyword-only parameter, as it's likely to be the most used.
  3. extra_options should be the last keyword-only parameter, as "catch-all" parameters tend to be last.

I don't have a preference for the order of the rest. 🚲 🛖 !

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't realize extra_options wasn't keyword-only, sorry for not catching that earlier. Let's make it keyword-only and keep it as the last parameter (it's probably going to be the least used param). frame_rate can go anywhere between codec and extra_options

) -> None:
"""Encode frames into a file.

Expand All @@ -63,6 +66,8 @@ def to_file(
extra_options (dict[str, Any], optional): A dictionary of additional
encoder options to pass, e.g. ``{"qp": 5, "tune": "film"}``.
Values will be converted to strings before passing to the encoder.
frame_rate (int, optional): The frame rate of the output video. If not specified,
uses the frame rate of the input ``frames``.
"""
preset = str(preset) if isinstance(preset, int) else preset
_core.encode_video_to_file(
Expand All @@ -76,6 +81,7 @@ def to_file(
extra_options=[
str(x) for k, v in (extra_options or {}).items() for x in (k, v)
],
desired_frame_rate=frame_rate,
)

def to_tensor(
Expand All @@ -87,6 +93,7 @@ def to_tensor(
crf: Optional[Union[int, float]] = None,
preset: Optional[Union[str, int]] = None,
extra_options: Optional[Dict[str, Any]] = None,
frame_rate: Optional[int] = None,
) -> Tensor:
"""Encode frames into raw bytes, as a 1D uint8 Tensor.

Expand All @@ -108,6 +115,8 @@ def to_tensor(
extra_options (dict[str, Any], optional): A dictionary of additional
encoder options to pass, e.g. ``{"qp": 5, "tune": "film"}``.
Values will be converted to strings before passing to the encoder.
frame_rate (int, optional): The frame rate of the output video. If not specified,
uses the frame rate of the input ``frames``.

Returns:
Tensor: The raw encoded bytes as 1D uint8 Tensor.
Expand All @@ -124,6 +133,7 @@ def to_tensor(
extra_options=[
str(x) for k, v in (extra_options or {}).items() for x in (k, v)
],
desired_frame_rate=frame_rate,
)

def to_file_like(
Expand All @@ -136,6 +146,7 @@ def to_file_like(
crf: Optional[Union[int, float]] = None,
preset: Optional[Union[str, int]] = None,
extra_options: Optional[Dict[str, Any]] = None,
frame_rate: Optional[int] = None,
) -> None:
"""Encode frames into a file-like object.

Expand All @@ -162,6 +173,8 @@ def to_file_like(
extra_options (dict[str, Any], optional): A dictionary of additional
encoder options to pass, e.g. ``{"qp": 5, "tune": "film"}``.
Values will be converted to strings before passing to the encoder.
frame_rate (int, optional): The frame rate of the output video. If not specified,
uses the frame rate of the input ``frames``.
"""
preset = str(preset) if isinstance(preset, int) else preset
_core.encode_video_to_file_like(
Expand All @@ -176,4 +189,5 @@ def to_file_like(
extra_options=[
str(x) for k, v in (extra_options or {}).items() for x in (k, v)
],
desired_frame_rate=frame_rate,
)
55 changes: 55 additions & 0 deletions test/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,13 @@ def test_bad_input_parameterized(self, tmp_path, method):
):
encoder.to_tensor(format="mp4", preset="fake_preset")

with pytest.raises(RuntimeError, match="Invalid frame_rate: "):
encoder = VideoEncoder(
frames=torch.zeros((5, 3, 64, 64), dtype=torch.uint8),
frame_rate=30,
)
getattr(encoder, method)(**valid_params, frame_rate=0)

@pytest.mark.parametrize("method", ["to_file", "to_tensor", "to_file_like"])
@pytest.mark.parametrize("crf", [23, 23.5, -0.9])
def test_crf_valid_values(self, method, crf, tmp_path):
Expand Down Expand Up @@ -1175,3 +1182,51 @@ def test_extra_options_utilized(self, tmp_path, profile, colorspace, color_range
assert metadata["profile"].lower() == expected_profile
assert metadata["color_space"] == colorspace
assert metadata["color_range"] == color_range

@pytest.mark.skipif(
in_fbcode(),
reason="ffprobe not available internally",
)
@pytest.mark.parametrize("method", ["to_file", "to_tensor", "to_file_like"])
@pytest.mark.parametrize("output_frame_rate", [10, 60, None])
def test_frame_rate_parameter(self, tmp_path, method, output_frame_rate):

frames = (
VideoDecoder(TEST_SRC_2_720P.path)
.get_frames_in_range(start=0, stop=60)
.data
)
input_frame_rate = 30
encoder = VideoEncoder(frames=frames, frame_rate=input_frame_rate)

if method == "to_file":
output_path = str(tmp_path / "output.mp4")
encoder.to_file(dest=output_path, frame_rate=output_frame_rate)
elif method == "to_tensor":
encoded_tensor = encoder.to_tensor(
format="mp4", frame_rate=output_frame_rate
)
# Write tensor to file to check with ffprobe
output_path = str(tmp_path / "output_from_tensor.mp4")
with open(output_path, "wb") as f:
f.write(encoded_tensor.numpy().tobytes())
elif method == "to_file_like":
file_like = io.BytesIO()
encoder.to_file_like(
file_like=file_like, format="mp4", frame_rate=output_frame_rate
)
# Write file_like to file to check with ffprobe
output_path = str(tmp_path / "output_from_file_like.mp4")
with open(output_path, "wb") as f:
f.write(file_like.getvalue())
else:
raise ValueError(f"Unknown method: {method}")

metadata = self._get_video_metadata(output_path, ["r_frame_rate"])
# Frame rate is returned as a fraction like "60/1"
numerator, denominator = metadata["r_frame_rate"].split("/")
actual_frame_rate = int(numerator) / int(denominator)
# Ensure frame_rate=None uses the input_frame_rate
if output_frame_rate is None:
output_frame_rate = input_frame_rate
assert actual_frame_rate == output_frame_rate
Loading