Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
a8a8cea
Committing to move on
scotts Nov 17, 2025
af2e1ab
Merge branch 'main' of github.com:pytorch/torchcodec into random_crop
scotts Nov 19, 2025
aa15765
It... works?
scotts Nov 21, 2025
fd8f7a5
Lint
scotts Nov 21, 2025
7e43313
Docstrings, better error checking, better testing
scotts Nov 21, 2025
8e6a8f2
Way more defensive programming
scotts Nov 22, 2025
d8b7ed0
Refactor all the things
scotts Dec 1, 2025
705d1ef
Comment formatting pedantry
scotts Dec 1, 2025
62eb585
Type checking
scotts Dec 1, 2025
817b1f8
Merge branch 'main' of github.com:pytorch/torchcodec into random_crop
scotts Dec 1, 2025
a92d5f0
Refactor DecoderTransforms to normal classes
scotts Dec 1, 2025
f8844f4
Simplify; handle pipelines
scotts Dec 3, 2025
2fed9c3
Moare refactor pleasze
scotts Dec 3, 2025
e736742
Better doc strings
scotts Dec 3, 2025
6432727
Comment
scotts Dec 3, 2025
be8ed26
Transformer -> Transform
scotts Dec 3, 2025
5017760
Merge branch 'random_crop' into refactor_decoder_transforms
scotts Dec 3, 2025
a9664c5
Merge branch 'main' of github.com:pytorch/torchcodec into refactor_de…
scotts Dec 3, 2025
2d012f7
Double define
scotts Dec 3, 2025
4907cde
Implement CenterCrop
scotts Dec 4, 2025
7754ae7
Add test
scotts Dec 4, 2025
47dbc5b
Comment
scotts Dec 4, 2025
e889c59
Merge branch 'main' of github.com:pytorch/torchcodec into center_crop
scotts Dec 4, 2025
f04bbc1
Better comments, docs
scotts Dec 4, 2025
4bccf4e
Refactor location of _make_transform_specs
scotts Dec 4, 2025
5e9a48b
Merge branch 'main' of github.com:pytorch/torchcodec into refactor_ma…
scotts Dec 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 2 additions & 99 deletions src/torchcodec/decoders/_video_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
create_decoder,
ERROR_REPORTING_INSTRUCTIONS,
)
from torchcodec.transforms import CenterCrop, DecoderTransform, RandomCrop, Resize
from torchcodec.transforms import DecoderTransform
from torchcodec.transforms._decoder_transforms import _make_transform_specs


class VideoDecoder:
Expand Down Expand Up @@ -451,104 +452,6 @@ def _get_and_validate_stream_metadata(
)


def _make_transform_specs(
transforms: Optional[Sequence[Union[DecoderTransform, nn.Module]]],
input_dims: Tuple[Optional[int], Optional[int]],
) -> str:
"""Given a sequence of transforms, turn those into the specification string
the core API expects.

Args:
transforms: Optional sequence of transform objects. The objects can be
one of two types:
1. torchcodec.transforms.DecoderTransform
2. torchvision.transforms.v2.Transform, but our type annotation
only mentions its base, nn.Module. We don't want to take a
hard dependency on TorchVision.
input_dims: Optional (height, width) pair. Note that only some
transforms need to know the dimensions. If the user provides
transforms that don't need to know the dimensions, and that metadata
is missing, everything should still work. That means we assert their
existence as late as possible.

Returns:
String of transforms in the format the core API expects: transform
specifications separate by semicolons.
"""
if transforms is None:
return ""

try:
from torchvision.transforms import v2

tv_available = True
except ImportError:
tv_available = False

# The following loop accomplishes two tasks:
#
# 1. Converts the transform to a DecoderTransform, if necessary. We
# accept TorchVision transform objects and they must be converted
# to their matching DecoderTransform.
# 2. Calculates what the input dimensions are to each transform.
#
# The order in our transforms list is semantically meaningful, as we
# actually have a pipeline where the output of one transform is the input to
# the next. For example, if we have the transforms list [A, B, C, D], then
# we should understand that as:
#
# A -> B -> C -> D
#
# Where the frame produced by A is the input to B, the frame produced by B
# is the input to C, etc. This particularly matters for frame dimensions.
# Transforms can both:
#
# 1. Produce frames with arbitrary dimensions.
# 2. Rely on their input frame's dimensions to calculate ahead-of-time
# what their runtime behavior will be.
#
# The consequence of the above facts is that we need to statically track
# frame dimensions in the pipeline while we pre-process it. The input
# frame's dimensions to A, our first transform, is always what we know from
# our metadata. For each transform, we always calculate its output
# dimensions from its input dimensions. We store these with the converted
# transform, to be all used together when we generate the specs.
converted_transforms: list[
Tuple[
DecoderTransform,
# A (height, width) pair where the values may be missing.
Tuple[Optional[int], Optional[int]],
]
] = []
curr_input_dims = input_dims
for transform in transforms:
if not isinstance(transform, DecoderTransform):
if not tv_available:
raise ValueError(
f"The supplied transform, {transform}, is not a TorchCodec "
" DecoderTransform. TorchCodec also accepts TorchVision "
"v2 transforms, but TorchVision is not installed."
)
elif isinstance(transform, v2.Resize):
transform = Resize._from_torchvision(transform)
elif isinstance(transform, v2.CenterCrop):
transform = CenterCrop._from_torchvision(transform)
elif isinstance(transform, v2.RandomCrop):
transform = RandomCrop._from_torchvision(transform)
else:
raise ValueError(
f"Unsupported transform: {transform}. Transforms must be "
"either a TorchCodec DecoderTransform or a TorchVision "
"v2 transform."
)

converted_transforms.append((transform, curr_input_dims))
output_dims = transform._get_output_dims()
curr_input_dims = output_dims if output_dims is not None else curr_input_dims

return ";".join([t._make_transform_spec(dims) for t, dims in converted_transforms])


def _read_custom_frame_mappings(
custom_frame_mappings: Union[str, bytes, io.RawIOBase, io.BufferedReader]
) -> tuple[Tensor, Tensor, Tensor]:
Expand Down
100 changes: 99 additions & 1 deletion src/torchcodec/transforms/_decoder_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from abc import ABC, abstractmethod
from types import ModuleType
from typing import Optional, Sequence, Tuple
from typing import Optional, Sequence, Tuple, Union

import torch
from torch import nn
Expand Down Expand Up @@ -282,3 +282,101 @@ def _from_torchvision(
)

return cls(size=tv_random_crop.size)


def _make_transform_specs(
transforms: Optional[Sequence[Union[DecoderTransform, nn.Module]]],
input_dims: Tuple[Optional[int], Optional[int]],
) -> str:
"""Given a sequence of transforms, turn those into the specification string
the core API expects.

Args:
transforms: Optional sequence of transform objects. The objects can be
one of two types:
1. torchcodec.transforms.DecoderTransform
2. torchvision.transforms.v2.Transform, but our type annotation
only mentions its base, nn.Module. We don't want to take a
hard dependency on TorchVision.
input_dims: Optional (height, width) pair. Note that only some
transforms need to know the dimensions. If the user provides
transforms that don't need to know the dimensions, and that metadata
is missing, everything should still work. That means we assert their
existence as late as possible.

Returns:
String of transforms in the format the core API expects: transform
specifications separate by semicolons.
"""
if transforms is None:
return ""

try:
from torchvision.transforms import v2

tv_available = True
except ImportError:
tv_available = False

# The following loop accomplishes two tasks:
#
# 1. Converts the transform to a DecoderTransform, if necessary. We
# accept TorchVision transform objects and they must be converted
# to their matching DecoderTransform.
# 2. Calculates what the input dimensions are to each transform.
#
# The order in our transforms list is semantically meaningful, as we
# actually have a pipeline where the output of one transform is the input to
# the next. For example, if we have the transforms list [A, B, C, D], then
# we should understand that as:
#
# A -> B -> C -> D
#
# Where the frame produced by A is the input to B, the frame produced by B
# is the input to C, etc. This particularly matters for frame dimensions.
# Transforms can both:
#
# 1. Produce frames with arbitrary dimensions.
# 2. Rely on their input frame's dimensions to calculate ahead-of-time
# what their runtime behavior will be.
#
# The consequence of the above facts is that we need to statically track
# frame dimensions in the pipeline while we pre-process it. The input
# frame's dimensions to A, our first transform, is always what we know from
# our metadata. For each transform, we always calculate its output
# dimensions from its input dimensions. We store these with the converted
# transform, to be all used together when we generate the specs.
converted_transforms: list[
Tuple[
DecoderTransform,
# A (height, width) pair where the values may be missing.
Tuple[Optional[int], Optional[int]],
]
] = []
curr_input_dims = input_dims
for transform in transforms:
if not isinstance(transform, DecoderTransform):
if not tv_available:
raise ValueError(
f"The supplied transform, {transform}, is not a TorchCodec "
" DecoderTransform. TorchCodec also accepts TorchVision "
"v2 transforms, but TorchVision is not installed."
)
elif isinstance(transform, v2.Resize):
transform = Resize._from_torchvision(transform)
elif isinstance(transform, v2.CenterCrop):
transform = CenterCrop._from_torchvision(transform)
elif isinstance(transform, v2.RandomCrop):
transform = RandomCrop._from_torchvision(transform)
else:
raise ValueError(
f"Unsupported transform: {transform}. Transforms must be "
"either a TorchCodec DecoderTransform or a TorchVision "
"v2 transform."
)

converted_transforms.append((transform, curr_input_dims))
output_dims = transform._get_output_dims()
curr_input_dims = output_dims if output_dims is not None else curr_input_dims

return ";".join([t._make_transform_spec(dims) for t, dims in converted_transforms])
Loading