Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
f6a7f4e
WIP
NicolasHug Mar 12, 2025
179a01c
Merge branch 'main' of github.com:pytorch/torchcodec into sample_rate
NicolasHug Mar 18, 2025
2d97555
Remove old code
NicolasHug Mar 18, 2025
9af4bc8
WI:P
NicolasHug Mar 18, 2025
2adf496
WIP
NicolasHug Mar 18, 2025
ef93be4
Fix clipping
NicolasHug Mar 18, 2025
db740a6
Merge branch 'main' of github.com:pytorch/torchcodec into sample_rate
NicolasHug Mar 19, 2025
ca15232
Driveby, remove preAllocatedOutputTensor
NicolasHug Mar 19, 2025
6aa7b09
Rename avFrame into srcAVFrame
NicolasHug Mar 19, 2025
f858d0c
Add flushing
NicolasHug Mar 19, 2025
70ac31e
Put back normal compilation flags
NicolasHug Mar 19, 2025
8deb079
Add tests
NicolasHug Mar 19, 2025
7b09315
Add tests
NicolasHug Mar 19, 2025
af4e88a
Nit
NicolasHug Mar 19, 2025
975b0fb
Fix test assets
NicolasHug Mar 19, 2025
7cb2271
Merge branch 'main' of github.com:pytorch/torchcodec into sample_rate
NicolasHug Mar 20, 2025
ee1c7b7
Nit
NicolasHug Mar 20, 2025
bf9aed2
NULL -> nullptr
NicolasHug Mar 20, 2025
f0e2cdd
Use optional
NicolasHug Mar 20, 2025
6f0694e
Merge branch 'main' of github.com:pytorch/torchcodec into audio_tutorial
NicolasHug Mar 20, 2025
7f9d3b0
Add AudioDecoder docs and tutorial
NicolasHug Mar 20, 2025
85d4fc1
Debug
NicolasHug Mar 20, 2025
cfb190b
Fix sample rate conversion bug with multi-channel data
NicolasHug Mar 20, 2025
dc1f6d7
WIP
NicolasHug Mar 21, 2025
b3f37c7
Add test
NicolasHug Mar 21, 2025
8ed45a7
Merge branch 'downsample' into audio_tutorial
NicolasHug Mar 21, 2025
1acd939
Docs
NicolasHug Mar 21, 2025
0b5db95
Add more
NicolasHug Mar 21, 2025
54ad867
Merge branch 'main' of github.com:pytorch/torchcodec into audio_tutorial
NicolasHug Mar 21, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ sphinx_copybutton
sphinx-tabs
matplotlib
torchvision
ipython
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
5 changes: 4 additions & 1 deletion docs/source/api_ref_decoders.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ torchcodec.decoders
.. currentmodule:: torchcodec.decoders


For a tutorial, see: :ref:`sphx_glr_generated_examples_basic_example.py`.
For a video decoder tutorial, see: :ref:`sphx_glr_generated_examples_basic_example.py`.
For an audio decoder tutorial, see: :ref:`sphx_glr_generated_examples_audio_decoding.py`.


.. autosummary::
Expand All @@ -16,6 +17,7 @@ For a tutorial, see: :ref:`sphx_glr_generated_examples_basic_example.py`.
:template: class.rst

VideoDecoder
AudioDecoder


.. autosummary::
Expand All @@ -24,3 +26,4 @@ For a tutorial, see: :ref:`sphx_glr_generated_examples_basic_example.py`.
:template: dataclass.rst

VideoStreamMetadata
AudioStreamMetadata
1 change: 1 addition & 0 deletions docs/source/api_ref_torchcodec.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ torchcodec

Frame
FrameBatch
AudioSamples
2 changes: 1 addition & 1 deletion docs/source/glossary.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Glossary
.. glossary::

pts
Presentation Time Stamp. The time at which a frame should be played.
Presentation Time Stamp. The time at which a frame or audio sample should be played.
In TorchCodec, pts are expressed in seconds.

best stream
Expand Down
111 changes: 111 additions & 0 deletions examples/audio_decoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""
========================================
Decoding audio streams with AudioDecoder
========================================

In this example, we'll learn how to decode an audio file using the
:class:`~torchcodec.decoders.AudioDecoder` class.
"""

# %%
# First, a bit of boilerplate: we'll download an audio file from the web and
# define an audio playing utility. You can ignore that part and jump right
# below to :ref:`creating_decoder_audio`.
import requests
from IPython.display import Audio


def play_audio(samples):
return Audio(samples.data, rate=samples.sample_rate)


# Audio source is CC0: https://opengameart.org/content/town-theme-rpg
# Attribution: cynicmusic.com pixelsphere.org
url = "https://opengameart.org/sites/default/files/TownTheme.mp3"
response = requests.get(url, headers={"User-Agent": ""})
if response.status_code != 200:
raise RuntimeError(f"Failed to download video. {response.status_code = }.")

raw_audio_bytes = response.content

# %%
# .. _creating_decoder_audio:
#
# Creating a decoder
# ------------------
#
# We can now create a decoder from the raw (encoded) audio bytes. You can of
# course use a local audio file and pass the path as input. You can also decode
# audio streams from videos!

from torchcodec.decoders import AudioDecoder

decoder = AudioDecoder(raw_audio_bytes)

# %%
# The has not yet been decoded by the decoder, but we already have access to
# some metadata via the ``metadata`` attribute which is an
# :class:`~torchcodec.decoders.AudioStreamMetadata` object.
print(decoder.metadata)

# %%
# Decoding samples
# ----------------
#
# To get decoded samples, we just need to call the
# :meth:`~torchcodec.decoders.AudioDecoder.get_samples_played_in_range` method,
# which returns an :class:`~torchcodec.AudioSamples` object:

samples = decoder.get_samples_played_in_range(start_seconds=0)

print(samples)
play_audio(samples)

# %%
# The ``.data`` field is a tensor of shape ``(num_channels, num_samples)`` and
# of float dtype with values in [-1, 1].
#
# The ``.pts_seconds`` field indicates the starting time of the output samples.
# Here it's 0.025 seconds, even though we asked for samples starting from 0. Not
# all streams start exactly at 0! This is not a bug in TorchCodec, this is a
# property of the file that was defined when it was encoded.
#
# We only output the *start* of the samples, not the end or the duration. Those can
# be easily derived from the number of samples and the sample rate:

duration_seconds = samples.data.shape[1] / samples.sample_rate
print(f"Duration = {int(duration_seconds // 60)}m{int(duration_seconds % 60)}s.")

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I'm writing this, I feel like we should just set the duration_seconds field ourselves and make it part of AudioSamples? It's not like memory is a problem, and it would be consistent with video frames.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I think it would be useful. I was wondering the same thing when reading the tutorial.

# %%
# Specifying a range
# ------------------
#
# By default,
# :meth:`~torchcodec.decoders.AudioDecoder.get_samples_played_in_range` decodes
# the entire audio stream, but we can specify a custom range:

samples = decoder.get_samples_played_in_range(start_seconds=10, stop_seconds=70)

print(samples)
play_audio(samples)

# %%
# Custom sample rate
# ------------------
#
# We can also decode the samples into a desired sample rate using the
# ``sample_rate`` parameter of :class:`~torchcodec.decoders.AudioDecoder`. The
# ouput will sound the same, but note that the number of samples greatly
# increased:

decoder = AudioDecoder(raw_audio_bytes, sample_rate=16_000)
samples = decoder.get_samples_played_in_range(start_seconds=0)

print(samples)
play_audio(samples)
4 changes: 3 additions & 1 deletion src/torchcodec/_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,10 +120,12 @@ def __repr__(self):
class AudioSamples(Iterable):
"""Audio samples with associated metadata."""

# TODO-AUDIO: docs
data: Tensor
"""The sample data (``torch.Tensor`` of float in [-1, 1], shape is ``(num_channels, num_samples)``)."""
pts_seconds: float
"""The :term:`pts` of the first sample, in seconds."""
sample_rate: int
"""The sample rate of the samples, in Hz."""

def __post_init__(self):
# This is called after __init__() when a Frame is created. We can run
Expand Down
3 changes: 2 additions & 1 deletion src/torchcodec/decoders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from ._core import VideoStreamMetadata
from ._audio_decoder import AudioDecoder # noqa
from ._core import AudioStreamMetadata, VideoStreamMetadata
from ._video_decoder import VideoDecoder # noqa

SimpleVideoDecoder = VideoDecoder
41 changes: 39 additions & 2 deletions src/torchcodec/decoders/_audio_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,31 @@


class AudioDecoder:
"""TODO-AUDIO docs"""
"""A single-stream audio decoder.

This can be used to decode audio from pure audio files (e.g. mp3, wav,
etc.), or from videos that contain audio streams (e.g. mp4 videos).

Returned samples are float samples normalized in [-1, 1]

Args:
source (str, ``Pathlib.path``, ``torch.Tensor``, or bytes): The source of the audio:

- If ``str``: a local path or a URL to a video or audio file.
- If ``Pathlib.path``: a path to a local video or audio file.
- If ``bytes`` object or ``torch.Tensor``: the raw encoded audio data.
stream_index (int, optional): Specifies which stream in the file to decode samples from.
Note that this index is absolute across all media types. If left unspecified, then
the :term:`best stream` is used.
sample_rate (int, optional): The desired output sample rate of the decoded samples.
By default, the samples are returned in their original sample rate.

Attributes:
metadata (AudioStreamMetadata): Metadata of the audio stream.
stream_index (int): The stream index that this decoder is retrieving samples from. If a
stream index was provided at initialization, this is the same value. If it was left
unspecified, this is the :term:`best stream`.
"""

def __init__(
self,
Expand Down Expand Up @@ -46,10 +70,23 @@ def __init__(
sample_rate if sample_rate is not None else self.metadata.sample_rate
)

# TODO-AUDIO: start_seconds should be 0 by default
def get_samples_played_in_range(
self, start_seconds: float, stop_seconds: Optional[float] = None
) -> AudioSamples:
"""TODO-AUDIO docs"""
"""Returns audio samples in the given range.

Samples are in the half open range [start_seconds, stop_seconds).

Args:
start_seconds (float): Time, in seconds, of the start of the
range.
stop_seconds (float): Time, in seconds, of the end of the
range. As a half open range, the end is excluded.

Returns:
AudioSamples: The samples within the specified range.
"""
if stop_seconds is not None and not start_seconds <= stop_seconds:
raise ValueError(
f"Invalid start seconds: {start_seconds}. It must be less than or equal to stop seconds ({stop_seconds})."
Expand Down
5 changes: 5 additions & 0 deletions src/torchcodec/decoders/_core/_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
# TODO-AUDIO: docs below are mostly for video streams, we should edit them and /
# or make sure they're OK for audio streams as well. Not sure how to best handle
# docs for such class hierarchy.
# TODO very related, none of these common fields in this base class show up in
# the docs right now.
@dataclass
class StreamMetadata:
duration_seconds_from_header: Optional[float]
Expand Down Expand Up @@ -162,8 +164,11 @@ class AudioStreamMetadata(StreamMetadata):
"""Metadata of a single audio stream."""

sample_rate: Optional[int]
"""The original sample rate."""
num_channels: Optional[int]
"""The number of channels (1 for mono, 2 for stereo, etc.)"""
sample_format: Optional[str]
"""The original sample format, as described by FFmpeg. E.g. 'fltp', 's32', etc."""

def __repr__(self):
return super().__repr__()
Expand Down
Loading