diff --git a/docs/requirements.txt b/docs/requirements.txt index 4cb316c84..64fa264e1 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -5,4 +5,5 @@ sphinx_copybutton sphinx-tabs matplotlib torchvision +ipython -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme diff --git a/docs/source/api_ref_decoders.rst b/docs/source/api_ref_decoders.rst index b969557e1..bb55cfae3 100644 --- a/docs/source/api_ref_decoders.rst +++ b/docs/source/api_ref_decoders.rst @@ -7,7 +7,8 @@ torchcodec.decoders .. currentmodule:: torchcodec.decoders -For a tutorial, see: :ref:`sphx_glr_generated_examples_basic_example.py`. +For a video decoder tutorial, see: :ref:`sphx_glr_generated_examples_basic_example.py`. +For an audio decoder tutorial, see: :ref:`sphx_glr_generated_examples_audio_decoding.py`. .. autosummary:: @@ -16,6 +17,7 @@ For a tutorial, see: :ref:`sphx_glr_generated_examples_basic_example.py`. :template: class.rst VideoDecoder + AudioDecoder .. autosummary:: @@ -24,3 +26,4 @@ For a tutorial, see: :ref:`sphx_glr_generated_examples_basic_example.py`. :template: dataclass.rst VideoStreamMetadata + AudioStreamMetadata diff --git a/docs/source/api_ref_torchcodec.rst b/docs/source/api_ref_torchcodec.rst index 36def114f..f6d3fef36 100644 --- a/docs/source/api_ref_torchcodec.rst +++ b/docs/source/api_ref_torchcodec.rst @@ -14,3 +14,4 @@ torchcodec Frame FrameBatch + AudioSamples diff --git a/docs/source/glossary.rst b/docs/source/glossary.rst index d067fcd19..7aafbdcd1 100644 --- a/docs/source/glossary.rst +++ b/docs/source/glossary.rst @@ -4,7 +4,7 @@ Glossary .. glossary:: pts - Presentation Time Stamp. The time at which a frame should be played. + Presentation Time Stamp. The time at which a frame or audio sample should be played. In TorchCodec, pts are expressed in seconds. best stream diff --git a/examples/audio_decoding.py b/examples/audio_decoding.py new file mode 100644 index 000000000..89c5f34ed --- /dev/null +++ b/examples/audio_decoding.py @@ -0,0 +1,111 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +======================================== +Decoding audio streams with AudioDecoder +======================================== + +In this example, we'll learn how to decode an audio file using the +:class:`~torchcodec.decoders.AudioDecoder` class. +""" + +# %% +# First, a bit of boilerplate: we'll download an audio file from the web and +# define an audio playing utility. You can ignore that part and jump right +# below to :ref:`creating_decoder_audio`. +import requests +from IPython.display import Audio + + +def play_audio(samples): + return Audio(samples.data, rate=samples.sample_rate) + + +# Audio source is CC0: https://opengameart.org/content/town-theme-rpg +# Attribution: cynicmusic.com pixelsphere.org +url = "https://opengameart.org/sites/default/files/TownTheme.mp3" +response = requests.get(url, headers={"User-Agent": ""}) +if response.status_code != 200: + raise RuntimeError(f"Failed to download video. {response.status_code = }.") + +raw_audio_bytes = response.content + +# %% +# .. _creating_decoder_audio: +# +# Creating a decoder +# ------------------ +# +# We can now create a decoder from the raw (encoded) audio bytes. You can of +# course use a local audio file and pass the path as input. You can also decode +# audio streams from videos! + +from torchcodec.decoders import AudioDecoder + +decoder = AudioDecoder(raw_audio_bytes) + +# %% +# The has not yet been decoded by the decoder, but we already have access to +# some metadata via the ``metadata`` attribute which is an +# :class:`~torchcodec.decoders.AudioStreamMetadata` object. +print(decoder.metadata) + +# %% +# Decoding samples +# ---------------- +# +# To get decoded samples, we just need to call the +# :meth:`~torchcodec.decoders.AudioDecoder.get_samples_played_in_range` method, +# which returns an :class:`~torchcodec.AudioSamples` object: + +samples = decoder.get_samples_played_in_range(start_seconds=0) + +print(samples) +play_audio(samples) + +# %% +# The ``.data`` field is a tensor of shape ``(num_channels, num_samples)`` and +# of float dtype with values in [-1, 1]. +# +# The ``.pts_seconds`` field indicates the starting time of the output samples. +# Here it's 0.025 seconds, even though we asked for samples starting from 0. Not +# all streams start exactly at 0! This is not a bug in TorchCodec, this is a +# property of the file that was defined when it was encoded. +# +# We only output the *start* of the samples, not the end or the duration. Those can +# be easily derived from the number of samples and the sample rate: + +duration_seconds = samples.data.shape[1] / samples.sample_rate +print(f"Duration = {int(duration_seconds // 60)}m{int(duration_seconds % 60)}s.") + +# %% +# Specifying a range +# ------------------ +# +# By default, +# :meth:`~torchcodec.decoders.AudioDecoder.get_samples_played_in_range` decodes +# the entire audio stream, but we can specify a custom range: + +samples = decoder.get_samples_played_in_range(start_seconds=10, stop_seconds=70) + +print(samples) +play_audio(samples) + +# %% +# Custom sample rate +# ------------------ +# +# We can also decode the samples into a desired sample rate using the +# ``sample_rate`` parameter of :class:`~torchcodec.decoders.AudioDecoder`. The +# ouput will sound the same, but note that the number of samples greatly +# increased: + +decoder = AudioDecoder(raw_audio_bytes, sample_rate=16_000) +samples = decoder.get_samples_played_in_range(start_seconds=0) + +print(samples) +play_audio(samples) diff --git a/src/torchcodec/_frame.py b/src/torchcodec/_frame.py index 31a9b6669..958db82fa 100644 --- a/src/torchcodec/_frame.py +++ b/src/torchcodec/_frame.py @@ -120,10 +120,12 @@ def __repr__(self): class AudioSamples(Iterable): """Audio samples with associated metadata.""" - # TODO-AUDIO: docs data: Tensor + """The sample data (``torch.Tensor`` of float in [-1, 1], shape is ``(num_channels, num_samples)``).""" pts_seconds: float + """The :term:`pts` of the first sample, in seconds.""" sample_rate: int + """The sample rate of the samples, in Hz.""" def __post_init__(self): # This is called after __init__() when a Frame is created. We can run diff --git a/src/torchcodec/decoders/__init__.py b/src/torchcodec/decoders/__init__.py index 307f18f43..4bb09f4dc 100644 --- a/src/torchcodec/decoders/__init__.py +++ b/src/torchcodec/decoders/__init__.py @@ -4,7 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from ._core import VideoStreamMetadata +from ._audio_decoder import AudioDecoder # noqa +from ._core import AudioStreamMetadata, VideoStreamMetadata from ._video_decoder import VideoDecoder # noqa SimpleVideoDecoder = VideoDecoder diff --git a/src/torchcodec/decoders/_audio_decoder.py b/src/torchcodec/decoders/_audio_decoder.py index 46cee62b4..cdf16dc82 100644 --- a/src/torchcodec/decoders/_audio_decoder.py +++ b/src/torchcodec/decoders/_audio_decoder.py @@ -18,7 +18,31 @@ class AudioDecoder: - """TODO-AUDIO docs""" + """A single-stream audio decoder. + + This can be used to decode audio from pure audio files (e.g. mp3, wav, + etc.), or from videos that contain audio streams (e.g. mp4 videos). + + Returned samples are float samples normalized in [-1, 1] + + Args: + source (str, ``Pathlib.path``, ``torch.Tensor``, or bytes): The source of the audio: + + - If ``str``: a local path or a URL to a video or audio file. + - If ``Pathlib.path``: a path to a local video or audio file. + - If ``bytes`` object or ``torch.Tensor``: the raw encoded audio data. + stream_index (int, optional): Specifies which stream in the file to decode samples from. + Note that this index is absolute across all media types. If left unspecified, then + the :term:`best stream` is used. + sample_rate (int, optional): The desired output sample rate of the decoded samples. + By default, the samples are returned in their original sample rate. + + Attributes: + metadata (AudioStreamMetadata): Metadata of the audio stream. + stream_index (int): The stream index that this decoder is retrieving samples from. If a + stream index was provided at initialization, this is the same value. If it was left + unspecified, this is the :term:`best stream`. + """ def __init__( self, @@ -46,10 +70,23 @@ def __init__( sample_rate if sample_rate is not None else self.metadata.sample_rate ) + # TODO-AUDIO: start_seconds should be 0 by default def get_samples_played_in_range( self, start_seconds: float, stop_seconds: Optional[float] = None ) -> AudioSamples: - """TODO-AUDIO docs""" + """Returns audio samples in the given range. + + Samples are in the half open range [start_seconds, stop_seconds). + + Args: + start_seconds (float): Time, in seconds, of the start of the + range. + stop_seconds (float): Time, in seconds, of the end of the + range. As a half open range, the end is excluded. + + Returns: + AudioSamples: The samples within the specified range. + """ if stop_seconds is not None and not start_seconds <= stop_seconds: raise ValueError( f"Invalid start seconds: {start_seconds}. It must be less than or equal to stop seconds ({stop_seconds})." diff --git a/src/torchcodec/decoders/_core/_metadata.py b/src/torchcodec/decoders/_core/_metadata.py index bf2e0256e..806526370 100644 --- a/src/torchcodec/decoders/_core/_metadata.py +++ b/src/torchcodec/decoders/_core/_metadata.py @@ -25,6 +25,8 @@ # TODO-AUDIO: docs below are mostly for video streams, we should edit them and / # or make sure they're OK for audio streams as well. Not sure how to best handle # docs for such class hierarchy. +# TODO very related, none of these common fields in this base class show up in +# the docs right now. @dataclass class StreamMetadata: duration_seconds_from_header: Optional[float] @@ -162,8 +164,11 @@ class AudioStreamMetadata(StreamMetadata): """Metadata of a single audio stream.""" sample_rate: Optional[int] + """The original sample rate.""" num_channels: Optional[int] + """The number of channels (1 for mono, 2 for stereo, etc.)""" sample_format: Optional[str] + """The original sample format, as described by FFmpeg. E.g. 'fltp', 's32', etc.""" def __repr__(self): return super().__repr__()