From 304fdf9d7a2b81a49040c0dde2c40128e1f696b9 Mon Sep 17 00:00:00 2001 From: Molly Xu Date: Wed, 19 Nov 2025 21:28:36 -0800 Subject: [PATCH 1/5] first draft of performance tips tutorial --- docs/source/conf.py | 1 + examples/decoding/performance_tips.py | 159 ++++++++++++++++++++++++++ 2 files changed, 160 insertions(+) create mode 100644 examples/decoding/performance_tips.py diff --git a/docs/source/conf.py b/docs/source/conf.py index 133bccf2e..87f14f75d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -81,6 +81,7 @@ def __call__(self, filename): "approximate_mode.py", "sampling.py", "parallel_decoding.py", + "performance_tips.py", "custom_frame_mappings.py", ] else: diff --git a/examples/decoding/performance_tips.py b/examples/decoding/performance_tips.py new file mode 100644 index 000000000..e04d4fb89 --- /dev/null +++ b/examples/decoding/performance_tips.py @@ -0,0 +1,159 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +==================================== +Performance Tips and Best Practices +==================================== + +This tutorial consolidates performance optimization techniques for video +decoding with TorchCodec. Learn when and how to apply various strategies +to increase performance. +""" + + +# %% +# Overview +# -------- +# +# When decoding videos with TorchCodec, several techniques can significantly +# improve performance depending on your use case. This guide covers: +# +# 1. **Batch APIs** - Decode multiple frames at once +# 2. **Approximate Mode & Keyframe Mappings** - Trade accuracy for speed +# 3. **Multi-threading** - Parallelize decoding across videos or chunks +# 4. **CUDA Acceleration (BETA)** - Use GPU decoding for supported formats +# +# We'll explore each technique and when to use it. + +# %% +# 1. Use Batch APIs When Possible +# -------------------------------- +# +# If you need to decode multiple frames at once, it is faster when using the batch methods. TorchCodec's batch APIs reduce overhead and can leverage +# internal optimizations. +# +# **Key Methods:** +# +# - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_at` for specific indices +# - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_in_range` for ranges +# - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_at` for timestamps +# - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_in_range` for time ranges +# +# **When to use:** +# +# - Decoding multiple frames + +# %% +# .. note:: +# +# For complete examples with runnable code demonstrating batch decoding, +# iteration, and frame retrieval, see: +# +# - :ref:`sphx_glr_generated_examples_decoding_basic_example.py` + +# %% +# 2. Approximate Mode & Keyframe Mappings +# ---------------------------------------- +# +# By default, TorchCodec uses ``seek_mode="exact"``, which performs a scan when +# the decoder is created to build an accurate internal index of frames. This +# ensures frame-accurate seeking but takes longer for decoder initialization, +# especially on long videos. + +# %% +# **Approximate Mode** +# ~~~~~~~~~~~~~~~~~~~~ +# +# Setting ``seek_mode="approximate"`` skips the initial scan and relies on the +# video file's metadata headers. This dramatically speeds up +# :class:`~torchcodec.decoders.VideoDecoder` creation, particularly for long +# videos, but may result in slightly less accurate seeking in some cases. +# +# +# **Which mode should you use:** +# +# - If you care about exactness of frame seeking, use “exact”. +# - If you can sacrifice exactness of seeking for speed, which is usually the case when doing clip sampling, use “approximate”. +# - If your videos don’t have variable framerate and their metadata is correct, then “approximate” mode is a net win: it will be just as accurate as the “exact” mode while still being significantly faster. +# - If your size is small enough and we’re decoding a lot of frames, there’s a chance exact mode is actually faster. + +# %% +# **Custom Frame Mappings** +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# For advanced use cases, you can pre-compute a custom mapping between desired +# frame indices and actual keyframe locations. This allows you to speed up :class:`~torchcodec.decoders.VideoDecoder` +# instantiation while maintaining the frame seeking accuracy of ``seek_mode="exact"`` +# +# **When to use:** +# +# - Frame accuracy is critical, so approximate mode cannot be used +# - Videos can be preprocessed once and then decoded many times +# +# **Performance impact:** Enables consistent, predictable performance for repeated +# random access without the overhead of exact mode's scanning. + +# %% +# .. note:: +# +# For complete benchmarks showing actual speedup numbers, accuracy comparisons, +# and implementation examples, see: +# +# - :ref:`sphx_glr_generated_examples_decoding_approximate_mode.py` +# +# - :ref:`sphx_glr_generated_examples_decoding_custom_frame_mappings.py` + +# %% +# 3. Multi-threading for Parallel Decoding +# ----------------------------------------- +# +# For video decoding of a large number of frames from a single video, there are a few parallelization strategies to speed up the decoding process: +# +# - FFmpeg-based parallelism: Using FFmpeg's internal threading capabilities +# - Multiprocessing: Distributing work across multiple processes +# - Multithreading: Using multiple threads within a single process + +# %% +# .. note:: +# +# For complete examples comparing +# sequential, ffmpeg-based parallelism, multi-process, and multi-threaded approaches, see: +# +# - :ref:`sphx_glr_generated_examples_decoding_parallel_decoding.py` + +# %% +# 4. BETA: CUDA Acceleration +# --------------------------- +# +# TorchCodec supports GPU-accelerated decoding using NVIDIA's hardware decoder +# (NVDEC) on supported hardware. This keeps decoded tensors in GPU memory, +# avoiding expensive CPU-GPU transfers for downstream GPU operations. +# +# **When to use:** +# +# - Decoding large resolution videos +# - Large batch of videos saturating the CPU +# - GPU-intensive pipelines with transforms like scaling and cropping +# - CPU is saturated and you want to free it up for other work +# +# **When NOT to use:** +# +# - You need bit-exact results +# - Small resolution videos and the PCI-e transfer latency is large +# - GPU is already busy and CPU is idle +# +# **Performance impact:** CUDA decoding can significantly outperform CPU decoding, +# especially for high-resolution videos and when combined with GPU-based transforms. +# Actual speedup varies by hardware, resolution, and codec. + +# %% +# .. note:: +# +# For installation instructions, detailed examples, and visual comparisons +# between CPU and CUDA decoding, see: +# +# - :ref:`sphx_glr_generated_examples_decoding_basic_cuda_example.py` From 5693776db99e44088be6692e01f6e414b5bd53c4 Mon Sep 17 00:00:00 2001 From: Molly Xu Date: Thu, 20 Nov 2025 07:52:46 -0800 Subject: [PATCH 2/5] modify format --- examples/decoding/performance_tips.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/decoding/performance_tips.py b/examples/decoding/performance_tips.py index e04d4fb89..e36598e30 100644 --- a/examples/decoding/performance_tips.py +++ b/examples/decoding/performance_tips.py @@ -111,11 +111,11 @@ # 3. Multi-threading for Parallel Decoding # ----------------------------------------- # -# For video decoding of a large number of frames from a single video, there are a few parallelization strategies to speed up the decoding process: +# When decoding multiple videos or decoding a large number of frames from a single video, there are a few parallelization strategies to speed up the decoding process: # -# - FFmpeg-based parallelism: Using FFmpeg's internal threading capabilities -# - Multiprocessing: Distributing work across multiple processes -# - Multithreading: Using multiple threads within a single process +# - **FFmpeg-based parallelism** - Using FFmpeg's internal threading capabilities +# - **Multiprocessing** - Distributing work across multiple processes +# - **Multithreading** - Using multiple threads within a single process # %% # .. note:: From a74f653b477547c33f3bd95f747e4978aee1c96b Mon Sep 17 00:00:00 2001 From: Molly Xu Date: Fri, 21 Nov 2025 08:38:38 -0800 Subject: [PATCH 3/5] address feedback --- examples/decoding/performance_tips.py | 33 ++++++++++++++++++--------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/examples/decoding/performance_tips.py b/examples/decoding/performance_tips.py index e36598e30..4e1705623 100644 --- a/examples/decoding/performance_tips.py +++ b/examples/decoding/performance_tips.py @@ -25,7 +25,7 @@ # 1. **Batch APIs** - Decode multiple frames at once # 2. **Approximate Mode & Keyframe Mappings** - Trade accuracy for speed # 3. **Multi-threading** - Parallelize decoding across videos or chunks -# 4. **CUDA Acceleration (BETA)** - Use GPU decoding for supported formats +# 4. **CUDA Acceleration** - Use GPU decoding for supported formats # # We'll explore each technique and when to use it. @@ -33,8 +33,9 @@ # 1. Use Batch APIs When Possible # -------------------------------- # -# If you need to decode multiple frames at once, it is faster when using the batch methods. TorchCodec's batch APIs reduce overhead and can leverage -# internal optimizations. +# If you need to decode multiple frames at once, the batch methods are faster than calling single-frame decoding methods multiple times. +# For example, :meth:`~torchcodec.decoders.VideoDecoder.get_frames_at` is faster than calling :meth:`~torchcodec.decoders.VideoDecoder.get_frame_at` multiple times. +# TorchCodec's batch APIs reduce overhead and can leverage internal optimizations. # # **Key Methods:** # @@ -59,7 +60,7 @@ # 2. Approximate Mode & Keyframe Mappings # ---------------------------------------- # -# By default, TorchCodec uses ``seek_mode="exact"``, which performs a scan when +# By default, TorchCodec uses ``seek_mode="exact"``, which performs a :term:`scan` when # the decoder is created to build an accurate internal index of frames. This # ensures frame-accurate seeking but takes longer for decoder initialization, # especially on long videos. @@ -68,7 +69,7 @@ # **Approximate Mode** # ~~~~~~~~~~~~~~~~~~~~ # -# Setting ``seek_mode="approximate"`` skips the initial scan and relies on the +# Setting ``seek_mode="approximate"`` skips the initial :term:`scan` and relies on the # video file's metadata headers. This dramatically speeds up # :class:`~torchcodec.decoders.VideoDecoder` creation, particularly for long # videos, but may result in slightly less accurate seeking in some cases. @@ -77,9 +78,7 @@ # **Which mode should you use:** # # - If you care about exactness of frame seeking, use “exact”. -# - If you can sacrifice exactness of seeking for speed, which is usually the case when doing clip sampling, use “approximate”. -# - If your videos don’t have variable framerate and their metadata is correct, then “approximate” mode is a net win: it will be just as accurate as the “exact” mode while still being significantly faster. -# - If your size is small enough and we’re decoding a lot of frames, there’s a chance exact mode is actually faster. +# - If the video is long and you're only decoding a small amount of frames, approximate mode should be faster. # %% # **Custom Frame Mappings** @@ -113,9 +112,11 @@ # # When decoding multiple videos or decoding a large number of frames from a single video, there are a few parallelization strategies to speed up the decoding process: # -# - **FFmpeg-based parallelism** - Using FFmpeg's internal threading capabilities +# - **FFmpeg-based parallelism** - Using FFmpeg's internal threading capabilities for intra-frame parallelism, where parallelization happens within individual frames rather than across frames # - **Multiprocessing** - Distributing work across multiple processes # - **Multithreading** - Using multiple threads within a single process +# +# Both multiprocessing and multithreading can be used to decode multiple videos in parallel, or to decode a single long video in parallel by splitting it into chunks. # %% # .. note:: @@ -126,8 +127,8 @@ # - :ref:`sphx_glr_generated_examples_decoding_parallel_decoding.py` # %% -# 4. BETA: CUDA Acceleration -# --------------------------- +# 4. CUDA Acceleration +# -------------------- # # TorchCodec supports GPU-accelerated decoding using NVIDIA's hardware decoder # (NVDEC) on supported hardware. This keeps decoded tensors in GPU memory, @@ -150,6 +151,16 @@ # especially for high-resolution videos and when combined with GPU-based transforms. # Actual speedup varies by hardware, resolution, and codec. +# %% +# **Recommended Usage for Beta Interface** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# .. code-block:: python +# +# with set_cuda_backend("beta"): +# decoder = VideoDecoder("file.mp4", device="cuda") +# + # %% # .. note:: # From 547d8e5310c8754556c178c5aabdf1af52d206e5 Mon Sep 17 00:00:00 2001 From: Molly Xu Date: Mon, 24 Nov 2025 12:58:01 -0800 Subject: [PATCH 4/5] address feedback --- docs/source/index.rst | 8 ++++++++ examples/decoding/performance_tips.py | 13 +++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 74e8d1298..e25a79827 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -84,6 +84,14 @@ Decoding How to sample regular and random clips from a video + .. grid-item-card:: :octicon:`file-code;1em` + Performance Tips + :img-top: _static/img/card-background.svg + :link: generated_examples/decoding/performance_tips.html + :link-type: url + + Tips for optimizing video decoding performance + Encoding ^^^^^^^^ diff --git a/examples/decoding/performance_tips.py b/examples/decoding/performance_tips.py index 4e1705623..17781f451 100644 --- a/examples/decoding/performance_tips.py +++ b/examples/decoding/performance_tips.py @@ -39,8 +39,13 @@ # # **Key Methods:** # +# For index-based frame retrieval: +# # - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_at` for specific indices # - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_in_range` for ranges +# +# For timestamp-based frame retrieval: +# # - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_at` for timestamps # - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_in_range` for time ranges # @@ -61,7 +66,7 @@ # ---------------------------------------- # # By default, TorchCodec uses ``seek_mode="exact"``, which performs a :term:`scan` when -# the decoder is created to build an accurate internal index of frames. This +# you create the decoder to build an accurate internal index of frames. This # ensures frame-accurate seeking but takes longer for decoder initialization, # especially on long videos. @@ -90,8 +95,8 @@ # # **When to use:** # -# - Frame accuracy is critical, so approximate mode cannot be used -# - Videos can be preprocessed once and then decoded many times +# - Frame accuracy is critical, so you cannot use approximate mode +# - You can preprocess videos once and then decode them many times # # **Performance impact:** Enables consistent, predictable performance for repeated # random access without the overhead of exact mode's scanning. @@ -116,7 +121,7 @@ # - **Multiprocessing** - Distributing work across multiple processes # - **Multithreading** - Using multiple threads within a single process # -# Both multiprocessing and multithreading can be used to decode multiple videos in parallel, or to decode a single long video in parallel by splitting it into chunks. +# You can use both multiprocessing and multithreading to decode multiple videos in parallel, or to decode a single long video in parallel by splitting it into chunks. # %% # .. note:: From 9e0f33ad8688bb9dab5be491b55d975584e4347e Mon Sep 17 00:00:00 2001 From: Molly Xu Date: Mon, 1 Dec 2025 21:12:51 -0800 Subject: [PATCH 5/5] address feedback --- examples/decoding/performance_tips.py | 31 ++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/examples/decoding/performance_tips.py b/examples/decoding/performance_tips.py index 17781f451..ac247fd64 100644 --- a/examples/decoding/performance_tips.py +++ b/examples/decoding/performance_tips.py @@ -5,9 +5,12 @@ # LICENSE file in the root directory of this source tree. """ -==================================== -Performance Tips and Best Practices -==================================== +.. meta:: + :description: Learn how to optimize TorchCodec video decoding performance with batch APIs, approximate seeking, multi-threading, and CUDA acceleration. + +============================================== +TorchCodec Performance Tips and Best Practices +============================================== This tutorial consolidates performance optimization techniques for video decoding with TorchCodec. Learn when and how to apply various strategies @@ -173,3 +176,25 @@ # between CPU and CUDA decoding, see: # # - :ref:`sphx_glr_generated_examples_decoding_basic_cuda_example.py` + +# %% +# Conclusion +# ---------- +# +# TorchCodec offers multiple performance optimization strategies, each suited to +# different scenarios. Use batch APIs for multi-frame decoding, approximate mode +# for faster initialization, parallel processing for high throughput, and CUDA +# acceleration for GPU-intensive workflows. +# +# The best results often come from combining techniques. Profile your specific +# use case and apply optimizations incrementally, using the benchmarks in the +# linked examples as a guide. +# +# For more information, see: +# +# - :ref:`sphx_glr_generated_examples_decoding_basic_example.py` - Basic decoding examples +# - :ref:`sphx_glr_generated_examples_decoding_approximate_mode.py` - Approximate mode benchmarks +# - :ref:`sphx_glr_generated_examples_decoding_custom_frame_mappings.py` - Custom frame mappings +# - :ref:`sphx_glr_generated_examples_decoding_parallel_decoding.py` - Parallel decoding strategies +# - :ref:`sphx_glr_generated_examples_decoding_basic_cuda_example.py` - CUDA acceleration guide +# - :class:`torchcodec.decoders.VideoDecoder` - Full API reference