From 304fdf9d7a2b81a49040c0dde2c40128e1f696b9 Mon Sep 17 00:00:00 2001
From: Molly Xu <mollyxu@fb.com>
Date: Wed, 19 Nov 2025 21:28:36 -0800
Subject: [PATCH 1/5] first draft of performance tips tutorial

---
 docs/source/conf.py                   |   1 +
 examples/decoding/performance_tips.py | 159 ++++++++++++++++++++++++++
 2 files changed, 160 insertions(+)
 create mode 100644 examples/decoding/performance_tips.py

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 133bccf2e..87f14f75d 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -81,6 +81,7 @@ def __call__(self, filename):
                 "approximate_mode.py",
                 "sampling.py",
                 "parallel_decoding.py",
+                "performance_tips.py",
                 "custom_frame_mappings.py",
             ]
         else:
diff --git a/examples/decoding/performance_tips.py b/examples/decoding/performance_tips.py
new file mode 100644
index 000000000..e04d4fb89
--- /dev/null
+++ b/examples/decoding/performance_tips.py
@@ -0,0 +1,159 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+====================================
+Performance Tips and Best Practices
+====================================
+
+This tutorial consolidates performance optimization techniques for video
+decoding with TorchCodec. Learn when and how to apply various strategies
+to increase performance.
+"""
+
+
+# %%
+# Overview
+# --------
+#
+# When decoding videos with TorchCodec, several techniques can significantly
+# improve performance depending on your use case. This guide covers:
+#
+# 1. **Batch APIs** - Decode multiple frames at once
+# 2. **Approximate Mode & Keyframe Mappings** - Trade accuracy for speed
+# 3. **Multi-threading** - Parallelize decoding across videos or chunks
+# 4. **CUDA Acceleration (BETA)** - Use GPU decoding for supported formats
+#
+# We'll explore each technique and when to use it.
+
+# %%
+# 1. Use Batch APIs When Possible
+# --------------------------------
+#
+# If you need to decode multiple frames at once, it is faster when using the batch methods. TorchCodec's batch APIs reduce overhead and can leverage
+# internal optimizations.
+#
+# **Key Methods:**
+#
+# - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_at` for specific indices
+# - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_in_range` for ranges
+# - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_at` for timestamps
+# - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_in_range` for time ranges
+#
+# **When to use:**
+#
+# - Decoding multiple frames
+
+# %%
+# .. note::
+#
+#     For complete examples with runnable code demonstrating batch decoding,
+#     iteration, and frame retrieval, see:
+#
+#     - :ref:`sphx_glr_generated_examples_decoding_basic_example.py`
+
+# %%
+# 2. Approximate Mode & Keyframe Mappings
+# ----------------------------------------
+#
+# By default, TorchCodec uses ``seek_mode="exact"``, which performs a scan when
+# the decoder is created to build an accurate internal index of frames. This
+# ensures frame-accurate seeking but takes longer for decoder initialization,
+# especially on long videos.
+
+# %%
+# **Approximate Mode**
+# ~~~~~~~~~~~~~~~~~~~~
+#
+# Setting ``seek_mode="approximate"`` skips the initial scan and relies on the
+# video file's metadata headers. This dramatically speeds up
+# :class:`~torchcodec.decoders.VideoDecoder` creation, particularly for long
+# videos, but may result in slightly less accurate seeking in some cases.
+#
+#
+# **Which mode should you use:**
+#
+# - If you care about exactness of frame seeking, use “exact”.
+# - If you can sacrifice exactness of seeking for speed, which is usually the case when doing clip sampling, use “approximate”.
+# - If your videos don’t have variable framerate and their metadata is correct, then “approximate” mode is a net win: it will be just as accurate as the “exact” mode while still being significantly faster.
+# - If your size is small enough and we’re decoding a lot of frames, there’s a chance exact mode is actually faster.
+
+# %%
+# **Custom Frame Mappings**
+# ~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# For advanced use cases, you can pre-compute a custom mapping between desired
+# frame indices and actual keyframe locations. This allows you to speed up :class:`~torchcodec.decoders.VideoDecoder`
+# instantiation while maintaining the frame seeking accuracy of ``seek_mode="exact"``
+#
+# **When to use:**
+#
+# - Frame accuracy is critical, so approximate mode cannot be used
+# - Videos can be preprocessed once and then decoded many times
+#
+# **Performance impact:** Enables consistent, predictable performance for repeated
+# random access without the overhead of exact mode's scanning.
+
+# %%
+# .. note::
+#
+#     For complete benchmarks showing actual speedup numbers, accuracy comparisons,
+#     and implementation examples, see:
+#
+#     - :ref:`sphx_glr_generated_examples_decoding_approximate_mode.py`
+#
+#     - :ref:`sphx_glr_generated_examples_decoding_custom_frame_mappings.py`
+
+# %%
+# 3. Multi-threading for Parallel Decoding
+# -----------------------------------------
+#
+# For video decoding of a large number of frames from a single video, there are a few parallelization strategies to speed up the decoding process:
+#
+# - FFmpeg-based parallelism: Using FFmpeg's internal threading capabilities
+# - Multiprocessing: Distributing work across multiple processes
+# - Multithreading: Using multiple threads within a single process
+
+# %%
+# .. note::
+#
+#     For complete examples comparing
+#     sequential, ffmpeg-based parallelism, multi-process, and multi-threaded approaches, see:
+#
+#     - :ref:`sphx_glr_generated_examples_decoding_parallel_decoding.py`
+
+# %%
+# 4. BETA: CUDA Acceleration
+# ---------------------------
+#
+# TorchCodec supports GPU-accelerated decoding using NVIDIA's hardware decoder
+# (NVDEC) on supported hardware. This keeps decoded tensors in GPU memory,
+# avoiding expensive CPU-GPU transfers for downstream GPU operations.
+#
+# **When to use:**
+#
+# - Decoding large resolution videos
+# - Large batch of videos saturating the CPU
+# - GPU-intensive pipelines with transforms like scaling and cropping
+# - CPU is saturated and you want to free it up for other work
+#
+# **When NOT to use:**
+#
+# - You need bit-exact results
+# - Small resolution videos and the PCI-e transfer latency is large
+# - GPU is already busy and CPU is idle
+#
+# **Performance impact:** CUDA decoding can significantly outperform CPU decoding,
+# especially for high-resolution videos and when combined with GPU-based transforms.
+# Actual speedup varies by hardware, resolution, and codec.
+
+# %%
+# .. note::
+#
+#     For installation instructions, detailed examples, and visual comparisons
+#     between CPU and CUDA decoding, see:
+#
+#     - :ref:`sphx_glr_generated_examples_decoding_basic_cuda_example.py`

From 5693776db99e44088be6692e01f6e414b5bd53c4 Mon Sep 17 00:00:00 2001
From: Molly Xu <mollyxu@fb.com>
Date: Thu, 20 Nov 2025 07:52:46 -0800
Subject: [PATCH 2/5] modify format

---
 examples/decoding/performance_tips.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/decoding/performance_tips.py b/examples/decoding/performance_tips.py
index e04d4fb89..e36598e30 100644
--- a/examples/decoding/performance_tips.py
+++ b/examples/decoding/performance_tips.py
@@ -111,11 +111,11 @@
 # 3. Multi-threading for Parallel Decoding
 # -----------------------------------------
 #
-# For video decoding of a large number of frames from a single video, there are a few parallelization strategies to speed up the decoding process:
+# When decoding multiple videos or decoding a large number of frames from a single video, there are a few parallelization strategies to speed up the decoding process:
 #
-# - FFmpeg-based parallelism: Using FFmpeg's internal threading capabilities
-# - Multiprocessing: Distributing work across multiple processes
-# - Multithreading: Using multiple threads within a single process
+# - **FFmpeg-based parallelism** - Using FFmpeg's internal threading capabilities
+# - **Multiprocessing** - Distributing work across multiple processes
+# - **Multithreading** - Using multiple threads within a single process
 
 # %%
 # .. note::

From a74f653b477547c33f3bd95f747e4978aee1c96b Mon Sep 17 00:00:00 2001
From: Molly Xu <mollyxu@fb.com>
Date: Fri, 21 Nov 2025 08:38:38 -0800
Subject: [PATCH 3/5] address feedback

---
 examples/decoding/performance_tips.py | 33 ++++++++++++++++++---------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/examples/decoding/performance_tips.py b/examples/decoding/performance_tips.py
index e36598e30..4e1705623 100644
--- a/examples/decoding/performance_tips.py
+++ b/examples/decoding/performance_tips.py
@@ -25,7 +25,7 @@
 # 1. **Batch APIs** - Decode multiple frames at once
 # 2. **Approximate Mode & Keyframe Mappings** - Trade accuracy for speed
 # 3. **Multi-threading** - Parallelize decoding across videos or chunks
-# 4. **CUDA Acceleration (BETA)** - Use GPU decoding for supported formats
+# 4. **CUDA Acceleration** - Use GPU decoding for supported formats
 #
 # We'll explore each technique and when to use it.
 
@@ -33,8 +33,9 @@
 # 1. Use Batch APIs When Possible
 # --------------------------------
 #
-# If you need to decode multiple frames at once, it is faster when using the batch methods. TorchCodec's batch APIs reduce overhead and can leverage
-# internal optimizations.
+# If you need to decode multiple frames at once, the batch methods are faster than calling single-frame decoding methods multiple times.
+# For example, :meth:`~torchcodec.decoders.VideoDecoder.get_frames_at` is faster than calling :meth:`~torchcodec.decoders.VideoDecoder.get_frame_at` multiple times.
+# TorchCodec's batch APIs reduce overhead and can leverage internal optimizations.
 #
 # **Key Methods:**
 #
@@ -59,7 +60,7 @@
 # 2. Approximate Mode & Keyframe Mappings
 # ----------------------------------------
 #
-# By default, TorchCodec uses ``seek_mode="exact"``, which performs a scan when
+# By default, TorchCodec uses ``seek_mode="exact"``, which performs a :term:`scan` when
 # the decoder is created to build an accurate internal index of frames. This
 # ensures frame-accurate seeking but takes longer for decoder initialization,
 # especially on long videos.
@@ -68,7 +69,7 @@
 # **Approximate Mode**
 # ~~~~~~~~~~~~~~~~~~~~
 #
-# Setting ``seek_mode="approximate"`` skips the initial scan and relies on the
+# Setting ``seek_mode="approximate"`` skips the initial :term:`scan` and relies on the
 # video file's metadata headers. This dramatically speeds up
 # :class:`~torchcodec.decoders.VideoDecoder` creation, particularly for long
 # videos, but may result in slightly less accurate seeking in some cases.
@@ -77,9 +78,7 @@
 # **Which mode should you use:**
 #
 # - If you care about exactness of frame seeking, use “exact”.
-# - If you can sacrifice exactness of seeking for speed, which is usually the case when doing clip sampling, use “approximate”.
-# - If your videos don’t have variable framerate and their metadata is correct, then “approximate” mode is a net win: it will be just as accurate as the “exact” mode while still being significantly faster.
-# - If your size is small enough and we’re decoding a lot of frames, there’s a chance exact mode is actually faster.
+# - If the video is long and you're only decoding a small amount of frames, approximate mode should be faster.
 
 # %%
 # **Custom Frame Mappings**
@@ -113,9 +112,11 @@
 #
 # When decoding multiple videos or decoding a large number of frames from a single video, there are a few parallelization strategies to speed up the decoding process:
 #
-# - **FFmpeg-based parallelism** - Using FFmpeg's internal threading capabilities
+# - **FFmpeg-based parallelism** - Using FFmpeg's internal threading capabilities for intra-frame parallelism, where parallelization happens within individual frames rather than across frames
 # - **Multiprocessing** - Distributing work across multiple processes
 # - **Multithreading** - Using multiple threads within a single process
+#
+# Both multiprocessing and multithreading can be used to decode multiple videos in parallel, or to decode a single long video in parallel by splitting it into chunks.
 
 # %%
 # .. note::
@@ -126,8 +127,8 @@
 #     - :ref:`sphx_glr_generated_examples_decoding_parallel_decoding.py`
 
 # %%
-# 4. BETA: CUDA Acceleration
-# ---------------------------
+# 4. CUDA Acceleration
+# --------------------
 #
 # TorchCodec supports GPU-accelerated decoding using NVIDIA's hardware decoder
 # (NVDEC) on supported hardware. This keeps decoded tensors in GPU memory,
@@ -150,6 +151,16 @@
 # especially for high-resolution videos and when combined with GPU-based transforms.
 # Actual speedup varies by hardware, resolution, and codec.
 
+# %%
+# **Recommended Usage for Beta Interface**
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# .. code-block:: python
+#
+#     with set_cuda_backend("beta"):
+#         decoder = VideoDecoder("file.mp4", device="cuda")
+#
+
 # %%
 # .. note::
 #

From 547d8e5310c8754556c178c5aabdf1af52d206e5 Mon Sep 17 00:00:00 2001
From: Molly Xu <mollyxu@fb.com>
Date: Mon, 24 Nov 2025 12:58:01 -0800
Subject: [PATCH 4/5] address feedback

---
 docs/source/index.rst                 |  8 ++++++++
 examples/decoding/performance_tips.py | 13 +++++++++----
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 74e8d1298..e25a79827 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -84,6 +84,14 @@ Decoding
 
         How to sample regular and random clips from a video
 
+     .. grid-item-card:: :octicon:`file-code;1em`
+        Performance Tips
+        :img-top: _static/img/card-background.svg
+        :link: generated_examples/decoding/performance_tips.html
+        :link-type: url
+
+        Tips for optimizing video decoding performance
+
 
 Encoding
 ^^^^^^^^
diff --git a/examples/decoding/performance_tips.py b/examples/decoding/performance_tips.py
index 4e1705623..17781f451 100644
--- a/examples/decoding/performance_tips.py
+++ b/examples/decoding/performance_tips.py
@@ -39,8 +39,13 @@
 #
 # **Key Methods:**
 #
+# For index-based frame retrieval:
+#
 # - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_at` for specific indices
 # - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_in_range` for ranges
+#
+# For timestamp-based frame retrieval:
+#
 # - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_at` for timestamps
 # - :meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_in_range` for time ranges
 #
@@ -61,7 +66,7 @@
 # ----------------------------------------
 #
 # By default, TorchCodec uses ``seek_mode="exact"``, which performs a :term:`scan` when
-# the decoder is created to build an accurate internal index of frames. This
+# you create the decoder to build an accurate internal index of frames. This
 # ensures frame-accurate seeking but takes longer for decoder initialization,
 # especially on long videos.
 
@@ -90,8 +95,8 @@
 #
 # **When to use:**
 #
-# - Frame accuracy is critical, so approximate mode cannot be used
-# - Videos can be preprocessed once and then decoded many times
+# - Frame accuracy is critical, so you cannot use approximate mode
+# - You can preprocess videos once and then decode them many times
 #
 # **Performance impact:** Enables consistent, predictable performance for repeated
 # random access without the overhead of exact mode's scanning.
@@ -116,7 +121,7 @@
 # - **Multiprocessing** - Distributing work across multiple processes
 # - **Multithreading** - Using multiple threads within a single process
 #
-# Both multiprocessing and multithreading can be used to decode multiple videos in parallel, or to decode a single long video in parallel by splitting it into chunks.
+# You can use both multiprocessing and multithreading to decode multiple videos in parallel, or to decode a single long video in parallel by splitting it into chunks.
 
 # %%
 # .. note::

From 9e0f33ad8688bb9dab5be491b55d975584e4347e Mon Sep 17 00:00:00 2001
From: Molly Xu <mollyxu@fb.com>
Date: Mon, 1 Dec 2025 21:12:51 -0800
Subject: [PATCH 5/5] address feedback

---
 examples/decoding/performance_tips.py | 31 ++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/examples/decoding/performance_tips.py b/examples/decoding/performance_tips.py
index 17781f451..ac247fd64 100644
--- a/examples/decoding/performance_tips.py
+++ b/examples/decoding/performance_tips.py
@@ -5,9 +5,12 @@
 # LICENSE file in the root directory of this source tree.
 
 """
-====================================
-Performance Tips and Best Practices
-====================================
+.. meta::
+   :description: Learn how to optimize TorchCodec video decoding performance with batch APIs, approximate seeking, multi-threading, and CUDA acceleration.
+
+==============================================
+TorchCodec Performance Tips and Best Practices
+==============================================
 
 This tutorial consolidates performance optimization techniques for video
 decoding with TorchCodec. Learn when and how to apply various strategies
@@ -173,3 +176,25 @@
 #     between CPU and CUDA decoding, see:
 #
 #     - :ref:`sphx_glr_generated_examples_decoding_basic_cuda_example.py`
+
+# %%
+# Conclusion
+# ----------
+#
+# TorchCodec offers multiple performance optimization strategies, each suited to
+# different scenarios. Use batch APIs for multi-frame decoding, approximate mode
+# for faster initialization, parallel processing for high throughput, and CUDA
+# acceleration for GPU-intensive workflows.
+#
+# The best results often come from combining techniques. Profile your specific
+# use case and apply optimizations incrementally, using the benchmarks in the
+# linked examples as a guide.
+#
+# For more information, see:
+#
+# - :ref:`sphx_glr_generated_examples_decoding_basic_example.py` - Basic decoding examples
+# - :ref:`sphx_glr_generated_examples_decoding_approximate_mode.py` - Approximate mode benchmarks
+# - :ref:`sphx_glr_generated_examples_decoding_custom_frame_mappings.py` - Custom frame mappings
+# - :ref:`sphx_glr_generated_examples_decoding_parallel_decoding.py` - Parallel decoding strategies
+# - :ref:`sphx_glr_generated_examples_decoding_basic_cuda_example.py` - CUDA acceleration guide
+# - :class:`torchcodec.decoders.VideoDecoder` - Full API reference