From 5b9eb09040255c9e125a968ba148895457c9f9b8 Mon Sep 17 00:00:00 2001
From: Scott Schneider <scott.a.s@gmail.com>
Date: Mon, 26 Aug 2024 19:28:26 -0700
Subject: [PATCH 1/2] refactor CUDA code into its own sets of files Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
---
 src/torchcodec/decoders/_core/CMakeLists.txt  |   2 +
 src/torchcodec/decoders/_core/CUDACommon.cpp  | 138 ++++++++++++++++++
 src/torchcodec/decoders/_core/CUDACommon.h    |  33 +++++
 .../decoders/_core/VideoDecoder.cpp           | 130 +----------------
 4 files changed, 179 insertions(+), 124 deletions(-)
 create mode 100644 src/torchcodec/decoders/_core/CUDACommon.cpp
 create mode 100644 src/torchcodec/decoders/_core/CUDACommon.h

diff --git a/src/torchcodec/decoders/_core/CMakeLists.txt b/src/torchcodec/decoders/_core/CMakeLists.txt
index ed8e8ef36..a36b94089 100644
--- a/src/torchcodec/decoders/_core/CMakeLists.txt
+++ b/src/torchcodec/decoders/_core/CMakeLists.txt
@@ -32,6 +32,8 @@ find_package(Python3 ${PYTHON_VERSION} EXACT COMPONENTS Development)
 function(make_torchcodec_library library_name ffmpeg_target)
     set(
         sources
+        CUDACommon.h
+        CUDACommon.cpp
         FFMPEGCommon.h
         FFMPEGCommon.cpp
         VideoDecoder.h
diff --git a/src/torchcodec/decoders/_core/CUDACommon.cpp b/src/torchcodec/decoders/_core/CUDACommon.cpp
new file mode 100644
index 000000000..bb6594e50
--- /dev/null
+++ b/src/torchcodec/decoders/_core/CUDACommon.cpp
@@ -0,0 +1,138 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "src/torchcodec/decoders/_core/CUDACommon.h"
+
+#ifdef ENABLE_CUDA
+
+#include <c10/cuda/CUDAStream.h>
+#include <cuda.h>
+#include <npp.h>
+
+extern "C" {
+#include <libavutil/hwcontext_cuda.h>
+}
+
+namespace facebook::torchcodec {
+namespace {
+
+AVBufferRef* getCudaContext() {
+  enum AVHWDeviceType type = av_hwdevice_find_type_by_name("cuda");
+  TORCH_CHECK(type != AV_HWDEVICE_TYPE_NONE, "Failed to find cuda device");
+  int err = 0;
+  AVBufferRef* hw_device_ctx;
+  err = av_hwdevice_ctx_create(
+      &hw_device_ctx,
+      type,
+      nullptr,
+      nullptr,
+  // Introduced in 58.26.100:
+  // https://github.com/FFmpeg/FFmpeg/blob/4acb9b7d1046944345ae506165fb55883d04d8a6/doc/APIchanges#L265
+#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(58, 26, 100)
+      AV_CUDA_USE_CURRENT_CONTEXT
+#else
+      0
+#endif
+  );
+  if (err < 0) {
+    TORCH_CHECK(
+        false,
+        "Failed to create specified HW device",
+        getFFMPEGErrorStringFromErrorCode(err));
+  }
+  return hw_device_ctx;
+}
+
+torch::Tensor allocateDeviceTensor(
+    at::IntArrayRef shape,
+    torch::Device device,
+    const torch::Dtype dtype = torch::kUInt8) {
+  return torch::empty(
+      shape,
+      torch::TensorOptions()
+          .dtype(dtype)
+          .layout(torch::kStrided)
+          .device(device));
+}
+
+} // namespace
+
+} // facebook::torchcodec
+
+#endif // ENABLE_CUDA
+
+/*
+ * Entry points from non-CUDA code.
+ */
+
+namespace facebook::torchcodec {
+
+AVBufferRef* initializeCudaContext(const torch::Device& device) {
+#ifdef ENABLE_CUDA
+  TORCH_CHECK(device.type() == torch::DeviceType::CUDA, "Invalid device type.");
+
+  // We create a small tensor using pytorch to initialize the cuda context.
+  torch::Tensor dummyTensorForCudaInitialization = torch::zeros(
+      {1},
+      torch::TensorOptions().dtype(torch::kUInt8).device(device));
+  codecContext->hw_device_ctx = av_buffer_ref(getCudaContext());
+
+  TORCH_INTERNAL_ASSERT(
+      codecContext->hw_device_ctx,
+      "Failed to create/reference the CUDA HW device context for index=" +
+          std::to_string(device.index()) + ".");
+#else
+  throw std::runtime_error(
+      "CUDA support is not enabled in this build of TorchCodec.");
+#endif
+}
+
+torch::Tensor convertFrameToTensorUsingCuda(
+    const AVCodecContext* codecContext,
+    const VideoDecoder::VideoStreamDecoderOptions& options,
+    const AVFrame* src) {
+#ifdef ENABLE_CUDA
+  NVTX_SCOPED_RANGE("convertFrameUsingCuda");
+  TORCH_CHECK(
+      src->format == AV_PIX_FMT_CUDA,
+      "Expected format to be AV_PIX_FMT_CUDA, got " +
+          std::string(av_get_pix_fmt_name((AVPixelFormat)src->format)));
+  int width = options.width.value_or(codecContext->width);
+  int height = options.height.value_or(codecContext->height);
+  NppStatus status;
+  NppiSize oSizeROI;
+  oSizeROI.width = width;
+  oSizeROI.height = height;
+  Npp8u* input[2];
+  input[0] = (Npp8u*)src->data[0];
+  input[1] = (Npp8u*)src->data[1];
+  torch::Tensor dst = allocateDeviceTensor({height, width, 3}, options.device);
+  auto start = std::chrono::high_resolution_clock::now();
+  status = nppiNV12ToRGB_8u_P2C3R(
+      input,
+      src->linesize[0],
+      static_cast<Npp8u*>(dst.data_ptr()),
+      dst.stride(0),
+      oSizeROI);
+  TORCH_CHECK(status == NPP_SUCCESS, "Failed to convert NV12 frame.");
+  auto end = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double, std::micro> duration = end - start;
+  VLOG(9) << "NPP Conversion of frame height=" << height << " width=" << width
+          << " took: " << duration.count() << "us" << std::endl;
+  if (options.dimensionOrder == "NCHW") {
+    // The docs guaranty this to return a view:
+    // https://pytorch.org/docs/stable/generated/torch.permute.html
+    dst = dst.permute({2, 0, 1});
+  }
+  return dst;
+#else
+  throw std::runtime_error(
+      "CUDA support is not enabled in this build of TorchCodec.");
+#endif
+}
+
+} // facebook::torchcodec
+
diff --git a/src/torchcodec/decoders/_core/CUDACommon.h b/src/torchcodec/decoders/_core/CUDACommon.h
new file mode 100644
index 000000000..31a9a35c8
--- /dev/null
+++ b/src/torchcodec/decoders/_core/CUDACommon.h
@@ -0,0 +1,33 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include "src/torchcodec/decoders/_core/VideoDecoder.h"
+#include <torch/torch.h>
+#include <torch/types.h>
+
+#ifdef ENABLE_NVTX
+#include <nvtx3/nvtx3.hpp>
+#endif
+
+namespace facebook::torchcodec {
+
+#ifdef ENABLE_NVTX
+#define NVTX_SCOPED_RANGE(Annotation) nvtx3::scoped_range loop{Annotation}
+#else
+#define NVTX_SCOPED_RANGE(Annotation) do {} while (0)
+#endif
+
+AVBufferRef* initializeCudaContext(const torch::Device& device);
+
+torch::Tensor convertFrameToTensorUsingCuda(
+    const AVCodecContext* codecContext,
+    const VideoDecoder::VideoStreamDecoderOptions& options,
+    const AVFrame* src);
+
+} // facebook::torchcodec
+
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
index a5c0fddfb..5edcf6aed 100644
--- a/src/torchcodec/decoders/_core/VideoDecoder.cpp
+++ b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -5,6 +5,7 @@
 // LICENSE file in the root directory of this source tree.
 
 #include "src/torchcodec/decoders/_core/VideoDecoder.h"
+#include "src/torchcodec/decoders/_core/CUDACommon.h"
 #include <torch/torch.h>
 #include <torch/types.h>
 #include <cstdint>
@@ -13,15 +14,6 @@
 #include <stdexcept>
 #include <string_view>
 
-#ifdef ENABLE_CUDA
-#include <c10/cuda/CUDAStream.h>
-#include <cuda.h>
-#include <npp.h>
-#ifdef ENABLE_NVTX
-#include <nvtx3/nvtx3.hpp>
-#endif
-#endif
-
 extern "C" {
 #include <libavcodec/avcodec.h>
 #include <libavfilter/buffersink.h>
@@ -29,9 +21,6 @@ extern "C" {
 #include <libavformat/avformat.h>
 #include <libavutil/imgutils.h>
 #include <libavutil/pixdesc.h>
-#ifdef ENABLE_CUDA
-#include <libavutil/hwcontext_cuda.h>
-#endif
 }
 
 namespace facebook::torchcodec {
@@ -107,87 +96,6 @@ std::vector<std::string> splitStringWithDelimiters(
   return result;
 }
 
-#ifdef ENABLE_CUDA
-
-AVBufferRef* getCudaContext() {
-  enum AVHWDeviceType type = av_hwdevice_find_type_by_name("cuda");
-  TORCH_CHECK(type != AV_HWDEVICE_TYPE_NONE, "Failed to find cuda device");
-  int err = 0;
-  AVBufferRef* hw_device_ctx;
-  err = av_hwdevice_ctx_create(
-      &hw_device_ctx,
-      type,
-      nullptr,
-      nullptr,
-  // Introduced in 58.26.100:
-  // https://github.com/FFmpeg/FFmpeg/blob/4acb9b7d1046944345ae506165fb55883d04d8a6/doc/APIchanges#L265
-#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(58, 26, 100)
-      AV_CUDA_USE_CURRENT_CONTEXT
-#else
-      0
-#endif
-  );
-  if (err < 0) {
-    TORCH_CHECK(
-        false,
-        "Failed to create specified HW device",
-        getFFMPEGErrorStringFromErrorCode(err));
-  }
-  return hw_device_ctx;
-}
-
-torch::Tensor allocateDeviceTensor(
-    at::IntArrayRef shape,
-    torch::Device device,
-    const torch::Dtype dtype = torch::kUInt8) {
-  return torch::empty(
-      shape,
-      torch::TensorOptions()
-          .dtype(dtype)
-          .layout(torch::kStrided)
-          .device(device));
-}
-
-torch::Tensor convertFrameToTensorUsingCUDA(
-    const AVCodecContext* codecContext,
-    const VideoDecoder::VideoStreamDecoderOptions& options,
-    const AVFrame* src) {
-  TORCH_CHECK(
-      src->format == AV_PIX_FMT_CUDA,
-      "Expected format to be AV_PIX_FMT_CUDA, got " +
-          std::string(av_get_pix_fmt_name((AVPixelFormat)src->format)));
-  int width = options.width.value_or(codecContext->width);
-  int height = options.height.value_or(codecContext->height);
-  NppStatus status;
-  NppiSize oSizeROI;
-  oSizeROI.width = width;
-  oSizeROI.height = height;
-  Npp8u* input[2];
-  input[0] = (Npp8u*)src->data[0];
-  input[1] = (Npp8u*)src->data[1];
-  torch::Tensor dst = allocateDeviceTensor({height, width, 3}, options.device);
-  auto start = std::chrono::high_resolution_clock::now();
-  status = nppiNV12ToRGB_8u_P2C3R(
-      input,
-      src->linesize[0],
-      static_cast<Npp8u*>(dst.data_ptr()),
-      dst.stride(0),
-      oSizeROI);
-  TORCH_CHECK(status == NPP_SUCCESS, "Failed to convert NV12 frame.");
-  auto end = std::chrono::high_resolution_clock::now();
-  std::chrono::duration<double, std::micro> duration = end - start;
-  VLOG(9) << "NPP Conversion of frame height=" << height << " width=" << width
-          << " took: " << duration.count() << "us" << std::endl;
-  if (options.dimensionOrder == "NCHW") {
-    // The docs guaranty this to return a view:
-    // https://pytorch.org/docs/stable/generated/torch.permute.html
-    dst = dst.permute({2, 0, 1});
-  }
-  return dst;
-}
-
-#endif
-
 } // namespace
 
 VideoDecoder::VideoStreamDecoderOptions::VideoStreamDecoderOptions(
@@ -490,21 +398,7 @@ void VideoDecoder::addVideoStreamDecoder(
   TORCH_CHECK_EQ(retVal, AVSUCCESS);
 
   if (options.device.type() == torch::DeviceType::CUDA) {
-#ifdef ENABLE_CUDA
-    // We create a small tensor using pytorch to initialize the cuda context.
-    torch::Tensor dummyTensorForCudaInitialization = torch::zeros(
-        {1},
-        torch::TensorOptions().dtype(torch::kUInt8).device(options.device));
-    codecContext->hw_device_ctx = av_buffer_ref(getCudaContext());
-
-    TORCH_INTERNAL_ASSERT(
-        codecContext->hw_device_ctx,
-        "Failed to create/reference the CUDA HW device context for index=" +
-            std::to_string(options.device.index()) + ".");
-#else
-    throw std::runtime_error(
-        "CUDA support is not enabled in this build of TorchCodec.");
-#endif
+    codecContext->hw_device_ctx = initializeCudaContext(options.device);
   }
 
   retVal = avcodec_open2(streamInfo.codecContext.get(), codec, nullptr);
@@ -765,9 +659,8 @@ void VideoDecoder::maybeSeekToBeforeDesiredPts() {
 
 VideoDecoder::DecodedOutput VideoDecoder::getDecodedOutputWithFilter(
     std::function<bool(int, AVFrame*)> filterFunction) {
-#ifdef ENABLE_NVTX
-  nvtx3::scoped_range loop{"decodeOneFrame"};
-#endif
+  NVTX_SCOPED_RANGE("decodeOneFrame");
+
   if (activeStreamIndices_.size() == 0) {
     throw std::runtime_error("No active streams configured.");
   }
@@ -856,9 +749,7 @@ VideoDecoder::DecodedOutput VideoDecoder::getDecodedOutputWithFilter(
       continue;
     }
     {
-#ifdef ENABLE_NVTX
-      nvtx3::scoped_range loop{"avcodec_send_packet"};
-#endif
+      NVTX_SCOPED_RANGE("avcodec_send_packet");
       ffmpegStatus = avcodec_send_packet(
           streams_[packet->stream_index].codecContext.get(), packet.get());
     }
@@ -912,17 +803,8 @@ VideoDecoder::DecodedOutput VideoDecoder::convertAVFrameToDecodedOutput(
       output.frame =
           convertFrameToTensorUsingFilterGraph(streamIndex, frame.get());
     } else if (streamInfo.options.device.is_cuda()) {
-#ifdef ENABLE_CUDA
-      {
-#ifdef ENABLE_NVTX
-        nvtx3::scoped_range loop{"convertFrameUsingCuda"};
-#endif
-        output.frame = convertFrameToTensorUsingCUDA(
+        output.frame = convertFrameToTensorUsingCuda(
             streamInfo.codecContext.get(), streamInfo.options, frame.get());
-      }
-#else
-      throw std::runtime_error("CUDA is not enabled in this build.");
-#endif // ENABLE_CUDA
     }
   } else if (output.streamType == AVMEDIA_TYPE_AUDIO) {
     // TODO: https://github.com/pytorch-labs/torchcodec/issues/85 implement

From afc2d53cb68e543dab3946dc725409d67a6bbd62 Mon Sep 17 00:00:00 2001
From: Scott Schneider <scott.a.s@gmail.com>
Date: Mon, 26 Aug 2024 19:28:26 -0700
Subject: [PATCH 2/2] refactor CUDA code into its own sets of files Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
---
 src/torchcodec/decoders/_core/CMakeLists.txt  |   2 +
 src/torchcodec/decoders/_core/CUDACommon.cpp  | 137 ++++++++++++++++++
 src/torchcodec/decoders/_core/CUDACommon.h    |  32 ++++
 .../decoders/_core/VideoDecoder.cpp           | 130 +----------------
 4 files changed, 177 insertions(+), 124 deletions(-)
 create mode 100644 src/torchcodec/decoders/_core/CUDACommon.cpp
 create mode 100644 src/torchcodec/decoders/_core/CUDACommon.h

diff --git a/src/torchcodec/decoders/_core/CMakeLists.txt b/src/torchcodec/decoders/_core/CMakeLists.txt
index ed8e8ef36..a36b94089 100644
--- a/src/torchcodec/decoders/_core/CMakeLists.txt
+++ b/src/torchcodec/decoders/_core/CMakeLists.txt
@@ -32,6 +32,8 @@ find_package(Python3 ${PYTHON_VERSION} EXACT COMPONENTS Development)
 function(make_torchcodec_library library_name ffmpeg_target)
     set(
         sources
+        CUDACommon.h
+        CUDACommon.cpp
         FFMPEGCommon.h
         FFMPEGCommon.cpp
         VideoDecoder.h
diff --git a/src/torchcodec/decoders/_core/CUDACommon.cpp b/src/torchcodec/decoders/_core/CUDACommon.cpp
new file mode 100644
index 000000000..188fd9b44
--- /dev/null
+++ b/src/torchcodec/decoders/_core/CUDACommon.cpp
@@ -0,0 +1,137 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "src/torchcodec/decoders/_core/CUDACommon.h"
+
+#ifdef ENABLE_CUDA
+
+#include <c10/cuda/CUDAStream.h>
+#include <cuda.h>
+#include <npp.h>
+
+extern "C" {
+#include <libavutil/hwcontext_cuda.h>
+}
+
+namespace facebook::torchcodec {
+namespace {
+
+AVBufferRef* getCudaContext() {
+  enum AVHWDeviceType type = av_hwdevice_find_type_by_name("cuda");
+  TORCH_CHECK(type != AV_HWDEVICE_TYPE_NONE, "Failed to find cuda device");
+  int err = 0;
+  AVBufferRef* hw_device_ctx;
+  err = av_hwdevice_ctx_create(
+      &hw_device_ctx,
+      type,
+      nullptr,
+      nullptr,
+  // Introduced in 58.26.100:
+  // https://github.com/FFmpeg/FFmpeg/blob/4acb9b7d1046944345ae506165fb55883d04d8a6/doc/APIchanges#L265
+#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(58, 26, 100)
+      AV_CUDA_USE_CURRENT_CONTEXT
+#else
+      0
+#endif
+  );
+  if (err < 0) {
+    TORCH_CHECK(
+        false,
+        "Failed to create specified HW device",
+        getFFMPEGErrorStringFromErrorCode(err));
+  }
+  return hw_device_ctx;
+}
+
+torch::Tensor allocateDeviceTensor(
+    at::IntArrayRef shape,
+    torch::Device device,
+    const torch::Dtype dtype = torch::kUInt8) {
+  return torch::empty(
+      shape,
+      torch::TensorOptions()
+          .dtype(dtype)
+          .layout(torch::kStrided)
+          .device(device));
+}
+
+} // namespace
+
+} // facebook::torchcodec
+
+#endif // ENABLE_CUDA
+
+/*
+ * Entry points from non-CUDA code.
+ */
+
+namespace facebook::torchcodec {
+
+AVBufferRef* initializeCudaContext(const torch::Device& device) {
+#ifdef ENABLE_CUDA
+  TORCH_CHECK(device.type() == torch::DeviceType::CUDA, "Invalid device type.");
+
+  // We create a small tensor using pytorch to initialize the cuda context.
+  torch::Tensor dummyTensorForCudaInitialization = torch::zeros(
+      {1},
+      torch::TensorOptions().dtype(torch::kUInt8).device(device));
+  codecContext->hw_device_ctx = av_buffer_ref(getCudaContext());
+
+  TORCH_INTERNAL_ASSERT(
+      codecContext->hw_device_ctx,
+      "Failed to create/reference the CUDA HW device context for index=" +
+          std::to_string(device.index()) + ".");
+#else
+  throw std::runtime_error(
+      "CUDA support is not enabled in this build of TorchCodec.");
+#endif
+}
+
+torch::Tensor convertFrameToTensorUsingCuda(
+    const AVCodecContext* codecContext,
+    const VideoDecoder::VideoStreamDecoderOptions& options,
+    const AVFrame* src) {
+#ifdef ENABLE_CUDA
+  NVTX_SCOPED_RANGE("convertFrameUsingCuda");
+  TORCH_CHECK(
+      src->format == AV_PIX_FMT_CUDA,
+      "Expected format to be AV_PIX_FMT_CUDA, got " +
+          std::string(av_get_pix_fmt_name((AVPixelFormat)src->format)));
+  int width = options.width.value_or(codecContext->width);
+  int height = options.height.value_or(codecContext->height);
+  NppStatus status;
+  NppiSize oSizeROI;
+  oSizeROI.width = width;
+  oSizeROI.height = height;
+  Npp8u* input[2];
+  input[0] = (Npp8u*)src->data[0];
+  input[1] = (Npp8u*)src->data[1];
+  torch::Tensor dst = allocateDeviceTensor({height, width, 3}, options.device);
+  auto start = std::chrono::high_resolution_clock::now();
+  status = nppiNV12ToRGB_8u_P2C3R(
+      input,
+      src->linesize[0],
+      static_cast<Npp8u*>(dst.data_ptr()),
+      dst.stride(0),
+      oSizeROI);
+  TORCH_CHECK(status == NPP_SUCCESS, "Failed to convert NV12 frame.");
+  auto end = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double, std::micro> duration = end - start;
+  VLOG(9) << "NPP Conversion of frame height=" << height << " width=" << width
+          << " took: " << duration.count() << "us" << std::endl;
+  if (options.dimensionOrder == "NCHW") {
+    // The docs guaranty this to return a view:
+    // https://pytorch.org/docs/stable/generated/torch.permute.html
+    dst = dst.permute({2, 0, 1});
+  }
+  return dst;
+#else
+  throw std::runtime_error(
+      "CUDA support is not enabled in this build of TorchCodec.");
+#endif
+}
+
+} // facebook::torchcodec
diff --git a/src/torchcodec/decoders/_core/CUDACommon.h b/src/torchcodec/decoders/_core/CUDACommon.h
new file mode 100644
index 000000000..e545c8b2c
--- /dev/null
+++ b/src/torchcodec/decoders/_core/CUDACommon.h
@@ -0,0 +1,32 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include "src/torchcodec/decoders/_core/VideoDecoder.h"
+#include <torch/torch.h>
+#include <torch/types.h>
+
+#ifdef ENABLE_NVTX
+#include <nvtx3/nvtx3.hpp>
+#endif
+
+namespace facebook::torchcodec {
+
+#ifdef ENABLE_NVTX
+#define NVTX_SCOPED_RANGE(Annotation) nvtx3::scoped_range loop{Annotation}
+#else
+#define NVTX_SCOPED_RANGE(Annotation) do {} while (0)
+#endif
+
+AVBufferRef* initializeCudaContext(const torch::Device& device);
+
+torch::Tensor convertFrameToTensorUsingCuda(
+    const AVCodecContext* codecContext,
+    const VideoDecoder::VideoStreamDecoderOptions& options,
+    const AVFrame* src);
+
+} // facebook::torchcodec
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
index a5c0fddfb..5edcf6aed 100644
--- a/src/torchcodec/decoders/_core/VideoDecoder.cpp
+++ b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -5,6 +5,7 @@
 // LICENSE file in the root directory of this source tree.
 
 #include "src/torchcodec/decoders/_core/VideoDecoder.h"
+#include "src/torchcodec/decoders/_core/CUDACommon.h"
 #include <torch/torch.h>
 #include <torch/types.h>
 #include <cstdint>
@@ -13,15 +14,6 @@
 #include <stdexcept>
 #include <string_view>
 
-#ifdef ENABLE_CUDA
-#include <c10/cuda/CUDAStream.h>
-#include <cuda.h>
-#include <npp.h>
-#ifdef ENABLE_NVTX
-#include <nvtx3/nvtx3.hpp>
-#endif
-#endif
-
 extern "C" {
 #include <libavcodec/avcodec.h>
 #include <libavfilter/buffersink.h>
@@ -29,9 +21,6 @@ extern "C" {
 #include <libavformat/avformat.h>
 #include <libavutil/imgutils.h>
 #include <libavutil/pixdesc.h>
-#ifdef ENABLE_CUDA
-#include <libavutil/hwcontext_cuda.h>
-#endif
 }
 
 namespace facebook::torchcodec {
@@ -107,87 +96,6 @@ std::vector<std::string> splitStringWithDelimiters(
   return result;
 }
 
-#ifdef ENABLE_CUDA
-
-AVBufferRef* getCudaContext() {
-  enum AVHWDeviceType type = av_hwdevice_find_type_by_name("cuda");
-  TORCH_CHECK(type != AV_HWDEVICE_TYPE_NONE, "Failed to find cuda device");
-  int err = 0;
-  AVBufferRef* hw_device_ctx;
-  err = av_hwdevice_ctx_create(
-      &hw_device_ctx,
-      type,
-      nullptr,
-      nullptr,
-  // Introduced in 58.26.100:
-  // https://github.com/FFmpeg/FFmpeg/blob/4acb9b7d1046944345ae506165fb55883d04d8a6/doc/APIchanges#L265
-#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(58, 26, 100)
-      AV_CUDA_USE_CURRENT_CONTEXT
-#else
-      0
-#endif
-  );
-  if (err < 0) {
-    TORCH_CHECK(
-        false,
-        "Failed to create specified HW device",
-        getFFMPEGErrorStringFromErrorCode(err));
-  }
-  return hw_device_ctx;
-}
-
-torch::Tensor allocateDeviceTensor(
-    at::IntArrayRef shape,
-    torch::Device device,
-    const torch::Dtype dtype = torch::kUInt8) {
-  return torch::empty(
-      shape,
-      torch::TensorOptions()
-          .dtype(dtype)
-          .layout(torch::kStrided)
-          .device(device));
-}
-
-torch::Tensor convertFrameToTensorUsingCUDA(
-    const AVCodecContext* codecContext,
-    const VideoDecoder::VideoStreamDecoderOptions& options,
-    const AVFrame* src) {
-  TORCH_CHECK(
-      src->format == AV_PIX_FMT_CUDA,
-      "Expected format to be AV_PIX_FMT_CUDA, got " +
-          std::string(av_get_pix_fmt_name((AVPixelFormat)src->format)));
-  int width = options.width.value_or(codecContext->width);
-  int height = options.height.value_or(codecContext->height);
-  NppStatus status;
-  NppiSize oSizeROI;
-  oSizeROI.width = width;
-  oSizeROI.height = height;
-  Npp8u* input[2];
-  input[0] = (Npp8u*)src->data[0];
-  input[1] = (Npp8u*)src->data[1];
-  torch::Tensor dst = allocateDeviceTensor({height, width, 3}, options.device);
-  auto start = std::chrono::high_resolution_clock::now();
-  status = nppiNV12ToRGB_8u_P2C3R(
-      input,
-      src->linesize[0],
-      static_cast<Npp8u*>(dst.data_ptr()),
-      dst.stride(0),
-      oSizeROI);
-  TORCH_CHECK(status == NPP_SUCCESS, "Failed to convert NV12 frame.");
-  auto end = std::chrono::high_resolution_clock::now();
-  std::chrono::duration<double, std::micro> duration = end - start;
-  VLOG(9) << "NPP Conversion of frame height=" << height << " width=" << width
-          << " took: " << duration.count() << "us" << std::endl;
-  if (options.dimensionOrder == "NCHW") {
-    // The docs guaranty this to return a view:
-    // https://pytorch.org/docs/stable/generated/torch.permute.html
-    dst = dst.permute({2, 0, 1});
-  }
-  return dst;
-}
-
-#endif
-
 } // namespace
 
 VideoDecoder::VideoStreamDecoderOptions::VideoStreamDecoderOptions(
@@ -490,21 +398,7 @@ void VideoDecoder::addVideoStreamDecoder(
   TORCH_CHECK_EQ(retVal, AVSUCCESS);
 
   if (options.device.type() == torch::DeviceType::CUDA) {
-#ifdef ENABLE_CUDA
-    // We create a small tensor using pytorch to initialize the cuda context.
-    torch::Tensor dummyTensorForCudaInitialization = torch::zeros(
-        {1},
-        torch::TensorOptions().dtype(torch::kUInt8).device(options.device));
-    codecContext->hw_device_ctx = av_buffer_ref(getCudaContext());
-
-    TORCH_INTERNAL_ASSERT(
-        codecContext->hw_device_ctx,
-        "Failed to create/reference the CUDA HW device context for index=" +
-            std::to_string(options.device.index()) + ".");
-#else
-    throw std::runtime_error(
-        "CUDA support is not enabled in this build of TorchCodec.");
-#endif
+    codecContext->hw_device_ctx = initializeCudaContext(options.device);
   }
 
   retVal = avcodec_open2(streamInfo.codecContext.get(), codec, nullptr);
@@ -765,9 +659,8 @@ void VideoDecoder::maybeSeekToBeforeDesiredPts() {
 
 VideoDecoder::DecodedOutput VideoDecoder::getDecodedOutputWithFilter(
     std::function<bool(int, AVFrame*)> filterFunction) {
-#ifdef ENABLE_NVTX
-  nvtx3::scoped_range loop{"decodeOneFrame"};
-#endif
+  NVTX_SCOPED_RANGE("decodeOneFrame");
+
   if (activeStreamIndices_.size() == 0) {
     throw std::runtime_error("No active streams configured.");
   }
@@ -856,9 +749,7 @@ VideoDecoder::DecodedOutput VideoDecoder::getDecodedOutputWithFilter(
       continue;
     }
     {
-#ifdef ENABLE_NVTX
-      nvtx3::scoped_range loop{"avcodec_send_packet"};
-#endif
+      NVTX_SCOPED_RANGE("avcodec_send_packet");
       ffmpegStatus = avcodec_send_packet(
           streams_[packet->stream_index].codecContext.get(), packet.get());
     }
@@ -912,17 +803,8 @@ VideoDecoder::DecodedOutput VideoDecoder::convertAVFrameToDecodedOutput(
       output.frame =
           convertFrameToTensorUsingFilterGraph(streamIndex, frame.get());
     } else if (streamInfo.options.device.is_cuda()) {
-#ifdef ENABLE_CUDA
-      {
-#ifdef ENABLE_NVTX
-        nvtx3::scoped_range loop{"convertFrameUsingCuda"};
-#endif
-        output.frame = convertFrameToTensorUsingCUDA(
+        output.frame = convertFrameToTensorUsingCuda(
             streamInfo.codecContext.get(), streamInfo.options, frame.get());
-      }
-#else
-      throw std::runtime_error("CUDA is not enabled in this build.");
-#endif // ENABLE_CUDA
     }
   } else if (output.streamType == AVMEDIA_TYPE_AUDIO) {
     // TODO: https://github.com/pytorch-labs/torchcodec/issues/85 implement