diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
new file mode 100644
index 000000000..7e88efbd3
--- /dev/null
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -0,0 +1,576 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <torch/types.h>
+#include <mutex>
+#include <vector>
+
+#include "src/torchcodec/_core/BetaCudaDeviceInterface.h"
+
+#include "src/torchcodec/_core/DeviceInterface.h"
+#include "src/torchcodec/_core/FFMPEGCommon.h"
+#include "src/torchcodec/_core/NVDECCache.h"
+
+// #include <cuda_runtime.h> // For cudaStreamSynchronize
+#include "src/torchcodec/_core/nvcuvid_include/cuviddec.h"
+#include "src/torchcodec/_core/nvcuvid_include/nvcuvid.h"
+
+extern "C" {
+#include <libavutil/hwcontext_cuda.h>
+#include <libavutil/pixdesc.h>
+}
+
+namespace facebook::torchcodec {
+
+namespace {
+
+static bool g_cuda_beta = registerDeviceInterface(
+    DeviceInterfaceKey(torch::kCUDA, /*variant=*/"beta"),
+    [](const torch::Device& device) {
+      return new BetaCudaDeviceInterface(device);
+    });
+
+static int CUDAAPI
+pfnSequenceCallback(void* pUserData, CUVIDEOFORMAT* videoFormat) {
+  BetaCudaDeviceInterface* decoder =
+      static_cast<BetaCudaDeviceInterface*>(pUserData);
+  return decoder->streamPropertyChange(videoFormat);
+}
+
+static int CUDAAPI
+pfnDecodePictureCallback(void* pUserData, CUVIDPICPARAMS* pPicParams) {
+  BetaCudaDeviceInterface* decoder =
+      static_cast<BetaCudaDeviceInterface*>(pUserData);
+  return decoder->frameReadyForDecoding(pPicParams);
+}
+
+static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
+  // Check decoder capabilities - same checks as DALI
+  auto caps = CUVIDDECODECAPS{};
+  caps.eCodecType = videoFormat->codec;
+  caps.eChromaFormat = videoFormat->chroma_format;
+  caps.nBitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
+  CUresult result = cuvidGetDecoderCaps(&caps);
+  TORCH_CHECK(result == CUDA_SUCCESS, "Failed to get decoder caps: ", result);
+
+  TORCH_CHECK(
+      caps.bIsSupported,
+      "Codec configuration not supported on this GPU. "
+      "Codec: ",
+      static_cast<int>(videoFormat->codec),
+      ", chroma format: ",
+      static_cast<int>(videoFormat->chroma_format),
+      ", bit depth: ",
+      videoFormat->bit_depth_luma_minus8 + 8);
+
+  TORCH_CHECK(
+      videoFormat->coded_width >= caps.nMinWidth &&
+          videoFormat->coded_height >= caps.nMinHeight,
+      "Video is too small in at least one dimension. Provided: ",
+      videoFormat->coded_width,
+      "x",
+      videoFormat->coded_height,
+      " vs supported:",
+      caps.nMinWidth,
+      "x",
+      caps.nMinHeight);
+
+  TORCH_CHECK(
+      videoFormat->coded_width <= caps.nMaxWidth &&
+          videoFormat->coded_height <= caps.nMaxHeight,
+      "Video is too large in at least one dimension. Provided: ",
+      videoFormat->coded_width,
+      "x",
+      videoFormat->coded_height,
+      " vs supported:",
+      caps.nMaxWidth,
+      "x",
+      caps.nMaxHeight);
+
+  // See nMaxMBCount in cuviddec.h
+  constexpr unsigned int macroblockConstant = 256;
+  TORCH_CHECK(
+      videoFormat->coded_width * videoFormat->coded_height /
+              macroblockConstant <=
+          caps.nMaxMBCount,
+      "Video is too large (too many macroblocks). "
+      "Provided (width * height / ",
+      macroblockConstant,
+      "): ",
+      videoFormat->coded_width * videoFormat->coded_height / macroblockConstant,
+      " vs supported:",
+      caps.nMaxMBCount);
+
+  // Decoder creation parameters, taken from DALI
+  CUVIDDECODECREATEINFO decoder_info = {};
+  decoder_info.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
+  decoder_info.ChromaFormat = videoFormat->chroma_format;
+  decoder_info.CodecType = videoFormat->codec;
+  decoder_info.ulHeight = videoFormat->coded_height;
+  decoder_info.ulWidth = videoFormat->coded_width;
+  decoder_info.ulMaxHeight = videoFormat->coded_height;
+  decoder_info.ulMaxWidth = videoFormat->coded_width;
+  decoder_info.ulTargetHeight =
+      videoFormat->display_area.bottom - videoFormat->display_area.top;
+  decoder_info.ulTargetWidth =
+      videoFormat->display_area.right - videoFormat->display_area.left;
+  decoder_info.ulNumDecodeSurfaces = videoFormat->min_num_decode_surfaces;
+  decoder_info.ulNumOutputSurfaces = 2;
+  decoder_info.display_area.left = videoFormat->display_area.left;
+  decoder_info.display_area.right = videoFormat->display_area.right;
+  decoder_info.display_area.top = videoFormat->display_area.top;
+  decoder_info.display_area.bottom = videoFormat->display_area.bottom;
+
+  CUvideodecoder* decoder = new CUvideodecoder();
+  result = cuvidCreateDecoder(decoder, &decoder_info);
+  TORCH_CHECK(
+      result == CUDA_SUCCESS, "Failed to create NVDEC decoder: ", result);
+  return UniqueCUvideodecoder(decoder, CUvideoDecoderDeleter{});
+}
+
+} // namespace
+
+BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
+    : DeviceInterface(device) {
+  TORCH_CHECK(g_cuda_beta, "BetaCudaDeviceInterface was not registered!");
+  TORCH_CHECK(
+      device_.type() == torch::kCUDA, "Unsupported device: ", device_.str());
+}
+
+BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
+  // TODONVDEC P0: we probably need to free the frames that have been decoded by
+  // NVDEC but not yet "mapped" - i.e. those that are still in frameBuffer_?
+
+  if (decoder_) {
+    NVDECCache::getCache(device_.index())
+        .returnDecoder(&videoFormat_, std::move(decoder_));
+  }
+
+  if (videoParser_) {
+    // TODONVDEC P2: consider caching this? Does DALI do that?
+    cuvidDestroyVideoParser(videoParser_);
+    videoParser_ = nullptr;
+  }
+}
+
+void BetaCudaDeviceInterface::initializeInterface(AVStream* avStream) {
+  torch::Tensor dummyTensorForCudaInitialization = torch::empty(
+      {1}, torch::TensorOptions().dtype(torch::kUInt8).device(device_));
+
+  TORCH_CHECK(avStream != nullptr, "AVStream cannot be null");
+  timeBase_ = avStream->time_base;
+
+  const AVCodecParameters* codecpar = avStream->codecpar;
+  TORCH_CHECK(codecpar != nullptr, "CodecParameters cannot be null");
+
+  TORCH_CHECK(
+      // TODONVDEC P0 support more
+      avStream->codecpar->codec_id == AV_CODEC_ID_H264,
+      "Can only do H264 for now");
+
+  // Setup bit stream filters (BSF):
+  // https://ffmpeg.org/doxygen/7.0/group__lavc__bsf.html
+  // This is only needed for some formats, like H264 or HEVC.  TODONVDEC P1: For
+  // now we apply BSF unconditionally, but it should be optional  and dependent
+  // on codec and container.
+  const AVBitStreamFilter* avBSF = av_bsf_get_by_name("h264_mp4toannexb");
+  TORCH_CHECK(
+      avBSF != nullptr, "Failed to find h264_mp4toannexb bitstream filter");
+
+  AVBSFContext* avBSFContext = nullptr;
+  int retVal = av_bsf_alloc(avBSF, &avBSFContext);
+  TORCH_CHECK(
+      retVal >= AVSUCCESS,
+      "Failed to allocate bitstream filter: ",
+      getFFMPEGErrorStringFromErrorCode(retVal));
+
+  bitstreamFilter_.reset(avBSFContext);
+
+  retVal = avcodec_parameters_copy(bitstreamFilter_->par_in, codecpar);
+  TORCH_CHECK(
+      retVal >= AVSUCCESS,
+      "Failed to copy codec parameters: ",
+      getFFMPEGErrorStringFromErrorCode(retVal));
+
+  retVal = av_bsf_init(bitstreamFilter_.get());
+  TORCH_CHECK(
+      retVal == AVSUCCESS,
+      "Failed to initialize bitstream filter: ",
+      getFFMPEGErrorStringFromErrorCode(retVal));
+
+  // Create parser. Default values that aren't obvious are taken from DALI.
+  CUVIDPARSERPARAMS parserParams = {};
+  parserParams.CodecType = cudaVideoCodec_H264;
+  parserParams.ulMaxNumDecodeSurfaces = 8;
+  parserParams.ulMaxDisplayDelay = 0;
+  // Callback setup, all are triggered by the parser within a call
+  // to cuvidParseVideoData
+  parserParams.pUserData = this;
+  parserParams.pfnSequenceCallback = pfnSequenceCallback;
+  parserParams.pfnDecodePicture = pfnDecodePictureCallback;
+  parserParams.pfnDisplayPicture = nullptr;
+
+  CUresult result = cuvidCreateVideoParser(&videoParser_, &parserParams);
+  TORCH_CHECK(
+      result == CUDA_SUCCESS, "Failed to create video parser: ", result);
+}
+
+// This callback is called by the parser within cuvidParseVideoData when there
+// is a change in the stream's properties (like resolution change), as specified
+// by CUVIDEOFORMAT. Particularly (but not just!), this is called at the very
+// start of the stream.
+// TODONVDEC P1: Code below mostly assume this is called only once at the start,
+// we should handle the case of multiple calls. Probably need to flush buffers,
+// etc.
+int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
+  TORCH_CHECK(videoFormat != nullptr, "Invalid video format");
+
+  videoFormat_ = *videoFormat;
+
+  if (videoFormat_.min_num_decode_surfaces == 0) {
+    // Same as DALI's fallback
+    videoFormat_.min_num_decode_surfaces = 20;
+  }
+
+  if (!decoder_) {
+    decoder_ = NVDECCache::getCache(device_.index()).getDecoder(videoFormat);
+
+    if (!decoder_) {
+      // TODONVDEC P0: consider re-configuring an existing decoder instead of
+      // re-creating one. See docs, see DALI.
+      decoder_ = createDecoder(videoFormat);
+    }
+
+    TORCH_CHECK(decoder_, "Failed to get or create decoder");
+  }
+
+  // DALI also returns min_num_decode_surfaces from this function. This
+  // instructs the parser to reset its ulMaxNumDecodeSurfaces field to this
+  // value.
+  return static_cast<int>(videoFormat_.min_num_decode_surfaces);
+}
+
+// Moral equivalent of avcodec_send_packet(). Here, we pass the AVPacket down to
+// the NVCUVID parser.
+int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
+  CUVIDSOURCEDATAPACKET cuvidPacket = {};
+
+  if (packet.get() && packet->data && packet->size > 0) {
+    applyBSF(packet);
+
+    // Regular packet with data
+    cuvidPacket.payload = packet->data;
+    cuvidPacket.payload_size = packet->size;
+    cuvidPacket.flags = CUVID_PKT_TIMESTAMP;
+    cuvidPacket.timestamp = packet->pts;
+
+    // Like DALI: store packet PTS in queue to later assign to frames as they
+    // come out
+    packetsPtsQueue.push(packet->pts);
+
+  } else {
+    // End of stream packet
+    cuvidPacket.flags = CUVID_PKT_ENDOFSTREAM;
+    eofSent_ = true;
+  }
+
+  CUresult result = cuvidParseVideoData(videoParser_, &cuvidPacket);
+  if (result != CUDA_SUCCESS) {
+    return AVERROR_EXTERNAL;
+  }
+  return AVSUCCESS;
+}
+
+void BetaCudaDeviceInterface::applyBSF(ReferenceAVPacket& packet) {
+  if (!bitstreamFilter_) {
+    return;
+  }
+
+  int retVal = av_bsf_send_packet(bitstreamFilter_.get(), packet.get());
+  TORCH_CHECK(
+      retVal >= AVSUCCESS,
+      "Failed to send packet to bitstream filter: ",
+      getFFMPEGErrorStringFromErrorCode(retVal));
+
+  // Create a temporary packet to receive the filtered data
+  // TODO P1: the docs mention there can theoretically be multiple output
+  // packets for a single input, i.e. we may need to call av_bsf_receive_packet
+  // more than once. We should figure out whether that applies to the BSF we're
+  // using.
+  AutoAVPacket filteredAutoPacket;
+  ReferenceAVPacket filteredPacket(filteredAutoPacket);
+  retVal = av_bsf_receive_packet(bitstreamFilter_.get(), filteredPacket.get());
+  TORCH_CHECK(
+      retVal >= AVSUCCESS,
+      "Failed to receive packet from bitstream filter: ",
+      getFFMPEGErrorStringFromErrorCode(retVal));
+
+  // Free the original packet's data which isn't needed anymore, and move the
+  // fields of the filtered packet into the original packet. The filtered packet
+  // fields are re-set by av_packet_move_ref, so when it goes out of scope and
+  // gets destructed, it's not going to affect the original packet.
+  packet.reset(filteredPacket);
+  // TODONVDEC P0: consider cleaner ways to do this. Maybe we should let
+  // applyBSF return a new packet, and maybe that new packet needs to be a field
+  // on the interface to avoid complex lifetime issues.
+}
+
+// Parser triggers this callback within cuvidParseVideoData when a frame is
+// ready to be decoded, i.e. the parser received all the necessary packets for a
+// given frame. It means we can send that frame to be decoded by the hardware
+// NVDEC decoder by calling cuvidDecodePicture which is non-blocking.
+int BetaCudaDeviceInterface::frameReadyForDecoding(CUVIDPICPARAMS* pPicParams) {
+  if (isFlushing_) {
+    return 0;
+  }
+
+  TORCH_CHECK(pPicParams != nullptr, "Invalid picture parameters");
+  TORCH_CHECK(decoder_, "Decoder not initialized before picture decode");
+
+  // Send frame to be decoded by NVDEC - non-blocking call.
+  CUresult result = cuvidDecodePicture(*decoder_.get(), pPicParams);
+  if (result != CUDA_SUCCESS) {
+    return 0; // Yes, you're reading that right, 0 mean error.
+  }
+
+  // The frame was sent to be decoded on the NVDEC hardware. Now we store some
+  // relevant info into our frame buffer so that we can retrieve the decoded
+  // frame later when receiveFrame() is called.
+  // Importantly we need to 'guess' the PTS of that frame. The heuristic we use
+  // (like in DALI) is that the frames are ready to be decoded in the same order
+  // as the packets were sent to the parser. So we assign the PTS of the frame
+  // by popping the PTS of the oldest packet in our packetsPtsQueue (note:
+  // oldest doesn't necessarily mean lowest PTS!).
+
+  TORCH_CHECK(
+      // TODONVDEC P0 the queue may be empty, handle that.
+      !packetsPtsQueue.empty(),
+      "PTS queue is empty when decoding a frame");
+  int64_t guessedPts = packetsPtsQueue.front();
+  packetsPtsQueue.pop();
+
+  // Field values taken from DALI
+  CUVIDPARSERDISPINFO dispInfo = {};
+  dispInfo.picture_index = pPicParams->CurrPicIdx;
+  dispInfo.progressive_frame = !pPicParams->field_pic_flag;
+  dispInfo.top_field_first = pPicParams->bottom_field_flag ^ 1;
+  dispInfo.repeat_first_field = 0;
+  dispInfo.timestamp = guessedPts;
+
+  FrameBuffer::Slot* slot = frameBuffer_.findEmptySlot();
+  slot->dispInfo = dispInfo;
+  slot->guessedPts = guessedPts;
+  slot->occupied = true;
+
+  return 1;
+}
+
+// Moral equivalent of avcodec_receive_frame(). Here, we look for a decoded
+// frame with the exact desired PTS in our frame buffer. This logic is only
+// valid in exact seek_mode, for now.
+int BetaCudaDeviceInterface::receiveFrame(
+    UniqueAVFrame& avFrame,
+    int64_t desiredPts) {
+  FrameBuffer::Slot* slot = frameBuffer_.findFrameWithExactPts(desiredPts);
+  if (slot == nullptr) {
+    // No frame found, instruct caller to try again later after sending more
+    // packets.
+    return AVERROR(EAGAIN);
+  }
+
+  slot->occupied = false;
+  slot->guessedPts = -1;
+
+  CUVIDPROCPARAMS procParams = {};
+  CUVIDPARSERDISPINFO dispInfo = slot->dispInfo;
+  procParams.progressive_frame = dispInfo.progressive_frame;
+  procParams.top_field_first = dispInfo.top_field_first;
+  procParams.unpaired_field = dispInfo.repeat_first_field < 0;
+  CUdeviceptr framePtr = 0;
+  unsigned int pitch = 0;
+
+  // We know the frame we want was sent to the hardware decoder, but now we need
+  // to "map" it to an "output surface" before we can use its data. This is a
+  // blocking calls that waits until the frame is fully decoded and ready to be
+  // used.
+  CUresult result = cuvidMapVideoFrame(
+      *decoder_.get(), dispInfo.picture_index, &framePtr, &pitch, &procParams);
+
+  if (result != CUDA_SUCCESS) {
+    return AVERROR_EXTERNAL;
+  }
+
+  avFrame = convertCudaFrameToAVFrame(framePtr, pitch, dispInfo);
+
+  // Unmap the frame so that the decoder can reuse its corresponding output
+  // surface. Whether this is blocking is unclear?
+  cuvidUnmapVideoFrame(*decoder_.get(), framePtr);
+  // TODONVDEC P0: Get clarity on this:
+  // We assume that the framePtr is still valid after unmapping. That framePtr
+  // is now part of the avFrame, which we'll return to the caller, and the
+  // caller will immediately use it for color-conversion, at which point a copy
+  // happens. After the copy, it doesn't matter whether framePtr is still valid.
+  // And we'll return to this function (and to cuvidUnmapVideoFrame()) *after*
+  // the copy is made, so there should be no risk of overwriting the data before
+  // the copy.
+  // Buuuut yeah, we need get more clarity on what actually happens, and on
+  // what's needed. IIUC DALI makes the color-conversion copy immediately after
+  // cuvidMapVideoFrame() and *before* cuvidUnmapVideoFrame() with a synchronize
+  // in between. So maybe we should do the same.
+
+  return AVSUCCESS;
+}
+
+UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
+    CUdeviceptr framePtr,
+    unsigned int pitch,
+    const CUVIDPARSERDISPINFO& dispInfo) {
+  TORCH_CHECK(framePtr != 0, "Invalid CUDA frame pointer");
+
+  // Get frame dimensions from video format display area (not coded dimensions)
+  // This matches DALI's approach and avoids padding issues
+  int width = videoFormat_.display_area.right - videoFormat_.display_area.left;
+  int height = videoFormat_.display_area.bottom - videoFormat_.display_area.top;
+
+  TORCH_CHECK(width > 0 && height > 0, "Invalid frame dimensions");
+  TORCH_CHECK(
+      pitch >= static_cast<unsigned int>(width), "Pitch must be >= width");
+
+  UniqueAVFrame avFrame(av_frame_alloc());
+  TORCH_CHECK(avFrame.get() != nullptr, "Failed to allocate AVFrame");
+
+  avFrame->width = width;
+  avFrame->height = height;
+  avFrame->format = AV_PIX_FMT_CUDA;
+  avFrame->pts = dispInfo.timestamp; // == guessedPts
+
+  // TODONVDEC P0: Zero division error!!!
+  // TODONVDEC P0: Move AVRational arithmetic to FFMPEGCommon, and put the
+  // similar SingleStreamDecoder stuff there too.
+  unsigned int frameRateNum = videoFormat_.frame_rate.numerator;
+  unsigned int frameRateDen = videoFormat_.frame_rate.denominator;
+  int64_t duration = static_cast<int64_t>((frameRateDen * timeBase_.den)) /
+      (frameRateNum * timeBase_.num);
+  setDuration(avFrame, duration);
+
+  // We need to assign the frame colorspace. This is crucial for proper color
+  // conversion. NVCUVID stores that in the matrix_coefficients field, but
+  // doesn't document the semantics of the values. Claude code generated this,
+  // which seems to work. Reassuringly, the values seem to match the
+  // corresponding indices in the FFmpeg enum for colorspace conversion
+  // (ff_yuv2rgb_coeffs):
+  // https://ffmpeg.org/doxygen/trunk/yuv2rgb_8c_source.html#l00047
+  switch (videoFormat_.video_signal_description.matrix_coefficients) {
+    case 1:
+      avFrame->colorspace = AVCOL_SPC_BT709;
+      break;
+    case 6:
+      avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601
+      break;
+    default:
+      // Default to BT.601
+      avFrame->colorspace = AVCOL_SPC_SMPTE170M;
+      break;
+  }
+
+  avFrame->color_range =
+      videoFormat_.video_signal_description.video_full_range_flag
+      ? AVCOL_RANGE_JPEG
+      : AVCOL_RANGE_MPEG;
+
+  // Below: Ask Claude. I'm not going to even pretend.
+  avFrame->data[0] = reinterpret_cast<uint8_t*>(framePtr);
+  avFrame->data[1] = reinterpret_cast<uint8_t*>(framePtr + (pitch * height));
+  avFrame->data[2] = nullptr;
+  avFrame->data[3] = nullptr;
+  avFrame->linesize[0] = pitch;
+  avFrame->linesize[1] = pitch;
+  avFrame->linesize[2] = 0;
+  avFrame->linesize[3] = 0;
+
+  return avFrame;
+}
+
+void BetaCudaDeviceInterface::flush() {
+  isFlushing_ = true;
+
+  // TODONVDEC P0: simplify flushing and "eofSent_" logic. We should just have a
+  // "sendEofPacket()" function that does the right thing, instead of setting
+  // CUVID_PKT_ENDOFSTREAM in different places.
+  if (!eofSent_) {
+    CUVIDSOURCEDATAPACKET cuvidPacket = {};
+    cuvidPacket.flags = CUVID_PKT_ENDOFSTREAM;
+    CUresult result = cuvidParseVideoData(videoParser_, &cuvidPacket);
+    if (result == CUDA_SUCCESS) {
+      eofSent_ = true;
+    }
+  }
+
+  isFlushing_ = false;
+
+  for (auto& slot : frameBuffer_) {
+    slot.occupied = false;
+    slot.guessedPts = -1;
+  }
+
+  std::queue<int64_t> empty;
+  packetsPtsQueue.swap(empty);
+
+  eofSent_ = false;
+}
+
+void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
+    const VideoStreamOptions& videoStreamOptions,
+    const AVRational& timeBase,
+    UniqueAVFrame& avFrame,
+    FrameOutput& frameOutput,
+    std::optional<torch::Tensor> preAllocatedOutputTensor) {
+  TORCH_CHECK(
+      avFrame->format == AV_PIX_FMT_CUDA,
+      "Expected CUDA format frame from BETA CUDA interface");
+
+  // TODONVDEC P1: we use the 'default' cuda device interface for color
+  // conversion. That's a temporary hack to make things work. we should abstract
+  // the color conversion stuff separately.
+  if (!defaultCudaInterface_) {
+    auto cudaDevice = torch::Device(torch::kCUDA);
+    defaultCudaInterface_ =
+        std::unique_ptr<DeviceInterface>(createDeviceInterface(cudaDevice));
+    AVCodecContext dummyCodecContext = {};
+    defaultCudaInterface_->initializeContext(&dummyCodecContext);
+  }
+
+  defaultCudaInterface_->convertAVFrameToFrameOutput(
+      videoStreamOptions,
+      timeBase,
+      avFrame,
+      frameOutput,
+      preAllocatedOutputTensor);
+}
+
+BetaCudaDeviceInterface::FrameBuffer::Slot*
+BetaCudaDeviceInterface::FrameBuffer::findEmptySlot() {
+  for (auto& slot : frameBuffer_) {
+    if (!slot.occupied) {
+      return &slot;
+    }
+  }
+  frameBuffer_.emplace_back();
+  return &frameBuffer_.back();
+}
+
+BetaCudaDeviceInterface::FrameBuffer::Slot*
+BetaCudaDeviceInterface::FrameBuffer::findFrameWithExactPts(
+    int64_t desiredPts) {
+  for (auto& slot : frameBuffer_) {
+    if (slot.occupied && slot.guessedPts == desiredPts) {
+      return &slot;
+    }
+  }
+  return nullptr;
+}
+
+} // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h
new file mode 100644
index 000000000..d42885c75
--- /dev/null
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.h
@@ -0,0 +1,129 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+// BETA CUDA device interface that provides direct control over NVDEC
+// while keeping FFmpeg for demuxing. A lot of the logic, particularly the use
+// of a cache for the decoders, is inspired by DALI's implementation which is
+// APACHE 2.0:
+// https://github.com/NVIDIA/DALI/blob/c7539676a24a8e9e99a6e8665e277363c5445259/dali/operators/video/frames_decoder_gpu.cc#L1
+//
+// NVDEC / NVCUVID docs:
+// https://docs.nvidia.com/video-technologies/video-codec-sdk/13.0/nvdec-video-decoder-api-prog-guide/index.html#using-nvidia-video-decoder-nvdecode-api
+
+#pragma once
+
+#include "src/torchcodec/_core/Cache.h"
+#include "src/torchcodec/_core/DeviceInterface.h"
+#include "src/torchcodec/_core/FFMPEGCommon.h"
+#include "src/torchcodec/_core/NVDECCache.h"
+
+#include <map>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <unordered_map>
+#include <vector>
+
+#include "src/torchcodec/_core/nvcuvid_include/cuviddec.h"
+#include "src/torchcodec/_core/nvcuvid_include/nvcuvid.h"
+
+namespace facebook::torchcodec {
+
+class BetaCudaDeviceInterface : public DeviceInterface {
+ public:
+  explicit BetaCudaDeviceInterface(const torch::Device& device);
+  virtual ~BetaCudaDeviceInterface();
+
+  void initializeInterface(AVStream* stream) override;
+
+  void convertAVFrameToFrameOutput(
+      const VideoStreamOptions& videoStreamOptions,
+      const AVRational& timeBase,
+      UniqueAVFrame& avFrame,
+      FrameOutput& frameOutput,
+      std::optional<torch::Tensor> preAllocatedOutputTensor =
+          std::nullopt) override;
+
+  bool canDecodePacketDirectly() const override {
+    return true;
+  }
+
+  int sendPacket(ReferenceAVPacket& packet) override;
+  int receiveFrame(UniqueAVFrame& avFrame, int64_t desiredPts) override;
+  void flush() override;
+
+  // NVDEC callback functions (must be public for C callbacks)
+  int streamPropertyChange(CUVIDEOFORMAT* videoFormat);
+  int frameReadyForDecoding(CUVIDPICPARAMS* pPicParams);
+
+ private:
+  // Apply bitstream filter, modifies packet in-place
+  void applyBSF(ReferenceAVPacket& packet);
+
+  class FrameBuffer {
+   public:
+    struct Slot {
+      CUVIDPARSERDISPINFO dispInfo;
+      int64_t guessedPts;
+      bool occupied = false;
+
+      Slot() : guessedPts(-1), occupied(false) {
+        std::memset(&dispInfo, 0, sizeof(dispInfo));
+      }
+    };
+
+    // TODONVDEC P1: init size should probably be min_num_decode_surfaces from
+    // video format
+    FrameBuffer() : frameBuffer_(4) {}
+
+    ~FrameBuffer() = default;
+
+    Slot* findEmptySlot();
+    Slot* findFrameWithExactPts(int64_t desiredPts);
+
+    // Iterator support for range-based for loops
+    auto begin() {
+      return frameBuffer_.begin();
+    }
+
+    auto end() {
+      return frameBuffer_.end();
+    }
+
+   private:
+    std::vector<Slot> frameBuffer_;
+  };
+
+  UniqueAVFrame convertCudaFrameToAVFrame(
+      CUdeviceptr framePtr,
+      unsigned int pitch,
+      const CUVIDPARSERDISPINFO& dispInfo);
+
+  CUvideoparser videoParser_ = nullptr;
+  UniqueCUvideodecoder decoder_;
+  CUVIDEOFORMAT videoFormat_ = {};
+
+  FrameBuffer frameBuffer_;
+
+  std::queue<int64_t> packetsPtsQueue;
+
+  bool eofSent_ = false;
+
+  // Flush flag to prevent decode operations during flush (like DALI's
+  // isFlushing_)
+  bool isFlushing_ = false;
+
+  AVRational timeBase_ = {0, 0};
+
+  UniqueAVBSFContext bitstreamFilter_;
+
+  // Default CUDA interface for color conversion.
+  // TODONVDEC P2: we shouldn't need to keep a separate instance of the default.
+  // See other TODO there about how interfaces should be completely independent.
+  std::unique_ptr<DeviceInterface> defaultCudaInterface_;
+};
+
+} // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/CMakeLists.txt b/src/torchcodec/_core/CMakeLists.txt
index e3f9102e2..7c04d79d4 100644
--- a/src/torchcodec/_core/CMakeLists.txt
+++ b/src/torchcodec/_core/CMakeLists.txt
@@ -98,7 +98,7 @@ function(make_torchcodec_libraries
     )
 
     if(ENABLE_CUDA)
-	    list(APPEND core_sources CudaDeviceInterface.cpp)
+	    list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp)
     endif()
 
     set(core_library_dependencies
@@ -107,9 +107,27 @@ function(make_torchcodec_libraries
     )
 
     if(ENABLE_CUDA)
+        # Try to find NVCUVID. Try the normal way first. This should work locally.
+        find_library(NVCUVID_LIBRARY NAMES nvcuvid)
+        # If not found, try with version suffix, or hardcoded path. Appears
+        # to be necessary on the CI.
+        if(NOT NVCUVID_LIBRARY)
+            find_library(NVCUVID_LIBRARY NAMES nvcuvid.1 PATHS /usr/lib64 /usr/lib)
+        endif()
+        if(NOT NVCUVID_LIBRARY)
+            set(NVCUVID_LIBRARY "/usr/lib64/libnvcuvid.so.1")
+        endif()
+
+        if(NVCUVID_LIBRARY)
+            message(STATUS "Found NVCUVID: ${NVCUVID_LIBRARY}")
+        else()
+            message(FATAL_ERROR "Could not find NVCUVID library")
+        endif()
+
         list(APPEND core_library_dependencies
             ${CUDA_nppi_LIBRARY}
             ${CUDA_nppicc_LIBRARY}
+            ${NVCUVID_LIBRARY}
         )
     endif()
 
diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp
index cf0da47b9..692a4aa31 100644
--- a/src/torchcodec/_core/CpuDeviceInterface.cpp
+++ b/src/torchcodec/_core/CpuDeviceInterface.cpp
@@ -10,7 +10,7 @@ namespace facebook::torchcodec {
 namespace {
 
 static bool g_cpu = registerDeviceInterface(
-    torch::kCPU,
+    DeviceInterfaceKey(torch::kCPU),
     [](const torch::Device& device) { return new CpuDeviceInterface(device); });
 
 } // namespace
diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
index 6a69d4fc3..5629686b4 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -13,11 +13,21 @@ extern "C" {
 #include <libavutil/pixdesc.h>
 }
 
+// TODONVDEC P1 Changes were made to this file to accomodate for the BETA CUDA
+// interface (see other TODONVDEC below). That's because the BETA CUDA interface
+// relies on this default CUDA interface to do the color conversion. That's
+// hacky, ugly, and leads to complicated code. We should refactor all this so
+// that an interface doesn't need to know anything about any other interface.
+// Note - this is more than just about the BETA CUDA interface: this default
+// interface already relies on the CPU interface to do software decoding when
+// needed, and that's already leading to similar complications.
+
 namespace facebook::torchcodec {
 namespace {
 
-static bool g_cuda =
-    registerDeviceInterface(torch::kCUDA, [](const torch::Device& device) {
+static bool g_cuda = registerDeviceInterface(
+    DeviceInterfaceKey(torch::kCUDA),
+    [](const torch::Device& device) {
       return new CudaDeviceInterface(device);
     });
 
@@ -216,10 +226,11 @@ std::unique_ptr<FiltersContext> CudaDeviceInterface::initializeFiltersContext(
     return nullptr;
   }
 
-  TORCH_CHECK(
-      avFrame->hw_frames_ctx != nullptr,
-      "The AVFrame does not have a hw_frames_ctx. "
-      "That's unexpected, please report this to the TorchCodec repo.");
+  if (avFrame->hw_frames_ctx == nullptr) {
+    // TODONVDEC P2 return early for for beta interface where avFrames don't
+    // have a hw_frames_ctx. We should get rid of this or improve the logic.
+    return nullptr;
+  }
 
   auto hwFramesCtx =
       reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data);
@@ -347,22 +358,23 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
   // Above we checked that the AVFrame was on GPU, but that's not enough, we
   // also need to check that the AVFrame is in AV_PIX_FMT_NV12 format (8 bits),
   // because this is what the NPP color conversion routines expect.
-  TORCH_CHECK(
-      avFrame->hw_frames_ctx != nullptr,
-      "The AVFrame does not have a hw_frames_ctx. "
-      "That's unexpected, please report this to the TorchCodec repo.");
-
-  auto hwFramesCtx =
-      reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data);
-  AVPixelFormat actualFormat = hwFramesCtx->sw_format;
+  // TODONVDEC P2 this can be hit from the beta interface, but there's no
+  // hw_frames_ctx in this case. We should try to understand how that affects
+  // this validation.
+  AVHWFramesContext* hwFramesCtx = nullptr;
+  if (avFrame->hw_frames_ctx != nullptr) {
+    hwFramesCtx =
+        reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data);
+    AVPixelFormat actualFormat = hwFramesCtx->sw_format;
 
-  TORCH_CHECK(
-      actualFormat == AV_PIX_FMT_NV12,
-      "The AVFrame is ",
-      (av_get_pix_fmt_name(actualFormat) ? av_get_pix_fmt_name(actualFormat)
-                                         : "unknown"),
-      ", but we expected AV_PIX_FMT_NV12. "
-      "That's unexpected, please report this to the TorchCodec repo.");
+    TORCH_CHECK(
+        actualFormat == AV_PIX_FMT_NV12,
+        "The AVFrame is ",
+        (av_get_pix_fmt_name(actualFormat) ? av_get_pix_fmt_name(actualFormat)
+                                           : "unknown"),
+        ", but we expected AV_PIX_FMT_NV12. "
+        "That's unexpected, please report this to the TorchCodec repo.");
+  }
 
   auto frameDims =
       getHeightAndWidthFromOptionsOrAVFrame(videoStreamOptions, avFrame);
@@ -396,19 +408,23 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
   // arbitrary, but unfortunately we know it's hardcoded to be the default
   // stream by FFmpeg:
   // https://github.com/FFmpeg/FFmpeg/blob/66e40840d15b514f275ce3ce2a4bf72ec68c7311/libavutil/hwcontext_cuda.c#L387-L388
-  TORCH_CHECK(
-      hwFramesCtx->device_ctx != nullptr,
-      "The AVFrame's hw_frames_ctx does not have a device_ctx. ");
-  auto cudaDeviceCtx =
-      static_cast<AVCUDADeviceContext*>(hwFramesCtx->device_ctx->hwctx);
-  at::cuda::CUDAEvent nvdecDoneEvent;
-  at::cuda::CUDAStream nvdecStream = // That's always the default stream. Sad.
-      c10::cuda::getStreamFromExternal(cudaDeviceCtx->stream, deviceIndex);
-  nvdecDoneEvent.record(nvdecStream);
-
-  // Don't start NPP work before NVDEC is done decoding the frame!
   at::cuda::CUDAStream nppStream = at::cuda::getCurrentCUDAStream(deviceIndex);
-  nvdecDoneEvent.block(nppStream);
+  if (hwFramesCtx) {
+    // TODONVDEC P2 this block won't be hit from the beta interface because
+    // there is no hwFramesCtx, but we should still make sure there's no CUDA
+    // stream sync issue in the beta interface.
+    TORCH_CHECK(
+        hwFramesCtx->device_ctx != nullptr,
+        "The AVFrame's hw_frames_ctx does not have a device_ctx. ");
+    auto cudaDeviceCtx =
+        static_cast<AVCUDADeviceContext*>(hwFramesCtx->device_ctx->hwctx);
+    at::cuda::CUDAEvent nvdecDoneEvent;
+    at::cuda::CUDAStream nvdecStream = // That's always the default stream. Sad.
+        c10::cuda::getStreamFromExternal(cudaDeviceCtx->stream, deviceIndex);
+    nvdecDoneEvent.record(nvdecStream);
+    // Don't start NPP work before NVDEC is done decoding the frame!
+    nvdecDoneEvent.block(nppStream);
+  }
 
   // Create the NPP context if we haven't yet.
   nppCtx_->hStream = nppStream.stream();
diff --git a/src/torchcodec/_core/DeviceInterface.cpp b/src/torchcodec/_core/DeviceInterface.cpp
index 70b00fb62..f6c17f7b2 100644
--- a/src/torchcodec/_core/DeviceInterface.cpp
+++ b/src/torchcodec/_core/DeviceInterface.cpp
@@ -11,7 +11,8 @@
 namespace facebook::torchcodec {
 
 namespace {
-using DeviceInterfaceMap = std::map<torch::DeviceType, CreateDeviceInterfaceFn>;
+using DeviceInterfaceMap =
+    std::map<DeviceInterfaceKey, CreateDeviceInterfaceFn>;
 static std::mutex g_interface_mutex;
 
 DeviceInterfaceMap& getDeviceMap() {
@@ -30,50 +31,72 @@ std::string getDeviceType(const std::string& device) {
 } // namespace
 
 bool registerDeviceInterface(
-    torch::DeviceType deviceType,
+    const DeviceInterfaceKey& key,
     CreateDeviceInterfaceFn createInterface) {
   std::scoped_lock lock(g_interface_mutex);
   DeviceInterfaceMap& deviceMap = getDeviceMap();
 
   TORCH_CHECK(
-      deviceMap.find(deviceType) == deviceMap.end(),
-      "Device interface already registered for ",
-      deviceType);
-  deviceMap.insert({deviceType, createInterface});
+      deviceMap.find(key) == deviceMap.end(),
+      "Device interface already registered for device type ",
+      key.deviceType,
+      " variant '",
+      key.variant,
+      "'");
+  deviceMap.insert({key, createInterface});
 
   return true;
 }
 
-torch::Device createTorchDevice(const std::string device) {
+void validateDeviceInterface(
+    const std::string device,
+    const std::string variant) {
   std::scoped_lock lock(g_interface_mutex);
   std::string deviceType = getDeviceType(device);
+
   DeviceInterfaceMap& deviceMap = getDeviceMap();
 
+  // Find device interface that matches device type and variant
+  torch::DeviceType deviceTypeEnum = torch::Device(deviceType).type();
+
   auto deviceInterface = std::find_if(
       deviceMap.begin(),
       deviceMap.end(),
-      [&](const std::pair<torch::DeviceType, CreateDeviceInterfaceFn>& arg) {
-        return device.rfind(
-                   torch::DeviceTypeName(arg.first, /*lcase*/ true), 0) == 0;
+      [&](const std::pair<DeviceInterfaceKey, CreateDeviceInterfaceFn>& arg) {
+        return arg.first.deviceType == deviceTypeEnum &&
+            arg.first.variant == variant;
       });
-  TORCH_CHECK(
-      deviceInterface != deviceMap.end(), "Unsupported device: ", device);
 
-  return torch::Device(device);
+  TORCH_CHECK(
+      deviceInterface != deviceMap.end(),
+      "Unsupported device: ",
+      device,
+      " (device type: ",
+      deviceType,
+      ", variant: ",
+      variant,
+      ")");
 }
 
 std::unique_ptr<DeviceInterface> createDeviceInterface(
-    const torch::Device& device) {
-  auto deviceType = device.type();
+    const torch::Device& device,
+    const std::string_view variant) {
+  DeviceInterfaceKey key(device.type(), variant);
   std::scoped_lock lock(g_interface_mutex);
   DeviceInterfaceMap& deviceMap = getDeviceMap();
 
-  TORCH_CHECK(
-      deviceMap.find(deviceType) != deviceMap.end(),
-      "Unsupported device: ",
-      device);
+  auto it = deviceMap.find(key);
+  if (it != deviceMap.end()) {
+    return std::unique_ptr<DeviceInterface>(it->second(device));
+  }
 
-  return std::unique_ptr<DeviceInterface>(deviceMap[deviceType](device));
+  TORCH_CHECK(
+      false,
+      "No device interface found for device type: ",
+      device.type(),
+      " variant: '",
+      variant,
+      "'");
 }
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
index 9a7288eb0..b5701f8ba 100644
--- a/src/torchcodec/_core/DeviceInterface.h
+++ b/src/torchcodec/_core/DeviceInterface.h
@@ -17,6 +17,24 @@
 
 namespace facebook::torchcodec {
 
+// Key for device interface registration with device type + variant support
+struct DeviceInterfaceKey {
+  torch::DeviceType deviceType;
+  std::string_view variant = "default"; // e.g., "default", "beta", etc.
+
+  bool operator<(const DeviceInterfaceKey& other) const {
+    if (deviceType != other.deviceType) {
+      return deviceType < other.deviceType;
+    }
+    return variant < other.variant;
+  }
+
+  explicit DeviceInterfaceKey(torch::DeviceType type) : deviceType(type) {}
+
+  DeviceInterfaceKey(torch::DeviceType type, const std::string_view& var)
+      : deviceType(type), variant(var) {}
+};
+
 class DeviceInterface {
  public:
   DeviceInterface(const torch::Device& device) : device_(device) {}
@@ -27,11 +45,17 @@ class DeviceInterface {
     return device_;
   };
 
-  virtual std::optional<const AVCodec*> findCodec(const AVCodecID& codecId) = 0;
+  virtual std::optional<const AVCodec*> findCodec(
+      [[maybe_unused]] const AVCodecID& codecId) {
+    return std::nullopt;
+  };
 
   // Initialize the hardware device that is specified in `device`. Some builds
   // support CUDA and others only support CPU.
-  virtual void initializeContext(AVCodecContext* codecContext) = 0;
+  virtual void initializeContext(
+      [[maybe_unused]] AVCodecContext* codecContext) {}
+
+  virtual void initializeInterface([[maybe_unused]] AVStream* stream) {}
 
   virtual void convertAVFrameToFrameOutput(
       const VideoStreamOptions& videoStreamOptions,
@@ -40,6 +64,44 @@ class DeviceInterface {
       FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt) = 0;
 
+  // ------------------------------------------
+  // Extension points for custom decoding paths
+  // ------------------------------------------
+
+  // Override to return true if this device interface can decode packets
+  // directly
+  virtual bool canDecodePacketDirectly() const {
+    return false;
+  }
+
+  // Moral equivalent of avcodec_send_packet()
+  // Returns AVSUCCESS on success, AVERROR(EAGAIN) if decoder queue full, or
+  // other AVERROR on failure
+  virtual int sendPacket([[maybe_unused]] ReferenceAVPacket& avPacket) {
+    TORCH_CHECK(
+        false,
+        "Send/receive packet decoding not implemented for this device interface");
+    return AVERROR(ENOSYS);
+  }
+
+  // Moral equivalent of avcodec_receive_frame()
+  // Returns AVSUCCESS on success, AVERROR(EAGAIN) if no frame ready,
+  // AVERROR_EOF if end of stream, or other AVERROR on failure
+  virtual int receiveFrame(
+      [[maybe_unused]] UniqueAVFrame& avFrame,
+      [[maybe_unused]] int64_t desiredPts) {
+    TORCH_CHECK(
+        false,
+        "Send/receive packet decoding not implemented for this device interface");
+    return AVERROR(ENOSYS);
+  }
+
+  // Flush remaining frames from decoder
+  virtual void flush() {
+    // Default implementation is no-op for standard decoders
+    // Custom decoders can override this method
+  }
+
  protected:
   torch::Device device_;
 };
@@ -48,12 +110,15 @@ using CreateDeviceInterfaceFn =
     std::function<DeviceInterface*(const torch::Device& device)>;
 
 bool registerDeviceInterface(
-    torch::DeviceType deviceType,
+    const DeviceInterfaceKey& key,
     const CreateDeviceInterfaceFn createInterface);
 
-torch::Device createTorchDevice(const std::string device);
+void validateDeviceInterface(
+    const std::string device,
+    const std::string variant);
 
 std::unique_ptr<DeviceInterface> createDeviceInterface(
-    const torch::Device& device);
+    const torch::Device& device,
+    const std::string_view variant = "default");
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/FFMPEGCommon.cpp b/src/torchcodec/_core/FFMPEGCommon.cpp
index 9ce7a4deb..200fc9359 100644
--- a/src/torchcodec/_core/FFMPEGCommon.cpp
+++ b/src/torchcodec/_core/FFMPEGCommon.cpp
@@ -33,6 +33,13 @@ AVPacket* ReferenceAVPacket::operator->() {
   return avPacket_;
 }
 
+void ReferenceAVPacket::reset(ReferenceAVPacket& other) {
+  if (this != &other) {
+    av_packet_unref(avPacket_);
+    av_packet_move_ref(avPacket_, other.avPacket_);
+  }
+}
+
 AVCodecOnlyUseForCallingAVFindBestStream
 makeAVCodecOnlyUseForCallingAVFindBestStream(const AVCodec* codec) {
 #if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(59, 18, 100)
@@ -56,6 +63,14 @@ int64_t getDuration(const UniqueAVFrame& avFrame) {
 #endif
 }
 
+void setDuration(const UniqueAVFrame& avFrame, int64_t duration) {
+#if LIBAVUTIL_VERSION_MAJOR < 58
+  avFrame->pkt_duration = duration;
+#else
+  avFrame->duration = duration;
+#endif
+}
+
 const int* getSupportedSampleRates(const AVCodec& avCodec) {
   const int* supportedSampleRates = nullptr;
 #if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(61, 13, 100) // FFmpeg >= 7.1
diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h
index 179c7464b..ac40f079a 100644
--- a/src/torchcodec/_core/FFMPEGCommon.h
+++ b/src/torchcodec/_core/FFMPEGCommon.h
@@ -12,6 +12,7 @@
 
 extern "C" {
 #include <libavcodec/avcodec.h>
+#include <libavcodec/bsf.h>
 #include <libavfilter/avfilter.h>
 #include <libavfilter/buffersrc.h>
 #include <libavformat/avformat.h>
@@ -86,6 +87,8 @@ using UniqueSwrContext =
     std::unique_ptr<SwrContext, Deleterp<SwrContext, void, swr_free>>;
 using UniqueAVAudioFifo = std::
     unique_ptr<AVAudioFifo, Deleter<AVAudioFifo, void, av_audio_fifo_free>>;
+using UniqueAVBSFContext =
+    std::unique_ptr<AVBSFContext, Deleterp<AVBSFContext, void, av_bsf_free>>;
 using UniqueAVBufferRef =
     std::unique_ptr<AVBufferRef, Deleterp<AVBufferRef, void, av_buffer_unref>>;
 using UniqueAVBufferSrcParameters = std::unique_ptr<
@@ -132,6 +135,7 @@ class ReferenceAVPacket {
   ~ReferenceAVPacket();
   AVPacket* get();
   AVPacket* operator->();
+  void reset(ReferenceAVPacket& other);
 };
 
 // av_find_best_stream is not const-correct before commit:
@@ -161,6 +165,7 @@ std::string getFFMPEGErrorStringFromErrorCode(int errorCode);
 // struct member representing duration has changed across the versions we
 // support.
 int64_t getDuration(const UniqueAVFrame& frame);
+void setDuration(const UniqueAVFrame& frame, int64_t duration);
 
 const int* getSupportedSampleRates(const AVCodec& avCodec);
 const AVSampleFormat* getSupportedOutputSampleFormats(const AVCodec& avCodec);
diff --git a/src/torchcodec/_core/NVDECCache.cpp b/src/torchcodec/_core/NVDECCache.cpp
new file mode 100644
index 000000000..87ab5b0dc
--- /dev/null
+++ b/src/torchcodec/_core/NVDECCache.cpp
@@ -0,0 +1,70 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <torch/types.h>
+#include <mutex>
+
+#include "src/torchcodec/_core/FFMPEGCommon.h"
+#include "src/torchcodec/_core/NVDECCache.h"
+
+#include <cuda_runtime.h> // For cudaGetDevice
+
+extern "C" {
+#include <libavutil/hwcontext_cuda.h>
+#include <libavutil/pixdesc.h>
+}
+
+namespace facebook::torchcodec {
+
+NVDECCache& NVDECCache::getCache(int deviceIndex) {
+  const int MAX_CUDA_GPUS = 128;
+  TORCH_CHECK(
+      deviceIndex >= -1 && deviceIndex < MAX_CUDA_GPUS,
+      "Invalid device index = ",
+      deviceIndex);
+  static NVDECCache cacheInstances[MAX_CUDA_GPUS];
+  if (deviceIndex == -1) {
+    // TODO NVDEC P3: Unify with existing getNonNegativeDeviceIndex()
+    TORCH_CHECK(
+        cudaGetDevice(&deviceIndex) == cudaSuccess,
+        "Failed to get current CUDA device.");
+  }
+  return cacheInstances[deviceIndex];
+}
+
+UniqueCUvideodecoder NVDECCache::getDecoder(CUVIDEOFORMAT* videoFormat) {
+  CacheKey key(videoFormat);
+  std::lock_guard<std::mutex> lock(cacheLock_);
+
+  auto it = cache_.find(key);
+  if (it != cache_.end()) {
+    auto decoder = std::move(it->second);
+    cache_.erase(it);
+    return decoder;
+  }
+
+  return nullptr;
+}
+
+bool NVDECCache::returnDecoder(
+    CUVIDEOFORMAT* videoFormat,
+    UniqueCUvideodecoder decoder) {
+  if (!decoder) {
+    return false;
+  }
+
+  CacheKey key(videoFormat);
+  std::lock_guard<std::mutex> lock(cacheLock_);
+
+  if (cache_.size() >= MAX_CACHE_SIZE) {
+    return false;
+  }
+
+  cache_[key] = std::move(decoder);
+  return true;
+}
+
+} // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/NVDECCache.h b/src/torchcodec/_core/NVDECCache.h
new file mode 100644
index 000000000..17fc99902
--- /dev/null
+++ b/src/torchcodec/_core/NVDECCache.h
@@ -0,0 +1,104 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <mutex>
+
+#include <cuda.h>
+#include "src/torchcodec/_core/nvcuvid_include/cuviddec.h"
+#include "src/torchcodec/_core/nvcuvid_include/nvcuvid.h"
+
+namespace facebook::torchcodec {
+
+// This file implements a cache for NVDEC decoders.
+// TODONVDEC P3: Consider merging this with Cache.h. The main difference is that
+// this NVDEC Cache involves a cache key (the decoder parameters).
+
+struct CUvideoDecoderDeleter {
+  void operator()(CUvideodecoder* decoderPtr) const {
+    if (decoderPtr && *decoderPtr) {
+      cuvidDestroyDecoder(*decoderPtr);
+      delete decoderPtr;
+    }
+  }
+};
+
+using UniqueCUvideodecoder =
+    std::unique_ptr<CUvideodecoder, CUvideoDecoderDeleter>;
+
+// A per-device cache for NVDEC decoders. There is one instance of this class
+// per GPU device, and it is accessed through the static getCache() method.
+class NVDECCache {
+ public:
+  static NVDECCache& getCache(int deviceIndex);
+
+  // Get decoder from cache - returns nullptr if none available
+  UniqueCUvideodecoder getDecoder(CUVIDEOFORMAT* videoFormat);
+
+  // Return decoder to cache - returns true if added to cache
+  bool returnDecoder(CUVIDEOFORMAT* videoFormat, UniqueCUvideodecoder decoder);
+
+ private:
+  // Cache key struct: a decoder can be reused and taken from the cache only if
+  // all these parameters match.
+  struct CacheKey {
+    cudaVideoCodec codecType;
+    uint32_t width;
+    uint32_t height;
+    cudaVideoChromaFormat chromaFormat;
+    uint32_t bitDepthLumaMinus8;
+    uint8_t numDecodeSurfaces;
+
+    CacheKey() = delete;
+
+    explicit CacheKey(CUVIDEOFORMAT* videoFormat)
+        : codecType(videoFormat->codec),
+          width(videoFormat->coded_width),
+          height(videoFormat->coded_height),
+          chromaFormat(videoFormat->chroma_format),
+          bitDepthLumaMinus8(videoFormat->bit_depth_luma_minus8),
+          numDecodeSurfaces(videoFormat->min_num_decode_surfaces) {}
+
+    CacheKey(const CacheKey&) = default;
+    CacheKey& operator=(const CacheKey&) = default;
+
+    // TODONVDEC P2: we only implement operator< which is enough for std::map,
+    // but:
+    // - we should consider using std::unordered_map
+    // - we should consider a more sophisticated and potentially less strict
+    // cache key comparison logic
+    bool operator<(const CacheKey& other) const {
+      return std::tie(
+                 codecType,
+                 width,
+                 height,
+                 chromaFormat,
+                 bitDepthLumaMinus8,
+                 numDecodeSurfaces) <
+          std::tie(
+                 other.codecType,
+                 other.width,
+                 other.height,
+                 other.chromaFormat,
+                 other.bitDepthLumaMinus8,
+                 other.numDecodeSurfaces);
+    }
+  };
+
+  NVDECCache() = default;
+  ~NVDECCache() = default;
+
+  std::map<CacheKey, UniqueCUvideodecoder> cache_;
+  std::mutex cacheLock_;
+
+  // Max number of cached decoders, per device
+  static constexpr int MAX_CACHE_SIZE = 20;
+};
+
+} // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
index 81e3e4474..0a4bc32e0 100644
--- a/src/torchcodec/_core/SingleStreamDecoder.cpp
+++ b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -399,6 +399,7 @@ void SingleStreamDecoder::addStream(
     int streamIndex,
     AVMediaType mediaType,
     const torch::Device& device,
+    const std::string_view deviceVariant,
     std::optional<int> ffmpegThreadCount) {
   TORCH_CHECK(
       activeStreamIndex_ == NO_ACTIVE_STREAM,
@@ -427,7 +428,7 @@ void SingleStreamDecoder::addStream(
   streamInfo.stream = formatContext_->streams[activeStreamIndex_];
   streamInfo.avMediaType = mediaType;
 
-  deviceInterface_ = createDeviceInterface(device);
+  deviceInterface_ = createDeviceInterface(device, deviceVariant);
 
   // This should never happen, checking just to be safe.
   TORCH_CHECK(
@@ -461,6 +462,7 @@ void SingleStreamDecoder::addStream(
   if (mediaType == AVMEDIA_TYPE_VIDEO) {
     if (deviceInterface_) {
       deviceInterface_->initializeContext(codecContext);
+      deviceInterface_->initializeInterface(streamInfo.stream);
     }
   }
 
@@ -468,6 +470,7 @@ void SingleStreamDecoder::addStream(
   TORCH_CHECK(retVal >= AVSUCCESS, getFFMPEGErrorStringFromErrorCode(retVal));
 
   codecContext->time_base = streamInfo.stream->time_base;
+
   containerMetadata_.allStreamMetadata[activeStreamIndex_].codecName =
       std::string(avcodec_get_name(codecContext->codec_id));
 
@@ -490,6 +493,7 @@ void SingleStreamDecoder::addVideoStream(
       streamIndex,
       AVMEDIA_TYPE_VIDEO,
       videoStreamOptions.device,
+      videoStreamOptions.deviceVariant,
       videoStreamOptions.ffmpegThreadCount);
 
   auto& streamMetadata =
@@ -1120,6 +1124,10 @@ void SingleStreamDecoder::maybeSeekToBeforeDesiredPts() {
 
   decodeStats_.numFlushes++;
   avcodec_flush_buffers(streamInfo.codecContext.get());
+
+  if (deviceInterface_) {
+    deviceInterface_->flush();
+  }
 }
 
 // --------------------------------------------------------------------------
@@ -1138,15 +1146,26 @@ UniqueAVFrame SingleStreamDecoder::decodeAVFrame(
   }
 
   StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
-
-  // Need to get the next frame or error from PopFrame.
   UniqueAVFrame avFrame(av_frame_alloc());
   AutoAVPacket autoAVPacket;
   int status = AVSUCCESS;
   bool reachedEOF = false;
+
+  // TODONVDEC P2: Instead of defining useCustomInterface and rely on if/else
+  // blocks to dispatch to the interface or to FFmpeg, consider *always*
+  // dispatching to the interface. The default implementation of the interface's
+  // receiveFrame and sendPacket could just be calling avcodec_receive_frame and
+  // avcodec_send_packet. This would make the decoding loop even more generic.
+  bool useCustomInterface =
+      deviceInterface_ && deviceInterface_->canDecodePacketDirectly();
+
   while (true) {
-    status =
-        avcodec_receive_frame(streamInfo.codecContext.get(), avFrame.get());
+    if (useCustomInterface) {
+      status = deviceInterface_->receiveFrame(avFrame, cursor_);
+    } else {
+      status =
+          avcodec_receive_frame(streamInfo.codecContext.get(), avFrame.get());
+    }
 
     if (status != AVSUCCESS && status != AVERROR(EAGAIN)) {
       // Non-retriable error
@@ -1169,7 +1188,7 @@ UniqueAVFrame SingleStreamDecoder::decodeAVFrame(
 
     if (reachedEOF) {
       // We don't have any more packets to receive. So keep on pulling frames
-      // from its internal buffers.
+      // from decoder's internal buffers.
       continue;
     }
 
@@ -1181,11 +1200,19 @@ UniqueAVFrame SingleStreamDecoder::decodeAVFrame(
       decodeStats_.numPacketsRead++;
 
       if (status == AVERROR_EOF) {
-        // End of file reached. We must drain the codec by sending a nullptr
-        // packet.
-        status = avcodec_send_packet(
-            streamInfo.codecContext.get(),
-            /*avpkt=*/nullptr);
+        // End of file reached. We must drain the decoder
+        if (useCustomInterface) {
+          // TODONVDEC P0: Re-think this. This should be simpler.
+          AutoAVPacket eofAutoPacket;
+          ReferenceAVPacket eofPacket(eofAutoPacket);
+          eofPacket->data = nullptr;
+          eofPacket->size = 0;
+          status = deviceInterface_->sendPacket(eofPacket);
+        } else {
+          status = avcodec_send_packet(
+              streamInfo.codecContext.get(),
+              /*avpkt=*/nullptr);
+        }
         TORCH_CHECK(
             status >= AVSUCCESS,
             "Could not flush decoder: ",
@@ -1210,7 +1237,11 @@ UniqueAVFrame SingleStreamDecoder::decodeAVFrame(
 
     // We got a valid packet. Send it to the decoder, and we'll receive it in
     // the next iteration.
-    status = avcodec_send_packet(streamInfo.codecContext.get(), packet.get());
+    if (useCustomInterface) {
+      status = deviceInterface_->sendPacket(packet);
+    } else {
+      status = avcodec_send_packet(streamInfo.codecContext.get(), packet.get());
+    }
     TORCH_CHECK(
         status >= AVSUCCESS,
         "Could not push packet to decoder: ",
diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h
index 56bb8bb58..779acd273 100644
--- a/src/torchcodec/_core/SingleStreamDecoder.h
+++ b/src/torchcodec/_core/SingleStreamDecoder.h
@@ -318,6 +318,7 @@ class SingleStreamDecoder {
       int streamIndex,
       AVMediaType mediaType,
       const torch::Device& device = torch::kCPU,
+      const std::string_view deviceVariant = "default",
       std::optional<int> ffmpegThreadCount = std::nullopt);
 
   // Returns the "best" stream index for a given media type. The "best" is
diff --git a/src/torchcodec/_core/StreamOptions.h b/src/torchcodec/_core/StreamOptions.h
index 19cc5126c..65f2782a8 100644
--- a/src/torchcodec/_core/StreamOptions.h
+++ b/src/torchcodec/_core/StreamOptions.h
@@ -9,6 +9,7 @@
 #include <torch/types.h>
 #include <optional>
 #include <string>
+#include <string_view>
 
 namespace facebook::torchcodec {
 
@@ -38,6 +39,8 @@ struct VideoStreamOptions {
   std::optional<ColorConversionLibrary> colorConversionLibrary;
   // By default we use CPU for decoding for both C++ and python users.
   torch::Device device = torch::kCPU;
+  // Device variant (e.g., "default", "beta", etc.)
+  std::string_view deviceVariant = "default";
 
   // Encoding options
   std::optional<int> bitRate;
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
index a865bdaed..b9693d1ff 100644
--- a/src/torchcodec/_core/custom_ops.cpp
+++ b/src/torchcodec/_core/custom_ops.cpp
@@ -43,9 +43,9 @@ TORCH_LIBRARY(torchcodec_ns, m) {
   m.def(
       "_create_from_file_like(int file_like_context, str? seek_mode=None) -> Tensor");
   m.def(
-      "_add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None, (Tensor, Tensor, Tensor)? custom_frame_mappings=None, str? color_conversion_library=None) -> ()");
+      "_add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"default\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None, str? color_conversion_library=None) -> ()");
   m.def(
-      "add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None, (Tensor, Tensor, Tensor)? custom_frame_mappings=None) -> ()");
+      "add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"default\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None) -> ()");
   m.def(
       "add_audio_stream(Tensor(a!) decoder, *, int? stream_index=None, int? sample_rate=None, int? num_channels=None) -> ()");
   m.def("seek_to_pts(Tensor(a!) decoder, float seconds) -> ()");
@@ -257,7 +257,8 @@ void _add_video_stream(
     std::optional<int64_t> num_threads = std::nullopt,
     std::optional<std::string_view> dimension_order = std::nullopt,
     std::optional<int64_t> stream_index = std::nullopt,
-    std::optional<std::string_view> device = std::nullopt,
+    std::string_view device = "cpu",
+    std::string_view device_variant = "default",
     std::optional<std::tuple<at::Tensor, at::Tensor, at::Tensor>>
         custom_frame_mappings = std::nullopt,
     std::optional<std::string_view> color_conversion_library = std::nullopt) {
@@ -287,9 +288,12 @@ void _add_video_stream(
           ". color_conversion_library must be either filtergraph or swscale.");
     }
   }
-  if (device.has_value()) {
-    videoStreamOptions.device = createTorchDevice(std::string(device.value()));
-  }
+
+  validateDeviceInterface(std::string(device), std::string(device_variant));
+
+  videoStreamOptions.device = torch::Device(std::string(device));
+  videoStreamOptions.deviceVariant = device_variant;
+
   std::optional<SingleStreamDecoder::FrameMappings> converted_mappings =
       custom_frame_mappings.has_value()
       ? std::make_optional(makeFrameMappings(custom_frame_mappings.value()))
@@ -307,7 +311,8 @@ void add_video_stream(
     std::optional<int64_t> num_threads = std::nullopt,
     std::optional<std::string_view> dimension_order = std::nullopt,
     std::optional<int64_t> stream_index = std::nullopt,
-    std::optional<std::string_view> device = std::nullopt,
+    std::string_view device = "cpu",
+    std::string_view device_variant = "default",
     const std::optional<std::tuple<at::Tensor, at::Tensor, at::Tensor>>&
         custom_frame_mappings = std::nullopt) {
   _add_video_stream(
@@ -318,6 +323,7 @@ void add_video_stream(
       dimension_order,
       stream_index,
       device,
+      device_variant,
       custom_frame_mappings);
 }
 
diff --git a/src/torchcodec/_core/nvcuvid_include/cuviddec.h b/src/torchcodec/_core/nvcuvid_include/cuviddec.h
new file mode 100644
index 000000000..4e70fe5a4
--- /dev/null
+++ b/src/torchcodec/_core/nvcuvid_include/cuviddec.h
@@ -0,0 +1,1374 @@
+/*
+ * This copyright notice applies to this header file only:
+ *
+ * Copyright (c) 2010-2024 NVIDIA Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the software, and to permit persons to whom the
+ * software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*****************************************************************************************************/
+//! \file cuviddec.h
+//! NVDECODE API provides video decoding interface to NVIDIA GPU devices.
+//! This file contains constants, structure definitions and function prototypes
+//! used for decoding.
+/*****************************************************************************************************/
+
+#if !defined(__CUDA_VIDEO_H__)
+#define __CUDA_VIDEO_H__
+
+#ifndef __cuda_cuda_h__
+#include <cuda.h>
+#endif // __cuda_cuda_h__
+
+#if defined(_WIN64) || defined(__LP64__) || defined(__x86_64) || \
+    defined(AMD64) || defined(_M_AMD64)
+#if (CUDA_VERSION >= 3020) && \
+    (!defined(CUDA_FORCE_API_VERSION) || (CUDA_FORCE_API_VERSION >= 3020))
+#define __CUVID_DEVPTR64
+#endif
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+typedef void* CUvideodecoder;
+typedef struct _CUcontextlock_st* CUvideoctxlock;
+
+/*********************************************************************************/
+//! \enum cudaVideoCodec
+//! Video codec enums
+//! These enums are used in CUVIDDECODECREATEINFO and CUVIDDECODECAPS structures
+/*********************************************************************************/
+typedef enum cudaVideoCodec_enum {
+  cudaVideoCodec_MPEG1 = 0, /**<  MPEG1             */
+  cudaVideoCodec_MPEG2, /**<  MPEG2             */
+  cudaVideoCodec_MPEG4, /**<  MPEG4             */
+  cudaVideoCodec_VC1, /**<  VC1               */
+  cudaVideoCodec_H264, /**<  H264              */
+  cudaVideoCodec_JPEG, /**<  JPEG              */
+  cudaVideoCodec_H264_SVC, /**<  H264-SVC          */
+  cudaVideoCodec_H264_MVC, /**<  H264-MVC          */
+  cudaVideoCodec_HEVC, /**<  HEVC              */
+  cudaVideoCodec_VP8, /**<  VP8               */
+  cudaVideoCodec_VP9, /**<  VP9               */
+  cudaVideoCodec_AV1, /**<  AV1               */
+  cudaVideoCodec_NumCodecs, /**<  Max codecs        */
+  // Uncompressed YUV
+  cudaVideoCodec_YUV420 =
+      (('I' << 24) | ('Y' << 16) | ('U' << 8) | ('V')), /**< Y,U,V (4:2:0) */
+  cudaVideoCodec_YV12 =
+      (('Y' << 24) | ('V' << 16) | ('1' << 8) | ('2')), /**< Y,V,U (4:2:0) */
+  cudaVideoCodec_NV12 =
+      (('N' << 24) | ('V' << 16) | ('1' << 8) | ('2')), /**< Y,UV  (4:2:0) */
+  cudaVideoCodec_YUYV =
+      (('Y' << 24) | ('U' << 16) | ('Y' << 8) |
+       ('V')), /**< YUYV/YUY2 (4:2:2)  */
+  cudaVideoCodec_UYVY =
+      (('U' << 24) | ('Y' << 16) | ('V' << 8) | ('Y')) /**< UYVY (4:2:2) */
+} cudaVideoCodec;
+
+/*********************************************************************************/
+//! \enum cudaVideoSurfaceFormat
+//! Video surface format enums used for output format of decoded output
+//! These enums are used in CUVIDDECODECREATEINFO structure
+/*********************************************************************************/
+typedef enum cudaVideoSurfaceFormat_enum {
+  cudaVideoSurfaceFormat_NV12 =
+      0, /**< Semi-Planar YUV [Y plane followed by interleaved UV plane]     */
+  cudaVideoSurfaceFormat_P016 =
+      1, /**< 16 bit Semi-Planar YUV [Y plane followed by interleaved UV plane].
+              Can be used for 10 bit(6LSB bits 0), 12 bit (4LSB bits 0)      */
+  cudaVideoSurfaceFormat_YUV444 =
+      2, /**< Planar YUV [Y plane followed by U and V planes]                */
+  cudaVideoSurfaceFormat_YUV444_16Bit =
+      3, /**< 16 bit Planar YUV [Y plane followed by U and V planes].
+              Can be used for 10 bit(6LSB bits 0), 12 bit (4LSB bits 0)      */
+} cudaVideoSurfaceFormat;
+
+/******************************************************************************************************************/
+//! \enum cudaVideoDeinterlaceMode
+//! Deinterlacing mode enums
+//! These enums are used in CUVIDDECODECREATEINFO structure
+//! Use cudaVideoDeinterlaceMode_Weave for progressive content and for content
+//! that doesn't need deinterlacing cudaVideoDeinterlaceMode_Adaptive needs more
+//! video memory than other DImodes
+/******************************************************************************************************************/
+typedef enum cudaVideoDeinterlaceMode_enum {
+  cudaVideoDeinterlaceMode_Weave =
+      0, /**< Weave both fields (no deinterlacing) */
+  cudaVideoDeinterlaceMode_Bob, /**< Drop one field                       */
+  cudaVideoDeinterlaceMode_Adaptive /**< Adaptive deinterlacing               */
+} cudaVideoDeinterlaceMode;
+
+/**************************************************************************************************************/
+//! \enum cudaVideoChromaFormat
+//! Chroma format enums
+//! These enums are used in CUVIDDECODECREATEINFO and CUVIDDECODECAPS structures
+/**************************************************************************************************************/
+typedef enum cudaVideoChromaFormat_enum {
+  cudaVideoChromaFormat_Monochrome = 0, /**< MonoChrome */
+  cudaVideoChromaFormat_420, /**< YUV 4:2:0  */
+  cudaVideoChromaFormat_422, /**< YUV 4:2:2  */
+  cudaVideoChromaFormat_444 /**< YUV 4:4:4  */
+} cudaVideoChromaFormat;
+
+/*************************************************************************************************************/
+//! \enum cudaVideoCreateFlags
+//! Decoder flag enums to select preferred decode path
+//! cudaVideoCreate_Default and cudaVideoCreate_PreferCUVID are most optimized,
+//! use these whenever possible
+/*************************************************************************************************************/
+typedef enum cudaVideoCreateFlags_enum {
+  cudaVideoCreate_Default =
+      0x00, /**< Default operation mode: use dedicated video engines */
+  cudaVideoCreate_PreferCUDA = 0x01, /**< Use CUDA-based decoder (requires valid
+                                        vidLock object for multi-threading) */
+  cudaVideoCreate_PreferDXVA = 0x02, /**< Go through DXVA internally if possible
+                                        (requires D3D9 interop)             */
+  cudaVideoCreate_PreferCUVID =
+      0x04 /**< Use dedicated video engines directly */
+} cudaVideoCreateFlags;
+
+/*************************************************************************/
+//! \enum cuvidDecodeStatus
+//! Decode status enums
+//! These enums are used in CUVIDGETDECODESTATUS structure
+/*************************************************************************/
+typedef enum cuvidDecodeStatus_enum {
+  cuvidDecodeStatus_Invalid = 0, // Decode status is not valid
+  cuvidDecodeStatus_InProgress = 1, // Decode is in progress
+  cuvidDecodeStatus_Success = 2, // Decode is completed without any errors
+  // 3 to 7 enums are reserved for future use
+  cuvidDecodeStatus_Error =
+      8, // Decode is completed with an error (error is not concealed)
+  cuvidDecodeStatus_Error_Concealed =
+      9, // Decode is completed with an error and error is concealed
+} cuvidDecodeStatus;
+
+/**************************************************************************************************************/
+//! \struct CUVIDDECODECAPS;
+//! This structure is used in cuvidGetDecoderCaps API
+/**************************************************************************************************************/
+typedef struct _CUVIDDECODECAPS {
+  cudaVideoCodec eCodecType; /**< IN: cudaVideoCodec_XXX */
+  cudaVideoChromaFormat eChromaFormat; /**< IN: cudaVideoChromaFormat_XXX */
+  unsigned int nBitDepthMinus8; /**< IN: The Value "BitDepth minus 8" */
+  unsigned int reserved1[3]; /**< Reserved for future use - set to zero */
+
+  unsigned char
+      bIsSupported; /**< OUT: 1 if codec supported, 0 if not supported */
+  unsigned char
+      nNumNVDECs; /**< OUT: Number of NVDECs that can support IN params */
+  unsigned short nOutputFormatMask; /**< OUT: each bit represents corresponding
+                                       cudaVideoSurfaceFormat enum */
+  unsigned int nMaxWidth; /**< OUT: Max supported coded width in pixels */
+  unsigned int nMaxHeight; /**< OUT: Max supported coded height in pixels */
+  unsigned int nMaxMBCount; /**< OUT: Max supported macroblock count
+                                      CodedWidth*CodedHeight/256 must be <=
+                               nMaxMBCount             */
+  unsigned short nMinWidth; /**< OUT: Min supported coded width in pixels */
+  unsigned short nMinHeight; /**< OUT: Min supported coded height in pixels */
+  unsigned char
+      bIsHistogramSupported; /**< OUT: 1 if Y component histogram output is
+                                supported, 0 if not Note: histogram is computed
+                                on original picture data before any
+                                post-processing like scaling, cropping, etc. is
+                                applied   */
+  unsigned char nCounterBitDepth; /**< OUT: histogram counter bit depth */
+  unsigned short nMaxHistogramBins; /**< OUT: Max number of histogram bins */
+  unsigned int reserved3[10]; /**< Reserved for future use - set to zero */
+} CUVIDDECODECAPS;
+
+/**************************************************************************************************************/
+//! \struct CUVIDDECODECREATEINFO
+//! This structure is used in cuvidCreateDecoder API
+/**************************************************************************************************************/
+typedef struct _CUVIDDECODECREATEINFO {
+  unsigned long ulWidth; /**< IN: Coded sequence width in pixels */
+  unsigned long ulHeight; /**< IN: Coded sequence height in pixels */
+  unsigned long ulNumDecodeSurfaces; /**< IN: Maximum number of internal decode
+                                        surfaces */
+  cudaVideoCodec CodecType; /**< IN: cudaVideoCodec_XXX */
+  cudaVideoChromaFormat ChromaFormat; /**< IN: cudaVideoChromaFormat_XXX */
+  unsigned long ulCreationFlags; /**< IN: Decoder creation flags
+                                    (cudaVideoCreateFlags_XXX) */
+  unsigned long bitDepthMinus8; /**< IN: The value "BitDepth minus 8" */
+  unsigned long
+      ulIntraDecodeOnly; /**< IN: Set 1 only if video has all intra frames
+                            (default value is 0). This will optimize video
+                            memory for Intra frames only decoding. The support
+                            is limited to specific codecs - H264, HEVC, VP9, the
+                            flag will be ignored for codecs which are not
+                            supported. However decoding might fail if the flag
+                            is enabled in case of supported codecs for regular
+                            bit streams having P and/or B frames.          */
+  unsigned long ulMaxWidth; /**< IN: Coded sequence max width in pixels used
+                               with reconfigure Decoder           */
+  unsigned long ulMaxHeight; /**< IN: Coded sequence max height in pixels used
+                                with reconfigure Decoder          */
+  unsigned long Reserved1; /**< Reserved for future use - set to zero */
+
+  /**
+   * IN: area of the frame that should be displayed
+   */
+  struct {
+    short left;
+    short top;
+    short right;
+    short bottom;
+  } display_area;
+
+  cudaVideoSurfaceFormat OutputFormat; /**< IN: cudaVideoSurfaceFormat_XXX */
+  cudaVideoDeinterlaceMode
+      DeinterlaceMode; /**< IN: cudaVideoDeinterlaceMode_XXX */
+  unsigned long ulTargetWidth; /**< IN: Post-processed output width (Should be
+                                  aligned to 2)           */
+  unsigned long ulTargetHeight; /**< IN: Post-processed output height (Should be
+                                   aligned to 2)          */
+  unsigned long ulNumOutputSurfaces; /**< IN: Maximum number of output surfaces
+                                        simultaneously mapped        */
+  CUvideoctxlock vidLock; /**< IN: If non-NULL, context lock used for
+                             synchronizing ownership of the cuda context. Needed
+                             for cudaVideoCreate_PreferCUDA decode     */
+
+  /**
+   * IN: target rectangle in the output frame (for aspect ratio conversion)
+   * if a null rectangle is specified, {0,0,ulTargetWidth,ulTargetHeight} will
+   * be used
+   */
+  struct {
+    short left;
+    short top;
+    short right;
+    short bottom;
+  } target_rect;
+
+  unsigned long
+      enableHistogram; /**< IN: enable histogram output, if supported */
+  unsigned long Reserved2[4]; /**< Reserved for future use - set to zero */
+} CUVIDDECODECREATEINFO;
+
+/*********************************************************/
+//! \struct CUVIDH264DPBENTRY
+//! H.264 DPB entry
+//! This structure is used in CUVIDH264PICPARAMS structure
+/*********************************************************/
+typedef struct _CUVIDH264DPBENTRY {
+  int PicIdx; /**< picture index of reference frame */
+  int FrameIdx; /**< frame_num(short-term) or LongTermFrameIdx(long-term) */
+  int is_long_term; /**< 0=short term reference, 1=long term reference */
+  int not_existing; /**< non-existing reference frame (corresponding PicIdx
+                       should be set to -1) */
+  int used_for_reference; /**< 0=unused, 1=top_field, 2=bottom_field,
+                             3=both_fields                    */
+  int FieldOrderCnt[2]; /**< field order count of top and bottom fields */
+} CUVIDH264DPBENTRY;
+
+/************************************************************/
+//! \struct CUVIDH264MVCEXT
+//! H.264 MVC picture parameters ext
+//! This structure is used in CUVIDH264PICPARAMS structure
+/************************************************************/
+typedef struct _CUVIDH264MVCEXT {
+  int num_views_minus1; /**< Max number of coded views minus 1 in video : Range
+                           - 0 to 1023              */
+  int view_id; /**< view identifier */
+  unsigned char
+      inter_view_flag; /**< 1 if used for inter-view prediction, 0 if not */
+  unsigned char num_inter_view_refs_l0; /**< number of inter-view ref pics in
+                                           RefPicList0 */
+  unsigned char num_inter_view_refs_l1; /**< number of inter-view ref pics in
+                                           RefPicList1 */
+  unsigned char MVCReserved8Bits; /**< Reserved bits */
+  int InterViewRefsL0[16]; /**< view id of the i-th view component for
+                              inter-view prediction in RefPicList0 */
+  int InterViewRefsL1[16]; /**< view id of the i-th view component for
+                              inter-view prediction in RefPicList1 */
+} CUVIDH264MVCEXT;
+
+/*********************************************************/
+//! \struct CUVIDH264SVCEXT
+//! H.264 SVC picture parameters ext
+//! This structure is used in CUVIDH264PICPARAMS structure
+/*********************************************************/
+typedef struct _CUVIDH264SVCEXT {
+  unsigned char profile_idc;
+  unsigned char level_idc;
+  unsigned char DQId;
+  unsigned char DQIdMax;
+  unsigned char disable_inter_layer_deblocking_filter_idc;
+  unsigned char ref_layer_chroma_phase_y_plus1;
+  signed char inter_layer_slice_alpha_c0_offset_div2;
+  signed char inter_layer_slice_beta_offset_div2;
+
+  unsigned short DPBEntryValidFlag;
+  unsigned char inter_layer_deblocking_filter_control_present_flag;
+  unsigned char extended_spatial_scalability_idc;
+  unsigned char adaptive_tcoeff_level_prediction_flag;
+  unsigned char slice_header_restriction_flag;
+  unsigned char chroma_phase_x_plus1_flag;
+  unsigned char chroma_phase_y_plus1;
+
+  unsigned char tcoeff_level_prediction_flag;
+  unsigned char constrained_intra_resampling_flag;
+  unsigned char ref_layer_chroma_phase_x_plus1_flag;
+  unsigned char store_ref_base_pic_flag;
+  unsigned char Reserved8BitsA;
+  unsigned char Reserved8BitsB;
+
+  short scaled_ref_layer_left_offset;
+  short scaled_ref_layer_top_offset;
+  short scaled_ref_layer_right_offset;
+  short scaled_ref_layer_bottom_offset;
+  unsigned short Reserved16Bits;
+  struct _CUVIDPICPARAMS*
+      pNextLayer; /**< Points to the picparams for the next layer to be decoded.
+                       Linked list ends at the target layer. */
+  int bRefBaseLayer; /**< whether to store ref base pic */
+} CUVIDH264SVCEXT;
+
+/******************************************************/
+//! \struct CUVIDH264PICPARAMS
+//! H.264 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/******************************************************/
+typedef struct _CUVIDH264PICPARAMS {
+  // SPS
+  int log2_max_frame_num_minus4;
+  int pic_order_cnt_type;
+  int log2_max_pic_order_cnt_lsb_minus4;
+  int delta_pic_order_always_zero_flag;
+  int frame_mbs_only_flag;
+  int direct_8x8_inference_flag;
+  int num_ref_frames; // NOTE: shall meet level 4.1 restrictions
+  unsigned char residual_colour_transform_flag;
+  unsigned char bit_depth_luma_minus8; // Must be 0 (only 8-bit supported)
+  unsigned char bit_depth_chroma_minus8; // Must be 0 (only 8-bit supported)
+  unsigned char qpprime_y_zero_transform_bypass_flag;
+  // PPS
+  int entropy_coding_mode_flag;
+  int pic_order_present_flag;
+  int num_ref_idx_l0_active_minus1;
+  int num_ref_idx_l1_active_minus1;
+  int weighted_pred_flag;
+  int weighted_bipred_idc;
+  int pic_init_qp_minus26;
+  int deblocking_filter_control_present_flag;
+  int redundant_pic_cnt_present_flag;
+  int transform_8x8_mode_flag;
+  int MbaffFrameFlag;
+  int constrained_intra_pred_flag;
+  int chroma_qp_index_offset;
+  int second_chroma_qp_index_offset;
+  int ref_pic_flag;
+  int frame_num;
+  int CurrFieldOrderCnt[2];
+  // DPB
+  CUVIDH264DPBENTRY dpb[16]; // List of reference frames within the DPB
+  // Quantization Matrices (raster-order)
+  unsigned char WeightScale4x4[6][16];
+  unsigned char WeightScale8x8[2][64];
+  // FMO/ASO
+  unsigned char fmo_aso_enable;
+  unsigned char num_slice_groups_minus1;
+  unsigned char slice_group_map_type;
+  signed char pic_init_qs_minus26;
+  unsigned int slice_group_change_rate_minus1;
+
+  union {
+    unsigned long long slice_group_map_addr;
+    const unsigned char* pMb2SliceGroupMap;
+  } fmo;
+
+  unsigned int Reserved[12];
+
+  // SVC/MVC
+  union {
+    CUVIDH264MVCEXT mvcext;
+    CUVIDH264SVCEXT svcext;
+  };
+} CUVIDH264PICPARAMS;
+
+/********************************************************/
+//! \struct CUVIDMPEG2PICPARAMS
+//! MPEG-2 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/********************************************************/
+typedef struct _CUVIDMPEG2PICPARAMS {
+  int ForwardRefIdx; // Picture index of forward reference (P/B-frames)
+  int BackwardRefIdx; // Picture index of backward reference (B-frames)
+  int picture_coding_type;
+  int full_pel_forward_vector;
+  int full_pel_backward_vector;
+  int f_code[2][2];
+  int intra_dc_precision;
+  int frame_pred_frame_dct;
+  int concealment_motion_vectors;
+  int q_scale_type;
+  int intra_vlc_format;
+  int alternate_scan;
+  int top_field_first;
+  // Quantization matrices (raster order)
+  unsigned char QuantMatrixIntra[64];
+  unsigned char QuantMatrixInter[64];
+} CUVIDMPEG2PICPARAMS;
+
+// MPEG-4 has VOP types instead of Picture types
+#define I_VOP 0
+#define P_VOP 1
+#define B_VOP 2
+#define S_VOP 3
+
+/*******************************************************/
+//! \struct CUVIDMPEG4PICPARAMS
+//! MPEG-4 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/*******************************************************/
+typedef struct _CUVIDMPEG4PICPARAMS {
+  int ForwardRefIdx; // Picture index of forward reference (P/B-frames)
+  int BackwardRefIdx; // Picture index of backward reference (B-frames)
+  // VOL
+  int video_object_layer_width;
+  int video_object_layer_height;
+  int vop_time_increment_bitcount;
+  int top_field_first;
+  int resync_marker_disable;
+  int quant_type;
+  int quarter_sample;
+  int short_video_header;
+  int divx_flags;
+  // VOP
+  int vop_coding_type;
+  int vop_coded;
+  int vop_rounding_type;
+  int alternate_vertical_scan_flag;
+  int interlaced;
+  int vop_fcode_forward;
+  int vop_fcode_backward;
+  int trd[2];
+  int trb[2];
+  // Quantization matrices (raster order)
+  unsigned char QuantMatrixIntra[64];
+  unsigned char QuantMatrixInter[64];
+  int gmc_enabled;
+} CUVIDMPEG4PICPARAMS;
+
+/********************************************************/
+//! \struct CUVIDVC1PICPARAMS
+//! VC1 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/********************************************************/
+typedef struct _CUVIDVC1PICPARAMS {
+  int ForwardRefIdx; /**< Picture index of forward reference (P/B-frames) */
+  int BackwardRefIdx; /**< Picture index of backward reference (B-frames)  */
+  int FrameWidth; /**< Actual frame width                              */
+  int FrameHeight; /**< Actual frame height                             */
+  // PICTURE
+  int intra_pic_flag; /**< Set to 1 for I,BI frames */
+  int ref_pic_flag; /**< Set to 1 for I,P frames  */
+  int progressive_fcm; /**< Progressive frame        */
+  // SEQUENCE
+  int profile;
+  int postprocflag;
+  int pulldown;
+  int interlace;
+  int tfcntrflag;
+  int finterpflag;
+  int psf;
+  int multires;
+  int syncmarker;
+  int rangered;
+  int maxbframes;
+  // ENTRYPOINT
+  int panscan_flag;
+  int refdist_flag;
+  int extended_mv;
+  int dquant;
+  int vstransform;
+  int loopfilter;
+  int fastuvmc;
+  int overlap;
+  int quantizer;
+  int extended_dmv;
+  int range_mapy_flag;
+  int range_mapy;
+  int range_mapuv_flag;
+  int range_mapuv;
+  int rangeredfrm; // range reduction state
+} CUVIDVC1PICPARAMS;
+
+/***********************************************************/
+//! \struct CUVIDJPEGPICPARAMS
+//! JPEG picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/***********************************************************/
+typedef struct _CUVIDJPEGPICPARAMS {
+  int Reserved;
+} CUVIDJPEGPICPARAMS;
+
+/*******************************************************/
+//! \struct CUVIDHEVCPICPARAMS
+//! HEVC picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/*******************************************************/
+typedef struct _CUVIDHEVCPICPARAMS {
+  // sps
+  int pic_width_in_luma_samples;
+  int pic_height_in_luma_samples;
+  unsigned char log2_min_luma_coding_block_size_minus3;
+  unsigned char log2_diff_max_min_luma_coding_block_size;
+  unsigned char log2_min_transform_block_size_minus2;
+  unsigned char log2_diff_max_min_transform_block_size;
+  unsigned char pcm_enabled_flag;
+  unsigned char log2_min_pcm_luma_coding_block_size_minus3;
+  unsigned char log2_diff_max_min_pcm_luma_coding_block_size;
+  unsigned char pcm_sample_bit_depth_luma_minus1;
+
+  unsigned char pcm_sample_bit_depth_chroma_minus1;
+  unsigned char pcm_loop_filter_disabled_flag;
+  unsigned char strong_intra_smoothing_enabled_flag;
+  unsigned char max_transform_hierarchy_depth_intra;
+  unsigned char max_transform_hierarchy_depth_inter;
+  unsigned char amp_enabled_flag;
+  unsigned char separate_colour_plane_flag;
+  unsigned char log2_max_pic_order_cnt_lsb_minus4;
+
+  unsigned char num_short_term_ref_pic_sets;
+  unsigned char long_term_ref_pics_present_flag;
+  unsigned char num_long_term_ref_pics_sps;
+  unsigned char sps_temporal_mvp_enabled_flag;
+  unsigned char sample_adaptive_offset_enabled_flag;
+  unsigned char scaling_list_enable_flag;
+  unsigned char IrapPicFlag;
+  unsigned char IdrPicFlag;
+
+  unsigned char bit_depth_luma_minus8;
+  unsigned char bit_depth_chroma_minus8;
+  // sps/pps extension fields
+  unsigned char log2_max_transform_skip_block_size_minus2;
+  unsigned char log2_sao_offset_scale_luma;
+  unsigned char log2_sao_offset_scale_chroma;
+  unsigned char high_precision_offsets_enabled_flag;
+  unsigned char reserved1[10];
+
+  // pps
+  unsigned char dependent_slice_segments_enabled_flag;
+  unsigned char slice_segment_header_extension_present_flag;
+  unsigned char sign_data_hiding_enabled_flag;
+  unsigned char cu_qp_delta_enabled_flag;
+  unsigned char diff_cu_qp_delta_depth;
+  signed char init_qp_minus26;
+  signed char pps_cb_qp_offset;
+  signed char pps_cr_qp_offset;
+
+  unsigned char constrained_intra_pred_flag;
+  unsigned char weighted_pred_flag;
+  unsigned char weighted_bipred_flag;
+  unsigned char transform_skip_enabled_flag;
+  unsigned char transquant_bypass_enabled_flag;
+  unsigned char entropy_coding_sync_enabled_flag;
+  unsigned char log2_parallel_merge_level_minus2;
+  unsigned char num_extra_slice_header_bits;
+
+  unsigned char loop_filter_across_tiles_enabled_flag;
+  unsigned char loop_filter_across_slices_enabled_flag;
+  unsigned char output_flag_present_flag;
+  unsigned char num_ref_idx_l0_default_active_minus1;
+  unsigned char num_ref_idx_l1_default_active_minus1;
+  unsigned char lists_modification_present_flag;
+  unsigned char cabac_init_present_flag;
+  unsigned char pps_slice_chroma_qp_offsets_present_flag;
+
+  unsigned char deblocking_filter_override_enabled_flag;
+  unsigned char pps_deblocking_filter_disabled_flag;
+  signed char pps_beta_offset_div2;
+  signed char pps_tc_offset_div2;
+  unsigned char tiles_enabled_flag;
+  unsigned char uniform_spacing_flag;
+  unsigned char num_tile_columns_minus1;
+  unsigned char num_tile_rows_minus1;
+
+  unsigned short column_width_minus1[21];
+  unsigned short row_height_minus1[21];
+
+  // sps and pps extension HEVC-main 444
+  unsigned char sps_range_extension_flag;
+  unsigned char transform_skip_rotation_enabled_flag;
+  unsigned char transform_skip_context_enabled_flag;
+  unsigned char implicit_rdpcm_enabled_flag;
+
+  unsigned char explicit_rdpcm_enabled_flag;
+  unsigned char extended_precision_processing_flag;
+  unsigned char intra_smoothing_disabled_flag;
+  unsigned char persistent_rice_adaptation_enabled_flag;
+
+  unsigned char cabac_bypass_alignment_enabled_flag;
+  unsigned char pps_range_extension_flag;
+  unsigned char cross_component_prediction_enabled_flag;
+  unsigned char chroma_qp_offset_list_enabled_flag;
+
+  unsigned char diff_cu_chroma_qp_offset_depth;
+  unsigned char chroma_qp_offset_list_len_minus1;
+  signed char cb_qp_offset_list[6];
+
+  signed char cr_qp_offset_list[6];
+  unsigned char reserved2[2];
+
+  unsigned int reserved3[8];
+
+  // RefPicSets
+  int NumBitsForShortTermRPSInSlice;
+  int NumDeltaPocsOfRefRpsIdx;
+  int NumPocTotalCurr;
+  int NumPocStCurrBefore;
+  int NumPocStCurrAfter;
+  int NumPocLtCurr;
+  int CurrPicOrderCntVal;
+  int RefPicIdx[16]; // [refpic] Indices of valid reference pictures (-1 if
+                     // unused for reference)
+  int PicOrderCntVal[16]; // [refpic]
+  unsigned char IsLongTerm[16]; // [refpic] 0=not a long-term reference,
+                                // 1=long-term reference
+  unsigned char
+      RefPicSetStCurrBefore[8]; // [0..NumPocStCurrBefore-1] -> refpic (0..15)
+  unsigned char
+      RefPicSetStCurrAfter[8]; // [0..NumPocStCurrAfter-1] -> refpic (0..15)
+  unsigned char RefPicSetLtCurr[8]; // [0..NumPocLtCurr-1] -> refpic (0..15)
+  unsigned char RefPicSetInterLayer0[8];
+  unsigned char RefPicSetInterLayer1[8];
+  unsigned int reserved4[12];
+
+  // scaling lists (diag order)
+  unsigned char ScalingList4x4[6][16]; // [matrixId][i]
+  unsigned char ScalingList8x8[6][64]; // [matrixId][i]
+  unsigned char ScalingList16x16[6][64]; // [matrixId][i]
+  unsigned char ScalingList32x32[2][64]; // [matrixId][i]
+  unsigned char ScalingListDCCoeff16x16[6]; // [matrixId]
+  unsigned char ScalingListDCCoeff32x32[2]; // [matrixId]
+} CUVIDHEVCPICPARAMS;
+
+/***********************************************************/
+//! \struct CUVIDVP8PICPARAMS
+//! VP8 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/***********************************************************/
+typedef struct _CUVIDVP8PICPARAMS {
+  int width;
+  int height;
+  unsigned int first_partition_size;
+  // Frame Indexes
+  unsigned char LastRefIdx;
+  unsigned char GoldenRefIdx;
+  unsigned char AltRefIdx;
+
+  union {
+    struct {
+      unsigned char frame_type : 1; /**< 0 = KEYFRAME, 1 = INTERFRAME  */
+      unsigned char version : 3;
+      unsigned char show_frame : 1;
+      unsigned char
+          update_mb_segmentation_data : 1; /**< Must be 0 if segmentation is not
+                                              enabled */
+      unsigned char Reserved2Bits : 2;
+    } vp8_frame_tag;
+
+    unsigned char wFrameTagFlags;
+  };
+
+  unsigned char Reserved1[4];
+  unsigned int Reserved2[3];
+} CUVIDVP8PICPARAMS;
+
+/***********************************************************/
+//! \struct CUVIDVP9PICPARAMS
+//! VP9 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/***********************************************************/
+typedef struct _CUVIDVP9PICPARAMS {
+  unsigned int width;
+  unsigned int height;
+
+  // Frame Indices
+  unsigned char LastRefIdx;
+  unsigned char GoldenRefIdx;
+  unsigned char AltRefIdx;
+  unsigned char colorSpace;
+
+  unsigned short profile : 3;
+  unsigned short frameContextIdx : 2;
+  unsigned short frameType : 1;
+  unsigned short showFrame : 1;
+  unsigned short errorResilient : 1;
+  unsigned short frameParallelDecoding : 1;
+  unsigned short subSamplingX : 1;
+  unsigned short subSamplingY : 1;
+  unsigned short intraOnly : 1;
+  unsigned short allow_high_precision_mv : 1;
+  unsigned short refreshEntropyProbs : 1;
+  unsigned short reserved2Bits : 2;
+
+  unsigned short reserved16Bits;
+
+  unsigned char refFrameSignBias[4];
+
+  unsigned char bitDepthMinus8Luma;
+  unsigned char bitDepthMinus8Chroma;
+  unsigned char loopFilterLevel;
+  unsigned char loopFilterSharpness;
+
+  unsigned char modeRefLfEnabled;
+  unsigned char log2_tile_columns;
+  unsigned char log2_tile_rows;
+
+  unsigned char segmentEnabled : 1;
+  unsigned char segmentMapUpdate : 1;
+  unsigned char segmentMapTemporalUpdate : 1;
+  unsigned char segmentFeatureMode : 1;
+  unsigned char reserved4Bits : 4;
+
+  unsigned char segmentFeatureEnable[8][4];
+  short segmentFeatureData[8][4];
+  unsigned char mb_segment_tree_probs[7];
+  unsigned char segment_pred_probs[3];
+  unsigned char reservedSegment16Bits[2];
+
+  int qpYAc;
+  int qpYDc;
+  int qpChDc;
+  int qpChAc;
+
+  unsigned int activeRefIdx[3];
+  unsigned int resetFrameContext;
+  unsigned int mcomp_filter_type;
+  unsigned int mbRefLfDelta[4];
+  unsigned int mbModeLfDelta[2];
+  unsigned int frameTagSize;
+  unsigned int offsetToDctParts;
+  unsigned int reserved128Bits[4];
+
+} CUVIDVP9PICPARAMS;
+
+/***********************************************************/
+//! \struct CUVIDAV1PICPARAMS
+//! AV1 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/***********************************************************/
+typedef struct _CUVIDAV1PICPARAMS {
+  unsigned int
+      width; // coded width, if superres enabled then it is upscaled width
+  unsigned int height; // coded height
+  unsigned int frame_offset; // defined as order_hint in AV1 specification
+  int decodePicIdx; // decoded output pic index, if film grain enabled, it will
+                    // keep decoded (without film grain) output It can be used
+                    // as reference frame for future frames
+
+  // sequence header
+  unsigned int profile : 3; // 0 = profile0, 1 = profile1, 2 = profile2
+  unsigned int
+      use_128x128_superblock : 1; // superblock size 0:64x64, 1: 128x128
+  unsigned int
+      subsampling_x : 1; // (subsampling_x, _y) 1,1 = 420, 1,0 = 422, 0,0 = 444
+  unsigned int subsampling_y : 1;
+  unsigned int mono_chrome : 1; // for monochrome content, mono_chrome = 1 and
+                                // (subsampling_x, _y) should be 1,1
+  unsigned int bit_depth_minus8 : 4; // bit depth minus 8
+  unsigned int enable_filter_intra : 1; // tool enable in seq level, 0 : disable
+                                        // 1: frame header control
+  unsigned int enable_intra_edge_filter : 1; // intra edge filtering process, 0
+                                             // : disable 1: enabled
+  unsigned int
+      enable_interintra_compound : 1; // interintra, 0 : not present 1: present
+  unsigned int
+      enable_masked_compound : 1; // 1: mode info for inter blocks may contain
+                                  // the syntax element compound_type. 0: syntax
+                                  // element compound_type will not be present
+  unsigned int enable_dual_filter : 1; // vertical and horiz filter selection,
+                                       // 1: enable and 0: disable
+  unsigned int enable_order_hint : 1; // order hint, and related tools, 1:
+                                      // enable and 0: disable
+  unsigned int order_hint_bits_minus1 : 3; // is used to compute OrderHintBits
+  unsigned int
+      enable_jnt_comp : 1; // joint compound modes, 1: enable and 0: disable
+  unsigned int enable_superres : 1; // superres in seq level, 0 : disable 1:
+                                    // frame level control
+  unsigned int enable_cdef : 1; // cdef filtering in seq level, 0 : disable 1:
+                                // frame level control
+  unsigned int
+      enable_restoration : 1; // loop restoration filtering in seq level, 0 :
+                              // disable 1: frame level control
+  unsigned int enable_fgs : 1; // defined as film_grain_params_present in AV1
+                               // specification
+  unsigned int reserved0_7bits : 7; // reserved bits; must be set to 0
+
+  // frame header
+  unsigned int
+      frame_type : 2; // 0:Key frame, 1:Inter frame, 2:intra only, 3:s-frame
+  unsigned int show_frame : 1; // show_frame = 1 implies that frame should be
+                               // immediately output once decoded
+  unsigned int disable_cdf_update : 1; // CDF update during symbol decoding, 1:
+                                       // disabled, 0: enabled
+  unsigned int allow_screen_content_tools : 1; // 1: intra blocks may use
+                                               // palette encoding, 0: palette
+                                               // encoding is never used
+  unsigned int force_integer_mv : 1; // 1: motion vectors will always be
+                                     // integers, 0: can contain fractional bits
+  unsigned int coded_denom : 3; // coded_denom of the superres scale as
+                                // specified in AV1 specification
+  unsigned int allow_intrabc : 1; // 1: intra block copy may be used, 0: intra
+                                  // block copy is not allowed
+  unsigned int allow_high_precision_mv : 1; // 1/8 precision mv enable
+  unsigned int
+      interp_filter : 3; // interpolation filter. Refer to section 6.8.9 of the
+                         // AV1 specification Version 1.0.0 with Errata 1
+  unsigned int
+      switchable_motion_mode : 1; // defined as is_motion_mode_switchable in AV1
+                                  // specification
+  unsigned int use_ref_frame_mvs : 1; // 1: current frame can use the previous
+                                      // frame mv information, 0: will not use.
+  unsigned int disable_frame_end_update_cdf : 1; // 1: indicates that the end of
+                                                 // frame CDF update is disabled
+  unsigned int delta_q_present : 1; // quantizer index delta values are present
+                                    // in the block level
+  unsigned int delta_q_res : 2; // left shift which should be applied to decoded
+                                // quantizer index delta values
+  unsigned int using_qmatrix : 1; // 1: quantizer matrix will be used to compute
+                                  // quantizers
+  unsigned int coded_lossless : 1; // 1: all segments use lossless coding
+  unsigned int use_superres : 1; // 1: superres enabled for frame
+  unsigned int tx_mode : 2; // 0: ONLY4x4,1:LARGEST,2:SELECT
+  unsigned int reference_mode : 1; // 0: SINGLE, 1: SELECT
+  unsigned int
+      allow_warped_motion : 1; // 1: allow_warped_motion may be present, 0:
+                               // allow_warped_motion will not be present
+  unsigned int
+      reduced_tx_set : 1; // 1: frame is restricted to subset of the full set of
+                          // transform types, 0: no such restriction
+  unsigned int skip_mode : 1; // 1: most of the mode info is skipped, 0: mode
+                              // info is not skipped
+  unsigned int reserved1_3bits : 3; // reserved bits; must be set to 0
+
+  // tiling info
+  unsigned int
+      num_tile_cols : 8; // number of tiles across the frame., max is 64
+  unsigned int num_tile_rows : 8; // number of tiles down the frame., max is 64
+  unsigned int context_update_tile_id : 16; // specifies which tile to use for
+                                            // the CDF update
+  unsigned short tile_widths[64]; // Width of each column in superblocks
+  unsigned short tile_heights[64]; // height of each row in superblocks
+
+  // CDEF - refer to section 6.10.14 of the AV1 specification Version 1.0.0 with
+  // Errata 1
+  unsigned char cdef_damping_minus_3 : 2; // controls the amount of damping in
+                                          // the deringing filter
+  unsigned char cdef_bits : 2; // the number of bits needed to specify which
+                               // CDEF filter to apply
+  unsigned char reserved2_4bits : 4; // reserved bits; must be set to 0
+  unsigned char
+      cdef_y_strength[8]; // 0-3 bits: y_pri_strength, 4-7 bits y_sec_strength
+  unsigned char cdef_uv_strength[8]; // 0-3 bits: uv_pri_strength, 4-7 bits
+                                     // uv_sec_strength
+
+  // SkipModeFrames
+  unsigned char SkipModeFrame0 : 4; // specifies the frames to use for compound
+                                    // prediction when skip_mode is equal to 1.
+  unsigned char SkipModeFrame1 : 4;
+
+  // qp information - refer to section 6.8.11 of the AV1 specification
+  // Version 1.0.0 with Errata 1
+  unsigned char base_qindex; // indicates the base frame qindex. Defined as
+                             // base_q_idx in AV1 specification
+  char qp_y_dc_delta_q; // indicates the Y DC quantizer relative to base_q_idx.
+                        // Defined as DeltaQYDc in AV1 specification
+  char qp_u_dc_delta_q; // indicates the U DC quantizer relative to base_q_idx.
+                        // Defined as DeltaQUDc in AV1 specification
+  char qp_v_dc_delta_q; // indicates the V DC quantizer relative to base_q_idx.
+                        // Defined as DeltaQVDc in AV1 specification
+  char qp_u_ac_delta_q; // indicates the U AC quantizer relative to base_q_idx.
+                        // Defined as DeltaQUAc in AV1 specification
+  char qp_v_ac_delta_q; // indicates the V AC quantizer relative to base_q_idx.
+                        // Defined as DeltaQVAc in AV1 specification
+  unsigned char qm_y; // specifies the level in the quantizer matrix that should
+                      // be used for luma plane decoding
+  unsigned char qm_u; // specifies the level in the quantizer matrix that should
+                      // be used for chroma U plane decoding
+  unsigned char qm_v; // specifies the level in the quantizer matrix that should
+                      // be used for chroma V plane decoding
+
+  // segmentation - refer to section 6.8.13 of the AV1 specification
+  // Version 1.0.0 with Errata 1
+  unsigned char segmentation_enabled : 1; // 1 indicates that this frame makes
+                                          // use of the segmentation tool
+  unsigned char
+      segmentation_update_map : 1; // 1 indicates that the segmentation map are
+                                   // updated during the decoding of this frame
+  unsigned char
+      segmentation_update_data : 1; // 1 indicates that new parameters are about
+                                    // to be specified for each segment
+  unsigned char
+      segmentation_temporal_update : 1; // 1 indicates that the updates to the
+                                        // segmentation map are coded relative
+                                        // to the existing segmentation map
+  unsigned char reserved3_4bits : 4; // reserved bits; must be set to 0
+  short segmentation_feature_data[8][8]; // specifies the feature data for a
+                                         // segment feature
+  unsigned char
+      segmentation_feature_mask[8]; // indicates that the corresponding feature
+                                    // is unused or feature value is coded
+
+  // loopfilter - refer to section 6.8.10 of the AV1 specification Version 1.0.0
+  // with Errata 1
+  unsigned char loop_filter_level[2]; // contains loop filter strength values
+  unsigned char loop_filter_level_u; // loop filter strength value of U plane
+  unsigned char loop_filter_level_v; // loop filter strength value of V plane
+  unsigned char loop_filter_sharpness; // indicates the sharpness level
+  char loop_filter_ref_deltas[8]; // contains the adjustment needed for the
+                                  // filter level based on the chosen reference
+                                  // frame
+  char loop_filter_mode_deltas[2]; // contains the adjustment needed for the
+                                   // filter level based on the chosen mode
+  unsigned char
+      loop_filter_delta_enabled : 1; // indicates that the filter level depends
+                                     // on the mode and reference frame used to
+                                     // predict a block
+  unsigned char
+      loop_filter_delta_update : 1; // indicates that additional syntax elements
+                                    // are present that specify which mode and
+                                    // reference frame deltas are to be updated
+  unsigned char delta_lf_present : 1; // specifies whether loop filter delta
+                                      // values are present in the block level
+  unsigned char delta_lf_res : 2; // specifies the left shift to apply to the
+                                  // decoded loop filter values
+  unsigned char
+      delta_lf_multi : 1; // separate loop filter deltas for Hy,Vy,U,V edges
+  unsigned char reserved4_2bits : 2; // reserved bits; must be set to 0
+
+  // restoration - refer to section 6.10.15 of the AV1 specification
+  // Version 1.0.0 with Errata 1
+  unsigned char lr_unit_size[3]; // specifies the size of loop restoration
+                                 // units: 0: 32, 1: 64, 2: 128, 3: 256
+  unsigned char lr_type[3]; // used to compute FrameRestorationType
+
+  // reference frames
+  unsigned char primary_ref_frame; // specifies which reference frame contains
+                                   // the CDF values and other state that should
+                                   // be loaded at the start of the frame
+  unsigned char ref_frame_map[8]; // frames in dpb that can be used as reference
+                                  // for current or future frames
+
+  unsigned char temporal_layer_id : 4; // temporal layer id
+  unsigned char spatial_layer_id : 4; // spatial layer id
+
+  unsigned char reserved5_32bits[4]; // reserved bits; must be set to 0
+
+  // ref frame list
+  struct {
+    unsigned int width;
+    unsigned int height;
+    unsigned char index;
+    unsigned char reserved24Bits[3]; // reserved bits; must be set to 0
+  } ref_frame[7]; // frames used as reference frame for current frame.
+
+  // global motion
+  struct {
+    unsigned char invalid : 1;
+    unsigned char wmtype : 2; // defined as GmType in AV1 specification
+    unsigned char reserved5Bits : 5; // reserved bits; must be set to 0
+    char reserved24Bits[3]; // reserved bits; must be set to 0
+    int wmmat[6]; // defined as gm_params[] in AV1 specification
+  } global_motion[7]; // global motion params for reference frames
+
+  // film grain params - refer to section 6.8.20 of the AV1 specification
+  // Version 1.0.0 with Errata 1
+  unsigned short apply_grain : 1;
+  unsigned short overlap_flag : 1;
+  unsigned short scaling_shift_minus8 : 2;
+  unsigned short chroma_scaling_from_luma : 1;
+  unsigned short ar_coeff_lag : 2;
+  unsigned short ar_coeff_shift_minus6 : 2;
+  unsigned short grain_scale_shift : 2;
+  unsigned short clip_to_restricted_range : 1;
+  unsigned short reserved6_4bits : 4; // reserved bits; must be set to 0
+  unsigned char num_y_points;
+  unsigned char scaling_points_y[14][2];
+  unsigned char num_cb_points;
+  unsigned char scaling_points_cb[10][2];
+  unsigned char num_cr_points;
+  unsigned char scaling_points_cr[10][2];
+  unsigned char reserved7_8bits; // reserved bits; must be set to 0
+  unsigned short random_seed;
+  short ar_coeffs_y[24];
+  short ar_coeffs_cb[25];
+  short ar_coeffs_cr[25];
+  unsigned char cb_mult;
+  unsigned char cb_luma_mult;
+  short cb_offset;
+  unsigned char cr_mult;
+  unsigned char cr_luma_mult;
+  short cr_offset;
+
+  int reserved[7]; // reserved bits; must be set to 0
+} CUVIDAV1PICPARAMS;
+
+/******************************************************************************************/
+//! \struct CUVIDPICPARAMS
+//! Picture parameters for decoding
+//! This structure is used in cuvidDecodePicture API
+//! IN  for cuvidDecodePicture
+/******************************************************************************************/
+typedef struct _CUVIDPICPARAMS {
+  int PicWidthInMbs; /**< IN: Coded frame size in macroblocks */
+  int FrameHeightInMbs; /**< IN: Coded frame height in macroblocks */
+  int CurrPicIdx; /**< IN: Output index of the current picture */
+  int field_pic_flag; /**< IN: 0=frame picture, 1=field picture */
+  int bottom_field_flag; /**< IN: 0=top field, 1=bottom field (ignored if
+                            field_pic_flag=0) */
+  int second_field; /**< IN: Second field of a complementary field pair */
+  // Bitstream data
+  unsigned int
+      nBitstreamDataLen; /**< IN: Number of bytes in bitstream data buffer */
+  const unsigned char* pBitstreamData; /**< IN: Ptr to bitstream data for this
+                                          picture (slice-layer)      */
+  unsigned int nNumSlices; /**< IN: Number of slices in this picture */
+  const unsigned int*
+      pSliceDataOffsets; /**< IN: nNumSlices entries, contains offset of each
+                            slice within the bitstream data buffer */
+  int ref_pic_flag; /**< IN: This picture is a reference picture */
+  int intra_pic_flag; /**< IN: This picture is entirely intra coded */
+  unsigned int Reserved[30]; /**< Reserved for future use */
+
+  // IN: Codec-specific data
+  union {
+    CUVIDMPEG2PICPARAMS mpeg2; /**< Also used for MPEG-1 */
+    CUVIDH264PICPARAMS h264;
+    CUVIDVC1PICPARAMS vc1;
+    CUVIDMPEG4PICPARAMS mpeg4;
+    CUVIDJPEGPICPARAMS jpeg;
+    CUVIDHEVCPICPARAMS hevc;
+    CUVIDVP8PICPARAMS vp8;
+    CUVIDVP9PICPARAMS vp9;
+    CUVIDAV1PICPARAMS av1;
+    unsigned int CodecReserved[1024];
+  } CodecSpecific;
+} CUVIDPICPARAMS;
+
+/******************************************************/
+//! \struct CUVIDPROCPARAMS
+//! Picture parameters for postprocessing
+//! This structure is used in cuvidMapVideoFrame API
+/******************************************************/
+typedef struct _CUVIDPROCPARAMS {
+  int progressive_frame; /**< IN: Input is progressive (deinterlace_mode will be
+                            ignored)                */
+  int second_field; /**< IN: Output the second field (ignored if deinterlace
+                       mode is Weave)         */
+  int top_field_first; /**< IN: Input frame is top field first (1st field is
+                          top, 2nd field is bottom) */
+  int unpaired_field; /**< IN: Input only contains one field (2nd field is
+                         invalid)                   */
+  // The fields below are used for raw YUV input
+  unsigned int reserved_flags; /**< Reserved for future use (set to zero) */
+  unsigned int reserved_zero; /**< Reserved (set to zero) */
+  unsigned long long
+      raw_input_dptr; /**< IN: Input CUdeviceptr for raw YUV extensions */
+  unsigned int raw_input_pitch; /**< IN: pitch in bytes of raw YUV input (should
+                                   be aligned appropriately)      */
+  unsigned int
+      raw_input_format; /**< IN: Input YUV format (cudaVideoCodec_enum) */
+  unsigned long long
+      raw_output_dptr; /**< IN: Output CUdeviceptr for raw YUV extensions */
+  unsigned int raw_output_pitch; /**< IN: pitch in bytes of raw YUV output
+                                    (should be aligned appropriately)     */
+  unsigned int Reserved1; /**< Reserved for future use (set to zero) */
+  CUstream output_stream; /**< IN: stream object used by cuvidMapVideoFrame */
+  unsigned int Reserved[46]; /**< Reserved for future use (set to zero) */
+  unsigned long long*
+      histogram_dptr; /**< OUT: Output CUdeviceptr for histogram extensions */
+  void* Reserved2[1]; /**< Reserved for future use (set to zero) */
+} CUVIDPROCPARAMS;
+
+/*********************************************************************************************************/
+//! \struct CUVIDGETDECODESTATUS
+//! Struct for reporting decode status.
+//! This structure is used in cuvidGetDecodeStatus API.
+/*********************************************************************************************************/
+typedef struct _CUVIDGETDECODESTATUS {
+  cuvidDecodeStatus decodeStatus;
+  unsigned int reserved[31];
+  void* pReserved[8];
+} CUVIDGETDECODESTATUS;
+
+/****************************************************/
+//! \struct CUVIDRECONFIGUREDECODERINFO
+//! Struct for decoder reset
+//! This structure is used in cuvidReconfigureDecoder() API
+/****************************************************/
+typedef struct _CUVIDRECONFIGUREDECODERINFO {
+  unsigned int ulWidth; /**< IN: Coded sequence width in pixels, MUST be < =
+                           ulMaxWidth defined at CUVIDDECODECREATEINFO  */
+  unsigned int ulHeight; /**< IN: Coded sequence height in pixels, MUST be < =
+                            ulMaxHeight defined at CUVIDDECODECREATEINFO  */
+  unsigned int ulTargetWidth; /**< IN: Post processed output width */
+  unsigned int ulTargetHeight; /**< IN: Post Processed output height */
+  unsigned int ulNumDecodeSurfaces; /**< IN: Maximum number of internal decode
+                                       surfaces */
+  unsigned int reserved1[12]; /**< Reserved for future use. Set to Zero */
+
+  /**
+   * IN: Area of frame to be displayed. Use-case : Source Cropping
+   */
+  struct {
+    short left;
+    short top;
+    short right;
+    short bottom;
+  } display_area;
+
+  /**
+   * IN: Target Rectangle in the OutputFrame. Use-case : Aspect ratio Conversion
+   */
+  struct {
+    short left;
+    short top;
+    short right;
+    short bottom;
+  } target_rect;
+
+  unsigned int reserved2[11]; /**< Reserved for future use. Set to Zero */
+} CUVIDRECONFIGUREDECODERINFO;
+
+/***********************************************************************************************************/
+//! VIDEO_DECODER
+//!
+//! In order to minimize decode latencies, there should be always at least 2
+//! pictures in the decode queue at any time, in order to make sure that all
+//! decode engines are always busy.
+//!
+//! Overall data flow:
+//!  - cuvidGetDecoderCaps(...)
+//!  - cuvidCreateDecoder(...)
+//!  - For each picture:
+//!    + cuvidDecodePicture(N)
+//!    + cuvidMapVideoFrame(N-4)
+//!    + do some processing in cuda
+//!    + cuvidUnmapVideoFrame(N-4)
+//!    + cuvidDecodePicture(N+1)
+//!    + cuvidMapVideoFrame(N-3)
+//!    + ...
+//!  - cuvidDestroyDecoder(...)
+//!
+//! NOTE:
+//! - When the cuda context is created from a D3D device, the D3D device must
+//! also be created
+//!   with the D3DCREATE_MULTITHREADED flag.
+//! - There is a limit to how many pictures can be mapped simultaneously
+//! (ulNumOutputSurfaces)
+//! - cuvidDecodePicture may block the calling thread if there are too many
+//! pictures pending
+//!   in the decode queue
+/***********************************************************************************************************/
+
+/**********************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidGetDecoderCaps(CUVIDDECODECAPS *pdc)
+//! Queries decode capabilities of NVDEC-HW based on CodecType, ChromaFormat and
+//! BitDepthMinus8 parameters.
+//! 1. Application fills IN parameters CodecType, ChromaFormat and
+//! BitDepthMinus8 of CUVIDDECODECAPS structure
+//! 2. On calling cuvidGetDecoderCaps, driver fills OUT parameters if the IN
+//! parameters are supported
+//!    If IN parameters passed to the driver are not supported by NVDEC-HW, then
+//!    all OUT params are set to 0.
+//! E.g. on Geforce GTX 960:
+//!   App fills - eCodecType = cudaVideoCodec_H264; eChromaFormat =
+//!   cudaVideoChromaFormat_420; nBitDepthMinus8 = 0; Given IN parameters are
+//!   supported, hence driver fills: bIsSupported = 1; nMinWidth   = 48;
+//!   nMinHeight  = 16; nMaxWidth = 4096; nMaxHeight = 4096; nMaxMBCount =
+//!   65536;
+//! CodedWidth*CodedHeight/256 must be less than or equal to nMaxMBCount
+/**********************************************************************************************************************/
+extern CUresult CUDAAPI cuvidGetDecoderCaps(CUVIDDECODECAPS* pdc);
+
+/*****************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidCreateDecoder(CUvideodecoder *phDecoder,
+//! CUVIDDECODECREATEINFO *pdci) Create the decoder object based on pdci. A
+//! handle to the created decoder is returned
+/*****************************************************************************************************/
+extern CUresult CUDAAPI
+cuvidCreateDecoder(CUvideodecoder* phDecoder, CUVIDDECODECREATEINFO* pdci);
+
+/*****************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidDestroyDecoder(CUvideodecoder hDecoder)
+//! Destroy the decoder object
+/*****************************************************************************************************/
+extern CUresult CUDAAPI cuvidDestroyDecoder(CUvideodecoder hDecoder);
+
+/*****************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidDecodePicture(CUvideodecoder hDecoder,
+//! CUVIDPICPARAMS *pPicParams) Decode a single picture (field or frame) Kicks
+//! off HW decoding
+/*****************************************************************************************************/
+extern CUresult CUDAAPI
+cuvidDecodePicture(CUvideodecoder hDecoder, CUVIDPICPARAMS* pPicParams);
+
+/************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidGetDecodeStatus(CUvideodecoder hDecoder, int
+//! nPicIdx); Get the decode status for frame corresponding to nPicIdx API is
+//! supported for Maxwell and above generation GPUs. API is currently supported
+//! for HEVC, H264 and JPEG codecs. API returns CUDA_ERROR_NOT_SUPPORTED error
+//! code for unsupported GPU or codec.
+/************************************************************************************************************/
+extern CUresult CUDAAPI cuvidGetDecodeStatus(
+    CUvideodecoder hDecoder,
+    int nPicIdx,
+    CUVIDGETDECODESTATUS* pDecodeStatus);
+
+/*********************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidReconfigureDecoder(CUvideodecoder hDecoder,
+//! CUVIDRECONFIGUREDECODERINFO *pDecReconfigParams) Used to reuse single
+//! decoder for multiple clips. Currently supports resolution change, resize
+//! params, display area params, target area params change for same codec. Must
+//! be called during CUVIDPARSERPARAMS::pfnSequenceCallback
+/*********************************************************************************************************/
+extern CUresult CUDAAPI cuvidReconfigureDecoder(
+    CUvideodecoder hDecoder,
+    CUVIDRECONFIGUREDECODERINFO* pDecReconfigParams);
+
+#if !defined(__CUVID_DEVPTR64) || defined(__CUVID_INTERNAL)
+/************************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidMapVideoFrame(CUvideodecoder hDecoder, int
+//! nPicIdx, unsigned int *pDevPtr,
+//!                                         unsigned int *pPitch,
+//!                                         CUVIDPROCPARAMS *pVPP);
+//! Post-process and map video frame corresponding to nPicIdx for use in cuda.
+//! Returns cuda device pointer and associated pitch of the video frame
+/************************************************************************************************************************/
+extern CUresult CUDAAPI cuvidMapVideoFrame(
+    CUvideodecoder hDecoder,
+    int nPicIdx,
+    unsigned int* pDevPtr,
+    unsigned int* pPitch,
+    CUVIDPROCPARAMS* pVPP);
+
+/*****************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidUnmapVideoFrame(CUvideodecoder hDecoder, unsigned
+//! int DevPtr) Unmap a previously mapped video frame
+/*****************************************************************************************************/
+extern CUresult CUDAAPI
+cuvidUnmapVideoFrame(CUvideodecoder hDecoder, unsigned int DevPtr);
+#endif
+
+/****************************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidMapVideoFrame64(CUvideodecoder hDecoder, int
+//! nPicIdx, unsigned long long *pDevPtr,
+//!                                           unsigned int * pPitch,
+//!                                           CUVIDPROCPARAMS *pVPP);
+//! Post-process and map video frame corresponding to nPicIdx for use in cuda.
+//! Returns cuda device pointer and associated pitch of the video frame
+/****************************************************************************************************************************/
+extern CUresult CUDAAPI cuvidMapVideoFrame64(
+    CUvideodecoder hDecoder,
+    int nPicIdx,
+    unsigned long long* pDevPtr,
+    unsigned int* pPitch,
+    CUVIDPROCPARAMS* pVPP);
+
+/**************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidUnmapVideoFrame64(CUvideodecoder hDecoder,
+//! unsigned long long DevPtr); Unmap a previously mapped video frame
+/**************************************************************************************************/
+extern CUresult CUDAAPI
+cuvidUnmapVideoFrame64(CUvideodecoder hDecoder, unsigned long long DevPtr);
+
+#if defined(__CUVID_DEVPTR64) && !defined(__CUVID_INTERNAL)
+#define cuvidMapVideoFrame cuvidMapVideoFrame64
+#define cuvidUnmapVideoFrame cuvidUnmapVideoFrame64
+#endif
+
+/********************************************************************************************************************/
+//!
+//! Context-locking: to facilitate multi-threaded implementations, the following
+//! 4 functions provide a simple mutex-style host synchronization. If a non-NULL
+//! context is specified in CUVIDDECODECREATEINFO, the codec library will
+//! acquire the mutex associated with the given context before making any cuda
+//! calls. A multi-threaded application could create a lock associated with a
+//! context handle so that multiple threads can safely share the same cuda
+//! context:
+//!  - use cuCtxPopCurrent immediately after context creation in order to create
+//!  a 'floating' context
+//!    that can be passed to cuvidCtxLockCreate.
+//!  - When using a floating context, all cuda calls should only be made within
+//!  a cuvidCtxLock/cuvidCtxUnlock section.
+//!
+//! NOTE: This is a safer alternative to cuCtxPushCurrent and cuCtxPopCurrent,
+//! and is not related to video decoder in any way (implemented as a critical
+//! section associated with cuCtx{Push|Pop}Current calls).
+/********************************************************************************************************************/
+
+/********************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidCtxLockCreate(CUvideoctxlock *pLock, CUcontext
+//! ctx) This API is used to create CtxLock object
+/********************************************************************************************************************/
+extern CUresult CUDAAPI
+cuvidCtxLockCreate(CUvideoctxlock* pLock, CUcontext ctx);
+
+/********************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidCtxLockDestroy(CUvideoctxlock lck)
+//! This API is used to free CtxLock object
+/********************************************************************************************************************/
+extern CUresult CUDAAPI cuvidCtxLockDestroy(CUvideoctxlock lck);
+
+/********************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidCtxLock(CUvideoctxlock lck, unsigned int
+//! reserved_flags) This API is used to acquire ctxlock
+/********************************************************************************************************************/
+extern CUresult CUDAAPI
+cuvidCtxLock(CUvideoctxlock lck, unsigned int reserved_flags);
+
+/********************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidCtxUnlock(CUvideoctxlock lck, unsigned int
+//! reserved_flags) This API is used to release ctxlock
+/********************************************************************************************************************/
+extern CUresult CUDAAPI
+cuvidCtxUnlock(CUvideoctxlock lck, unsigned int reserved_flags);
+
+/**********************************************************************************************/
+
+#if defined(__cplusplus)
+}
+
+// Auto-lock helper for C++ applications
+class CCtxAutoLock {
+ private:
+  CUvideoctxlock m_ctx;
+
+ public:
+  CCtxAutoLock(CUvideoctxlock ctx) : m_ctx(ctx) {
+    cuvidCtxLock(m_ctx, 0);
+  }
+
+  ~CCtxAutoLock() {
+    cuvidCtxUnlock(m_ctx, 0);
+  }
+};
+#endif /* __cplusplus */
+
+#endif // __CUDA_VIDEO_H__
diff --git a/src/torchcodec/_core/nvcuvid_include/nvcuvid.h b/src/torchcodec/_core/nvcuvid_include/nvcuvid.h
new file mode 100644
index 000000000..f0d9446d7
--- /dev/null
+++ b/src/torchcodec/_core/nvcuvid_include/nvcuvid.h
@@ -0,0 +1,610 @@
+/*
+ * This copyright notice applies to this header file only:
+ *
+ * Copyright (c) 2010-2024 NVIDIA Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the software, and to permit persons to whom the
+ * software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/********************************************************************************************************************/
+//! \file nvcuvid.h
+//!   NVDECODE API provides video decoding interface to NVIDIA GPU devices.
+//! \date 2015-2024
+//!  This file contains the interface constants, structure definitions and
+//!  function prototypes.
+/********************************************************************************************************************/
+
+#if !defined(__NVCUVID_H__)
+#define __NVCUVID_H__
+
+#include "cuviddec.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+#define MAX_CLOCK_TS 3
+
+/***********************************************/
+//!
+//! High-level helper APIs for video sources
+//!
+/***********************************************/
+
+typedef void* CUvideosource;
+typedef void* CUvideoparser;
+typedef long long CUvideotimestamp;
+
+/************************************************************************/
+//! \enum cudaVideoState
+//! Video source state enums
+//! Used in cuvidSetVideoSourceState and cuvidGetVideoSourceState APIs
+/************************************************************************/
+typedef enum {
+  cudaVideoState_Error = -1, /**< Error state (invalid source) */
+  cudaVideoState_Stopped =
+      0, /**< Source is stopped (or reached end-of-stream)  */
+  cudaVideoState_Started = 1 /**< Source is running and delivering data */
+} cudaVideoState;
+
+/************************************************************************/
+//! \enum cudaAudioCodec
+//! Audio compression enums
+//! Used in CUAUDIOFORMAT structure
+/************************************************************************/
+typedef enum {
+  cudaAudioCodec_MPEG1 = 0, /**< MPEG-1 Audio               */
+  cudaAudioCodec_MPEG2, /**< MPEG-2 Audio               */
+  cudaAudioCodec_MP3, /**< MPEG-1 Layer III Audio     */
+  cudaAudioCodec_AC3, /**< Dolby Digital (AC3) Audio  */
+  cudaAudioCodec_LPCM, /**< PCM Audio                  */
+  cudaAudioCodec_AAC, /**< AAC Audio                  */
+} cudaAudioCodec;
+
+/************************************************************************/
+//! \ingroup STRUCTS
+//! \struct TIMECODESET
+//! Used to store Time code set extracted from H264 and HEVC codecs
+/************************************************************************/
+typedef struct _TIMECODESET {
+  unsigned int time_offset_value;
+  unsigned short n_frames;
+  unsigned char clock_timestamp_flag;
+  unsigned char units_field_based_flag;
+  unsigned char counting_type;
+  unsigned char full_timestamp_flag;
+  unsigned char discontinuity_flag;
+  unsigned char cnt_dropped_flag;
+  unsigned char seconds_value;
+  unsigned char minutes_value;
+  unsigned char hours_value;
+  unsigned char seconds_flag;
+  unsigned char minutes_flag;
+  unsigned char hours_flag;
+  unsigned char time_offset_length;
+  unsigned char reserved;
+} TIMECODESET;
+
+/************************************************************************/
+//! \ingroup STRUCTS
+//! \struct TIMECODE
+//! Used to extract Time code in H264 and HEVC codecs
+/************************************************************************/
+typedef struct _TIMECODE {
+  TIMECODESET time_code_set[MAX_CLOCK_TS];
+  unsigned char num_clock_ts;
+} TIMECODE;
+
+/**********************************************************************************/
+//! \ingroup STRUCTS
+//! \struct SEIMASTERINGDISPLAYINFO
+//! Used to extract mastering display color volume SEI in H264 and HEVC codecs
+/**********************************************************************************/
+typedef struct _SEIMASTERINGDISPLAYINFO {
+  unsigned short display_primaries_x[3];
+  unsigned short display_primaries_y[3];
+  unsigned short white_point_x;
+  unsigned short white_point_y;
+  unsigned int max_display_mastering_luminance;
+  unsigned int min_display_mastering_luminance;
+} SEIMASTERINGDISPLAYINFO;
+
+/**********************************************************************************/
+//! \ingroup STRUCTS
+//! \struct SEICONTENTLIGHTLEVELINFO
+//! Used to extract content light level info SEI in H264 and HEVC codecs
+/**********************************************************************************/
+typedef struct _SEICONTENTLIGHTLEVELINFO {
+  unsigned short max_content_light_level;
+  unsigned short max_pic_average_light_level;
+  unsigned int reserved;
+} SEICONTENTLIGHTLEVELINFO;
+
+/**********************************************************************************/
+//! \ingroup STRUCTS
+//! \struct TIMECODEMPEG2
+//! Used to extract Time code in MPEG2 codec
+/**********************************************************************************/
+typedef struct _TIMECODEMPEG2 {
+  unsigned char drop_frame_flag;
+  unsigned char time_code_hours;
+  unsigned char time_code_minutes;
+  unsigned char marker_bit;
+  unsigned char time_code_seconds;
+  unsigned char time_code_pictures;
+} TIMECODEMPEG2;
+
+/**********************************************************************************/
+//! \ingroup STRUCTS
+//! \struct SEIALTERNATIVETRANSFERCHARACTERISTICS
+//! Used to extract alternative transfer characteristics SEI in H264 and HEVC
+//! codecs
+/**********************************************************************************/
+typedef struct _SEIALTERNATIVETRANSFERCHARACTERISTICS {
+  unsigned char preferred_transfer_characteristics;
+} SEIALTERNATIVETRANSFERCHARACTERISTICS;
+
+/**********************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUSEIMESSAGE;
+//! Used in CUVIDSEIMESSAGEINFO structure
+/**********************************************************************************/
+typedef struct _CUSEIMESSAGE {
+  unsigned char sei_message_type; /**< OUT: SEI Message Type      */
+  unsigned char reserved[3];
+  unsigned int sei_message_size; /**< OUT: SEI Message Size      */
+} CUSEIMESSAGE;
+
+/************************************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDEOFORMAT
+//! Video format
+//! Used in cuvidGetSourceVideoFormat API
+/************************************************************************************************/
+typedef struct {
+  cudaVideoCodec codec; /**< OUT: Compression format          */
+
+  /**
+   * OUT: frame rate = numerator / denominator (for example: 30000/1001)
+   */
+  struct {
+    /**< OUT: frame rate numerator   (0 = unspecified or variable frame rate) */
+    unsigned int numerator;
+    /**< OUT: frame rate denominator (0 = unspecified or variable frame rate) */
+    unsigned int denominator;
+  } frame_rate;
+
+  unsigned char progressive_sequence; /**< OUT: 0=interlaced, 1=progressive */
+  unsigned char bit_depth_luma_minus8; /**< OUT: high bit depth luma. E.g, 2 for
+                                          10-bitdepth, 4 for 12-bitdepth   */
+  unsigned char
+      bit_depth_chroma_minus8; /**< OUT: high bit depth chroma. E.g, 2 for
+                                  10-bitdepth, 4 for 12-bitdepth */
+  unsigned char
+      min_num_decode_surfaces; /**< OUT: Minimum number of decode surfaces to be
+                                  allocated for correct decoding. The client can
+                                  send this value in ulNumDecodeSurfaces (in
+                                  CUVIDDECODECREATEINFO structure). This
+                                  guarantees correct functionality and optimal
+                                  video memory usage but not necessarily the
+                                  best performance, which depends on the design
+                                  of the overall application. The optimal number
+                                  of decode surfaces (in terms of performance
+                                  and memory utilization) should be decided by
+                                  experimentation for each application, but it
+                                         cannot go below
+                                  min_num_decode_surfaces. If this value is used
+                                  for ulNumDecodeSurfaces then it must be
+                                         returned to parser during sequence
+                                  callback.                     */
+  unsigned int coded_width; /**< OUT: coded frame width in pixels */
+  unsigned int coded_height; /**< OUT: coded frame height in pixels */
+
+  /**
+   * area of the frame that should be displayed
+   * typical example:
+   * coded_width = 1920, coded_height = 1088
+   * display_area = { 0,0,1920,1080 }
+   */
+  struct {
+    int left; /**< OUT: left position of display rect    */
+    int top; /**< OUT: top position of display rect     */
+    int right; /**< OUT: right position of display rect   */
+    int bottom; /**< OUT: bottom position of display rect  */
+  } display_area;
+
+  cudaVideoChromaFormat chroma_format; /**< OUT:  Chroma format */
+  unsigned int bitrate; /**< OUT: video bitrate (bps, 0=unknown)   */
+
+  /**
+   * OUT: Display Aspect Ratio = x:y (4:3, 16:9, etc)
+   */
+  struct {
+    int x;
+    int y;
+  } display_aspect_ratio;
+
+  /**
+   * Video Signal Description
+   * Refer section E.2.1 (VUI parameters semantics) of H264 spec file
+   */
+  struct {
+    unsigned char video_format : 3; /**< OUT: 0-Component, 1-PAL, 2-NTSC,
+                                       3-SECAM, 4-MAC, 5-Unspecified     */
+    unsigned char video_full_range_flag : 1; /**< OUT: indicates the black level
+                                                and luma and chroma range */
+    unsigned char reserved_zero_bits : 4; /**< Reserved bits */
+    unsigned char color_primaries; /**< OUT: chromaticity coordinates of source
+                                      primaries                  */
+    unsigned char
+        transfer_characteristics; /**< OUT: opto-electronic transfer
+                                     characteristic of the source picture */
+    unsigned char matrix_coefficients; /**< OUT: used in deriving luma and
+                                          chroma signals from RGB primaries   */
+  } video_signal_description;
+
+  unsigned int seqhdr_data_length; /**< OUT: Additional bytes following
+                                      (CUVIDEOFORMATEX)                  */
+} CUVIDEOFORMAT;
+
+/****************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDOPERATINGPOINTINFO
+//! Operating point information of scalable bitstream
+/****************************************************************/
+typedef struct {
+  cudaVideoCodec codec;
+
+  union {
+    struct {
+      unsigned char operating_points_cnt;
+      unsigned char reserved24_bits[3];
+      unsigned short operating_points_idc[32];
+    } av1;
+
+    unsigned char CodecReserved[1024];
+  };
+} CUVIDOPERATINGPOINTINFO;
+
+/**********************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDSEIMESSAGEINFO
+//! Used in cuvidParseVideoData API with PFNVIDSEIMSGCALLBACK pfnGetSEIMsg
+/**********************************************************************************/
+typedef struct _CUVIDSEIMESSAGEINFO {
+  void* pSEIData; /**< OUT: SEI Message Data      */
+  CUSEIMESSAGE* pSEIMessage; /**< OUT: SEI Message Info      */
+  unsigned int sei_message_count; /**< OUT: SEI Message Count     */
+  unsigned int picIdx; /**< OUT: SEI Message Pic Index */
+} CUVIDSEIMESSAGEINFO;
+
+/****************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDAV1SEQHDR
+//! AV1 specific sequence header information
+/****************************************************************/
+typedef struct {
+  unsigned int max_width;
+  unsigned int max_height;
+  unsigned char reserved[1016];
+} CUVIDAV1SEQHDR;
+
+/****************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDEOFORMATEX
+//! Video format including raw sequence header information
+//! Used in cuvidGetSourceVideoFormat API
+/****************************************************************/
+typedef struct {
+  CUVIDEOFORMAT format; /**< OUT: CUVIDEOFORMAT structure */
+
+  union {
+    CUVIDAV1SEQHDR av1;
+    unsigned char raw_seqhdr_data[1024]; /**< OUT: Sequence header data    */
+  };
+} CUVIDEOFORMATEX;
+
+/****************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUAUDIOFORMAT
+//! Audio formats
+//! Used in cuvidGetSourceAudioFormat API
+/****************************************************************/
+typedef struct {
+  cudaAudioCodec codec; /**< OUT: Compression format */
+  unsigned int channels; /**< OUT: number of audio channels */
+  unsigned int samplespersec; /**< OUT: sampling frequency */
+  unsigned int bitrate; /**< OUT: For uncompressed, can also be used to
+                           determine bits per sample */
+  unsigned int reserved1; /**< Reserved for future use */
+  unsigned int reserved2; /**< Reserved for future use */
+} CUAUDIOFORMAT;
+
+/***************************************************************/
+//! \enum CUvideopacketflags
+//! Data packet flags
+//! Used in CUVIDSOURCEDATAPACKET structure
+/***************************************************************/
+typedef enum {
+  CUVID_PKT_ENDOFSTREAM =
+      0x01, /**< Set when this is the last packet for this stream */
+  CUVID_PKT_TIMESTAMP = 0x02, /**< Timestamp is valid */
+  CUVID_PKT_DISCONTINUITY =
+      0x04, /**< Set when a discontinuity has to be signalled */
+  CUVID_PKT_ENDOFPICTURE =
+      0x08, /**< Set when the packet contains exactly one frame or one field */
+  CUVID_PKT_NOTIFY_EOS =
+      0x10, /**< If this flag is set along with CUVID_PKT_ENDOFSTREAM, an
+               additional (dummy) display callback will be invoked with null
+               value of CUVIDPARSERDISPINFO which should be interpreted as end
+               of the stream.                                   */
+} CUvideopacketflags;
+
+/*****************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDSOURCEDATAPACKET
+//! Data Packet
+//! Used in cuvidParseVideoData API
+//! IN for cuvidParseVideoData
+/*****************************************************************************/
+typedef struct _CUVIDSOURCEDATAPACKET {
+  unsigned long flags; /**< IN: Combination of CUVID_PKT_XXX flags */
+  unsigned long payload_size; /**< IN: number of bytes in the payload (may be
+                                 zero if EOS flag is set) */
+  const unsigned char* payload; /**< IN: Pointer to packet payload data (may be
+                                   NULL if EOS flag is set) */
+  CUvideotimestamp
+      timestamp; /**< IN: Presentation time stamp (10MHz clock), only valid if
+                          CUVID_PKT_TIMESTAMP flag is set */
+} CUVIDSOURCEDATAPACKET;
+
+// Callback for packet delivery
+typedef int(CUDAAPI* PFNVIDSOURCECALLBACK)(void*, CUVIDSOURCEDATAPACKET*);
+
+/**************************************************************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDSOURCEPARAMS
+//! Describes parameters needed in cuvidCreateVideoSource API
+//! NVDECODE API is intended for HW accelerated video decoding so CUvideosource
+//! doesn't have audio demuxer for all supported containers. It's recommended to
+//! clients to use their own or third party demuxer if audio support is needed.
+/**************************************************************************************************************************/
+typedef struct _CUVIDSOURCEPARAMS {
+  unsigned int
+      ulClockRate; /**< IN: Time stamp units in Hz (0=default=10000000Hz) */
+  unsigned int bAnnexb : 1; /**< IN: AV1 annexB stream */
+  unsigned int uReserved : 31; /**< Reserved for future use - set to zero */
+  unsigned int uReserved1[6]; /**< Reserved for future use - set to zero */
+  void* pUserData; /**< IN: User private data passed in to the data handlers */
+  PFNVIDSOURCECALLBACK
+  pfnVideoDataHandler; /**< IN: Called to deliver video packets */
+  PFNVIDSOURCECALLBACK
+  pfnAudioDataHandler; /**< IN: Called to deliver audio packets. */
+  void* pvReserved2[8]; /**< Reserved for future use - set to NULL */
+} CUVIDSOURCEPARAMS;
+
+/**********************************************/
+//! \ingroup ENUMS
+//! \enum CUvideosourceformat_flags
+//! CUvideosourceformat_flags
+//! Used in cuvidGetSourceVideoFormat API
+/**********************************************/
+typedef enum {
+  CUVID_FMT_EXTFORMATINFO =
+      0x100 /**< Return extended format structure (CUVIDEOFORMATEX) */
+} CUvideosourceformat_flags;
+
+#if !defined(__APPLE__)
+/***************************************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidCreateVideoSource(CUvideosource *pObj, const char
+//! *pszFileName, CUVIDSOURCEPARAMS *pParams) Create CUvideosource object.
+//! CUvideosource spawns demultiplexer thread that provides two callbacks:
+//! pfnVideoDataHandler() and pfnAudioDataHandler()
+//! NVDECODE API is intended for HW accelerated video decoding so CUvideosource
+//! doesn't have audio demuxer for all supported containers. It's recommended to
+//! clients to use their own or third party demuxer if audio support is needed.
+/***************************************************************************************************************************/
+CUresult CUDAAPI cuvidCreateVideoSource(
+    CUvideosource* pObj,
+    const char* pszFileName,
+    CUVIDSOURCEPARAMS* pParams);
+
+/***************************************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidCreateVideoSourceW(CUvideosource *pObj, const
+//! wchar_t *pwszFileName, CUVIDSOURCEPARAMS *pParams) Create video source
+/***************************************************************************************************************************/
+CUresult CUDAAPI cuvidCreateVideoSourceW(
+    CUvideosource* pObj,
+    const wchar_t* pwszFileName,
+    CUVIDSOURCEPARAMS* pParams);
+
+/********************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidDestroyVideoSource(CUvideosource obj)
+//! Destroy video source
+/********************************************************************/
+CUresult CUDAAPI cuvidDestroyVideoSource(CUvideosource obj);
+
+/******************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidSetVideoSourceState(CUvideosource obj,
+//! cudaVideoState state) Set video source state to: cudaVideoState_Started - to
+//! signal the source to run and deliver data cudaVideoState_Stopped - to stop
+//! the source from delivering the data cudaVideoState_Error   - invalid source
+/******************************************************************************************/
+CUresult CUDAAPI
+cuvidSetVideoSourceState(CUvideosource obj, cudaVideoState state);
+
+/******************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn cudaVideoState CUDAAPI cuvidGetVideoSourceState(CUvideosource obj)
+//! Get video source state
+//! Returns:
+//! cudaVideoState_Started - if Source is running and delivering data
+//! cudaVideoState_Stopped - if Source is stopped or reached end-of-stream
+//! cudaVideoState_Error   - if Source is in error state
+/******************************************************************************************/
+cudaVideoState CUDAAPI cuvidGetVideoSourceState(CUvideosource obj);
+
+/******************************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidGetSourceVideoFormat(CUvideosource obj,
+//! CUVIDEOFORMAT *pvidfmt, unsigned int flags) Gets video source format in
+//! pvidfmt, flags is set to combination of CUvideosourceformat_flags as per
+//! requirement
+/******************************************************************************************************************/
+CUresult CUDAAPI cuvidGetSourceVideoFormat(
+    CUvideosource obj,
+    CUVIDEOFORMAT* pvidfmt,
+    unsigned int flags);
+
+/**************************************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidGetSourceAudioFormat(CUvideosource obj,
+//! CUAUDIOFORMAT *paudfmt, unsigned int flags) Get audio source format NVDECODE
+//! API is intended for HW accelerated video decoding so CUvideosource doesn't
+//! have audio demuxer for all supported containers. It's recommended to clients
+//! to use their own or third party demuxer if audio support is needed.
+/**************************************************************************************************************************/
+CUresult CUDAAPI cuvidGetSourceAudioFormat(
+    CUvideosource obj,
+    CUAUDIOFORMAT* paudfmt,
+    unsigned int flags);
+
+#endif
+/**********************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDPARSERDISPINFO
+//! Used in cuvidParseVideoData API with PFNVIDDISPLAYCALLBACK pfnDisplayPicture
+/**********************************************************************************/
+typedef struct _CUVIDPARSERDISPINFO {
+  int picture_index; /**< OUT: Index of the current picture */
+  int progressive_frame; /**< OUT: 1 if progressive frame; 0 otherwise */
+  int top_field_first; /**< OUT: 1 if top field is displayed first; 0 otherwise
+                        */
+  int repeat_first_field; /**< OUT: Number of additional fields (1=ivtc, 2=frame
+                             doubling, 4=frame tripling, -1=unpaired field) */
+  CUvideotimestamp timestamp; /**< OUT: Presentation time stamp */
+} CUVIDPARSERDISPINFO;
+
+/***********************************************************************************************************************/
+//! Parser callbacks
+//! The parser will call these synchronously from within cuvidParseVideoData(),
+//! whenever there is sequence change or a picture is ready to be decoded and/or
+//! displayed. First argument in functions is "void *pUserData" member of
+//! structure CUVIDSOURCEPARAMS Return values from these callbacks are
+//! interpreted as below. If the callbacks return failure, it will be propagated
+//! by cuvidParseVideoData() to the application. Parser picks default operating
+//! point as 0 and outputAllLayers flag as 0 if PFNVIDOPPOINTCALLBACK is not set
+//! or return value is -1 or invalid operating point. PFNVIDSEQUENCECALLBACK :
+//! 0: fail, 1: succeeded, > 1: override dpb size of parser (set by
+//! CUVIDPARSERPARAMS::ulMaxNumDecodeSurfaces while creating parser)
+//! PFNVIDDECODECALLBACK   : 0: fail, >=1: succeeded
+//! PFNVIDDISPLAYCALLBACK  : 0: fail, >=1: succeeded
+//! PFNVIDOPPOINTCALLBACK  : <0: fail, >=0: succeeded (bit 0-9: OperatingPoint,
+//! bit 10-10: outputAllLayers, bit 11-30: reserved) PFNVIDSEIMSGCALLBACK   : 0:
+//! fail, >=1: succeeded
+/***********************************************************************************************************************/
+typedef int(CUDAAPI* PFNVIDSEQUENCECALLBACK)(void*, CUVIDEOFORMAT*);
+typedef int(CUDAAPI* PFNVIDDECODECALLBACK)(void*, CUVIDPICPARAMS*);
+typedef int(CUDAAPI* PFNVIDDISPLAYCALLBACK)(void*, CUVIDPARSERDISPINFO*);
+typedef int(CUDAAPI* PFNVIDOPPOINTCALLBACK)(void*, CUVIDOPERATINGPOINTINFO*);
+typedef int(CUDAAPI* PFNVIDSEIMSGCALLBACK)(void*, CUVIDSEIMESSAGEINFO*);
+
+/**************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDPARSERPARAMS
+//! Used in cuvidCreateVideoParser API
+/**************************************/
+typedef struct _CUVIDPARSERPARAMS {
+  cudaVideoCodec CodecType; /**< IN: cudaVideoCodec_XXX */
+  unsigned int ulMaxNumDecodeSurfaces; /**< IN: Max # of decode surfaces (parser
+                                          will cycle through these)          */
+  unsigned int
+      ulClockRate; /**< IN: Timestamp units in Hz (0=default=10000000Hz) */
+  unsigned int ulErrorThreshold; /**< IN: % Error threshold (0-100) for calling
+                                    pfnDecodePicture (100=always IN: call
+                                    pfnDecodePicture even if picture bitstream
+                                    is fully corrupted) */
+  unsigned int ulMaxDisplayDelay; /**< IN: Max display queue delay (improves
+                                     pipelining of decode with display) 0=no
+                                     delay (recommended values: 2..4) */
+  unsigned int bAnnexb : 1; /**< IN: AV1 annexB stream */
+  unsigned int uReserved : 31; /**< Reserved for future use - set to zero */
+  unsigned int uReserved1[4]; /**< IN: Reserved for future use - set to 0 */
+  void* pUserData; /**< IN: User data for callbacks */
+  PFNVIDSEQUENCECALLBACK
+  pfnSequenceCallback; /**< IN: Called before decoding frames and/or
+                          whenever there is a fmt change */
+  PFNVIDDECODECALLBACK pfnDecodePicture; /**< IN: Called when a picture is ready
+                                            to be decoded (decode order) */
+  PFNVIDDISPLAYCALLBACK
+  pfnDisplayPicture; /**< IN: Called whenever a picture is ready to be
+                        displayed (display order)  */
+  PFNVIDOPPOINTCALLBACK
+  pfnGetOperatingPoint; /**< IN: Called from AV1 sequence header to get
+                           operating point of a AV1 scalable bitstream */
+  PFNVIDSEIMSGCALLBACK pfnGetSEIMsg; /**< IN: Called when all SEI messages are
+                                        parsed for particular frame        */
+  void* pvReserved2[5]; /**< Reserved for future use - set to NULL */
+  CUVIDEOFORMATEX* pExtVideoInfo; /**< IN: [Optional] sequence header data from
+                                     system layer                   */
+} CUVIDPARSERPARAMS;
+
+/************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidCreateVideoParser(CUvideoparser *pObj,
+//! CUVIDPARSERPARAMS *pParams) Create video parser object and initialize
+/************************************************************************************************/
+CUresult CUDAAPI
+cuvidCreateVideoParser(CUvideoparser* pObj, CUVIDPARSERPARAMS* pParams);
+
+/************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidParseVideoData(CUvideoparser obj,
+//! CUVIDSOURCEDATAPACKET *pPacket) Parse the video data from source data packet
+//! in pPacket Extracts parameter sets like SPS, PPS, bitstream etc. from
+//! pPacket and calls back pfnDecodePicture with CUVIDPICPARAMS data for kicking
+//! of HW decoding calls back pfnSequenceCallback with CUVIDEOFORMAT data for
+//! initial sequence header or when the decoder encounters a video format change
+//! calls back pfnDisplayPicture with CUVIDPARSERDISPINFO data to display a
+//! video frame
+/************************************************************************************************/
+CUresult CUDAAPI
+cuvidParseVideoData(CUvideoparser obj, CUVIDSOURCEDATAPACKET* pPacket);
+
+/************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidDestroyVideoParser(CUvideoparser obj)
+//! Destroy the video parser
+/************************************************************************************************/
+CUresult CUDAAPI cuvidDestroyVideoParser(CUvideoparser obj);
+
+/**********************************************************************************************/
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#endif // __NVCUVID_H__
diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
index d618b8d9f..801032011 100644
--- a/src/torchcodec/_core/ops.py
+++ b/src/torchcodec/_core/ops.py
@@ -275,7 +275,8 @@ def _add_video_stream_abstract(
     num_threads: Optional[int] = None,
     dimension_order: Optional[str] = None,
     stream_index: Optional[int] = None,
-    device: Optional[str] = None,
+    device: str = "cpu",
+    device_variant: str = "default",
     custom_frame_mappings: Optional[
         tuple[torch.Tensor, torch.Tensor, torch.Tensor]
     ] = None,
@@ -293,7 +294,8 @@ def add_video_stream_abstract(
     num_threads: Optional[int] = None,
     dimension_order: Optional[str] = None,
     stream_index: Optional[int] = None,
-    device: Optional[str] = None,
+    device: str = "cpu",
+    device_variant: str = "default",
     custom_frame_mappings: Optional[
         tuple[torch.Tensor, torch.Tensor, torch.Tensor]
     ] = None,
diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
index 05c391766..de2653e21 100644
--- a/src/torchcodec/decoders/_video_decoder.py
+++ b/src/torchcodec/decoders/_video_decoder.py
@@ -143,12 +143,31 @@ def __init__(
         if isinstance(device, torch_device):
             device = str(device)
 
+        # If device looks like "cuda:0:beta", make it "cuda:0" and set
+        # device_variant to "beta"
+        # TODONVDEC P2 Consider alternative ways of exposing custom device
+        # variants, and if we want this new decoder backend to be a "device
+        # variant" at all.
+        device_variant = "default"
+        if device is not None:
+            device_split = device.split(":")
+            if len(device_split) == 3:
+                device_variant = device_split[2]
+                device = ":".join(device_split[0:2])
+
+        # TODONVDEC P0 Support approximate mode. Not ideal to validate that here
+        # either, but validating this at a lower level forces to add yet another
+        # (temprorary) validation API to the device inteface
+        if device_variant == "beta" and seek_mode != "exact":
+            raise ValueError("Seek mode must be exact for BETA CUDA interface.")
+
         core.add_video_stream(
             self._decoder,
             stream_index=stream_index,
             dimension_order=dimension_order,
             num_threads=num_ffmpeg_threads,
             device=device,
+            device_variant=device_variant,
             custom_frame_mappings=custom_frame_mappings_data,
         )
 
diff --git a/test/resources/testsrc2.mp4 b/test/resources/testsrc2.mp4
new file mode 100644
index 000000000..4694b4533
Binary files /dev/null and b/test/resources/testsrc2.mp4 differ
diff --git a/test/test_decoders.py b/test/test_decoders.py
index 5f128e3e0..bfeeca47a 100644
--- a/test/test_decoders.py
+++ b/test/test_decoders.py
@@ -43,6 +43,7 @@
     SINE_MONO_S32,
     SINE_MONO_S32_44100,
     SINE_MONO_S32_8000,
+    TEST_SRC_2_720P,
 )
 
 
@@ -1401,6 +1402,80 @@ def test_get_frames_at_tensor_indices(self):
         decoder.get_frames_played_at(torch.tensor([0, 1], dtype=torch.int))
         decoder.get_frames_played_at(torch.tensor([0, 1], dtype=torch.float))
 
+    # TODONVDEC P1 unskip equality assertion checks on FFMpeg4. The comparison
+    # checks are failing on very few pixels, e.g.:
+    #
+    # E   Mismatched elements: 648586 / 82944000 (0.8%)
+    # E   Greatest absolute difference: 164 at index (20, 2, 27, 96)
+    # E   Greatest relative difference: inf at index (5, 1, 112, 186)
+    #
+    # So we're skipping them to unblock for now, but we should call
+    # assert_tensor_close_on_at_least or something like that.
+
+    @needs_cuda
+    @pytest.mark.parametrize("asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE))
+    @pytest.mark.parametrize("contiguous_indices", (True, False))
+    def test_beta_cuda_interface_get_frame_at(self, asset, contiguous_indices):
+        ref_decoder = VideoDecoder(asset.path, device="cuda")
+        beta_decoder = VideoDecoder(asset.path, device="cuda:0:beta")
+
+        assert ref_decoder.metadata == beta_decoder.metadata
+
+        if contiguous_indices:
+            indices = range(len(ref_decoder))
+        else:
+            indices = range(0, len(ref_decoder), 10)
+
+        for frame_index in indices:
+            ref_frame = ref_decoder.get_frame_at(frame_index)
+            beta_frame = beta_decoder.get_frame_at(frame_index)
+            if get_ffmpeg_major_version() > 4:  # TODONVDEC P1 see above
+                torch.testing.assert_close(
+                    beta_frame.data, ref_frame.data, rtol=0, atol=0
+                )
+
+            assert beta_frame.pts_seconds == ref_frame.pts_seconds
+            assert beta_frame.duration_seconds == ref_frame.duration_seconds
+
+    @needs_cuda
+    @pytest.mark.parametrize("asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE))
+    @pytest.mark.parametrize("contiguous_indices", (True, False))
+    def test_beta_cuda_interface_get_frames_at(self, asset, contiguous_indices):
+        ref_decoder = VideoDecoder(asset.path, device="cuda")
+        beta_decoder = VideoDecoder(asset.path, device="cuda:0:beta")
+
+        assert ref_decoder.metadata == beta_decoder.metadata
+
+        if contiguous_indices:
+            indices = range(len(ref_decoder))
+        else:
+            indices = range(0, len(ref_decoder), 10)
+        indices = list(indices)
+
+        ref_frames = ref_decoder.get_frames_at(indices)
+        beta_frames = beta_decoder.get_frames_at(indices)
+        if get_ffmpeg_major_version() > 4:  # TODONVDEC P1 see above
+            torch.testing.assert_close(
+                beta_frames.data, ref_frames.data, rtol=0, atol=0
+            )
+        torch.testing.assert_close(beta_frames.pts_seconds, ref_frames.pts_seconds)
+        torch.testing.assert_close(
+            beta_frames.duration_seconds, ref_frames.duration_seconds
+        )
+
+    @needs_cuda
+    def test_beta_cuda_interface_error(self):
+        with pytest.raises(RuntimeError, match="Can only do H264 for now"):
+            VideoDecoder(AV1_VIDEO.path, device="cuda:0:beta")
+        with pytest.raises(RuntimeError, match="Can only do H264 for now"):
+            VideoDecoder(H265_VIDEO.path, device="cuda:0:beta")
+        with pytest.raises(
+            ValueError, match="Seek mode must be exact for BETA CUDA interface."
+        ):
+            VideoDecoder(NASA_VIDEO.path, device="cuda:0:beta", seek_mode="approximate")
+        with pytest.raises(RuntimeError, match="Unsupported device"):
+            VideoDecoder(NASA_VIDEO.path, device="cuda:0:bad_variant")
+
 
 class TestAudioDecoder:
     @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3, SINE_MONO_S32))
diff --git a/test/utils.py b/test/utils.py
index b186dbb41..3ce603a9f 100644
--- a/test/utils.py
+++ b/test/utils.py
@@ -678,3 +678,13 @@ def sample_format(self) -> str:
     },
     frames={0: {}},  # Not needed for now
 )
+
+# ffmpeg -f lavfi -i testsrc2=duration=2:size=1280x720:rate=30 -c:v libx264 -profile:v baseline -level 3.1 -pix_fmt yuv420p -b:v 2500k -r 30 -movflags +faststart output_720p_2s.mp4
+TEST_SRC_2_720P = TestVideo(
+    filename="testsrc2.mp4",
+    default_stream_index=0,
+    stream_infos={
+        0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3),
+    },
+    frames={0: {}},  # Not needed for now
+)