meta-pytorch · NicolasHug · Oct 4, 2025 · Sep 25, 2025 · Sep 26, 2025 · Sep 30, 2025
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -109,32 +109,52 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
       caps.nMaxMBCount);
 
   // Decoder creation parameters, taken from DALI
-  CUVIDDECODECREATEINFO decoder_info = {};
-  decoder_info.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
-  decoder_info.ChromaFormat = videoFormat->chroma_format;
-  decoder_info.CodecType = videoFormat->codec;
-  decoder_info.ulHeight = videoFormat->coded_height;
-  decoder_info.ulWidth = videoFormat->coded_width;
-  decoder_info.ulMaxHeight = videoFormat->coded_height;
-  decoder_info.ulMaxWidth = videoFormat->coded_width;
-  decoder_info.ulTargetHeight =
+  CUVIDDECODECREATEINFO decoderParams = {};
+  decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
+  decoderParams.ChromaFormat = videoFormat->chroma_format;
+  decoderParams.OutputFormat = cudaVideoSurfaceFormat_NV12;
+  decoderParams.ulCreationFlags = cudaVideoCreate_Default;
+  decoderParams.CodecType = videoFormat->codec;
+  decoderParams.ulHeight = videoFormat->coded_height;
+  decoderParams.ulWidth = videoFormat->coded_width;
+  decoderParams.ulMaxHeight = videoFormat->coded_height;
+  decoderParams.ulMaxWidth = videoFormat->coded_width;
+  decoderParams.ulTargetHeight =
       videoFormat->display_area.bottom - videoFormat->display_area.top;
-  decoder_info.ulTargetWidth =
+  decoderParams.ulTargetWidth =
       videoFormat->display_area.right - videoFormat->display_area.left;
-  decoder_info.ulNumDecodeSurfaces = videoFormat->min_num_decode_surfaces;
-  decoder_info.ulNumOutputSurfaces = 2;
-  decoder_info.display_area.left = videoFormat->display_area.left;
-  decoder_info.display_area.right = videoFormat->display_area.right;
-  decoder_info.display_area.top = videoFormat->display_area.top;
-  decoder_info.display_area.bottom = videoFormat->display_area.bottom;
+  decoderParams.ulNumDecodeSurfaces = videoFormat->min_num_decode_surfaces;
+  decoderParams.ulNumOutputSurfaces = 2;
+  decoderParams.display_area.left = videoFormat->display_area.left;
+  decoderParams.display_area.right = videoFormat->display_area.right;
+  decoderParams.display_area.top = videoFormat->display_area.top;
+  decoderParams.display_area.bottom = videoFormat->display_area.bottom;
 
   CUvideodecoder* decoder = new CUvideodecoder();
-  result = cuvidCreateDecoder(decoder, &decoder_info);
+  result = cuvidCreateDecoder(decoder, &decoderParams);
   TORCH_CHECK(
       result == CUDA_SUCCESS, "Failed to create NVDEC decoder: ", result);
   return UniqueCUvideodecoder(decoder, CUvideoDecoderDeleter{});
 }
 
+cudaVideoCodec validateCodecSupport(AVCodecID codecId) {
+  switch (codecId) {
+    case AV_CODEC_ID_H264:
+      return cudaVideoCodec_H264;
+    case AV_CODEC_ID_HEVC:
+      return cudaVideoCodec_HEVC;
+    // TODONVDEC P0: support more codecs
+    // case AV_CODEC_ID_AV1: return cudaVideoCodec_AV1;
+    // case AV_CODEC_ID_MPEG4: return cudaVideoCodec_MPEG4;
+    // case AV_CODEC_ID_VP8: return cudaVideoCodec_VP8;
+    // case AV_CODEC_ID_VP9: return cudaVideoCodec_VP9;
+    // case AV_CODEC_ID_MJPEG: return cudaVideoCodec_JPEG;
+    default: {
+      TORCH_CHECK(false, "Unsupported codec type: ", avcodec_get_name(codecId));
+    }
+  }
+}
+
 } // namespace
 
 BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
@@ -160,36 +180,100 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
   }
 }
 
-void BetaCudaDeviceInterface::initialize(const AVStream* avStream) {
+void BetaCudaDeviceInterface::initialize(
+    const AVStream* avStream,
+    const UniqueDecodingAVFormatContext& avFormatCtx) {
   torch::Tensor dummyTensorForCudaInitialization = torch::empty(
       {1}, torch::TensorOptions().dtype(torch::kUInt8).device(device_));
 
-  TORCH_CHECK(avStream != nullptr, "AVStream cannot be null");
-  timeBase_ = avStream->time_base;
-
   auto cudaDevice = torch::Device(torch::kCUDA);
   defaultCudaInterface_ =
       std::unique_ptr<DeviceInterface>(createDeviceInterface(cudaDevice));
   AVCodecContext dummyCodecContext = {};
-  defaultCudaInterface_->initialize(avStream);
+  defaultCudaInterface_->initialize(avStream, avFormatCtx);
   defaultCudaInterface_->registerHardwareDeviceWithCodec(&dummyCodecContext);
 
-  const AVCodecParameters* codecpar = avStream->codecpar;
-  TORCH_CHECK(codecpar != nullptr, "CodecParameters cannot be null");
+  TORCH_CHECK(avStream != nullptr, "AVStream cannot be null");
+  timeBase_ = avStream->time_base;
+
+  const AVCodecParameters* codecPar = avStream->codecpar;
+  TORCH_CHECK(codecPar != nullptr, "CodecParameters cannot be null");
+
+  initializeBSF(codecPar, avFormatCtx);
+
+  // Create parser. Default values that aren't obvious are taken from DALI.
+  CUVIDPARSERPARAMS parserParams = {};
+  parserParams.CodecType = validateCodecSupport(codecPar->codec_id);
+  parserParams.ulMaxNumDecodeSurfaces = 8;
+  parserParams.ulMaxDisplayDelay = 0;
+  // Callback setup, all are triggered by the parser within a call
+  // to cuvidParseVideoData
+  parserParams.pUserData = this;
+  parserParams.pfnSequenceCallback = pfnSequenceCallback;
+  parserParams.pfnDecodePicture = pfnDecodePictureCallback;
+  parserParams.pfnDisplayPicture = pfnDisplayPictureCallback;
 
+  CUresult result = cuvidCreateVideoParser(&videoParser_, &parserParams);
   TORCH_CHECK(
-      // TODONVDEC P0 support more
-      avStream->codecpar->codec_id == AV_CODEC_ID_H264,
-      "Can only do H264 for now");
+      result == CUDA_SUCCESS, "Failed to create video parser: ", result);
+}
 
+void BetaCudaDeviceInterface::initializeBSF(
+    const AVCodecParameters* codecPar,
+    const UniqueDecodingAVFormatContext& avFormatCtx) {
   // Setup bit stream filters (BSF):
   // https://ffmpeg.org/doxygen/7.0/group__lavc__bsf.html
-  // This is only needed for some formats, like H264 or HEVC.  TODONVDEC P1: For
-  // now we apply BSF unconditionally, but it should be optional  and dependent
-  // on codec and container.
-  const AVBitStreamFilter* avBSF = av_bsf_get_by_name("h264_mp4toannexb");
+  // This is only needed for some formats, like H264 or HEVC.
+
+  TORCH_CHECK(codecPar != nullptr, "codecPar cannot be null");
+  TORCH_CHECK(avFormatCtx != nullptr, "AVFormatContext cannot be null");
+  TORCH_CHECK(
+      avFormatCtx->iformat != nullptr,
+      "AVFormatContext->iformat cannot be null");
+  std::string filterName;
+
+  // Matching logic is taken from DALI
+  switch (codecPar->codec_id) {
+    case AV_CODEC_ID_H264: {
+      const std::string formatName = avFormatCtx->iformat->long_name
+          ? avFormatCtx->iformat->long_name
+          : "";
+
+      if (formatName == "QuickTime / MOV" ||
+          formatName == "FLV (Flash Video)" ||
+          formatName == "Matroska / WebM" || formatName == "raw H.264 video") {
+        filterName = "h264_mp4toannexb";
+      }
+      break;
+    }
+
+    case AV_CODEC_ID_HEVC: {
+      const std::string formatName = avFormatCtx->iformat->long_name
+          ? avFormatCtx->iformat->long_name
+          : "";
+
+      if (formatName == "QuickTime / MOV" ||
+          formatName == "FLV (Flash Video)" ||
+          formatName == "Matroska / WebM" || formatName == "raw HEVC video") {
+        filterName = "hevc_mp4toannexb";
+      }
+      break;
+    }
+
+    default:
+      // No bitstream filter needed for other codecs
+      // TODONVDEC P1 MPEG4 will need one!
+      break;
+  }
+
+  if (filterName.empty()) {
+    // Only initialize BSF if we actually need one
+    return;
+  }
+
+  const AVBitStreamFilter* avBSF = av_bsf_get_by_name(filterName.c_str());
   TORCH_CHECK(
-      avBSF != nullptr, "Failed to find h264_mp4toannexb bitstream filter");
+      avBSF != nullptr, "Failed to find bitstream filter: ", filterName);
 
   AVBSFContext* avBSFContext = nullptr;
   int retVal = av_bsf_alloc(avBSF, &avBSFContext);
@@ -200,7 +284,7 @@ void BetaCudaDeviceInterface::initialize(const AVStream* avStream) {
 
   bitstreamFilter_.reset(avBSFContext);
 
-  retVal = avcodec_parameters_copy(bitstreamFilter_->par_in, codecpar);
+  retVal = avcodec_parameters_copy(bitstreamFilter_->par_in, codecPar);
   TORCH_CHECK(
       retVal >= AVSUCCESS,
       "Failed to copy codec parameters: ",
@@ -211,22 +295,6 @@ void BetaCudaDeviceInterface::initialize(const AVStream* avStream) {
       retVal == AVSUCCESS,
       "Failed to initialize bitstream filter: ",
       getFFMPEGErrorStringFromErrorCode(retVal));
-
-  // Create parser. Default values that aren't obvious are taken from DALI.
-  CUVIDPARSERPARAMS parserParams = {};
-  parserParams.CodecType = cudaVideoCodec_H264;
-  parserParams.ulMaxNumDecodeSurfaces = 8;
-  parserParams.ulMaxDisplayDelay = 0;
-  // Callback setup, all are triggered by the parser within a call
-  // to cuvidParseVideoData
-  parserParams.pUserData = this;
-  parserParams.pfnSequenceCallback = pfnSequenceCallback;
-  parserParams.pfnDecodePicture = pfnDecodePictureCallback;
-  parserParams.pfnDisplayPicture = pfnDisplayPictureCallback;
-
-  CUresult result = cuvidCreateVideoParser(&videoParser_, &parserParams);
-  TORCH_CHECK(
-      result == CUDA_SUCCESS, "Failed to create video parser: ", result);
 }
 
 // This callback is called by the parser within cuvidParseVideoData when there
@@ -360,6 +428,10 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
   CUVIDPARSERDISPINFO dispInfo = readyFrames_.front();
   readyFrames_.pop();
 
+  // TODONVDEC P1 we need to set the procParams.output_stream field to the
+  // current CUDA stream and ensure proper synchronization. There's a related
+  // NVDECTODO in CudaDeviceInterface.cpp where we do the necessary
+  // synchronization for NPP.
   CUVIDPROCPARAMS procParams = {};
   procParams.progressive_frame = dispInfo.progressive_frame;
   procParams.top_field_first = dispInfo.top_field_first;

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h
@@ -37,7 +37,9 @@ class BetaCudaDeviceInterface : public DeviceInterface {
   explicit BetaCudaDeviceInterface(const torch::Device& device);
   virtual ~BetaCudaDeviceInterface();
 
-  void initialize(const AVStream* avStream) override;
+  void initialize(
+      const AVStream* avStream,
+      const UniqueDecodingAVFormatContext& avFormatCtx) override;
 
   void convertAVFrameToFrameOutput(
       UniqueAVFrame& avFrame,
@@ -61,6 +63,9 @@ class BetaCudaDeviceInterface : public DeviceInterface {
  private:
   // Apply bitstream filter, modifies packet in-place
   void applyBSF(ReferenceAVPacket& packet);
+  void initializeBSF(
+      const AVCodecParameters* codecPar,
+      const UniqueDecodingAVFormatContext& avFormatCtx);
 
   UniqueAVFrame convertCudaFrameToAVFrame(
       CUdeviceptr framePtr,

diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp
@@ -46,7 +46,9 @@ CpuDeviceInterface::CpuDeviceInterface(const torch::Device& device)
       device_.type() == torch::kCPU, "Unsupported device: ", device_.str());
 }
 
-void CpuDeviceInterface::initialize(const AVStream* avStream) {
+void CpuDeviceInterface::initialize(
+    const AVStream* avStream,
+    [[maybe_unused]] const UniqueDecodingAVFormatContext& avFormatCtx) {
   TORCH_CHECK(avStream != nullptr, "avStream is null");
   timeBase_ = avStream->time_base;
 }

diff --git a/src/torchcodec/_core/CpuDeviceInterface.h b/src/torchcodec/_core/CpuDeviceInterface.h
@@ -23,7 +23,9 @@ class CpuDeviceInterface : public DeviceInterface {
     return std::nullopt;
   }
 
-  virtual void initialize(const AVStream* avStream) override;
+  virtual void initialize(
+      const AVStream* avStream,
+      const UniqueDecodingAVFormatContext& avFormatCtx) override;
 
   virtual void initializeVideo(
       const VideoStreamOptions& videoStreamOptions,

diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -203,14 +203,16 @@ CudaDeviceInterface::~CudaDeviceInterface() {
   }
 }
 
-void CudaDeviceInterface::initialize(const AVStream* avStream) {
+void CudaDeviceInterface::initialize(
+    const AVStream* avStream,
+    const UniqueDecodingAVFormatContext& avFormatCtx) {
   TORCH_CHECK(avStream != nullptr, "avStream is null");
   timeBase_ = avStream->time_base;
 
   cpuInterface_ = createDeviceInterface(torch::kCPU);
   TORCH_CHECK(
       cpuInterface_ != nullptr, "Failed to create CPU device interface");
-  cpuInterface_->initialize(avStream);
+  cpuInterface_->initialize(avStream, avFormatCtx);
   cpuInterface_->initializeVideo(
       VideoStreamOptions(),
       {},

diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h
@@ -20,7 +20,9 @@ class CudaDeviceInterface : public DeviceInterface {
 
   std::optional<const AVCodec*> findCodec(const AVCodecID& codecId) override;
 
-  void initialize(const AVStream* avStream) override;
+  void initialize(
+      const AVStream* avStream,
+      const UniqueDecodingAVFormatContext& avFormatCtx) override;
 
   void initializeVideo(
       const VideoStreamOptions& videoStreamOptions,

diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
@@ -52,7 +52,9 @@ class DeviceInterface {
   };
 
   // Initialize the device with parameters generic to all kinds of decoding.
-  virtual void initialize(const AVStream* avStream) = 0;
+  virtual void initialize(
+      const AVStream* avStream,
+      const UniqueDecodingAVFormatContext& avFormatCtx) = 0;
 
   // Initialize the device with parameters specific to video decoding. There is
   // a default empty implementation.

diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -439,7 +439,7 @@ void SingleStreamDecoder::addStream(
   TORCH_CHECK(
       deviceInterface_ != nullptr,
       "Failed to create device interface. This should never happen, please report.");
-  deviceInterface_->initialize(streamInfo.stream);
+  deviceInterface_->initialize(streamInfo.stream, formatContext_);
 
   // TODO_CODE_QUALITY it's pretty meh to have a video-specific logic within
   // addStream() which is supposed to be generic

diff --git a/test/resources/testsrc2_h265.mp4 b/test/resources/testsrc2_h265.mp4