Add h265 support

NicolasHug · NicolasHug · commit 12c75e76b211 · 2025-10-03T15:45:52.000+01:00
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -138,6 +138,24 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
   return UniqueCUvideodecoder(decoder, CUvideoDecoderDeleter{});
 }
 
+cudaVideoCodec validateCodecSupport(AVCodecID codecId) {
+  switch (codecId) {
+    case AV_CODEC_ID_H264:
+      return cudaVideoCodec_H264;
+    case AV_CODEC_ID_HEVC:
+      return cudaVideoCodec_HEVC;
+    // TODONVDEC P0: support more codecs
+    // case AV_CODEC_ID_AV1: return cudaVideoCodec_AV1;
+    // case AV_CODEC_ID_MPEG4: return cudaVideoCodec_MPEG4;
+    // case AV_CODEC_ID_VP8: return cudaVideoCodec_VP8;
+    // case AV_CODEC_ID_VP9: return cudaVideoCodec_VP9;
+    // case AV_CODEC_ID_MJPEG: return cudaVideoCodec_JPEG;
+    default: {
+      TORCH_CHECK(false, "Unsupported codec type: ", avcodec_get_name(codecId));
+    }
+  }
+}
+
 } // namespace
 
 BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
@@ -163,29 +181,62 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
   }
 }
 
-void BetaCudaDeviceInterface::initializeInterface(AVStream* avStream) {
-  torch::Tensor dummyTensorForCudaInitialization = torch::empty(
-      {1}, torch::TensorOptions().dtype(torch::kUInt8).device(device_));
+void BetaCudaDeviceInterface::initializeBSF(
+    const AVCodecParameters* codecPar,
+    const UniqueDecodingAVFormatContext& avFormatCtx) {
+  // Setup bit stream filters (BSF):
+  // https://ffmpeg.org/doxygen/7.0/group__lavc__bsf.html
+  // This is only needed for some formats, like H264 or HEVC.
 
-  TORCH_CHECK(avStream != nullptr, "AVStream cannot be null");
-  timeBase_ = avStream->time_base;
+  TORCH_CHECK(codecPar != nullptr, "codecPar cannot be null");
+  TORCH_CHECK(avFormatCtx != nullptr, "AVFormatContext cannot be null");
+  TORCH_CHECK(
+      avFormatCtx->iformat != nullptr,
+      "AVFormatContext->iformat cannot be null");
+  std::string filterName;
+
+  // Matching logic is taken from DALI
+  switch (codecPar->codec_id) {
+    case AV_CODEC_ID_H264: {
+      const std::string formatName = avFormatCtx->iformat->long_name
+          ? avFormatCtx->iformat->long_name
+          : "";
+
+      if (formatName == "QuickTime / MOV" ||
+          formatName == "FLV (Flash Video)" ||
+          formatName == "Matroska / WebM" || formatName == "raw H.264 video") {
+        filterName = "h264_mp4toannexb";
+      }
+      break;
+    }
 
-  const AVCodecParameters* codecpar = avStream->codecpar;
-  TORCH_CHECK(codecpar != nullptr, "CodecParameters cannot be null");
+    case AV_CODEC_ID_HEVC: {
+      const std::string formatName = avFormatCtx->iformat->long_name
+          ? avFormatCtx->iformat->long_name
+          : "";
 
-  TORCH_CHECK(
-      // TODONVDEC P0 support more
-      avStream->codecpar->codec_id == AV_CODEC_ID_H264,
-      "Can only do H264 for now");
+      if (formatName == "QuickTime / MOV" ||
+          formatName == "FLV (Flash Video)" ||
+          formatName == "Matroska / WebM" || formatName == "raw HEVC video") {
+        filterName = "hevc_mp4toannexb";
+      }
+      break;
+    }
 
-  // Setup bit stream filters (BSF):
-  // https://ffmpeg.org/doxygen/7.0/group__lavc__bsf.html
-  // This is only needed for some formats, like H264 or HEVC.  TODONVDEC P1: For
-  // now we apply BSF unconditionally, but it should be optional  and dependent
-  // on codec and container.
-  const AVBitStreamFilter* avBSF = av_bsf_get_by_name("h264_mp4toannexb");
+    default:
+      // No bitstream filter needed for other codecs
+      // TODONVDEC P1 MPEG4 will need one!
+      break;
+  }
+
+  if (filterName.empty()) {
+    // Only initialize BSF if we actually need one
+    return;
+  }
+
+  const AVBitStreamFilter* avBSF = av_bsf_get_by_name(filterName.c_str());
   TORCH_CHECK(
-      avBSF != nullptr, "Failed to find h264_mp4toannexb bitstream filter");
+      avBSF != nullptr, "Failed to find bitstream filter: ", filterName);
 
   AVBSFContext* avBSFContext = nullptr;
   int retVal = av_bsf_alloc(avBSF, &avBSFContext);
@@ -196,7 +247,7 @@ void BetaCudaDeviceInterface::initializeInterface(AVStream* avStream) {
 
   bitstreamFilter_.reset(avBSFContext);
 
-  retVal = avcodec_parameters_copy(bitstreamFilter_->par_in, codecpar);
+  retVal = avcodec_parameters_copy(bitstreamFilter_->par_in, codecPar);
   TORCH_CHECK(
       retVal >= AVSUCCESS,
       "Failed to copy codec parameters: ",
@@ -207,10 +258,25 @@ void BetaCudaDeviceInterface::initializeInterface(AVStream* avStream) {
       retVal == AVSUCCESS,
       "Failed to initialize bitstream filter: ",
       getFFMPEGErrorStringFromErrorCode(retVal));
+}
+
+void BetaCudaDeviceInterface::initializeInterface(
+    const AVStream* avStream,
+    const UniqueDecodingAVFormatContext& avFormatCtx) {
+  torch::Tensor dummyTensorForCudaInitialization = torch::empty(
+      {1}, torch::TensorOptions().dtype(torch::kUInt8).device(device_));
+
+  TORCH_CHECK(avStream != nullptr, "AVStream cannot be null");
+  timeBase_ = avStream->time_base;
+
+  const AVCodecParameters* codecPar = avStream->codecpar;
+  TORCH_CHECK(codecPar != nullptr, "CodecParameters cannot be null");
+
+  initializeBSF(codecPar, avFormatCtx);
 
   // Create parser. Default values that aren't obvious are taken from DALI.
   CUVIDPARSERPARAMS parserParams = {};
-  parserParams.CodecType = cudaVideoCodec_H264;
+  parserParams.CodecType = validateCodecSupport(codecPar->codec_id);
   parserParams.ulMaxNumDecodeSurfaces = 8;
   parserParams.ulMaxDisplayDelay = 0;
   // Callback setup, all are triggered by the parser within a call
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h
@@ -37,7 +37,9 @@ class BetaCudaDeviceInterface : public DeviceInterface {
   explicit BetaCudaDeviceInterface(const torch::Device& device);
   virtual ~BetaCudaDeviceInterface();
 
-  void initializeInterface(AVStream* stream) override;
+  void initializeInterface(
+      const AVStream* stream,
+      const UniqueDecodingAVFormatContext& avFormatCtx) override;
 
   void convertAVFrameToFrameOutput(
       const VideoStreamOptions& videoStreamOptions,
@@ -63,6 +65,9 @@ class BetaCudaDeviceInterface : public DeviceInterface {
  private:
   // Apply bitstream filter, modifies packet in-place
   void applyBSF(ReferenceAVPacket& packet);
+  void initializeBSF(
+      const AVCodecParameters* codecPar,
+      const UniqueDecodingAVFormatContext& avFormatCtx);
 
   UniqueAVFrame convertCudaFrameToAVFrame(
       CUdeviceptr framePtr,
diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
@@ -55,7 +55,9 @@ class DeviceInterface {
   virtual void initializeContext(
       [[maybe_unused]] AVCodecContext* codecContext) {}
 
-  virtual void initializeInterface([[maybe_unused]] AVStream* stream) {}
+  virtual void initializeInterface(
+      [[maybe_unused]] const AVStream* stream,
+      [[maybe_unused]] const UniqueDecodingAVFormatContext& avFormatCtx) {}
 
   virtual void convertAVFrameToFrameOutput(
       const VideoStreamOptions& videoStreamOptions,
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -462,7 +462,7 @@ void SingleStreamDecoder::addStream(
   if (mediaType == AVMEDIA_TYPE_VIDEO) {
     if (deviceInterface_) {
       deviceInterface_->initializeContext(codecContext);
-      deviceInterface_->initializeInterface(streamInfo.stream);
+      deviceInterface_->initializeInterface(streamInfo.stream, formatContext_);
     }
   }
 
diff --git a/test/resources/testsrc2_h265.mp4 b/test/resources/testsrc2_h265.mp4
diff --git a/test/test_decoders.py b/test/test_decoders.py
@@ -44,6 +44,7 @@
     SINE_MONO_S32_44100,
     SINE_MONO_S32_8000,
     TEST_SRC_2_720P,
+    TEST_SRC_2_720P_H265,
 )
 
 
@@ -1415,7 +1416,9 @@ def test_get_frames_at_tensor_indices(self):
     # assert_tensor_close_on_at_least or something like that.
 
     @needs_cuda
-    @pytest.mark.parametrize("asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE))
+    @pytest.mark.parametrize(
+        "asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE, TEST_SRC_2_720P_H265)
+    )
     @pytest.mark.parametrize("contiguous_indices", (True, False))
     @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
     def test_beta_cuda_interface_get_frame_at(
@@ -1445,7 +1448,9 @@ def test_beta_cuda_interface_get_frame_at(
             assert beta_frame.duration_seconds == ref_frame.duration_seconds
 
     @needs_cuda
-    @pytest.mark.parametrize("asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE))
+    @pytest.mark.parametrize(
+        "asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE, TEST_SRC_2_720P_H265)
+    )
     @pytest.mark.parametrize("contiguous_indices", (True, False))
     @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
     def test_beta_cuda_interface_get_frames_at(
@@ -1476,7 +1481,9 @@ def test_beta_cuda_interface_get_frames_at(
         )
 
     @needs_cuda
-    @pytest.mark.parametrize("asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE))
+    @pytest.mark.parametrize(
+        "asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE, TEST_SRC_2_720P_H265)
+    )
     @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
     def test_beta_cuda_interface_get_frame_played_at(self, asset, seek_mode):
         ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode)
@@ -1498,7 +1505,9 @@ def test_beta_cuda_interface_get_frame_played_at(self, asset, seek_mode):
             assert beta_frame.duration_seconds == ref_frame.duration_seconds
 
     @needs_cuda
-    @pytest.mark.parametrize("asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE))
+    @pytest.mark.parametrize(
+        "asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE, TEST_SRC_2_720P_H265)
+    )
     @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
     def test_beta_cuda_interface_get_frames_played_at(self, asset, seek_mode):
         ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode)
@@ -1521,7 +1530,9 @@ def test_beta_cuda_interface_get_frames_played_at(self, asset, seek_mode):
         )
 
     @needs_cuda
-    @pytest.mark.parametrize("asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE))
+    @pytest.mark.parametrize(
+        "asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE, TEST_SRC_2_720P_H265)
+    )
     @pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
     def test_beta_cuda_interface_backwards(self, asset, seek_mode):
 
@@ -1541,12 +1552,24 @@ def test_beta_cuda_interface_backwards(self, asset, seek_mode):
             assert beta_frame.pts_seconds == ref_frame.pts_seconds
             assert beta_frame.duration_seconds == ref_frame.duration_seconds
 
+    @needs_cuda
+    def test_beta_cuda_interface_small_h265(self):
+        # TODONVDEC P2 investigate why/how the default interface can decode this
+        # video.
+
+        # This is fine on the default interface - why?
+        VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0)
+        # But it fails on the beta interface due to input validation checks, which we took from DALI!
+        with pytest.raises(
+            RuntimeError,
+            match="Video is too small in at least one dimension. Provided: 128x128 vs supported:144x144",
+        ):
+            VideoDecoder(H265_VIDEO.path, device="cuda:0:beta").get_frame_at(0)
+
     @needs_cuda
     def test_beta_cuda_interface_error(self):
-        with pytest.raises(RuntimeError, match="Can only do H264 for now"):
+        with pytest.raises(RuntimeError, match="Unsupported codec type: av1"):
             VideoDecoder(AV1_VIDEO.path, device="cuda:0:beta")
-        with pytest.raises(RuntimeError, match="Can only do H264 for now"):
-            VideoDecoder(H265_VIDEO.path, device="cuda:0:beta")
         with pytest.raises(RuntimeError, match="Unsupported device"):
             VideoDecoder(NASA_VIDEO.path, device="cuda:0:bad_variant")
 
diff --git a/test/utils.py b/test/utils.py
@@ -688,3 +688,12 @@ def sample_format(self) -> str:
     },
     frames={0: {}},  # Not needed for now
 )
+# ffmpeg -f lavfi -i testsrc2=duration=10:size=1280x720:rate=30 -c:v libx265 -crf 23 -preset medium output.mp4
+TEST_SRC_2_720P_H265 = TestVideo(
+    filename="testsrc2_h265.mp4",
+    default_stream_index=0,
+    stream_infos={
+        0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3),
+    },
+    frames={0: {}},  # Not needed for now
+)

Original file line number	Diff line number	Diff line change
`@@ -462,7 +462,7 @@ void SingleStreamDecoder::addStream(`
`462`	`462`	`if (mediaType == AVMEDIA_TYPE_VIDEO) {`
`463`	`463`	`if (deviceInterface_) {`
`464`	`464`	`deviceInterface_->initializeContext(codecContext);`
`465`		`- deviceInterface_->initializeInterface(streamInfo.stream);`
	`465`	`+ deviceInterface_->initializeInterface(streamInfo.stream, formatContext_);`
`466`	`466`	`}`
`467`	`467`	`}`
`468`	`468`