Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
78ab058
Let's just commit 3k loc in a single commit
NicolasHug Sep 25, 2025
b45decc
Fixes
NicolasHug Sep 26, 2025
316f218
Merge branch 'main' of github.com:pytorch/torchcodec into aeaenjfjanef
NicolasHug Sep 30, 2025
d0192ec
GetCache -> getCache
NicolasHug Sep 30, 2025
515deb5
Make UniqueCUvideodecoder a pointer on CUvideodecoder, not void
NicolasHug Sep 30, 2025
13fad10
Make device and device_variant have a default instead of being std::o…
NicolasHug Sep 30, 2025
eb8de72
Remove old registerDeviceInterface
NicolasHug Sep 30, 2025
4f7a4fb
Call std::memset
NicolasHug Sep 30, 2025
dcf3124
remove unnecessary cuda_runtime.h include, update cmake accordingly
NicolasHug Sep 30, 2025
0ad7370
abstract frameBuffer_ into a FrameBuffer class
NicolasHug Sep 30, 2025
aad142e
Cleanup BSF logic
NicolasHug Sep 30, 2025
2592888
Return int in callback instead of unsigned char
NicolasHug Sep 30, 2025
b5fe9bc
define width and height as unsigned int
NicolasHug Sep 30, 2025
5605c90
Rework frame ordering and pts matching
NicolasHug Oct 1, 2025
7494259
Merge branch 'main' of github.com:pytorch/torchcodec into aeaenjfjanef
NicolasHug Oct 1, 2025
560b376
Fix cuda context initialization
NicolasHug Oct 1, 2025
88196c5
Merge branch 'aeaenjfjanef' into nvdec-rework-frame-ordering
NicolasHug Oct 1, 2025
2a78b84
Renaming
NicolasHug Oct 1, 2025
5d194e5
Comment
NicolasHug Oct 1, 2025
d1e51b3
Merge branch 'main' of github.com:pytorch/torchcodec into aeaenjfjanef
NicolasHug Oct 2, 2025
f9c7297
Skip equality check on ffmepg 4
NicolasHug Oct 2, 2025
b7bbfb2
Merge branch 'aeaenjfjanef' into nvdec-rework-frame-ordering
NicolasHug Oct 2, 2025
390fd7c
Refac, simplify
NicolasHug Oct 2, 2025
f55dcc0
Update comment
NicolasHug Oct 2, 2025
7e4dd10
Define constant, add TODO for AVRational
NicolasHug Oct 2, 2025
f614846
Use uint32_t types
NicolasHug Oct 2, 2025
aa6e253
Create packet.reset() and add P0 TODO
NicolasHug Oct 2, 2025
186eaa4
Add TODO
NicolasHug Oct 2, 2025
1cb4890
Merge branch 'aeaenjfjanef' into nvdec-rework-frame-ordering
NicolasHug Oct 2, 2025
c5b32a4
Merge branch 'main' of github.com:pytorch/torchcodec into nvdec-rewor…
NicolasHug Oct 2, 2025
70873bf
lint
NicolasHug Oct 2, 2025
8e73bcf
Add TODOs and more explicit initialization
NicolasHug Oct 3, 2025
12c75e7
Add h265 support
NicolasHug Oct 2, 2025
9b63504
Merge branch 'main' of github.com:pytorch/torchcodec into nvdec-param…
NicolasHug Oct 3, 2025
718a3e3
Merge branch 'nvdec-params-and-todos' into nvdec-h265
NicolasHug Oct 3, 2025
c42388f
put initializeBSF below
NicolasHug Oct 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
170 changes: 121 additions & 49 deletions src/torchcodec/_core/BetaCudaDeviceInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,32 +109,52 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
caps.nMaxMBCount);

// Decoder creation parameters, taken from DALI
CUVIDDECODECREATEINFO decoder_info = {};
decoder_info.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
decoder_info.ChromaFormat = videoFormat->chroma_format;
decoder_info.CodecType = videoFormat->codec;
decoder_info.ulHeight = videoFormat->coded_height;
decoder_info.ulWidth = videoFormat->coded_width;
decoder_info.ulMaxHeight = videoFormat->coded_height;
decoder_info.ulMaxWidth = videoFormat->coded_width;
decoder_info.ulTargetHeight =
CUVIDDECODECREATEINFO decoderParams = {};
decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
decoderParams.ChromaFormat = videoFormat->chroma_format;
decoderParams.OutputFormat = cudaVideoSurfaceFormat_NV12;
decoderParams.ulCreationFlags = cudaVideoCreate_Default;
decoderParams.CodecType = videoFormat->codec;
decoderParams.ulHeight = videoFormat->coded_height;
decoderParams.ulWidth = videoFormat->coded_width;
decoderParams.ulMaxHeight = videoFormat->coded_height;
decoderParams.ulMaxWidth = videoFormat->coded_width;
decoderParams.ulTargetHeight =
videoFormat->display_area.bottom - videoFormat->display_area.top;
decoder_info.ulTargetWidth =
decoderParams.ulTargetWidth =
videoFormat->display_area.right - videoFormat->display_area.left;
decoder_info.ulNumDecodeSurfaces = videoFormat->min_num_decode_surfaces;
decoder_info.ulNumOutputSurfaces = 2;
decoder_info.display_area.left = videoFormat->display_area.left;
decoder_info.display_area.right = videoFormat->display_area.right;
decoder_info.display_area.top = videoFormat->display_area.top;
decoder_info.display_area.bottom = videoFormat->display_area.bottom;
decoderParams.ulNumDecodeSurfaces = videoFormat->min_num_decode_surfaces;
decoderParams.ulNumOutputSurfaces = 2;
decoderParams.display_area.left = videoFormat->display_area.left;
decoderParams.display_area.right = videoFormat->display_area.right;
decoderParams.display_area.top = videoFormat->display_area.top;
decoderParams.display_area.bottom = videoFormat->display_area.bottom;

CUvideodecoder* decoder = new CUvideodecoder();
result = cuvidCreateDecoder(decoder, &decoder_info);
result = cuvidCreateDecoder(decoder, &decoderParams);
TORCH_CHECK(
result == CUDA_SUCCESS, "Failed to create NVDEC decoder: ", result);
return UniqueCUvideodecoder(decoder, CUvideoDecoderDeleter{});
}

cudaVideoCodec validateCodecSupport(AVCodecID codecId) {
switch (codecId) {
case AV_CODEC_ID_H264:
return cudaVideoCodec_H264;
case AV_CODEC_ID_HEVC:
return cudaVideoCodec_HEVC;
// TODONVDEC P0: support more codecs
// case AV_CODEC_ID_AV1: return cudaVideoCodec_AV1;
// case AV_CODEC_ID_MPEG4: return cudaVideoCodec_MPEG4;
// case AV_CODEC_ID_VP8: return cudaVideoCodec_VP8;
// case AV_CODEC_ID_VP9: return cudaVideoCodec_VP9;
// case AV_CODEC_ID_MJPEG: return cudaVideoCodec_JPEG;
default: {
TORCH_CHECK(false, "Unsupported codec type: ", avcodec_get_name(codecId));
}
}
}

} // namespace

BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
Expand All @@ -160,36 +180,100 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
}
}

void BetaCudaDeviceInterface::initialize(const AVStream* avStream) {
void BetaCudaDeviceInterface::initialize(
const AVStream* avStream,
const UniqueDecodingAVFormatContext& avFormatCtx) {
torch::Tensor dummyTensorForCudaInitialization = torch::empty(
{1}, torch::TensorOptions().dtype(torch::kUInt8).device(device_));

TORCH_CHECK(avStream != nullptr, "AVStream cannot be null");
timeBase_ = avStream->time_base;

auto cudaDevice = torch::Device(torch::kCUDA);
defaultCudaInterface_ =
std::unique_ptr<DeviceInterface>(createDeviceInterface(cudaDevice));
AVCodecContext dummyCodecContext = {};
defaultCudaInterface_->initialize(avStream);
defaultCudaInterface_->initialize(avStream, avFormatCtx);
defaultCudaInterface_->registerHardwareDeviceWithCodec(&dummyCodecContext);

const AVCodecParameters* codecpar = avStream->codecpar;
TORCH_CHECK(codecpar != nullptr, "CodecParameters cannot be null");
TORCH_CHECK(avStream != nullptr, "AVStream cannot be null");
timeBase_ = avStream->time_base;

const AVCodecParameters* codecPar = avStream->codecpar;
TORCH_CHECK(codecPar != nullptr, "CodecParameters cannot be null");

initializeBSF(codecPar, avFormatCtx);

// Create parser. Default values that aren't obvious are taken from DALI.
CUVIDPARSERPARAMS parserParams = {};
parserParams.CodecType = validateCodecSupport(codecPar->codec_id);
parserParams.ulMaxNumDecodeSurfaces = 8;
parserParams.ulMaxDisplayDelay = 0;
// Callback setup, all are triggered by the parser within a call
// to cuvidParseVideoData
parserParams.pUserData = this;
parserParams.pfnSequenceCallback = pfnSequenceCallback;
parserParams.pfnDecodePicture = pfnDecodePictureCallback;
parserParams.pfnDisplayPicture = pfnDisplayPictureCallback;

CUresult result = cuvidCreateVideoParser(&videoParser_, &parserParams);
TORCH_CHECK(
// TODONVDEC P0 support more
avStream->codecpar->codec_id == AV_CODEC_ID_H264,
"Can only do H264 for now");
result == CUDA_SUCCESS, "Failed to create video parser: ", result);
}

void BetaCudaDeviceInterface::initializeBSF(
const AVCodecParameters* codecPar,
const UniqueDecodingAVFormatContext& avFormatCtx) {
// Setup bit stream filters (BSF):
// https://ffmpeg.org/doxygen/7.0/group__lavc__bsf.html
// This is only needed for some formats, like H264 or HEVC. TODONVDEC P1: For
// now we apply BSF unconditionally, but it should be optional and dependent
// on codec and container.
const AVBitStreamFilter* avBSF = av_bsf_get_by_name("h264_mp4toannexb");
// This is only needed for some formats, like H264 or HEVC.

TORCH_CHECK(codecPar != nullptr, "codecPar cannot be null");
TORCH_CHECK(avFormatCtx != nullptr, "AVFormatContext cannot be null");
TORCH_CHECK(
avFormatCtx->iformat != nullptr,
"AVFormatContext->iformat cannot be null");
std::string filterName;

// Matching logic is taken from DALI
switch (codecPar->codec_id) {
case AV_CODEC_ID_H264: {
const std::string formatName = avFormatCtx->iformat->long_name
? avFormatCtx->iformat->long_name
: "";

if (formatName == "QuickTime / MOV" ||
formatName == "FLV (Flash Video)" ||
formatName == "Matroska / WebM" || formatName == "raw H.264 video") {
filterName = "h264_mp4toannexb";
}
break;
}

case AV_CODEC_ID_HEVC: {
const std::string formatName = avFormatCtx->iformat->long_name
? avFormatCtx->iformat->long_name
: "";

if (formatName == "QuickTime / MOV" ||
formatName == "FLV (Flash Video)" ||
formatName == "Matroska / WebM" || formatName == "raw HEVC video") {
filterName = "hevc_mp4toannexb";
}
break;
}

default:
// No bitstream filter needed for other codecs
// TODONVDEC P1 MPEG4 will need one!
break;
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: I would prefer putting this switching logic into a function - I know we have a style difference there. :) My rationale is that I find it useful to think in terms of pure functions when I can, and this can definitely be a pure function, and then we can simply say in this scope:

std::string filterName = toFilterName(codecPar->codec_id);

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC we'd need toFilterName to return "" when no match exist, and it would be up to the caller to check against that?


if (filterName.empty()) {
// Only initialize BSF if we actually need one
return;
}

const AVBitStreamFilter* avBSF = av_bsf_get_by_name(filterName.c_str());
TORCH_CHECK(
avBSF != nullptr, "Failed to find h264_mp4toannexb bitstream filter");
avBSF != nullptr, "Failed to find bitstream filter: ", filterName);

AVBSFContext* avBSFContext = nullptr;
int retVal = av_bsf_alloc(avBSF, &avBSFContext);
Expand All @@ -200,7 +284,7 @@ void BetaCudaDeviceInterface::initialize(const AVStream* avStream) {

bitstreamFilter_.reset(avBSFContext);

retVal = avcodec_parameters_copy(bitstreamFilter_->par_in, codecpar);
retVal = avcodec_parameters_copy(bitstreamFilter_->par_in, codecPar);
TORCH_CHECK(
retVal >= AVSUCCESS,
"Failed to copy codec parameters: ",
Expand All @@ -211,22 +295,6 @@ void BetaCudaDeviceInterface::initialize(const AVStream* avStream) {
retVal == AVSUCCESS,
"Failed to initialize bitstream filter: ",
getFFMPEGErrorStringFromErrorCode(retVal));

// Create parser. Default values that aren't obvious are taken from DALI.
CUVIDPARSERPARAMS parserParams = {};
parserParams.CodecType = cudaVideoCodec_H264;
parserParams.ulMaxNumDecodeSurfaces = 8;
parserParams.ulMaxDisplayDelay = 0;
// Callback setup, all are triggered by the parser within a call
// to cuvidParseVideoData
parserParams.pUserData = this;
parserParams.pfnSequenceCallback = pfnSequenceCallback;
parserParams.pfnDecodePicture = pfnDecodePictureCallback;
parserParams.pfnDisplayPicture = pfnDisplayPictureCallback;

CUresult result = cuvidCreateVideoParser(&videoParser_, &parserParams);
TORCH_CHECK(
result == CUDA_SUCCESS, "Failed to create video parser: ", result);
}

// This callback is called by the parser within cuvidParseVideoData when there
Expand Down Expand Up @@ -360,6 +428,10 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
CUVIDPARSERDISPINFO dispInfo = readyFrames_.front();
readyFrames_.pop();

// TODONVDEC P1 we need to set the procParams.output_stream field to the
// current CUDA stream and ensure proper synchronization. There's a related
// NVDECTODO in CudaDeviceInterface.cpp where we do the necessary
// synchronization for NPP.
CUVIDPROCPARAMS procParams = {};
procParams.progressive_frame = dispInfo.progressive_frame;
procParams.top_field_first = dispInfo.top_field_first;
Expand Down
7 changes: 6 additions & 1 deletion src/torchcodec/_core/BetaCudaDeviceInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@ class BetaCudaDeviceInterface : public DeviceInterface {
explicit BetaCudaDeviceInterface(const torch::Device& device);
virtual ~BetaCudaDeviceInterface();

void initialize(const AVStream* avStream) override;
void initialize(
const AVStream* avStream,
const UniqueDecodingAVFormatContext& avFormatCtx) override;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just merged #902, so there's going to be merge conflicts. It should be easy to solve: we just rename initializeInterface() to initialize(), and keep the new param.


void convertAVFrameToFrameOutput(
UniqueAVFrame& avFrame,
Expand All @@ -61,6 +63,9 @@ class BetaCudaDeviceInterface : public DeviceInterface {
private:
// Apply bitstream filter, modifies packet in-place
void applyBSF(ReferenceAVPacket& packet);
void initializeBSF(
const AVCodecParameters* codecPar,
const UniqueDecodingAVFormatContext& avFormatCtx);

UniqueAVFrame convertCudaFrameToAVFrame(
CUdeviceptr framePtr,
Expand Down
4 changes: 3 additions & 1 deletion src/torchcodec/_core/CpuDeviceInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ CpuDeviceInterface::CpuDeviceInterface(const torch::Device& device)
device_.type() == torch::kCPU, "Unsupported device: ", device_.str());
}

void CpuDeviceInterface::initialize(const AVStream* avStream) {
void CpuDeviceInterface::initialize(
const AVStream* avStream,
[[maybe_unused]] const UniqueDecodingAVFormatContext& avFormatCtx) {
TORCH_CHECK(avStream != nullptr, "avStream is null");
timeBase_ = avStream->time_base;
}
Expand Down
4 changes: 3 additions & 1 deletion src/torchcodec/_core/CpuDeviceInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ class CpuDeviceInterface : public DeviceInterface {
return std::nullopt;
}

virtual void initialize(const AVStream* avStream) override;
virtual void initialize(
const AVStream* avStream,
const UniqueDecodingAVFormatContext& avFormatCtx) override;

virtual void initializeVideo(
const VideoStreamOptions& videoStreamOptions,
Expand Down
6 changes: 4 additions & 2 deletions src/torchcodec/_core/CudaDeviceInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,14 +203,16 @@ CudaDeviceInterface::~CudaDeviceInterface() {
}
}

void CudaDeviceInterface::initialize(const AVStream* avStream) {
void CudaDeviceInterface::initialize(
const AVStream* avStream,
const UniqueDecodingAVFormatContext& avFormatCtx) {
TORCH_CHECK(avStream != nullptr, "avStream is null");
timeBase_ = avStream->time_base;

cpuInterface_ = createDeviceInterface(torch::kCPU);
TORCH_CHECK(
cpuInterface_ != nullptr, "Failed to create CPU device interface");
cpuInterface_->initialize(avStream);
cpuInterface_->initialize(avStream, avFormatCtx);
cpuInterface_->initializeVideo(
VideoStreamOptions(),
{},
Expand Down
4 changes: 3 additions & 1 deletion src/torchcodec/_core/CudaDeviceInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ class CudaDeviceInterface : public DeviceInterface {

std::optional<const AVCodec*> findCodec(const AVCodecID& codecId) override;

void initialize(const AVStream* avStream) override;
void initialize(
const AVStream* avStream,
const UniqueDecodingAVFormatContext& avFormatCtx) override;

void initializeVideo(
const VideoStreamOptions& videoStreamOptions,
Expand Down
4 changes: 3 additions & 1 deletion src/torchcodec/_core/DeviceInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,9 @@ class DeviceInterface {
};

// Initialize the device with parameters generic to all kinds of decoding.
virtual void initialize(const AVStream* avStream) = 0;
virtual void initialize(
const AVStream* avStream,
const UniqueDecodingAVFormatContext& avFormatCtx) = 0;

// Initialize the device with parameters specific to video decoding. There is
// a default empty implementation.
Expand Down
2 changes: 1 addition & 1 deletion src/torchcodec/_core/SingleStreamDecoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,7 @@ void SingleStreamDecoder::addStream(
TORCH_CHECK(
deviceInterface_ != nullptr,
"Failed to create device interface. This should never happen, please report.");
deviceInterface_->initialize(streamInfo.stream);
deviceInterface_->initialize(streamInfo.stream, formatContext_);

// TODO_CODE_QUALITY it's pretty meh to have a video-specific logic within
// addStream() which is supposed to be generic
Expand Down
Binary file added test/resources/testsrc2_h265.mp4
Binary file not shown.
Loading
Loading