Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
78ab058
Let's just commit 3k loc in a single commit
NicolasHug Sep 25, 2025
b45decc
Fixes
NicolasHug Sep 26, 2025
316f218
Merge branch 'main' of github.com:pytorch/torchcodec into aeaenjfjanef
NicolasHug Sep 30, 2025
d0192ec
GetCache -> getCache
NicolasHug Sep 30, 2025
515deb5
Make UniqueCUvideodecoder a pointer on CUvideodecoder, not void
NicolasHug Sep 30, 2025
13fad10
Make device and device_variant have a default instead of being std::o…
NicolasHug Sep 30, 2025
eb8de72
Remove old registerDeviceInterface
NicolasHug Sep 30, 2025
4f7a4fb
Call std::memset
NicolasHug Sep 30, 2025
dcf3124
remove unnecessary cuda_runtime.h include, update cmake accordingly
NicolasHug Sep 30, 2025
0ad7370
abstract frameBuffer_ into a FrameBuffer class
NicolasHug Sep 30, 2025
aad142e
Cleanup BSF logic
NicolasHug Sep 30, 2025
2592888
Return int in callback instead of unsigned char
NicolasHug Sep 30, 2025
b5fe9bc
define width and height as unsigned int
NicolasHug Sep 30, 2025
5605c90
Rework frame ordering and pts matching
NicolasHug Oct 1, 2025
7494259
Merge branch 'main' of github.com:pytorch/torchcodec into aeaenjfjanef
NicolasHug Oct 1, 2025
560b376
Fix cuda context initialization
NicolasHug Oct 1, 2025
88196c5
Merge branch 'aeaenjfjanef' into nvdec-rework-frame-ordering
NicolasHug Oct 1, 2025
2a78b84
Renaming
NicolasHug Oct 1, 2025
5d194e5
Comment
NicolasHug Oct 1, 2025
d1e51b3
Merge branch 'main' of github.com:pytorch/torchcodec into aeaenjfjanef
NicolasHug Oct 2, 2025
f9c7297
Skip equality check on ffmepg 4
NicolasHug Oct 2, 2025
b7bbfb2
Merge branch 'aeaenjfjanef' into nvdec-rework-frame-ordering
NicolasHug Oct 2, 2025
390fd7c
Refac, simplify
NicolasHug Oct 2, 2025
f55dcc0
Update comment
NicolasHug Oct 2, 2025
7e4dd10
Define constant, add TODO for AVRational
NicolasHug Oct 2, 2025
f614846
Use uint32_t types
NicolasHug Oct 2, 2025
aa6e253
Create packet.reset() and add P0 TODO
NicolasHug Oct 2, 2025
186eaa4
Add TODO
NicolasHug Oct 2, 2025
1cb4890
Merge branch 'aeaenjfjanef' into nvdec-rework-frame-ordering
NicolasHug Oct 2, 2025
c5b32a4
Merge branch 'main' of github.com:pytorch/torchcodec into nvdec-rewor…
NicolasHug Oct 2, 2025
70873bf
lint
NicolasHug Oct 2, 2025
8e73bcf
Add TODOs and more explicit initialization
NicolasHug Oct 3, 2025
12c75e7
Add h265 support
NicolasHug Oct 2, 2025
7ea3ca9
Add h265 support
NicolasHug Oct 2, 2025
8ad66ce
Add AV1 support
NicolasHug Oct 3, 2025
9b63504
Merge branch 'main' of github.com:pytorch/torchcodec into nvdec-param…
NicolasHug Oct 3, 2025
718a3e3
Merge branch 'nvdec-params-and-todos' into nvdec-h265
NicolasHug Oct 3, 2025
c42388f
put initializeBSF below
NicolasHug Oct 3, 2025
29e72f9
Merge branch 'nvdec-h265' into nvdec-av1
NicolasHug Oct 3, 2025
ea86677
Merge branch 'main' of github.com:pytorch/torchcodec into nvdec-av1
NicolasHug Oct 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 16 additions & 9 deletions src/torchcodec/_core/BetaCudaDeviceInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,8 +143,9 @@ cudaVideoCodec validateCodecSupport(AVCodecID codecId) {
return cudaVideoCodec_H264;
case AV_CODEC_ID_HEVC:
return cudaVideoCodec_HEVC;
case AV_CODEC_ID_AV1:
return cudaVideoCodec_AV1;
// TODONVDEC P0: support more codecs
// case AV_CODEC_ID_AV1: return cudaVideoCodec_AV1;
// case AV_CODEC_ID_MPEG4: return cudaVideoCodec_MPEG4;
// case AV_CODEC_ID_VP8: return cudaVideoCodec_VP8;
// case AV_CODEC_ID_VP9: return cudaVideoCodec_VP9;
Expand Down Expand Up @@ -195,6 +196,7 @@ void BetaCudaDeviceInterface::initialize(

TORCH_CHECK(avStream != nullptr, "AVStream cannot be null");
timeBase_ = avStream->time_base;
frameRateAvgFromFFmpeg_ = avStream->r_frame_rate;

const AVCodecParameters* codecPar = avStream->codecpar;
TORCH_CHECK(codecPar != nullptr, "CodecParameters cannot be null");
Expand Down Expand Up @@ -494,14 +496,19 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
avFrame->format = AV_PIX_FMT_CUDA;
avFrame->pts = dispInfo.timestamp;

// TODONVDEC P0: Zero division error!!!
// TODONVDEC P0: Move AVRational arithmetic to FFMPEGCommon, and put the
// similar SingleStreamDecoder stuff there too.
unsigned int frameRateNum = videoFormat_.frame_rate.numerator;
unsigned int frameRateDen = videoFormat_.frame_rate.denominator;
int64_t duration = static_cast<int64_t>((frameRateDen * timeBase_.den)) /
(frameRateNum * timeBase_.num);
setDuration(avFrame, duration);
// TODONVDEC P2: We compute the duration based on average frame rate info:
// either from NVCUVID if it's valid, otherwise from FFmpeg as fallback. But
// both of these are based on average frame rate, so if the video has
// variable frame rate, the durations may be off. We should try to see if we
// can set the duration more accurately. Unfortunately it's not given by
// dispInfo. One option would be to set it based on the pts difference between
// consecutive frames, if the next frame is already available.
int frameRateNum = static_cast<int>(videoFormat_.frame_rate.numerator);
int frameRateDen = static_cast<int>(videoFormat_.frame_rate.denominator);
AVRational frameRate = (frameRateNum > 0 && frameRateDen > 0)
? AVRational{frameRateNum, frameRateDen}
: frameRateAvgFromFFmpeg_;
setDuration(avFrame, computeSafeDuration(frameRate, timeBase_));

// We need to assign the frame colorspace. This is crucial for proper color
// conversion. NVCUVID stores that in the matrix_coefficients field, but
Expand Down
3 changes: 2 additions & 1 deletion src/torchcodec/_core/BetaCudaDeviceInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ class BetaCudaDeviceInterface : public DeviceInterface {
// isFlushing_)
bool isFlushing_ = false;

AVRational timeBase_ = {0, 0};
AVRational timeBase_ = {0, 1};
AVRational frameRateAvgFromFFmpeg_ = {0, 1};

UniqueAVBSFContext bitstreamFilter_;

Expand Down
22 changes: 22 additions & 0 deletions src/torchcodec/_core/FFMPEGCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -501,4 +501,26 @@ AVIOContext* avioAllocContext(
seek);
}

double ptsToSeconds(int64_t pts, const AVRational& timeBase) {
// To perform the multiplication before the division, av_q2d is not used
return static_cast<double>(pts) * timeBase.num / timeBase.den;
}

int64_t secondsToClosestPts(double seconds, const AVRational& timeBase) {
return static_cast<int64_t>(
std::round(seconds * timeBase.den / timeBase.num));
}

int64_t computeSafeDuration(
const AVRational& frameRate,
const AVRational& timeBase) {
if (frameRate.num <= 0 || frameRate.den <= 0 || timeBase.num <= 0 ||
timeBase.den <= 0) {
return 0;
} else {
return (static_cast<int64_t>(frameRate.den) * timeBase.den) /
(static_cast<int64_t>(timeBase.num) * frameRate.num);
}
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved all of these here from SingleStreamDecoder, addressing @scotts 's #910 (comment)

} // namespace facebook::torchcodec
6 changes: 6 additions & 0 deletions src/torchcodec/_core/FFMPEGCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -232,4 +232,10 @@ AVIOContext* avioAllocContext(
AVIOWriteFunction write_packet,
AVIOSeekFunction seek);

double ptsToSeconds(int64_t pts, const AVRational& timeBase);
int64_t secondsToClosestPts(double seconds, const AVRational& timeBase);
int64_t computeSafeDuration(
const AVRational& frameRate,
const AVRational& timeBase);

} // namespace facebook::torchcodec
10 changes: 0 additions & 10 deletions src/torchcodec/_core/SingleStreamDecoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,6 @@
namespace facebook::torchcodec {
namespace {

double ptsToSeconds(int64_t pts, const AVRational& timeBase) {
// To perform the multiplication before the division, av_q2d is not used
return static_cast<double>(pts) * timeBase.num / timeBase.den;
}

int64_t secondsToClosestPts(double seconds, const AVRational& timeBase) {
return static_cast<int64_t>(
std::round(seconds * timeBase.den / timeBase.num));
}

// Some videos aren't properly encoded and do not specify pts values for
// packets, and thus for frames. Unset values correspond to INT64_MIN. When that
// happens, we fallback to the dts value which hopefully exists and is correct.
Expand Down
84 changes: 73 additions & 11 deletions test/test_decoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -1417,13 +1417,23 @@ def test_get_frames_at_tensor_indices(self):

@needs_cuda
@pytest.mark.parametrize(
"asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE, TEST_SRC_2_720P_H265)
"asset",
(
NASA_VIDEO,
TEST_SRC_2_720P,
BT709_FULL_RANGE,
TEST_SRC_2_720P_H265,
AV1_VIDEO,
),
)
@pytest.mark.parametrize("contiguous_indices", (True, False))
@pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
def test_beta_cuda_interface_get_frame_at(
self, asset, contiguous_indices, seek_mode
):
if asset == AV1_VIDEO and seek_mode == "approximate":
pytest.skip("AV1 asset doesn't work with approximate mode")

ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode)
beta_decoder = VideoDecoder(
asset.path, device="cuda:0:beta", seek_mode=seek_mode
Expand All @@ -1449,13 +1459,23 @@ def test_beta_cuda_interface_get_frame_at(

@needs_cuda
@pytest.mark.parametrize(
"asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE, TEST_SRC_2_720P_H265)
"asset",
(
NASA_VIDEO,
TEST_SRC_2_720P,
BT709_FULL_RANGE,
TEST_SRC_2_720P_H265,
AV1_VIDEO,
),
)
@pytest.mark.parametrize("contiguous_indices", (True, False))
@pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
def test_beta_cuda_interface_get_frames_at(
self, asset, contiguous_indices, seek_mode
):
if asset == AV1_VIDEO and seek_mode == "approximate":
pytest.skip("AV1 asset doesn't work with approximate mode")

ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode)
beta_decoder = VideoDecoder(
asset.path, device="cuda:0:beta", seek_mode=seek_mode
Expand All @@ -1482,10 +1502,20 @@ def test_beta_cuda_interface_get_frames_at(

@needs_cuda
@pytest.mark.parametrize(
"asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE, TEST_SRC_2_720P_H265)
"asset",
(
NASA_VIDEO,
TEST_SRC_2_720P,
BT709_FULL_RANGE,
TEST_SRC_2_720P_H265,
AV1_VIDEO,
),
)
@pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
def test_beta_cuda_interface_get_frame_played_at(self, asset, seek_mode):
if asset == AV1_VIDEO and seek_mode == "approximate":
pytest.skip("AV1 asset doesn't work with approximate mode")

ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode)
beta_decoder = VideoDecoder(
asset.path, device="cuda:0:beta", seek_mode=seek_mode
Expand All @@ -1499,17 +1529,30 @@ def test_beta_cuda_interface_get_frame_played_at(self, asset, seek_mode):
for pts in timestamps:
ref_frame = ref_decoder.get_frame_played_at(pts)
beta_frame = beta_decoder.get_frame_played_at(pts)
torch.testing.assert_close(beta_frame.data, ref_frame.data, rtol=0, atol=0)
if get_ffmpeg_major_version() > 4: # TODONVDEC P1 see above
torch.testing.assert_close(
beta_frame.data, ref_frame.data, rtol=0, atol=0
)

assert beta_frame.pts_seconds == ref_frame.pts_seconds
assert beta_frame.duration_seconds == ref_frame.duration_seconds

@needs_cuda
@pytest.mark.parametrize(
"asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE, TEST_SRC_2_720P_H265)
"asset",
(
NASA_VIDEO,
TEST_SRC_2_720P,
BT709_FULL_RANGE,
TEST_SRC_2_720P_H265,
AV1_VIDEO,
),
)
@pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
def test_beta_cuda_interface_get_frames_played_at(self, asset, seek_mode):
if asset == AV1_VIDEO and seek_mode == "approximate":
pytest.skip("AV1 asset doesn't work with approximate mode")

ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode)
beta_decoder = VideoDecoder(
asset.path, device="cuda:0:beta", seek_mode=seek_mode
Expand All @@ -1523,18 +1566,30 @@ def test_beta_cuda_interface_get_frames_played_at(self, asset, seek_mode):

ref_frames = ref_decoder.get_frames_played_at(timestamps)
beta_frames = beta_decoder.get_frames_played_at(timestamps)
torch.testing.assert_close(beta_frames.data, ref_frames.data, rtol=0, atol=0)
if get_ffmpeg_major_version() > 4: # TODONVDEC P1 see above
torch.testing.assert_close(
beta_frames.data, ref_frames.data, rtol=0, atol=0
)
torch.testing.assert_close(beta_frames.pts_seconds, ref_frames.pts_seconds)
torch.testing.assert_close(
beta_frames.duration_seconds, ref_frames.duration_seconds
)

@needs_cuda
@pytest.mark.parametrize(
"asset", (NASA_VIDEO, TEST_SRC_2_720P, BT709_FULL_RANGE, TEST_SRC_2_720P_H265)
"asset",
(
NASA_VIDEO,
TEST_SRC_2_720P,
BT709_FULL_RANGE,
TEST_SRC_2_720P_H265,
AV1_VIDEO,
),
)
@pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
def test_beta_cuda_interface_backwards(self, asset, seek_mode):
if asset == AV1_VIDEO and seek_mode == "approximate":
pytest.skip("AV1 asset doesn't work with approximate mode")

ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode)
beta_decoder = VideoDecoder(
Expand All @@ -1543,11 +1598,20 @@ def test_beta_cuda_interface_backwards(self, asset, seek_mode):

assert ref_decoder.metadata == beta_decoder.metadata

for frame_index in [0, 100, 10, 50, 20, 200, 150, 389]:
for frame_index in [0, 1, 2, 1, 0, 100, 10, 50, 20, 200, 150, 150, 150, 389, 2]:
# This is ugly, but OK: the indices values above are relevant for
# the NASA_VIDEO. We need to avoid going out of bounds for other
# videos so we cap the frame_index. This test still serves its
# purpose: no matter what the range of the video, we're still doing
# backwards seeks.
frame_index = min(frame_index, len(ref_decoder) - 1)

ref_frame = ref_decoder.get_frame_at(frame_index)
beta_frame = beta_decoder.get_frame_at(frame_index)
torch.testing.assert_close(beta_frame.data, ref_frame.data, rtol=0, atol=0)
if get_ffmpeg_major_version() > 4: # TODONVDEC P1 see above
torch.testing.assert_close(
beta_frame.data, ref_frame.data, rtol=0, atol=0
)

assert beta_frame.pts_seconds == ref_frame.pts_seconds
assert beta_frame.duration_seconds == ref_frame.duration_seconds
Expand All @@ -1568,8 +1632,6 @@ def test_beta_cuda_interface_small_h265(self):

@needs_cuda
def test_beta_cuda_interface_error(self):
with pytest.raises(RuntimeError, match="Unsupported codec type: av1"):
VideoDecoder(AV1_VIDEO.path, device="cuda:0:beta")
with pytest.raises(RuntimeError, match="Unsupported device"):
VideoDecoder(NASA_VIDEO.path, device="cuda:0:bad_variant")

Expand Down
Loading