Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions benchmarks/decoders/BenchmarkDecodersMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ void runNDecodeIterations(
decoder->addVideoStreamDecoder(-1);
for (double pts : ptsList) {
decoder->setCursorPtsInSeconds(pts);
torch::Tensor tensor = decoder->getNextDecodedOutput().frame;
torch::Tensor tensor = decoder->getNextDecodedOutputNoDemux().frame;
}
if (i + 1 == warmupIterations) {
start = std::chrono::high_resolution_clock::now();
Expand Down Expand Up @@ -95,7 +95,7 @@ void runNdecodeIterationsGrabbingConsecutiveFrames(
VideoDecoder::createFromFilePath(videoPath);
decoder->addVideoStreamDecoder(-1);
for (int j = 0; j < consecutiveFrameCount; ++j) {
torch::Tensor tensor = decoder->getNextDecodedOutput().frame;
torch::Tensor tensor = decoder->getNextDecodedOutputNoDemux().frame;
}
if (i + 1 == warmupIterations) {
start = std::chrono::high_resolution_clock::now();
Expand Down
8 changes: 4 additions & 4 deletions src/torchcodec/decoders/_core/VideoDecoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -761,7 +761,7 @@ VideoDecoder::DecodedOutput VideoDecoder::getDecodedOutputWithFilter(
if (activeStreamIndices_.size() == 0) {
throw std::runtime_error("No active streams configured.");
}
VLOG(9) << "Starting getNextDecodedOutput()";
VLOG(9) << "Starting getNextDecodedOutputNoDemux()";
resetDecodeStats();
if (maybeDesiredPts_.has_value()) {
VLOG(9) << "maybeDesiredPts_=" << *maybeDesiredPts_;
Expand Down Expand Up @@ -922,7 +922,7 @@ VideoDecoder::DecodedOutput VideoDecoder::convertAVFrameToDecodedOutput(
return output;
}

VideoDecoder::DecodedOutput VideoDecoder::getFrameDisplayedAtTimestamp(
VideoDecoder::DecodedOutput VideoDecoder::getFrameDisplayedAtTimestampNoDemux(
double seconds) {
for (auto& [streamIndex, stream] : streams_) {
double frameStartTime = ptsToSeconds(stream.currentPts, stream.timeBase);
Expand Down Expand Up @@ -981,7 +981,7 @@ VideoDecoder::DecodedOutput VideoDecoder::getFrameAtIndex(
}
int64_t pts = stream.allFrames[frameIndex].pts;
setCursorPtsInSeconds(ptsToSeconds(pts, stream.timeBase));
return getNextDecodedOutput();
return getNextDecodedOutputNoDemux();
}

VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesAtIndexes(
Expand Down Expand Up @@ -1134,7 +1134,7 @@ VideoDecoder::getFramesDisplayedByTimestampInRange(
return output;
}

VideoDecoder::DecodedOutput VideoDecoder::getNextDecodedOutput() {
VideoDecoder::DecodedOutput VideoDecoder::getNextDecodedOutputNoDemux() {
return getDecodedOutputWithFilter(
[this](int frameStreamIndex, AVFrame* frame) {
StreamInfo& activeStream = streams_[frameStreamIndex];
Expand Down
19 changes: 10 additions & 9 deletions src/torchcodec/decoders/_core/VideoDecoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,8 @@ class VideoDecoder {

// ---- SINGLE FRAME SEEK AND DECODING API ----
// Places the cursor at the first frame on or after the position in seconds.
// Calling getNextFrameAsTensor() will return the first frame at or after this
// position.
// Calling getNextDecodedOutputNoDemux() will return the first frame at or
// after this position.
void setCursorPtsInSeconds(double seconds);
struct DecodedOutput {
// The actual decoded output as a Tensor.
Expand All @@ -180,13 +180,14 @@ class VideoDecoder {
};
// Decodes the frame where the current cursor position is. It also advances
// the cursor to the next frame.
DecodedOutput getNextDecodedOutput();
// Decodes the frame that is visible at a given timestamp. Frames in the video
// have a presentation timestamp and a duration. For example, if a frame has
// presentation timestamp of 5.0s and a duration of 1.0s, it will be visible
// in the timestamp range [5.0, 6.0). i.e. it will be returned when this
// function is called with seconds=5.0 or seconds=5.999, etc.
DecodedOutput getFrameDisplayedAtTimestamp(double seconds);
DecodedOutput getNextDecodedOutputNoDemux();
// Decodes the first frame in any added stream that is visible at a given
// timestamp. Frames in the video have a presentation timestamp and a
// duration. For example, if a frame has presentation timestamp of 5.0s and a
// duration of 1.0s, it will be visible in the timestamp range [5.0, 6.0).
// i.e. it will be returned when this function is called with seconds=5.0 or
// seconds=5.999, etc.
DecodedOutput getFrameDisplayedAtTimestampNoDemux(double seconds);
DecodedOutput getFrameAtIndex(int streamIndex, int64_t frameIndex);
struct BatchDecodedOutput {
torch::Tensor frames;
Expand Down
4 changes: 2 additions & 2 deletions src/torchcodec/decoders/_core/VideoDecoderOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ OpsDecodedOutput get_next_frame(at::Tensor& decoder) {
auto videoDecoder = unwrapTensorToGetDecoder(decoder);
VideoDecoder::DecodedOutput result;
try {
result = videoDecoder->getNextDecodedOutput();
result = videoDecoder->getNextDecodedOutputNoDemux();
} catch (const VideoDecoder::EndOfFileException& e) {
throw pybind11::stop_iteration(e.what());
}
Expand All @@ -159,7 +159,7 @@ OpsDecodedOutput get_next_frame(at::Tensor& decoder) {

OpsDecodedOutput get_frame_at_pts(at::Tensor& decoder, double seconds) {
auto videoDecoder = unwrapTensorToGetDecoder(decoder);
auto result = videoDecoder->getFrameDisplayedAtTimestamp(seconds);
auto result = videoDecoder->getFrameDisplayedAtTimestampNoDemux(seconds);
return makeOpsDecodedOutput(result);
}

Expand Down
37 changes: 19 additions & 18 deletions test/decoders/VideoDecoderTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ TEST(VideoDecoderTest, RespectsWidthAndHeightFromOptions) {
streamOptions.width = 100;
streamOptions.height = 120;
decoder->addVideoStreamDecoder(-1, streamOptions);
torch::Tensor tensor = decoder->getNextDecodedOutput().frame;
torch::Tensor tensor = decoder->getNextDecodedOutputNoDemux().frame;
EXPECT_EQ(tensor.sizes(), std::vector<long>({3, 120, 100}));
}

Expand All @@ -163,7 +163,7 @@ TEST(VideoDecoderTest, RespectsOutputTensorDimensionOrderFromOptions) {
VideoDecoder::VideoStreamDecoderOptions streamOptions;
streamOptions.dimensionOrder = "NHWC";
decoder->addVideoStreamDecoder(-1, streamOptions);
torch::Tensor tensor = decoder->getNextDecodedOutput().frame;
torch::Tensor tensor = decoder->getNextDecodedOutputNoDemux().frame;
EXPECT_EQ(tensor.sizes(), std::vector<long>({270, 480, 3}));
}

Expand All @@ -172,12 +172,12 @@ TEST_P(VideoDecoderTest, ReturnsFirstTwoFramesOfVideo) {
std::unique_ptr<VideoDecoder> ourDecoder =
createDecoderFromPath(path, GetParam());
ourDecoder->addVideoStreamDecoder(-1);
auto output = ourDecoder->getNextDecodedOutput();
auto output = ourDecoder->getNextDecodedOutputNoDemux();
torch::Tensor tensor0FromOurDecoder = output.frame;
EXPECT_EQ(tensor0FromOurDecoder.sizes(), std::vector<long>({3, 270, 480}));
EXPECT_EQ(output.ptsSeconds, 0.0);
EXPECT_EQ(output.pts, 0);
output = ourDecoder->getNextDecodedOutput();
output = ourDecoder->getNextDecodedOutputNoDemux();
torch::Tensor tensor1FromOurDecoder = output.frame;
EXPECT_EQ(tensor1FromOurDecoder.sizes(), std::vector<long>({3, 270, 480}));
EXPECT_EQ(output.ptsSeconds, 1'001. / 30'000);
Expand Down Expand Up @@ -219,12 +219,12 @@ TEST(GPUVideoDecoderTest, ReturnsFirstTwoFramesOfVideo) {
ASSERT_TRUE(streamOptions.device.is_cuda());
ASSERT_EQ(streamOptions.device.type(), torch::DeviceType::CUDA);
ourDecoder->addVideoStreamDecoder(-1, streamOptions);
auto output = ourDecoder->getNextDecodedOutput();
auto output = ourDecoder->getNextDecodedOutputNoDemux();
torch::Tensor tensor1FromOurDecoder = output.frame;
EXPECT_EQ(tensor1FromOurDecoder.sizes(), std::vector<long>({3, 270, 480}));
EXPECT_EQ(output.ptsSeconds, 0.0);
EXPECT_EQ(output.pts, 0);
output = ourDecoder->getNextDecodedOutput();
output = ourDecoder->getNextDecodedOutputNoDemux();
torch::Tensor tensor2FromOurDecoder = output.frame;
EXPECT_EQ(tensor2FromOurDecoder.sizes(), std::vector<long>({3, 270, 480}));
EXPECT_EQ(output.ptsSeconds, 1'001. / 30'000);
Expand Down Expand Up @@ -306,30 +306,31 @@ TEST_P(VideoDecoderTest, SeeksCloseToEof) {
createDecoderFromPath(path, GetParam());
ourDecoder->addVideoStreamDecoder(-1);
ourDecoder->setCursorPtsInSeconds(388388. / 30'000);
auto output = ourDecoder->getNextDecodedOutput();
auto output = ourDecoder->getNextDecodedOutputNoDemux();
EXPECT_EQ(output.ptsSeconds, 388'388. / 30'000);
output = ourDecoder->getNextDecodedOutput();
output = ourDecoder->getNextDecodedOutputNoDemux();
EXPECT_EQ(output.ptsSeconds, 389'389. / 30'000);
EXPECT_THROW(ourDecoder->getNextDecodedOutput(), std::exception);
EXPECT_THROW(ourDecoder->getNextDecodedOutputNoDemux(), std::exception);
}

TEST_P(VideoDecoderTest, GetsFrameDisplayedAtTimestamp) {
std::string path = getResourcePath("nasa_13013.mp4");
std::unique_ptr<VideoDecoder> ourDecoder =
createDecoderFromPath(path, GetParam());
ourDecoder->addVideoStreamDecoder(-1);
auto output = ourDecoder->getFrameDisplayedAtTimestamp(6.006);
auto output = ourDecoder->getFrameDisplayedAtTimestampNoDemux(6.006);
EXPECT_EQ(output.ptsSeconds, 6.006);
// The frame's duration is 0.033367 according to ffprobe,
// so the next frame is displayed at timestamp=6.039367.
const double kNextFramePts = 6.039366666666667;
// The frame that is displayed a microsecond before the next frame is still
// the previous frame.
output = ourDecoder->getFrameDisplayedAtTimestamp(kNextFramePts - 1e-6);
output =
ourDecoder->getFrameDisplayedAtTimestampNoDemux(kNextFramePts - 1e-6);
EXPECT_EQ(output.ptsSeconds, 6.006);
// The frame that is displayed at the exact pts of the frame is the next
// frame.
output = ourDecoder->getFrameDisplayedAtTimestamp(kNextFramePts);
output = ourDecoder->getFrameDisplayedAtTimestampNoDemux(kNextFramePts);
EXPECT_EQ(output.ptsSeconds, kNextFramePts);

// This is the timestamp of the last frame in this video.
Expand All @@ -339,7 +340,7 @@ TEST_P(VideoDecoderTest, GetsFrameDisplayedAtTimestamp) {
kPtsOfLastFrameInVideoStream + kDurationOfLastFrameInVideoStream;
// Sanity check: make sure duration is strictly positive.
EXPECT_GT(kPtsPlusDurationOfLastFrame, kPtsOfLastFrameInVideoStream);
output = ourDecoder->getFrameDisplayedAtTimestamp(
output = ourDecoder->getFrameDisplayedAtTimestampNoDemux(
kPtsPlusDurationOfLastFrame - 1e-6);
EXPECT_EQ(output.ptsSeconds, kPtsOfLastFrameInVideoStream);
}
Expand All @@ -350,7 +351,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) {
createDecoderFromPath(path, GetParam());
ourDecoder->addVideoStreamDecoder(-1);
ourDecoder->setCursorPtsInSeconds(6.0);
auto output = ourDecoder->getNextDecodedOutput();
auto output = ourDecoder->getNextDecodedOutputNoDemux();
torch::Tensor tensor6FromOurDecoder = output.frame;
EXPECT_EQ(output.ptsSeconds, 180'180. / 30'000);
torch::Tensor tensor6FromFFMPEG =
Expand All @@ -366,7 +367,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) {
EXPECT_GT(ourDecoder->getDecodeStats().numPacketsSentToDecoder, 180);

ourDecoder->setCursorPtsInSeconds(6.1);
output = ourDecoder->getNextDecodedOutput();
output = ourDecoder->getNextDecodedOutputNoDemux();
torch::Tensor tensor61FromOurDecoder = output.frame;
EXPECT_EQ(output.ptsSeconds, 183'183. / 30'000);
torch::Tensor tensor61FromFFMPEG =
Expand All @@ -386,7 +387,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) {
EXPECT_LT(ourDecoder->getDecodeStats().numPacketsSentToDecoder, 10);

ourDecoder->setCursorPtsInSeconds(10.0);
output = ourDecoder->getNextDecodedOutput();
output = ourDecoder->getNextDecodedOutputNoDemux();
torch::Tensor tensor10FromOurDecoder = output.frame;
EXPECT_EQ(output.ptsSeconds, 300'300. / 30'000);
torch::Tensor tensor10FromFFMPEG =
Expand All @@ -403,7 +404,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) {
EXPECT_GT(ourDecoder->getDecodeStats().numPacketsSentToDecoder, 60);

ourDecoder->setCursorPtsInSeconds(6.0);
output = ourDecoder->getNextDecodedOutput();
output = ourDecoder->getNextDecodedOutputNoDemux();
tensor6FromOurDecoder = output.frame;
EXPECT_EQ(output.ptsSeconds, 180'180. / 30'000);
EXPECT_TRUE(torch::equal(tensor6FromOurDecoder, tensor6FromFFMPEG));
Expand All @@ -418,7 +419,7 @@ TEST_P(VideoDecoderTest, SeeksToFrameWithSpecificPts) {

constexpr double kPtsOfLastFrameInVideoStream = 389'389. / 30'000; // ~12.9
ourDecoder->setCursorPtsInSeconds(kPtsOfLastFrameInVideoStream);
output = ourDecoder->getNextDecodedOutput();
output = ourDecoder->getNextDecodedOutputNoDemux();
torch::Tensor tensor7FromOurDecoder = output.frame;
EXPECT_EQ(output.ptsSeconds, 389'389. / 30'000);
torch::Tensor tensor7FromFFMPEG =
Expand Down