Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 63 additions & 51 deletions src/torchcodec/_core/BetaCudaDeviceInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
" vs supported:",
caps.nMaxMBCount);

// Decoder creation parameters, taken from DALI
// Decoder creation parameters, most are taken from DALI
CUVIDDECODECREATEINFO decoderParams = {};
decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
decoderParams.ChromaFormat = videoFormat->chroma_format;
Expand All @@ -124,7 +124,12 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
decoderParams.ulTargetWidth =
videoFormat->display_area.right - videoFormat->display_area.left;
decoderParams.ulNumDecodeSurfaces = videoFormat->min_num_decode_surfaces;
decoderParams.ulNumOutputSurfaces = 2;
// We should only ever need 1 output surface, since we process frames
// sequentially, and we always unmap the previous frame before mapping a new
// one.
// TODONVDEC P3: set this to 2, allow for 2 frames to be mapped at a time, and
// benchmark to see if this makes any difference.
decoderParams.ulNumOutputSurfaces = 1;
decoderParams.display_area.left = videoFormat->display_area.left;
decoderParams.display_area.right = videoFormat->display_area.right;
decoderParams.display_area.top = videoFormat->display_area.top;
Expand Down Expand Up @@ -166,10 +171,14 @@ BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
}

BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
// TODONVDEC P0: we probably need to free the frames that have been decoded by
// NVDEC but not yet "mapped" - i.e. those that are still in readyFrames_?

if (decoder_) {
// DALI doesn't seem to do any particular cleanup of the decoder before
// sending it to the cache, so we probably don't need to do anything either.
// Just to be safe, we flush.
// What happens to those decode surfaces that haven't yet been mapped is
// unclear.
flush();
unmapPreviousFrame();
NVDECCache::getCache(device_.index())
.returnDecoder(&videoFormat_, std::move(decoder_));
}
Expand Down Expand Up @@ -320,7 +329,7 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
decoder_ = NVDECCache::getCache(device_.index()).getDecoder(videoFormat);

if (!decoder_) {
// TODONVDEC P0: consider re-configuring an existing decoder instead of
// TODONVDEC P2: consider re-configuring an existing decoder instead of
// re-creating one. See docs, see DALI.
decoder_ = createDecoder(videoFormat);
}
Expand All @@ -341,13 +350,20 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
packet.get() && packet->data && packet->size > 0,
"sendPacket received an empty packet, this is unexpected, please report.");

applyBSF(packet);
// Apply BSF if needed. We want applyBSF to return a *new* filtered packet, or
// the original one if no BSF is needed. This new filtered packet must be
// allocated outside of applyBSF: if it were allocated inside applyBSF, it
// would be destroyed at the end of the function, leaving us with a dangling
// reference.
AutoAVPacket filteredAutoPacket;
ReferenceAVPacket filteredPacket(filteredAutoPacket);
ReferenceAVPacket& packetToSend = applyBSF(packet, filteredPacket);

CUVIDSOURCEDATAPACKET cuvidPacket = {};
cuvidPacket.payload = packet->data;
cuvidPacket.payload_size = packet->size;
cuvidPacket.payload = packetToSend->data;
cuvidPacket.payload_size = packetToSend->size;
cuvidPacket.flags = CUVID_PKT_TIMESTAMP;
cuvidPacket.timestamp = packet->pts;
cuvidPacket.timestamp = packetToSend->pts;

return sendCuvidPacket(cuvidPacket);
}
Expand All @@ -366,9 +382,11 @@ int BetaCudaDeviceInterface::sendCuvidPacket(
return result == CUDA_SUCCESS ? AVSUCCESS : AVERROR_EXTERNAL;
}

void BetaCudaDeviceInterface::applyBSF(ReferenceAVPacket& packet) {
ReferenceAVPacket& BetaCudaDeviceInterface::applyBSF(
ReferenceAVPacket& packet,
ReferenceAVPacket& filteredPacket) {
if (!bitstreamFilter_) {
return;
return packet;
}

int retVal = av_bsf_send_packet(bitstreamFilter_.get(), packet.get());
Expand All @@ -377,41 +395,26 @@ void BetaCudaDeviceInterface::applyBSF(ReferenceAVPacket& packet) {
"Failed to send packet to bitstream filter: ",
getFFMPEGErrorStringFromErrorCode(retVal));

// Create a temporary packet to receive the filtered data
// TODO P1: the docs mention there can theoretically be multiple output
// packets for a single input, i.e. we may need to call av_bsf_receive_packet
// more than once. We should figure out whether that applies to the BSF we're
// using.
AutoAVPacket filteredAutoPacket;
ReferenceAVPacket filteredPacket(filteredAutoPacket);
retVal = av_bsf_receive_packet(bitstreamFilter_.get(), filteredPacket.get());
TORCH_CHECK(
retVal >= AVSUCCESS,
"Failed to receive packet from bitstream filter: ",
getFFMPEGErrorStringFromErrorCode(retVal));

// Free the original packet's data which isn't needed anymore, and move the
// fields of the filtered packet into the original packet. The filtered packet
// fields are re-set by av_packet_move_ref, so when it goes out of scope and
// gets destructed, it's not going to affect the original packet.
packet.reset(filteredPacket);
// TODONVDEC P0: consider cleaner ways to do this. Maybe we should let
// applyBSF return a new packet, and maybe that new packet needs to be a field
// on the interface to avoid complex lifetime issues.
return filteredPacket;
}

// Parser triggers this callback within cuvidParseVideoData when a frame is
// ready to be decoded, i.e. the parser received all the necessary packets for a
// given frame. It means we can send that frame to be decoded by the hardware
// NVDEC decoder by calling cuvidDecodePicture which is non-blocking.
int BetaCudaDeviceInterface::frameReadyForDecoding(CUVIDPICPARAMS* picParams) {
if (isFlushing_) {
return 0;
}

TORCH_CHECK(picParams != nullptr, "Invalid picture parameters");
TORCH_CHECK(decoder_, "Decoder not initialized before picture decode");

// Send frame to be decoded by NVDEC - non-blocking call.
CUresult result = cuvidDecodePicture(*decoder_.get(), picParams);

Expand All @@ -432,6 +435,7 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
// packets, or to stop if EOF was already sent.
return eofSent_ ? AVERROR_EOF : AVERROR(EAGAIN);
}

CUVIDPARSERDISPINFO dispInfo = readyFrames_.front();
readyFrames_.pop();

Expand All @@ -450,34 +454,41 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
// to "map" it to an "output surface" before we can use its data. This is a
// blocking calls that waits until the frame is fully decoded and ready to be
// used.
// When a frame is mapped to an output surface, it needs to be unmapped
// eventually, so that the decoder can re-use the output surface. Failing to
// unmap will cause map to eventually fail. DALI unmaps frames almost
// immediately after mapping them: they do the color-conversion in-between,
// which involves a copy of the data, so that works.
// We, OTOH, will do the color-conversion later, outside of ReceiveFrame(). So
// we unmap here: just before mapping a new frame. At that point we know that
// the previously-mapped frame is no longer needed: it was either
// color-converted (with a copy), or that's a frame that was discarded in
// SingleStreamDecoder. Either way, the underlying output surface can be
// safely re-used.
unmapPreviousFrame();
CUresult result = cuvidMapVideoFrame(
*decoder_.get(), dispInfo.picture_index, &framePtr, &pitch, &procParams);

if (result != CUDA_SUCCESS) {
return AVERROR_EXTERNAL;
}
previouslyMappedFrame_ = framePtr;

avFrame = convertCudaFrameToAVFrame(framePtr, pitch, dispInfo);

// Unmap the frame so that the decoder can reuse its corresponding output
// surface. Whether this is blocking is unclear?
cuvidUnmapVideoFrame(*decoder_.get(), framePtr);
// TODONVDEC P0: Get clarity on this:
// We assume that the framePtr is still valid after unmapping. That framePtr
// is now part of the avFrame, which we'll return to the caller, and the
// caller will immediately use it for color-conversion, at which point a copy
// happens. After the copy, it doesn't matter whether framePtr is still valid.
// And we'll return to this function (and to cuvidUnmapVideoFrame()) *after*
// the copy is made, so there should be no risk of overwriting the data before
// the copy.
// Buuuut yeah, we need get more clarity on what actually happens, and on
// what's needed. IIUC DALI makes the color-conversion copy immediately after
// cuvidMapVideoFrame() and *before* cuvidUnmapVideoFrame() with a synchronize
// in between. So maybe we should do the same.

return AVSUCCESS;
}

void BetaCudaDeviceInterface::unmapPreviousFrame() {
if (previouslyMappedFrame_ == 0) {
return;
}
CUresult result =
cuvidUnmapVideoFrame(*decoder_.get(), previouslyMappedFrame_);
TORCH_CHECK(
result == CUDA_SUCCESS, "Failed to unmap previous frame: ", result);
previouslyMappedFrame_ = 0;
}

UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
CUdeviceptr framePtr,
unsigned int pitch,
Expand Down Expand Up @@ -554,16 +565,17 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
}

void BetaCudaDeviceInterface::flush() {
isFlushing_ = true;

// The NVCUVID docs mention that after seeking, i.e. when flush() is called,
// we should send a packet with the CUVID_PKT_DISCONTINUITY flag. The docs
// don't say whether this should be an empty packet, or whether it should be a
// flag on the next non-empty packet. It doesn't matter: neither work :)
// Sending an EOF packet, however, does work. So we do that. And we re-set the
// eofSent_ flag to false because that's not a true EOF notification.
sendEOFPacket();

isFlushing_ = false;
eofSent_ = false;

std::queue<CUVIDPARSERDISPINFO> emptyQueue;
std::swap(readyFrames_, emptyQueue);

eofSent_ = false;
}

void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
Expand Down
15 changes: 9 additions & 6 deletions src/torchcodec/_core/BetaCudaDeviceInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,18 @@ class BetaCudaDeviceInterface : public DeviceInterface {

private:
int sendCuvidPacket(CUVIDSOURCEDATAPACKET& cuvidPacket);
// Apply bitstream filter, modifies packet in-place
void applyBSF(ReferenceAVPacket& packet);

void initializeBSF(
const AVCodecParameters* codecPar,
const UniqueDecodingAVFormatContext& avFormatCtx);
// Apply bitstream filter, returns filtered packet or original if no filter
// needed.
ReferenceAVPacket& applyBSF(
ReferenceAVPacket& packet,
ReferenceAVPacket& filteredPacket);

CUdeviceptr previouslyMappedFrame_ = 0;
void unmapPreviousFrame();

UniqueAVFrame convertCudaFrameToAVFrame(
CUdeviceptr framePtr,
Expand All @@ -82,10 +89,6 @@ class BetaCudaDeviceInterface : public DeviceInterface {

bool eofSent_ = false;

// Flush flag to prevent decode operations during flush (like DALI's
// isFlushing_)
bool isFlushing_ = false;

AVRational timeBase_ = {0, 1};
AVRational frameRateAvgFromFFmpeg_ = {0, 1};

Expand Down
7 changes: 0 additions & 7 deletions src/torchcodec/_core/FFMPEGCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,6 @@ AVPacket* ReferenceAVPacket::operator->() {
return avPacket_;
}

void ReferenceAVPacket::reset(ReferenceAVPacket& other) {
if (this != &other) {
av_packet_unref(avPacket_);
av_packet_move_ref(avPacket_, other.avPacket_);
}
}

AVCodecOnlyUseForCallingAVFindBestStream
makeAVCodecOnlyUseForCallingAVFindBestStream(const AVCodec* codec) {
#if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(59, 18, 100)
Expand Down
1 change: 0 additions & 1 deletion src/torchcodec/_core/FFMPEGCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,6 @@ class ReferenceAVPacket {
~ReferenceAVPacket();
AVPacket* get();
AVPacket* operator->();
void reset(ReferenceAVPacket& other);
};

// av_find_best_stream is not const-correct before commit:
Expand Down
Loading