diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index fa7ff7a9c..9d8ba7b3c 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -108,7 +108,7 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) { " vs supported:", caps.nMaxMBCount); - // Decoder creation parameters, taken from DALI + // Decoder creation parameters, most are taken from DALI CUVIDDECODECREATEINFO decoderParams = {}; decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8; decoderParams.ChromaFormat = videoFormat->chroma_format; @@ -124,7 +124,12 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) { decoderParams.ulTargetWidth = videoFormat->display_area.right - videoFormat->display_area.left; decoderParams.ulNumDecodeSurfaces = videoFormat->min_num_decode_surfaces; - decoderParams.ulNumOutputSurfaces = 2; + // We should only ever need 1 output surface, since we process frames + // sequentially, and we always unmap the previous frame before mapping a new + // one. + // TODONVDEC P3: set this to 2, allow for 2 frames to be mapped at a time, and + // benchmark to see if this makes any difference. + decoderParams.ulNumOutputSurfaces = 1; decoderParams.display_area.left = videoFormat->display_area.left; decoderParams.display_area.right = videoFormat->display_area.right; decoderParams.display_area.top = videoFormat->display_area.top; @@ -166,10 +171,14 @@ BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device) } BetaCudaDeviceInterface::~BetaCudaDeviceInterface() { - // TODONVDEC P0: we probably need to free the frames that have been decoded by - // NVDEC but not yet "mapped" - i.e. those that are still in readyFrames_? - if (decoder_) { + // DALI doesn't seem to do any particular cleanup of the decoder before + // sending it to the cache, so we probably don't need to do anything either. + // Just to be safe, we flush. + // What happens to those decode surfaces that haven't yet been mapped is + // unclear. + flush(); + unmapPreviousFrame(); NVDECCache::getCache(device_.index()) .returnDecoder(&videoFormat_, std::move(decoder_)); } @@ -320,7 +329,7 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) { decoder_ = NVDECCache::getCache(device_.index()).getDecoder(videoFormat); if (!decoder_) { - // TODONVDEC P0: consider re-configuring an existing decoder instead of + // TODONVDEC P2: consider re-configuring an existing decoder instead of // re-creating one. See docs, see DALI. decoder_ = createDecoder(videoFormat); } @@ -341,13 +350,20 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) { packet.get() && packet->data && packet->size > 0, "sendPacket received an empty packet, this is unexpected, please report."); - applyBSF(packet); + // Apply BSF if needed. We want applyBSF to return a *new* filtered packet, or + // the original one if no BSF is needed. This new filtered packet must be + // allocated outside of applyBSF: if it were allocated inside applyBSF, it + // would be destroyed at the end of the function, leaving us with a dangling + // reference. + AutoAVPacket filteredAutoPacket; + ReferenceAVPacket filteredPacket(filteredAutoPacket); + ReferenceAVPacket& packetToSend = applyBSF(packet, filteredPacket); CUVIDSOURCEDATAPACKET cuvidPacket = {}; - cuvidPacket.payload = packet->data; - cuvidPacket.payload_size = packet->size; + cuvidPacket.payload = packetToSend->data; + cuvidPacket.payload_size = packetToSend->size; cuvidPacket.flags = CUVID_PKT_TIMESTAMP; - cuvidPacket.timestamp = packet->pts; + cuvidPacket.timestamp = packetToSend->pts; return sendCuvidPacket(cuvidPacket); } @@ -366,9 +382,11 @@ int BetaCudaDeviceInterface::sendCuvidPacket( return result == CUDA_SUCCESS ? AVSUCCESS : AVERROR_EXTERNAL; } -void BetaCudaDeviceInterface::applyBSF(ReferenceAVPacket& packet) { +ReferenceAVPacket& BetaCudaDeviceInterface::applyBSF( + ReferenceAVPacket& packet, + ReferenceAVPacket& filteredPacket) { if (!bitstreamFilter_) { - return; + return packet; } int retVal = av_bsf_send_packet(bitstreamFilter_.get(), packet.get()); @@ -377,27 +395,17 @@ void BetaCudaDeviceInterface::applyBSF(ReferenceAVPacket& packet) { "Failed to send packet to bitstream filter: ", getFFMPEGErrorStringFromErrorCode(retVal)); - // Create a temporary packet to receive the filtered data // TODO P1: the docs mention there can theoretically be multiple output // packets for a single input, i.e. we may need to call av_bsf_receive_packet // more than once. We should figure out whether that applies to the BSF we're // using. - AutoAVPacket filteredAutoPacket; - ReferenceAVPacket filteredPacket(filteredAutoPacket); retVal = av_bsf_receive_packet(bitstreamFilter_.get(), filteredPacket.get()); TORCH_CHECK( retVal >= AVSUCCESS, "Failed to receive packet from bitstream filter: ", getFFMPEGErrorStringFromErrorCode(retVal)); - // Free the original packet's data which isn't needed anymore, and move the - // fields of the filtered packet into the original packet. The filtered packet - // fields are re-set by av_packet_move_ref, so when it goes out of scope and - // gets destructed, it's not going to affect the original packet. - packet.reset(filteredPacket); - // TODONVDEC P0: consider cleaner ways to do this. Maybe we should let - // applyBSF return a new packet, and maybe that new packet needs to be a field - // on the interface to avoid complex lifetime issues. + return filteredPacket; } // Parser triggers this callback within cuvidParseVideoData when a frame is @@ -405,13 +413,8 @@ void BetaCudaDeviceInterface::applyBSF(ReferenceAVPacket& packet) { // given frame. It means we can send that frame to be decoded by the hardware // NVDEC decoder by calling cuvidDecodePicture which is non-blocking. int BetaCudaDeviceInterface::frameReadyForDecoding(CUVIDPICPARAMS* picParams) { - if (isFlushing_) { - return 0; - } - TORCH_CHECK(picParams != nullptr, "Invalid picture parameters"); TORCH_CHECK(decoder_, "Decoder not initialized before picture decode"); - // Send frame to be decoded by NVDEC - non-blocking call. CUresult result = cuvidDecodePicture(*decoder_.get(), picParams); @@ -432,6 +435,7 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) { // packets, or to stop if EOF was already sent. return eofSent_ ? AVERROR_EOF : AVERROR(EAGAIN); } + CUVIDPARSERDISPINFO dispInfo = readyFrames_.front(); readyFrames_.pop(); @@ -450,34 +454,41 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) { // to "map" it to an "output surface" before we can use its data. This is a // blocking calls that waits until the frame is fully decoded and ready to be // used. + // When a frame is mapped to an output surface, it needs to be unmapped + // eventually, so that the decoder can re-use the output surface. Failing to + // unmap will cause map to eventually fail. DALI unmaps frames almost + // immediately after mapping them: they do the color-conversion in-between, + // which involves a copy of the data, so that works. + // We, OTOH, will do the color-conversion later, outside of ReceiveFrame(). So + // we unmap here: just before mapping a new frame. At that point we know that + // the previously-mapped frame is no longer needed: it was either + // color-converted (with a copy), or that's a frame that was discarded in + // SingleStreamDecoder. Either way, the underlying output surface can be + // safely re-used. + unmapPreviousFrame(); CUresult result = cuvidMapVideoFrame( *decoder_.get(), dispInfo.picture_index, &framePtr, &pitch, &procParams); - if (result != CUDA_SUCCESS) { return AVERROR_EXTERNAL; } + previouslyMappedFrame_ = framePtr; avFrame = convertCudaFrameToAVFrame(framePtr, pitch, dispInfo); - // Unmap the frame so that the decoder can reuse its corresponding output - // surface. Whether this is blocking is unclear? - cuvidUnmapVideoFrame(*decoder_.get(), framePtr); - // TODONVDEC P0: Get clarity on this: - // We assume that the framePtr is still valid after unmapping. That framePtr - // is now part of the avFrame, which we'll return to the caller, and the - // caller will immediately use it for color-conversion, at which point a copy - // happens. After the copy, it doesn't matter whether framePtr is still valid. - // And we'll return to this function (and to cuvidUnmapVideoFrame()) *after* - // the copy is made, so there should be no risk of overwriting the data before - // the copy. - // Buuuut yeah, we need get more clarity on what actually happens, and on - // what's needed. IIUC DALI makes the color-conversion copy immediately after - // cuvidMapVideoFrame() and *before* cuvidUnmapVideoFrame() with a synchronize - // in between. So maybe we should do the same. - return AVSUCCESS; } +void BetaCudaDeviceInterface::unmapPreviousFrame() { + if (previouslyMappedFrame_ == 0) { + return; + } + CUresult result = + cuvidUnmapVideoFrame(*decoder_.get(), previouslyMappedFrame_); + TORCH_CHECK( + result == CUDA_SUCCESS, "Failed to unmap previous frame: ", result); + previouslyMappedFrame_ = 0; +} + UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame( CUdeviceptr framePtr, unsigned int pitch, @@ -554,16 +565,17 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame( } void BetaCudaDeviceInterface::flush() { - isFlushing_ = true; - + // The NVCUVID docs mention that after seeking, i.e. when flush() is called, + // we should send a packet with the CUVID_PKT_DISCONTINUITY flag. The docs + // don't say whether this should be an empty packet, or whether it should be a + // flag on the next non-empty packet. It doesn't matter: neither work :) + // Sending an EOF packet, however, does work. So we do that. And we re-set the + // eofSent_ flag to false because that's not a true EOF notification. sendEOFPacket(); - - isFlushing_ = false; + eofSent_ = false; std::queue emptyQueue; std::swap(readyFrames_, emptyQueue); - - eofSent_ = false; } void BetaCudaDeviceInterface::convertAVFrameToFrameOutput( diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h index d4b9fc0a5..61f1450b6 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.h +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.h @@ -63,11 +63,18 @@ class BetaCudaDeviceInterface : public DeviceInterface { private: int sendCuvidPacket(CUVIDSOURCEDATAPACKET& cuvidPacket); - // Apply bitstream filter, modifies packet in-place - void applyBSF(ReferenceAVPacket& packet); + void initializeBSF( const AVCodecParameters* codecPar, const UniqueDecodingAVFormatContext& avFormatCtx); + // Apply bitstream filter, returns filtered packet or original if no filter + // needed. + ReferenceAVPacket& applyBSF( + ReferenceAVPacket& packet, + ReferenceAVPacket& filteredPacket); + + CUdeviceptr previouslyMappedFrame_ = 0; + void unmapPreviousFrame(); UniqueAVFrame convertCudaFrameToAVFrame( CUdeviceptr framePtr, @@ -82,10 +89,6 @@ class BetaCudaDeviceInterface : public DeviceInterface { bool eofSent_ = false; - // Flush flag to prevent decode operations during flush (like DALI's - // isFlushing_) - bool isFlushing_ = false; - AVRational timeBase_ = {0, 1}; AVRational frameRateAvgFromFFmpeg_ = {0, 1}; diff --git a/src/torchcodec/_core/FFMPEGCommon.cpp b/src/torchcodec/_core/FFMPEGCommon.cpp index 93176917c..94c666d79 100644 --- a/src/torchcodec/_core/FFMPEGCommon.cpp +++ b/src/torchcodec/_core/FFMPEGCommon.cpp @@ -33,13 +33,6 @@ AVPacket* ReferenceAVPacket::operator->() { return avPacket_; } -void ReferenceAVPacket::reset(ReferenceAVPacket& other) { - if (this != &other) { - av_packet_unref(avPacket_); - av_packet_move_ref(avPacket_, other.avPacket_); - } -} - AVCodecOnlyUseForCallingAVFindBestStream makeAVCodecOnlyUseForCallingAVFindBestStream(const AVCodec* codec) { #if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(59, 18, 100) diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h index 6bdcffc28..58589fe60 100644 --- a/src/torchcodec/_core/FFMPEGCommon.h +++ b/src/torchcodec/_core/FFMPEGCommon.h @@ -135,7 +135,6 @@ class ReferenceAVPacket { ~ReferenceAVPacket(); AVPacket* get(); AVPacket* operator->(); - void reset(ReferenceAVPacket& other); }; // av_find_best_stream is not const-correct before commit: