diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index fa7ff7a9c..9d8ba7b3c 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -108,7 +108,7 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
       " vs supported:",
       caps.nMaxMBCount);
 
-  // Decoder creation parameters, taken from DALI
+  // Decoder creation parameters, most are taken from DALI
   CUVIDDECODECREATEINFO decoderParams = {};
   decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
   decoderParams.ChromaFormat = videoFormat->chroma_format;
@@ -124,7 +124,12 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
   decoderParams.ulTargetWidth =
       videoFormat->display_area.right - videoFormat->display_area.left;
   decoderParams.ulNumDecodeSurfaces = videoFormat->min_num_decode_surfaces;
-  decoderParams.ulNumOutputSurfaces = 2;
+  // We should only ever need 1 output surface, since we process frames
+  // sequentially, and we always unmap the previous frame before mapping a new
+  // one.
+  // TODONVDEC P3: set this to 2, allow for 2 frames to be mapped at a time, and
+  // benchmark to see if this makes any difference.
+  decoderParams.ulNumOutputSurfaces = 1;
   decoderParams.display_area.left = videoFormat->display_area.left;
   decoderParams.display_area.right = videoFormat->display_area.right;
   decoderParams.display_area.top = videoFormat->display_area.top;
@@ -166,10 +171,14 @@ BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
 }
 
 BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
-  // TODONVDEC P0: we probably need to free the frames that have been decoded by
-  // NVDEC but not yet "mapped" - i.e. those that are still in readyFrames_?
-
   if (decoder_) {
+    // DALI doesn't seem to do any particular cleanup of the decoder before
+    // sending it to the cache, so we probably don't need to do anything either.
+    // Just to be safe, we flush.
+    // What happens to those decode surfaces that haven't yet been mapped is
+    // unclear.
+    flush();
+    unmapPreviousFrame();
     NVDECCache::getCache(device_.index())
         .returnDecoder(&videoFormat_, std::move(decoder_));
   }
@@ -320,7 +329,7 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
     decoder_ = NVDECCache::getCache(device_.index()).getDecoder(videoFormat);
 
     if (!decoder_) {
-      // TODONVDEC P0: consider re-configuring an existing decoder instead of
+      // TODONVDEC P2: consider re-configuring an existing decoder instead of
       // re-creating one. See docs, see DALI.
       decoder_ = createDecoder(videoFormat);
     }
@@ -341,13 +350,20 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
       packet.get() && packet->data && packet->size > 0,
       "sendPacket received an empty packet, this is unexpected, please report.");
 
-  applyBSF(packet);
+  // Apply BSF if needed. We want applyBSF to return a *new* filtered packet, or
+  // the original one if no BSF is needed. This new filtered packet must be
+  // allocated outside of applyBSF: if it were allocated inside applyBSF, it
+  // would be destroyed at the end of the function, leaving us with a dangling
+  // reference.
+  AutoAVPacket filteredAutoPacket;
+  ReferenceAVPacket filteredPacket(filteredAutoPacket);
+  ReferenceAVPacket& packetToSend = applyBSF(packet, filteredPacket);
 
   CUVIDSOURCEDATAPACKET cuvidPacket = {};
-  cuvidPacket.payload = packet->data;
-  cuvidPacket.payload_size = packet->size;
+  cuvidPacket.payload = packetToSend->data;
+  cuvidPacket.payload_size = packetToSend->size;
   cuvidPacket.flags = CUVID_PKT_TIMESTAMP;
-  cuvidPacket.timestamp = packet->pts;
+  cuvidPacket.timestamp = packetToSend->pts;
 
   return sendCuvidPacket(cuvidPacket);
 }
@@ -366,9 +382,11 @@ int BetaCudaDeviceInterface::sendCuvidPacket(
   return result == CUDA_SUCCESS ? AVSUCCESS : AVERROR_EXTERNAL;
 }
 
-void BetaCudaDeviceInterface::applyBSF(ReferenceAVPacket& packet) {
+ReferenceAVPacket& BetaCudaDeviceInterface::applyBSF(
+    ReferenceAVPacket& packet,
+    ReferenceAVPacket& filteredPacket) {
   if (!bitstreamFilter_) {
-    return;
+    return packet;
   }
 
   int retVal = av_bsf_send_packet(bitstreamFilter_.get(), packet.get());
@@ -377,27 +395,17 @@ void BetaCudaDeviceInterface::applyBSF(ReferenceAVPacket& packet) {
       "Failed to send packet to bitstream filter: ",
       getFFMPEGErrorStringFromErrorCode(retVal));
 
-  // Create a temporary packet to receive the filtered data
   // TODO P1: the docs mention there can theoretically be multiple output
   // packets for a single input, i.e. we may need to call av_bsf_receive_packet
   // more than once. We should figure out whether that applies to the BSF we're
   // using.
-  AutoAVPacket filteredAutoPacket;
-  ReferenceAVPacket filteredPacket(filteredAutoPacket);
   retVal = av_bsf_receive_packet(bitstreamFilter_.get(), filteredPacket.get());
   TORCH_CHECK(
       retVal >= AVSUCCESS,
       "Failed to receive packet from bitstream filter: ",
       getFFMPEGErrorStringFromErrorCode(retVal));
 
-  // Free the original packet's data which isn't needed anymore, and move the
-  // fields of the filtered packet into the original packet. The filtered packet
-  // fields are re-set by av_packet_move_ref, so when it goes out of scope and
-  // gets destructed, it's not going to affect the original packet.
-  packet.reset(filteredPacket);
-  // TODONVDEC P0: consider cleaner ways to do this. Maybe we should let
-  // applyBSF return a new packet, and maybe that new packet needs to be a field
-  // on the interface to avoid complex lifetime issues.
+  return filteredPacket;
 }
 
 // Parser triggers this callback within cuvidParseVideoData when a frame is
@@ -405,13 +413,8 @@ void BetaCudaDeviceInterface::applyBSF(ReferenceAVPacket& packet) {
 // given frame. It means we can send that frame to be decoded by the hardware
 // NVDEC decoder by calling cuvidDecodePicture which is non-blocking.
 int BetaCudaDeviceInterface::frameReadyForDecoding(CUVIDPICPARAMS* picParams) {
-  if (isFlushing_) {
-    return 0;
-  }
-
   TORCH_CHECK(picParams != nullptr, "Invalid picture parameters");
   TORCH_CHECK(decoder_, "Decoder not initialized before picture decode");
-
   // Send frame to be decoded by NVDEC - non-blocking call.
   CUresult result = cuvidDecodePicture(*decoder_.get(), picParams);
 
@@ -432,6 +435,7 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
     // packets, or to stop if EOF was already sent.
     return eofSent_ ? AVERROR_EOF : AVERROR(EAGAIN);
   }
+
   CUVIDPARSERDISPINFO dispInfo = readyFrames_.front();
   readyFrames_.pop();
 
@@ -450,34 +454,41 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
   // to "map" it to an "output surface" before we can use its data. This is a
   // blocking calls that waits until the frame is fully decoded and ready to be
   // used.
+  // When a frame is mapped to an output surface, it needs to be unmapped
+  // eventually, so that the decoder can re-use the output surface. Failing to
+  // unmap will cause map to eventually fail. DALI unmaps frames almost
+  // immediately  after mapping them: they do the color-conversion in-between,
+  // which involves a copy of the data, so that works.
+  // We, OTOH, will do the color-conversion later, outside of ReceiveFrame(). So
+  // we unmap here: just before mapping a new frame. At that point we know that
+  // the previously-mapped frame is no longer needed: it was either
+  // color-converted (with a copy), or that's a frame that was discarded in
+  // SingleStreamDecoder. Either way, the underlying output surface can be
+  // safely re-used.
+  unmapPreviousFrame();
   CUresult result = cuvidMapVideoFrame(
       *decoder_.get(), dispInfo.picture_index, &framePtr, &pitch, &procParams);
-
   if (result != CUDA_SUCCESS) {
     return AVERROR_EXTERNAL;
   }
+  previouslyMappedFrame_ = framePtr;
 
   avFrame = convertCudaFrameToAVFrame(framePtr, pitch, dispInfo);
 
-  // Unmap the frame so that the decoder can reuse its corresponding output
-  // surface. Whether this is blocking is unclear?
-  cuvidUnmapVideoFrame(*decoder_.get(), framePtr);
-  // TODONVDEC P0: Get clarity on this:
-  // We assume that the framePtr is still valid after unmapping. That framePtr
-  // is now part of the avFrame, which we'll return to the caller, and the
-  // caller will immediately use it for color-conversion, at which point a copy
-  // happens. After the copy, it doesn't matter whether framePtr is still valid.
-  // And we'll return to this function (and to cuvidUnmapVideoFrame()) *after*
-  // the copy is made, so there should be no risk of overwriting the data before
-  // the copy.
-  // Buuuut yeah, we need get more clarity on what actually happens, and on
-  // what's needed. IIUC DALI makes the color-conversion copy immediately after
-  // cuvidMapVideoFrame() and *before* cuvidUnmapVideoFrame() with a synchronize
-  // in between. So maybe we should do the same.
-
   return AVSUCCESS;
 }
 
+void BetaCudaDeviceInterface::unmapPreviousFrame() {
+  if (previouslyMappedFrame_ == 0) {
+    return;
+  }
+  CUresult result =
+      cuvidUnmapVideoFrame(*decoder_.get(), previouslyMappedFrame_);
+  TORCH_CHECK(
+      result == CUDA_SUCCESS, "Failed to unmap previous frame: ", result);
+  previouslyMappedFrame_ = 0;
+}
+
 UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
     CUdeviceptr framePtr,
     unsigned int pitch,
@@ -554,16 +565,17 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
 }
 
 void BetaCudaDeviceInterface::flush() {
-  isFlushing_ = true;
-
+  // The NVCUVID docs mention that after seeking, i.e. when flush() is called,
+  // we should send a packet with the CUVID_PKT_DISCONTINUITY flag. The docs
+  // don't say whether this should be an empty packet, or whether it should be a
+  // flag on the next non-empty packet. It doesn't matter: neither work :)
+  // Sending an EOF packet, however, does work. So we do that. And we re-set the
+  // eofSent_ flag to false because that's not a true EOF notification.
   sendEOFPacket();
-
-  isFlushing_ = false;
+  eofSent_ = false;
 
   std::queue<CUVIDPARSERDISPINFO> emptyQueue;
   std::swap(readyFrames_, emptyQueue);
-
-  eofSent_ = false;
 }
 
 void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h
index d4b9fc0a5..61f1450b6 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.h
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.h
@@ -63,11 +63,18 @@ class BetaCudaDeviceInterface : public DeviceInterface {
 
  private:
   int sendCuvidPacket(CUVIDSOURCEDATAPACKET& cuvidPacket);
-  // Apply bitstream filter, modifies packet in-place
-  void applyBSF(ReferenceAVPacket& packet);
+
   void initializeBSF(
       const AVCodecParameters* codecPar,
       const UniqueDecodingAVFormatContext& avFormatCtx);
+  // Apply bitstream filter, returns filtered packet or original if no filter
+  // needed.
+  ReferenceAVPacket& applyBSF(
+      ReferenceAVPacket& packet,
+      ReferenceAVPacket& filteredPacket);
+
+  CUdeviceptr previouslyMappedFrame_ = 0;
+  void unmapPreviousFrame();
 
   UniqueAVFrame convertCudaFrameToAVFrame(
       CUdeviceptr framePtr,
@@ -82,10 +89,6 @@ class BetaCudaDeviceInterface : public DeviceInterface {
 
   bool eofSent_ = false;
 
-  // Flush flag to prevent decode operations during flush (like DALI's
-  // isFlushing_)
-  bool isFlushing_ = false;
-
   AVRational timeBase_ = {0, 1};
   AVRational frameRateAvgFromFFmpeg_ = {0, 1};
 
diff --git a/src/torchcodec/_core/FFMPEGCommon.cpp b/src/torchcodec/_core/FFMPEGCommon.cpp
index 93176917c..94c666d79 100644
--- a/src/torchcodec/_core/FFMPEGCommon.cpp
+++ b/src/torchcodec/_core/FFMPEGCommon.cpp
@@ -33,13 +33,6 @@ AVPacket* ReferenceAVPacket::operator->() {
   return avPacket_;
 }
 
-void ReferenceAVPacket::reset(ReferenceAVPacket& other) {
-  if (this != &other) {
-    av_packet_unref(avPacket_);
-    av_packet_move_ref(avPacket_, other.avPacket_);
-  }
-}
-
 AVCodecOnlyUseForCallingAVFindBestStream
 makeAVCodecOnlyUseForCallingAVFindBestStream(const AVCodec* codec) {
 #if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(59, 18, 100)
diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h
index 6bdcffc28..58589fe60 100644
--- a/src/torchcodec/_core/FFMPEGCommon.h
+++ b/src/torchcodec/_core/FFMPEGCommon.h
@@ -135,7 +135,6 @@ class ReferenceAVPacket {
   ~ReferenceAVPacket();
   AVPacket* get();
   AVPacket* operator->();
-  void reset(ReferenceAVPacket& other);
 };
 
 // av_find_best_stream is not const-correct before commit: