nv-legate · manopapad · Feb 14, 2023 · Feb 10, 2023
diff --git a/src/core/cuda/stream_pool.h b/src/core/cuda/stream_pool.h
@@ -58,6 +58,8 @@ struct StreamPool {
 
  private:
   // For now we keep only one stream in the pool
+  // TODO: If this ever changes, the use of non-stream-ordered `DeferredBuffer`s
+  // in `core/data/buffer.h` will no longer be safe.
   std::unique_ptr<cudaStream_t> cached_stream_{nullptr};
 };
 

diff --git a/src/core/data/buffer.h b/src/core/data/buffer.h
@@ -25,6 +25,23 @@ namespace legate {
 template <typename VAL, int32_t DIM = 1>
 using Buffer = Legion::DeferredBuffer<VAL, DIM>;
 
+// Note on using temporary buffers in CUDA tasks:
+// We use Legion `DeferredBuffer`s, whose lifetime is not connected with the CUDA stream(s) used to
+// launch kernels. The buffer is allocated immediately at the point when `create_buffer` called,
+// whereas the kernel that uses it is placed on a stream, and may run at a later point. Normally
+// `DeferredBuffer`s are deallocated automatically by Legion once all the kernels launched in the
+// task are complete. However, `DeferredBuffer`s can also be deallocated immediately using
+// `destroy()`, which is useful for operations that want to deallocate intermediate memory as soon
+// as possible. This deallocation is not synchronized with the task stream, i.e. it may happen
+// before a kernel which uses the buffer has actually completed. This is safe as long as we use the
+// same stream on all GPU tasks running on the same device (which is guaranteed by the current
+// implementation of `get_cached_stream()`), because then all the actual uses of the buffer are done
+// in order on the one stream. It is important that all library CUDA code uses
+// `get_cached_stream()`, and all CUDA operations (including library calls) are enqueued on that
+// stream exclusively. This analysis additionally assumes that no code outside of Legate is
+// concurrently allocating from the eager pool, and that it's OK for kernels to access a buffer even
+// after it's technically been deallocated.
+
 template <typename VAL, int32_t DIM>
 Buffer<VAL, DIM> create_buffer(const Legion::Point<DIM>& extents,
                                Legion::Memory::Kind kind = Legion::Memory::Kind::NO_MEMKIND,