SYCL: Use host-pinned memory to copy reduction/scan result (kokkos#6500)

* SYCL: Use host-pinned memory to copy reduction/scan result * Remove unused variable * m_shared_memory_lock -> m_host_scratch_lock; improve comments * Add comment for choosing memcpy over fence+deep_copy * m_[host_]scratch_lock->m_scratch_buffers_lock
nliber · Oct 26, 2023 · 0975671 · 0975671
1 parent 6056c6b
commit 0975671
Show file tree

Hide file tree

Showing 7 changed files with 130 additions and 65 deletions.
diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp
@@ -54,7 +54,7 @@ Kokkos::View<uint32_t*, SYCLDeviceUSMSpace> sycl_global_unique_token_locks(
 }
 
 SYCLInternal::~SYCLInternal() {
-  if (!was_finalized || m_scratchSpace || m_scratchFlags) {
+  if (!was_finalized || m_scratchSpace || m_scratchHost || m_scratchFlags) {
     std::cerr << "Kokkos::Experimental::SYCL ERROR: Failed to call "
                  "Kokkos::Experimental::SYCL::finalize()"
               << std::endl;
@@ -199,11 +199,15 @@ void SYCLInternal::finalize() {
   using RecordSYCL = Kokkos::Impl::SharedAllocationRecord<SYCLDeviceUSMSpace>;
   if (nullptr != m_scratchSpace)
     RecordSYCL::decrement(RecordSYCL::get_record(m_scratchSpace));
+  if (nullptr != m_scratchHost)
+    RecordSYCL::decrement(RecordSYCL::get_record(m_scratchHost));
   if (nullptr != m_scratchFlags)
     RecordSYCL::decrement(RecordSYCL::get_record(m_scratchFlags));
   m_syclDev           = -1;
   m_scratchSpaceCount = 0;
   m_scratchSpace      = nullptr;
+  m_scratchHostCount  = 0;
+  m_scratchHost       = nullptr;
   m_scratchFlagsCount = 0;
   m_scratchFlags      = nullptr;
 
@@ -250,6 +254,30 @@ sycl::device_ptr<void> SYCLInternal::scratch_space(const std::size_t size) {
   return m_scratchSpace;
 }
 
+sycl::host_ptr<void> SYCLInternal::scratch_host(const std::size_t size) {
+  if (verify_is_initialized("scratch_unified") &&
+      m_scratchHostCount < scratch_count(size)) {
+    m_scratchHostCount = scratch_count(size);
+
+    using Record = Kokkos::Impl::SharedAllocationRecord<
+        Kokkos::Experimental::SYCLHostUSMSpace, void>;
+
+    if (m_scratchHost) Record::decrement(Record::get_record(m_scratchHost));
+
+    std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort(
+        m_scratchHostCount, sizeScratchGrain);
+    Record* const r = Record::allocate(
+        Kokkos::Experimental::SYCLHostUSMSpace(*m_queue),
+        "Kokkos::Experimental::SYCL::InternalScratchHost", alloc_size);
+
+    Record::increment(r);
+
+    m_scratchHost = reinterpret_cast<size_type*>(r->data());
+  }
+
+  return m_scratchHost;
+}
+
 sycl::device_ptr<void> SYCLInternal::scratch_flags(const std::size_t size) {
   if (verify_is_initialized("scratch_flags") &&
       m_scratchFlagsCount < scratch_count(size)) {

diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/core/src/SYCL/Kokkos_SYCL_Instance.hpp
@@ -45,6 +45,7 @@ class SYCLInternal {
 
   sycl::device_ptr<void> scratch_space(const std::size_t size);
   sycl::device_ptr<void> scratch_flags(const std::size_t size);
+  sycl::host_ptr<void> scratch_host(const std::size_t size);
   int acquire_team_scratch_space();
   sycl::device_ptr<void> resize_team_scratch_space(int scratch_pool_id,
                                                    std::int64_t bytes,
@@ -60,6 +61,8 @@ class SYCLInternal {
 
   std::size_t m_scratchSpaceCount            = 0;
   sycl::device_ptr<size_type> m_scratchSpace = nullptr;
+  std::size_t m_scratchHostCount             = 0;
+  sycl::host_ptr<size_type> m_scratchHost    = nullptr;
   std::size_t m_scratchFlagsCount            = 0;
   sycl::device_ptr<size_type> m_scratchFlags = nullptr;
   // mutex to access shared memory

diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp
@@ -46,9 +46,9 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   int m_shmem_size;
   sycl::device_ptr<char> m_global_scratch_ptr;
   size_t m_scratch_size[2];
-  // Only let one ParallelFor/Reduce modify the team scratch memory. The
-  // constructor acquires the mutex which is released in the destructor.
-  std::scoped_lock<std::mutex> m_scratch_lock;
+  // Only let one ParallelFor instance at a time use the team scratch memory.
+  // The constructor acquires the mutex which is released in the destructor.
+  std::scoped_lock<std::mutex> m_scratch_buffers_lock;
   int m_scratch_pool_id = -1;
 
   template <typename FunctorWrapper>
@@ -141,9 +141,9 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_league_size(arg_policy.league_size()),
         m_team_size(arg_policy.team_size()),
         m_vector_size(arg_policy.impl_vector_length()),
-        m_scratch_lock(arg_policy.space()
-                           .impl_internal_space_instance()
-                           ->m_team_scratch_mutex) {
+        m_scratch_buffers_lock(arg_policy.space()
+                                   .impl_internal_space_instance()
+                                   ->m_team_scratch_mutex) {
     // FIXME_SYCL optimize
     if (m_team_size < 0)
       m_team_size =

diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp
@@ -78,7 +78,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
         m_result_ptr_device_accessible(
             MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
                               typename View::memory_space>::accessible),
-        m_shared_memory_lock(
+        m_scratch_buffers_lock(
             m_space.impl_internal_space_instance()->m_mutexScratchSpace) {}
 
  private:
@@ -95,6 +95,11 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
     const unsigned int value_count =
         m_functor_reducer.get_reducer().value_count();
     sycl::device_ptr<value_type> results_ptr;
+    auto host_result_ptr =
+        (m_result_ptr && !m_result_ptr_device_accessible)
+            ? static_cast<sycl::host_ptr<value_type>>(
+                  instance.scratch_host(sizeof(value_type) * value_count))
+            : nullptr;
 
     sycl::event last_reduction_event;
 
@@ -109,8 +114,10 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
 #endif
         results_ptr = static_cast<sycl::device_ptr<value_type>>(
             instance.scratch_space(sizeof(value_type) * value_count));
-        sycl::global_ptr<value_type> device_accessible_result_ptr =
-            m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+        auto device_accessible_result_ptr =
+            m_result_ptr_device_accessible
+                ? static_cast<sycl::global_ptr<value_type>>(m_result_ptr)
+                : static_cast<sycl::global_ptr<value_type>>(host_result_ptr);
         cgh.single_task([=]() {
           const CombinedFunctorReducerType& functor_reducer =
               functor_reducer_wrapper.get_functor();
@@ -148,8 +155,10 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
 
       results_ptr = static_cast<sycl::device_ptr<value_type>>(
           instance.scratch_space(sizeof(value_type) * value_count * n_wgroups));
-      sycl::global_ptr<value_type> device_accessible_result_ptr =
-          m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+      auto device_accessible_result_ptr =
+          m_result_ptr_device_accessible
+              ? static_cast<sycl::global_ptr<value_type>>(m_result_ptr)
+              : static_cast<sycl::global_ptr<value_type>>(host_result_ptr);
       auto scratch_flags = static_cast<sycl::device_ptr<unsigned int>>(
           instance.scratch_flags(sizeof(unsigned int)));
 
@@ -296,11 +305,13 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
     // At this point, the reduced value is written to the entry in results_ptr
     // and all that is left is to copy it back to the given result pointer if
     // necessary.
-    if (m_result_ptr && !m_result_ptr_device_accessible) {
-      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
-          m_space, m_result_ptr, results_ptr,
-          sizeof(*m_result_ptr) * value_count);
+    // Using DeepCopy instead of fence+memcpy turned out to be up to 2x slower.
+    if (host_result_ptr) {
+      m_space.fence(
+          "Kokkos::Impl::ParallelReduce<SYCL, MDRangePolicy>::execute: result "
+          "not device-accessible");
+      std::memcpy(m_result_ptr, host_result_ptr,
+                  sizeof(value_type) * value_count);
     }
 
     return last_reduction_event;
@@ -335,9 +346,9 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
   const pointer_type m_result_ptr;
   const bool m_result_ptr_device_accessible;
 
-  // Only let one Parallel/Scan modify the shared memory. The
-  // constructor acquires the mutex which is released in the destructor.
-  std::scoped_lock<std::mutex> m_shared_memory_lock;
+  // Only let one ParallelReduce instance at a time use the host scratch memory.
+  // The constructor acquires the mutex which is released in the destructor.
+  std::scoped_lock<std::mutex> m_scratch_buffers_lock;
 };
 
 #endif /* KOKKOS_SYCL_PARALLEL_REDUCE_MDRANGE_HPP */
diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp
@@ -51,7 +51,7 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
         m_result_ptr_device_accessible(
             MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
                               typename View::memory_space>::accessible),
-        m_shared_memory_lock(
+        m_scratch_buffers_lock(
             p.space().impl_internal_space_instance()->m_mutexScratchSpace) {}
 
  private:
@@ -70,8 +70,15 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
     const unsigned int value_count =
         m_functor_reducer.get_reducer().value_count();
     sycl::device_ptr<value_type> results_ptr = nullptr;
-    sycl::global_ptr<value_type> device_accessible_result_ptr =
-        m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+    auto host_result_ptr =
+        (m_result_ptr && !m_result_ptr_device_accessible)
+            ? static_cast<sycl::host_ptr<value_type>>(
+                  instance.scratch_host(sizeof(value_type) * value_count))
+            : nullptr;
+    auto device_accessible_result_ptr =
+        m_result_ptr_device_accessible
+            ? static_cast<sycl::global_ptr<value_type>>(m_result_ptr)
+            : static_cast<sycl::global_ptr<value_type>>(host_result_ptr);
 
     sycl::event last_reduction_event;
 
@@ -320,11 +327,13 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
     // At this point, the reduced value is written to the entry in results_ptr
     // and all that is left is to copy it back to the given result pointer if
     // necessary.
-    if (m_result_ptr && !m_result_ptr_device_accessible) {
-      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
-          space, m_result_ptr, results_ptr,
-          sizeof(*m_result_ptr) * value_count);
+    // Using DeepCopy instead of fence+memcpy turned out to be up to 2x slower.
+    if (host_result_ptr) {
+      space.fence(
+          "Kokkos::Impl::ParallelReduce<SYCL, RangePolicy>::execute: result "
+          "not device-accessible");
+      std::memcpy(m_result_ptr, host_result_ptr,
+                  sizeof(*m_result_ptr) * value_count);
     }
 
     return last_reduction_event;
@@ -354,9 +363,9 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
   const pointer_type m_result_ptr;
   const bool m_result_ptr_device_accessible;
 
-  // Only let one Parallel/Scan modify the shared memory. The
-  // constructor acquires the mutex which is released in the destructor.
-  std::scoped_lock<std::mutex> m_shared_memory_lock;
+  // Only let one ParallelReduce instance at a time use the host scratch memory.
+  // The constructor acquires the mutex which is released in the destructor.
+  std::scoped_lock<std::mutex> m_scratch_buffers_lock;
 };
 
 #endif /* KOKKOS_SYCL_PARALLEL_REDUCE_RANGE_HPP */
diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp
@@ -59,9 +59,10 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
   const size_type m_league_size;
   int m_team_size;
   const size_type m_vector_size;
-  // Only let one ParallelFor/Reduce modify the team scratch memory. The
-  // constructor acquires the mutex which is released in the destructor.
-  std::scoped_lock<std::mutex> m_scratch_lock;
+  // Only let one ParallelReduce instance at a time use the team scratch memory
+  // and the host scratch memory. The constructor acquires the mutex which is
+  // released in the destructor.
+  std::scoped_lock<std::mutex> m_scratch_buffers_lock;
   int m_scratch_pool_id = -1;
 
   template <typename PolicyType, typename CombinedFunctorReducerWrapper>
@@ -79,6 +80,11 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
         m_functor_reducer.get_reducer().value_count();
     std::size_t size = std::size_t(m_league_size) * m_team_size * m_vector_size;
     value_type* results_ptr = nullptr;
+    auto host_result_ptr =
+        (m_result_ptr && !m_result_ptr_device_accessible)
+            ? static_cast<sycl::host_ptr<value_type>>(
+                  instance.scratch_host(sizeof(value_type) * value_count))
+            : nullptr;
 
     sycl::event last_reduction_event;
 
@@ -89,8 +95,10 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
       results_ptr =
           static_cast<sycl::device_ptr<value_type>>(instance.scratch_space(
               sizeof(value_type) * std::max(value_count, 1u)));
-      sycl::global_ptr<value_type> device_accessible_result_ptr =
-          m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+      auto device_accessible_result_ptr =
+          m_result_ptr_device_accessible
+              ? static_cast<sycl::global_ptr<value_type>>(m_result_ptr)
+              : static_cast<sycl::global_ptr<value_type>>(host_result_ptr);
 
       auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
         // FIXME_SYCL accessors seem to need a size greater than zero at least
@@ -164,8 +172,11 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
         auto team_reduction_factory =
             [&](sycl::local_accessor<value_type, 1> local_mem,
                 sycl::device_ptr<value_type> results_ptr) {
-              sycl::global_ptr<value_type> device_accessible_result_ptr =
-                  m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+              auto device_accessible_result_ptr =
+                  m_result_ptr_device_accessible
+                      ? static_cast<sycl::global_ptr<value_type>>(m_result_ptr)
+                      : static_cast<sycl::global_ptr<value_type>>(
+                            host_result_ptr);
               auto lambda = [=](sycl::nd_item<2> item) {
                 auto n_wgroups = item.get_group_range()[1];
                 int wgroup_size =
@@ -358,11 +369,13 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
     // At this point, the reduced value is written to the entry in results_ptr
     // and all that is left is to copy it back to the given result pointer if
     // necessary.
-    if (m_result_ptr && !m_result_ptr_device_accessible) {
-      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
-          space, m_result_ptr, results_ptr,
-          sizeof(*m_result_ptr) * value_count);
+    // Using DeepCopy instead of fence+memcpy turned out to be up to 2x slower.
+    if (host_result_ptr) {
+      space.fence(
+          "Kokkos::Impl::ParallelReduce<SYCL, TeamPolicy>::execute: result not "
+          "device-accessible");
+      std::memcpy(m_result_ptr, host_result_ptr,
+                  sizeof(*m_result_ptr) * value_count);
     }
 
     return last_reduction_event;
@@ -448,9 +461,9 @@ class Kokkos::Impl::ParallelReduce<CombinedFunctorReducerType,
         m_league_size(arg_policy.league_size()),
         m_team_size(arg_policy.team_size()),
         m_vector_size(arg_policy.impl_vector_length()),
-        m_scratch_lock(arg_policy.space()
-                           .impl_internal_space_instance()
-                           ->m_team_scratch_mutex) {
+        m_scratch_buffers_lock(arg_policy.space()
+                                   .impl_internal_space_instance()
+                                   ->m_team_scratch_mutex) {
     initialize();
   }
 };