OpenMPTarget: Update hierarchical parallelism. (kokkos#6043)

* OpenMPTarget: Update hierarchical parallelism. * OpenMPTarget: Update initialize routine. * OpenMPTarget: Remove num_teams for Intel GPUs. * OpenMPTarget: fix comment. * OpenMPTarget: Oversubscribe number of teams. * OpenMPTarget: Move KOKKOS_IMPL_HIERARCHICAL_INTEL_GPU macro to a central location. * OpenMPTarget: Add num_teams clause for Intel GPUs too. * OpenMPTarget: Moving the undef for Intel GPUs into files that include the macro. * OpenMPTarget: Updated macro name and added to print_configuration. * OpenMPTarget: Adding impl to macro. * OpenMPTarget: Fix typo for Intel GPUs. * OpenMPTarget: Fix print_configuration. * OpenMPTarget: Rename variable names. * OpenMPTarget: clang format. --------- Co-authored-by: Rahulkumar Gayatri <rgayatri@lbl.gov>
masterleinad · May 3, 2023 · 4b6d971 · 4b6d971
1 parent 5b1f341
commit 4b6d971
Show file tree

Hide file tree

Showing 5 changed files with 190 additions and 101 deletions.
diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
@@ -67,13 +67,16 @@ void OpenMPTargetExec::verify_initialized(const char* const label) {
     msg.append(" ERROR: not initialized");
     Kokkos::Impl::throw_runtime_exception(msg);
   }
+  OpenMPTargetExec::MAX_ACTIVE_THREADS =
+      Kokkos::Experimental::OpenMPTarget().concurrency();
 }
 
 void* OpenMPTargetExec::m_scratch_ptr         = nullptr;
 int64_t OpenMPTargetExec::m_scratch_size      = 0;
 int* OpenMPTargetExec::m_lock_array           = nullptr;
 uint64_t OpenMPTargetExec::m_lock_size        = 0;
 uint32_t* OpenMPTargetExec::m_uniquetoken_ptr = nullptr;
+int OpenMPTargetExec::MAX_ACTIVE_THREADS      = 0;
 
 void OpenMPTargetExec::clear_scratch() {
   Kokkos::Experimental::OpenMPTargetSpace space;
@@ -100,11 +103,23 @@ void OpenMPTargetExec::resize_scratch(int64_t team_size, int64_t shmem_size_L0,
   const int64_t shmem_size =
       shmem_size_L0 + shmem_size_L1;  // L0 + L1 scratch memory per team.
   const int64_t padding = shmem_size * 10 / 100;  // Padding per team.
+
+  // Maximum active teams possible.
+  // The number should not exceed the maximum in-flight teams possible or the
+  // league_size.
+  int max_active_teams =
+      std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size);
+
+  // max_active_teams is the number of active teams on the given hardware.
+  // We set the number of teams to be twice the number of max_active_teams for
+  // the compiler to pick the right number in its case.
+  omp_set_num_teams(max_active_teams * 2);
+
   // Total amount of scratch memory allocated is depenedent
   // on the maximum number of in-flight teams possible.
   int64_t total_size =
       (shmem_size + OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE + padding) *
-      std::min(MAX_ACTIVE_THREADS / team_size, league_size);
+      max_active_teams * 2;
 
   if (total_size > m_scratch_size) {
     space.deallocate(m_scratch_ptr, m_scratch_size);

diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
@@ -31,6 +31,7 @@
 #include <OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp>
 #include <OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp>
 #include <impl/Kokkos_ExecSpaceManager.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp>
 
 #include <sstream>
 
@@ -66,13 +67,40 @@ void OpenMPTargetInternal::fence(const std::string& name,
   }
 }
 int OpenMPTargetInternal::concurrency() const {
-  return 128000;  // FIXME_OPENMPTARGET
+  int max_threads = 2048 * 80;
+#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU)
+  int max_threads_sm = 2048;
+#if defined(KOKKOS_ARCH_AMPERE86)
+  max_threads = max_threads_sm * 84;
+#elif defined(KOKKOS_ARCH_AMPERE80)
+  max_threads = max_threads_sm * 108;
+#elif defined(KOKKOS_ARCH_VOLTA72)
+  max_threads = max_threads_sm * 84;
+#elif defined(KOKKOS_ARCH_VOLTA70)
+  max_threads = max_threads_sm * 80;
+#elif defined(KOKKOS_ARCH_PASCAL60) || defined(KOKKOS_ARCH_PASCAL61)
+  max_threads = max_threads_sm * 60;
+#endif
+#elif defined(KOKKOS_ARCH_INTEL_GPU)
+#pragma omp target map(max_threads)
+  { max_threads = omp_get_num_procs(); }
+
+  // Multiply the number of processors with the SIMD length.
+  max_threads *= 32;
+#endif
+
+  return max_threads;
 }
 const char* OpenMPTargetInternal::name() { return "OpenMPTarget"; }
 void OpenMPTargetInternal::print_configuration(std::ostream& os,
                                                bool /*verbose*/) const {
   // FIXME_OPENMPTARGET
   os << "Using OpenMPTarget\n";
+#if defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU)
+  os << "Defined KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU: Workaround "
+        "for "
+        "hierarchical parallelism for Intel GPUs.";
+#endif
 }
 
 void OpenMPTargetInternal::impl_finalize() {

diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
@@ -26,6 +26,12 @@
 #include <Kokkos_Atomic.hpp>
 #include "Kokkos_OpenMPTarget_Abort.hpp"
 
+// Intel architectures prefer the classical hierarchical parallelism that relies
+// on OpenMP.
+#if defined(KOKKOS_ARCH_INTEL_GPU)
+#define KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU
+#endif
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
@@ -727,8 +733,7 @@ class OpenMPTargetExec {
   // teams possible is calculated based on NVIDIA's Volta GPU. In
   // future this value should be based on the chosen architecture for the
   // OpenMPTarget backend.
-  static constexpr int MAX_ACTIVE_THREADS = 2080 * 80;
-  static constexpr int MAX_ACTIVE_TEAMS   = MAX_ACTIVE_THREADS / 32;
+  static int MAX_ACTIVE_THREADS;
 
  private:
   static void* scratch_ptr;

diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp
@@ -115,44 +115,68 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // mode but works in the Debug mode.
 
     // Maximum active teams possible.
-    int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size;
-    // nteams should not exceed the maximum in-flight teams possible.
-    const auto nteams =
-        league_size < max_active_teams ? league_size : max_active_teams;
+    int max_active_teams = omp_get_max_teams();
+
+    // FIXME_OPENMPTARGET: Although the maximum number of teams is set using the
+    // omp_set_num_teams in the resize_scratch routine, the call is not
+    // respected. Hence we need to use `num_teams` routine to restrict the
+    // number of teams generated to max_active_teams. Hopefully we can avoid the
+    // num_teams clause in the future and let compiler pick the right number of
+    // teams. This is not true for Intel architectures.
 
     // If the league size is <=0, do not launch the kernel.
-    if (nteams <= 0) return;
+    if (max_active_teams <= 0) return;
 
 // Performing our own scheduling of teams to avoid separation of code between
 // teams-distribute and parallel. Gave a 2x performance boost in test cases with
 // the clang compiler. atomic_compare_exchange can be avoided since the standard
 // guarantees that the number of teams specified in the `num_teams` clause is
 // always less than or equal to the maximum concurrently running teams.
-#pragma omp target teams num_teams(nteams) thread_limit(team_size) \
-    map(to                                                         \
-        : a_functor) is_device_ptr(scratch_ptr)
+#if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU)
+#pragma omp target teams thread_limit(team_size) firstprivate(a_functor) \
+    num_teams(max_active_teams) is_device_ptr(scratch_ptr)
 #pragma omp parallel
     {
+      if (omp_get_num_teams() > max_active_teams)
+        Kokkos::abort("`omp_set_num_teams` call was not respected.\n");
+
       const int blockIdx = omp_get_team_num();
       const int gridDim  = omp_get_num_teams();
 
       // Iterate through the number of teams until league_size and assign the
       // league_id accordingly
       // Guarantee that the compilers respect the `num_teams` clause
-      if (gridDim <= nteams) {
-        for (int league_id = blockIdx; league_id < league_size;
-             league_id += gridDim) {
-          typename Policy::member_type team(
-              league_id, league_size, team_size, vector_length, scratch_ptr,
-              blockIdx, shmem_size_L0, shmem_size_L1);
-          if constexpr (std::is_void<TagType>::value)
-            m_functor(team);
-          else
-            m_functor(TagType(), team);
-        }
-      } else
-        Kokkos::abort("`num_teams` clause was not respected.\n");
+      for (int league_id = blockIdx; league_id < league_size;
+           league_id += gridDim) {
+        typename Policy::member_type team(league_id, league_size, team_size,
+                                          vector_length, scratch_ptr, blockIdx,
+                                          shmem_size_L0, shmem_size_L1);
+        if constexpr (std::is_void_v<TagType>)
+          m_functor(team);
+        else
+          m_functor(TagType(), team);
+      }
     }
+#else
+#pragma omp target teams distribute firstprivate(a_functor) \
+    is_device_ptr(scratch_ptr) num_teams(max_active_teams)  \
+        thread_limit(team_size)
+    for (int i = 0; i < league_size; i++) {
+#pragma omp parallel
+      {
+        if (omp_get_num_teams() > max_active_teams)
+          Kokkos::abort("`omp_set_num_teams` call was not respected.\n");
+
+        typename Policy::member_type team(i, league_size, team_size,
+                                          vector_length, scratch_ptr, i,
+                                          shmem_size_L0, shmem_size_L1);
+        if constexpr (std::is_void_v<TagType>)
+          m_functor(team);
+        else
+          m_functor(TagType(), team);
+      }
+    }
+#endif
   }
 
  public: