Skip to content

Commit

Permalink
OpenMPTarget: Update hierarchical parallelism. (kokkos#6043)
Browse files Browse the repository at this point in the history
* OpenMPTarget: Update hierarchical parallelism.

* OpenMPTarget: Update initialize routine.

* OpenMPTarget: Remove num_teams for Intel GPUs.

* OpenMPTarget: fix comment.

* OpenMPTarget: Oversubscribe number of teams.

* OpenMPTarget: Move KOKKOS_IMPL_HIERARCHICAL_INTEL_GPU macro to a central location.

* OpenMPTarget: Add num_teams clause for Intel GPUs too.

* OpenMPTarget: Moving the undef for Intel GPUs into files that include the macro.

* OpenMPTarget: Updated macro name and added to print_configuration.

* OpenMPTarget: Adding impl to macro.

* OpenMPTarget: Fix typo for Intel GPUs.

* OpenMPTarget: Fix print_configuration.

* OpenMPTarget: Rename variable names.

* OpenMPTarget: clang format.

---------

Co-authored-by: Rahulkumar Gayatri <rgayatri@lbl.gov>
  • Loading branch information
Rahulkumar Gayatri and Rahulkumar Gayatri committed May 3, 2023
1 parent 5b1f341 commit 4b6d971
Show file tree
Hide file tree
Showing 5 changed files with 190 additions and 101 deletions.
17 changes: 16 additions & 1 deletion core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,13 +67,16 @@ void OpenMPTargetExec::verify_initialized(const char* const label) {
msg.append(" ERROR: not initialized");
Kokkos::Impl::throw_runtime_exception(msg);
}
OpenMPTargetExec::MAX_ACTIVE_THREADS =
Kokkos::Experimental::OpenMPTarget().concurrency();
}

void* OpenMPTargetExec::m_scratch_ptr = nullptr;
int64_t OpenMPTargetExec::m_scratch_size = 0;
int* OpenMPTargetExec::m_lock_array = nullptr;
uint64_t OpenMPTargetExec::m_lock_size = 0;
uint32_t* OpenMPTargetExec::m_uniquetoken_ptr = nullptr;
int OpenMPTargetExec::MAX_ACTIVE_THREADS = 0;

void OpenMPTargetExec::clear_scratch() {
Kokkos::Experimental::OpenMPTargetSpace space;
Expand All @@ -100,11 +103,23 @@ void OpenMPTargetExec::resize_scratch(int64_t team_size, int64_t shmem_size_L0,
const int64_t shmem_size =
shmem_size_L0 + shmem_size_L1; // L0 + L1 scratch memory per team.
const int64_t padding = shmem_size * 10 / 100; // Padding per team.

// Maximum active teams possible.
// The number should not exceed the maximum in-flight teams possible or the
// league_size.
int max_active_teams =
std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size);

// max_active_teams is the number of active teams on the given hardware.
// We set the number of teams to be twice the number of max_active_teams for
// the compiler to pick the right number in its case.
omp_set_num_teams(max_active_teams * 2);

// Total amount of scratch memory allocated is depenedent
// on the maximum number of in-flight teams possible.
int64_t total_size =
(shmem_size + OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE + padding) *
std::min(MAX_ACTIVE_THREADS / team_size, league_size);
max_active_teams * 2;

if (total_size > m_scratch_size) {
space.deallocate(m_scratch_ptr, m_scratch_size);
Expand Down
30 changes: 29 additions & 1 deletion core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp>
#include <OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp>
#include <impl/Kokkos_ExecSpaceManager.hpp>
#include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp>

#include <sstream>

Expand Down Expand Up @@ -66,13 +67,40 @@ void OpenMPTargetInternal::fence(const std::string& name,
}
}
int OpenMPTargetInternal::concurrency() const {
return 128000; // FIXME_OPENMPTARGET
int max_threads = 2048 * 80;
#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU)
int max_threads_sm = 2048;
#if defined(KOKKOS_ARCH_AMPERE86)
max_threads = max_threads_sm * 84;
#elif defined(KOKKOS_ARCH_AMPERE80)
max_threads = max_threads_sm * 108;
#elif defined(KOKKOS_ARCH_VOLTA72)
max_threads = max_threads_sm * 84;
#elif defined(KOKKOS_ARCH_VOLTA70)
max_threads = max_threads_sm * 80;
#elif defined(KOKKOS_ARCH_PASCAL60) || defined(KOKKOS_ARCH_PASCAL61)
max_threads = max_threads_sm * 60;
#endif
#elif defined(KOKKOS_ARCH_INTEL_GPU)
#pragma omp target map(max_threads)
{ max_threads = omp_get_num_procs(); }

// Multiply the number of processors with the SIMD length.
max_threads *= 32;
#endif

return max_threads;
}
const char* OpenMPTargetInternal::name() { return "OpenMPTarget"; }
void OpenMPTargetInternal::print_configuration(std::ostream& os,
bool /*verbose*/) const {
// FIXME_OPENMPTARGET
os << "Using OpenMPTarget\n";
#if defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU)
os << "Defined KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU: Workaround "
"for "
"hierarchical parallelism for Intel GPUs.";
#endif
}

void OpenMPTargetInternal::impl_finalize() {
Expand Down
9 changes: 7 additions & 2 deletions core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@
#include <Kokkos_Atomic.hpp>
#include "Kokkos_OpenMPTarget_Abort.hpp"

// Intel architectures prefer the classical hierarchical parallelism that relies
// on OpenMP.
#if defined(KOKKOS_ARCH_INTEL_GPU)
#define KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU
#endif

//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

Expand Down Expand Up @@ -727,8 +733,7 @@ class OpenMPTargetExec {
// teams possible is calculated based on NVIDIA's Volta GPU. In
// future this value should be based on the chosen architecture for the
// OpenMPTarget backend.
static constexpr int MAX_ACTIVE_THREADS = 2080 * 80;
static constexpr int MAX_ACTIVE_TEAMS = MAX_ACTIVE_THREADS / 32;
static int MAX_ACTIVE_THREADS;

private:
static void* scratch_ptr;
Expand Down
66 changes: 45 additions & 21 deletions core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,44 +115,68 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
// mode but works in the Debug mode.

// Maximum active teams possible.
int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size;
// nteams should not exceed the maximum in-flight teams possible.
const auto nteams =
league_size < max_active_teams ? league_size : max_active_teams;
int max_active_teams = omp_get_max_teams();

// FIXME_OPENMPTARGET: Although the maximum number of teams is set using the
// omp_set_num_teams in the resize_scratch routine, the call is not
// respected. Hence we need to use `num_teams` routine to restrict the
// number of teams generated to max_active_teams. Hopefully we can avoid the
// num_teams clause in the future and let compiler pick the right number of
// teams. This is not true for Intel architectures.

// If the league size is <=0, do not launch the kernel.
if (nteams <= 0) return;
if (max_active_teams <= 0) return;

// Performing our own scheduling of teams to avoid separation of code between
// teams-distribute and parallel. Gave a 2x performance boost in test cases with
// the clang compiler. atomic_compare_exchange can be avoided since the standard
// guarantees that the number of teams specified in the `num_teams` clause is
// always less than or equal to the maximum concurrently running teams.
#pragma omp target teams num_teams(nteams) thread_limit(team_size) \
map(to \
: a_functor) is_device_ptr(scratch_ptr)
#if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU)
#pragma omp target teams thread_limit(team_size) firstprivate(a_functor) \
num_teams(max_active_teams) is_device_ptr(scratch_ptr)
#pragma omp parallel
{
if (omp_get_num_teams() > max_active_teams)
Kokkos::abort("`omp_set_num_teams` call was not respected.\n");

const int blockIdx = omp_get_team_num();
const int gridDim = omp_get_num_teams();

// Iterate through the number of teams until league_size and assign the
// league_id accordingly
// Guarantee that the compilers respect the `num_teams` clause
if (gridDim <= nteams) {
for (int league_id = blockIdx; league_id < league_size;
league_id += gridDim) {
typename Policy::member_type team(
league_id, league_size, team_size, vector_length, scratch_ptr,
blockIdx, shmem_size_L0, shmem_size_L1);
if constexpr (std::is_void<TagType>::value)
m_functor(team);
else
m_functor(TagType(), team);
}
} else
Kokkos::abort("`num_teams` clause was not respected.\n");
for (int league_id = blockIdx; league_id < league_size;
league_id += gridDim) {
typename Policy::member_type team(league_id, league_size, team_size,
vector_length, scratch_ptr, blockIdx,
shmem_size_L0, shmem_size_L1);
if constexpr (std::is_void_v<TagType>)
m_functor(team);
else
m_functor(TagType(), team);
}
}
#else
#pragma omp target teams distribute firstprivate(a_functor) \
is_device_ptr(scratch_ptr) num_teams(max_active_teams) \
thread_limit(team_size)
for (int i = 0; i < league_size; i++) {
#pragma omp parallel
{
if (omp_get_num_teams() > max_active_teams)
Kokkos::abort("`omp_set_num_teams` call was not respected.\n");

typename Policy::member_type team(i, league_size, team_size,
vector_length, scratch_ptr, i,
shmem_size_L0, shmem_size_L1);
if constexpr (std::is_void_v<TagType>)
m_functor(team);
else
m_functor(TagType(), team);
}
}
#endif
}

public:
Expand Down

0 comments on commit 4b6d971

Please sign in to comment.