Skip to content

Commit

Permalink
kokkos#5635: Serial/OpenMP: Parallel_scan with return value for TeamT…
Browse files Browse the repository at this point in the history
…hreadRange (kokkos#6090)

* kokkos#5635: Add parallel_scan overloads with return value

* kokkos#5635: Add UTs for parallel_scan with return value

* kokkos#5635: Add static_assert to compare value types

* kokkos#5635: Fix complaince with Kokkos semantics

* kokkos#5635: Move some tests for parallel_scan to TestTeamScan

* kokkos#5635: Remove overloads and test for ThreadVectorRangeBoundariesStruct

* kokkos#5635: Remove default value assignement for scan variables

* kokkos#5635: Fix failing test
  • Loading branch information
thearusable committed Jul 31, 2023
1 parent 39de959 commit 929cac2
Show file tree
Hide file tree
Showing 2 changed files with 154 additions and 7 deletions.
33 changes: 26 additions & 7 deletions core/src/impl/Kokkos_HostThreadTeam.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -864,19 +864,21 @@ KOKKOS_INLINE_FUNCTION

//----------------------------------------------------------------------------

template <typename iType, class Closure, class Member>
template <typename iType, class Closure, class Member, typename ValueType>
KOKKOS_INLINE_FUNCTION
std::enable_if_t<Impl::is_host_thread_team_member<Member>::value>
std::enable_if_t<!Kokkos::is_reducer<ValueType>::value &&
Impl::is_host_thread_team_member<Member>::value>
parallel_scan(Impl::TeamThreadRangeBoundariesStruct<iType, Member> const&
loop_boundaries,
Closure const& closure) {
// Extract ValueType from the closure

using value_type = typename Kokkos::Impl::FunctorAnalysis<
Closure const& closure, ValueType& return_val) {
// Extract ValueType from the Closure
using ClosureValueType = typename Kokkos::Impl::FunctorAnalysis<
Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure,
void>::value_type;
static_assert(std::is_same<ClosureValueType, ValueType>::value,
"Non-matching value types of closure and return type");

value_type accum = 0;
ValueType accum = ValueType();

// Intra-member scan
for (iType i = loop_boundaries.start; i < loop_boundaries.end;
Expand All @@ -891,6 +893,23 @@ KOKKOS_INLINE_FUNCTION
i += loop_boundaries.increment) {
closure(i, accum, true);
}

return_val = accum;
}

template <typename iType, class Closure, class Member>
KOKKOS_INLINE_FUNCTION
std::enable_if_t<Impl::is_host_thread_team_member<Member>::value>
parallel_scan(Impl::TeamThreadRangeBoundariesStruct<iType, Member> const&
loop_boundaries,
Closure const& closure) {
// Extract ValueType from the closure
using ValueType = typename Kokkos::Impl::FunctorAnalysis<
Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure,
void>::value_type;

ValueType scan_val;
parallel_scan(loop_boundaries, closure, scan_val);
}

template <typename iType, class ClosureType, class Member>
Expand Down
128 changes: 128 additions & 0 deletions core/unit_test/TestTeamScan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,4 +130,132 @@ TEST(TEST_CATEGORY, team_scan) {
TestTeamScan<TEST_EXECSPACE, double>{}(2596, 1311);
}

// Temporary: This condition will progressively be reduced when parallel_scan
// with return value will be implemented for more backends.
#if defined(KOKKOS_ENABLE_SERIAL) || defined(KOKKOS_ENABLE_OPENMP)
#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \
!defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_SYCL) && \
!defined(KOKKOS_ENABLE_THREADS) && !defined(KOKKOS_ENABLE_OPENMPTARGET) && \
!defined(KOKKOS_ENABLE_HPX)
template <class ExecutionSpace, class DataType>
struct TestTeamScanRetVal {
using execution_space = ExecutionSpace;
using value_type = DataType;
using policy_type = Kokkos::TeamPolicy<execution_space>;
using member_type = typename policy_type::member_type;
using view_1d_type = Kokkos::View<value_type*, execution_space>;
using view_2d_type = Kokkos::View<value_type**, execution_space>;

view_2d_type a_d;
view_2d_type a_r;
view_1d_type a_s;
int32_t M = 0;
int32_t N = 0;

KOKKOS_FUNCTION
void operator()(const member_type& team) const {
auto leagueRank = team.league_rank();

auto beg = 0;
auto end = N;

Kokkos::parallel_for(
Kokkos::TeamThreadRange(team, beg, end),
[&](const int i) { a_d(leagueRank, i) = leagueRank * N + i; });

DataType accum;
Kokkos::parallel_scan(
Kokkos::TeamThreadRange(team, beg, end),
[&](int i, DataType& val, const bool final) {
val += a_d(leagueRank, i);
if (final) a_r(leagueRank, i) = val;
},
accum);

// Save return value from parallel_scan
Kokkos::single(Kokkos::PerTeam(team), [&]() { a_s(leagueRank) = accum; });
}

auto operator()(int32_t _M, int32_t _N) {
std::stringstream ss;
ss << Kokkos::Impl::demangle(typeid(*this).name());
ss << "(/*M=*/" << _M << ", /*N=*/" << _N << ")";
std::string const test_id = ss.str();

M = _M;
N = _N;
a_d = view_2d_type("a_d", M, N);
a_r = view_2d_type("a_r", M, N);
a_s = view_1d_type("a_s", M);

// Execute calculations
Kokkos::parallel_for(policy_type(M, Kokkos::AUTO), *this);

Kokkos::fence();
auto a_i = Kokkos::create_mirror_view(a_d);
auto a_o = Kokkos::create_mirror_view(a_r);
auto a_os = Kokkos::create_mirror_view(a_s);
Kokkos::deep_copy(a_i, a_d);
Kokkos::deep_copy(a_o, a_r);
Kokkos::deep_copy(a_os, a_s);

for (int32_t i = 0; i < M; ++i) {
value_type scan_ref = 0;
value_type scan_calc;
value_type abs_err = 0;
// each fp addition is subject to small loses in precision and these
// compound as loop so we set the base error to be the machine epsilon and
// then add in another epsilon each iteration. For example, with CUDA
// backend + 32-bit float + large N values (e.g. 1,000) + high
// thread-counts (e.g. 1024), this test will fail w/o epsilon
// accommodation
constexpr value_type epsilon = std::numeric_limits<value_type>::epsilon();
for (int32_t j = 0; j < N; ++j) {
scan_ref += a_i(i, j);
scan_calc = a_o(i, j);
if (std::is_integral<value_type>::value) {
ASSERT_EQ(scan_ref, scan_calc)
<< test_id
<< " calculated scan output value differs from reference at "
"indices i="
<< i << " and j=" << j;
} else {
abs_err += epsilon;
ASSERT_NEAR(scan_ref, scan_calc, abs_err)
<< test_id
<< " calculated scan output value differs from reference at "
"indices i="
<< i << " and j=" << j;
}
}
// Validate return value from parallel_scan
if (std::is_integral<value_type>::value) {
ASSERT_EQ(scan_ref, a_os(i));
} else {
ASSERT_NEAR(scan_ref, a_os(i), abs_err);
}
}
}
};

TEST(TEST_CATEGORY, team_scan_ret_val) {
TestTeamScanRetVal<TEST_EXECSPACE, int32_t>{}(0, 0);
TestTeamScanRetVal<TEST_EXECSPACE, int32_t>{}(0, 1);
TestTeamScanRetVal<TEST_EXECSPACE, int32_t>{}(1, 0);
TestTeamScanRetVal<TEST_EXECSPACE, uint32_t>{}(99, 32);
TestTeamScanRetVal<TEST_EXECSPACE, uint32_t>{}(139, 64);
TestTeamScanRetVal<TEST_EXECSPACE, uint32_t>{}(163, 128);
TestTeamScanRetVal<TEST_EXECSPACE, int64_t>{}(433, 256);
TestTeamScanRetVal<TEST_EXECSPACE, uint64_t>{}(976, 512);
TestTeamScanRetVal<TEST_EXECSPACE, uint64_t>{}(1234, 1024);
TestTeamScanRetVal<TEST_EXECSPACE, float>{}(2596, 34);
TestTeamScanRetVal<TEST_EXECSPACE, double>{}(2596, 59);
TestTeamScanRetVal<TEST_EXECSPACE, float>{}(2596, 65);
TestTeamScanRetVal<TEST_EXECSPACE, double>{}(2596, 371);
TestTeamScanRetVal<TEST_EXECSPACE, int64_t>{}(2596, 987);
TestTeamScanRetVal<TEST_EXECSPACE, double>{}(2596, 1311);
}
#endif
#endif

} // namespace Test

0 comments on commit 929cac2

Please sign in to comment.