232 changes: 196 additions & 36 deletions openmp/libomptarget/DeviceRTL/include/State.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,105 @@
#define OMPTARGET_STATE_H

#include "Debug.h"
#include "Mapping.h"
#include "Types.h"
#include "Utils.h"

#pragma omp begin declare target device_type(nohost)

namespace _OMP {

namespace memory {

/// Alloca \p Size bytes in shared memory, if possible, for \p Reason.
///
/// Note: See the restrictions on __kmpc_alloc_shared for proper usage.
void *allocShared(uint64_t Size, const char *Reason);

/// Free \p Ptr, alloated via allocShared, for \p Reason.
///
/// Note: See the restrictions on __kmpc_free_shared for proper usage.
void freeShared(void *Ptr, uint64_t Bytes, const char *Reason);

/// Alloca \p Size bytes in global memory, if possible, for \p Reason.
void *allocGlobal(uint64_t Size, const char *Reason);

/// Return a pointer to the dynamic shared memory buffer.
void *getDynamicBuffer();

/// Free \p Ptr, alloated via allocGlobal, for \p Reason.
void freeGlobal(void *Ptr, const char *Reason);

} // namespace memory

namespace state {

inline constexpr uint32_t SharedScratchpadSize = SHARED_SCRATCHPAD_SIZE;

struct ICVStateTy {
uint32_t NThreadsVar;
uint32_t LevelVar;
uint32_t ActiveLevelVar;
uint32_t MaxActiveLevelsVar;
uint32_t RunSchedVar;
uint32_t RunSchedChunkVar;

bool operator==(const ICVStateTy &Other) const;

void assertEqual(const ICVStateTy &Other) const;
};

struct TeamStateTy {
void init(bool IsSPMD);

bool operator==(const TeamStateTy &) const;

void assertEqual(TeamStateTy &Other) const;

/// ICVs
///
/// Preallocated storage for ICV values that are used if the threads have not
/// set a custom default. The latter is supported but unlikely and slow(er).
///
///{
ICVStateTy ICVState;
///}

uint32_t ParallelTeamSize;
uint32_t HasThreadState;
ParallelRegionFnTy ParallelRegionFnVar;
};

extern TeamStateTy TeamState;
#pragma omp allocate(TeamState) allocator(omp_pteam_mem_alloc)

struct ThreadStateTy {

/// ICVs have preallocated storage in the TeamStateTy which is used if a
/// thread has not set a custom value. The latter is supported but unlikely.
/// When it happens we will allocate dynamic memory to hold the values of all
/// ICVs. Thus, the first time an ICV is set by a thread we will allocate an
/// ICV struct to hold them all. This is slower than alternatives but allows
/// users to pay only for what they use.
///
state::ICVStateTy ICVState;

ThreadStateTy *PreviousThreadState;

void init() {
ICVState = TeamState.ICVState;
PreviousThreadState = nullptr;
}

void init(ThreadStateTy *PreviousTS) {
ICVState = PreviousTS ? PreviousTS->ICVState : TeamState.ICVState;
PreviousThreadState = PreviousTS;
}
};

extern ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam];
#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)

/// Initialize the state machinery. Must be called by all threads.
void init(bool IsSPMD);

Expand All @@ -37,6 +126,7 @@ enum ValueKind {
VK_RunSchedChunk,
VK_ParallelRegionFn,
VK_ParallelTeamSize,
VK_HasThreadState,
};

/// TODO
Expand All @@ -54,14 +144,90 @@ struct DateEnvironmentRAII {
/// TODO
void resetStateForThread(uint32_t TId);

uint32_t &lookup32(ValueKind VK, bool IsReadonly, IdentTy *Ident);
void *&lookupPtr(ValueKind VK, bool IsReadonly);
inline uint32_t &lookupForModify32Impl(uint32_t state::ICVStateTy::*Var,
IdentTy *Ident, bool ForceTeamState) {
if (OMP_LIKELY(ForceTeamState || !config::mayUseThreadStates() ||
!TeamState.HasThreadState))
return TeamState.ICVState.*Var;
uint32_t TId = mapping::getThreadIdInBlock();
if (OMP_UNLIKELY(!ThreadStates[TId])) {
ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>(memory::allocGlobal(
sizeof(ThreadStateTy), "ICV modification outside data environment"));
ASSERT(ThreadStates[TId] != nullptr && "Nullptr returned by malloc!");
TeamState.HasThreadState = true;
ThreadStates[TId]->init();
}
return ThreadStates[TId]->ICVState.*Var;
}

inline uint32_t &lookupImpl(uint32_t state::ICVStateTy::*Var,
bool ForceTeamState) {
auto TId = mapping::getThreadIdInBlock();
if (OMP_UNLIKELY(!ForceTeamState && config::mayUseThreadStates() &&
TeamState.HasThreadState && ThreadStates[TId]))
return ThreadStates[TId]->ICVState.*Var;
return TeamState.ICVState.*Var;
}

__attribute__((always_inline, flatten)) inline uint32_t &
lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
switch (Kind) {
case state::VK_NThreads:
if (IsReadonly)
return lookupImpl(&ICVStateTy::NThreadsVar, ForceTeamState);
return lookupForModify32Impl(&ICVStateTy::NThreadsVar, Ident,
ForceTeamState);
case state::VK_Level:
if (IsReadonly)
return lookupImpl(&ICVStateTy::LevelVar, ForceTeamState);
return lookupForModify32Impl(&ICVStateTy::LevelVar, Ident, ForceTeamState);
case state::VK_ActiveLevel:
if (IsReadonly)
return lookupImpl(&ICVStateTy::ActiveLevelVar, ForceTeamState);
return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar, Ident,
ForceTeamState);
case state::VK_MaxActiveLevels:
if (IsReadonly)
return lookupImpl(&ICVStateTy::MaxActiveLevelsVar, ForceTeamState);
return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar, Ident,
ForceTeamState);
case state::VK_RunSched:
if (IsReadonly)
return lookupImpl(&ICVStateTy::RunSchedVar, ForceTeamState);
return lookupForModify32Impl(&ICVStateTy::RunSchedVar, Ident,
ForceTeamState);
case state::VK_RunSchedChunk:
if (IsReadonly)
return lookupImpl(&ICVStateTy::RunSchedChunkVar, ForceTeamState);
return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar, Ident,
ForceTeamState);
case state::VK_ParallelTeamSize:
return TeamState.ParallelTeamSize;
case state::VK_HasThreadState:
return TeamState.HasThreadState;
default:
break;
}
__builtin_unreachable();
}

__attribute__((always_inline, flatten)) inline void *&
lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) {
switch (Kind) {
case state::VK_ParallelRegionFn:
return TeamState.ParallelRegionFnVar;
default:
break;
}
__builtin_unreachable();
}

/// A class without actual state used to provide a nice interface to lookup and
/// update ICV values we can declare in global scope.
template <typename Ty, ValueKind Kind> struct Value {
__attribute__((flatten, always_inline)) operator Ty() {
return lookup(/* IsReadonly */ true, /* IdentTy */ nullptr);
return lookup(/* IsReadonly */ true, /* IdentTy */ nullptr,
/* ForceTeamState */ false);
}

__attribute__((flatten, always_inline)) Value &operator=(const Ty &Other) {
Expand All @@ -79,21 +245,29 @@ template <typename Ty, ValueKind Kind> struct Value {
return *this;
}

__attribute__((flatten, always_inline)) void
assert_eq(const Ty &V, IdentTy *Ident = nullptr,
bool ForceTeamState = false) {
ASSERT(lookup(/* IsReadonly */ true, Ident, ForceTeamState) == V);
}

private:
__attribute__((flatten, always_inline)) Ty &lookup(bool IsReadonly,
IdentTy *Ident) {
Ty &t = lookup32(Kind, IsReadonly, Ident);
__attribute__((flatten, always_inline)) Ty &
lookup(bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
Ty &t = lookup32(Kind, IsReadonly, Ident, ForceTeamState);
return t;
}

__attribute__((flatten, always_inline)) Ty &inc(int UpdateVal,
IdentTy *Ident) {
return (lookup(/* IsReadonly */ false, Ident) += UpdateVal);
return (lookup(/* IsReadonly */ false, Ident, /* ForceTeamState */ false) +=
UpdateVal);
}

__attribute__((flatten, always_inline)) Ty &set(Ty UpdateVal,
IdentTy *Ident) {
return (lookup(/* IsReadonly */ false, Ident) = UpdateVal);
return (lookup(/* IsReadonly */ false, Ident, /* ForceTeamState */ false) =
UpdateVal);
}

template <typename VTy, typename Ty2> friend struct ValueRAII;
Expand All @@ -104,7 +278,8 @@ template <typename Ty, ValueKind Kind> struct Value {
/// we can declare in global scope.
template <typename Ty, ValueKind Kind> struct PtrValue {
__attribute__((flatten, always_inline)) operator Ty() {
return lookup(/* IsReadonly */ true, /* IdentTy */ nullptr);
return lookup(/* IsReadonly */ true, /* IdentTy */ nullptr,
/* ForceTeamState */ false);
}

__attribute__((flatten, always_inline)) PtrValue &operator=(const Ty Other) {
Expand All @@ -113,18 +288,23 @@ template <typename Ty, ValueKind Kind> struct PtrValue {
}

private:
Ty &lookup(bool IsReadonly, IdentTy *) { return lookupPtr(Kind, IsReadonly); }
Ty &lookup(bool IsReadonly, IdentTy *, bool ForceTeamState) {
return lookupPtr(Kind, IsReadonly, ForceTeamState);
}

Ty &set(Ty UpdateVal) {
return (lookup(/* IsReadonly */ false, /* IdentTy */ nullptr) = UpdateVal);
return (lookup(/* IsReadonly */ false, /* IdentTy */ nullptr,
/* ForceTeamState */ false) = UpdateVal);
}

template <typename VTy, typename Ty2> friend struct ValueRAII;
};

template <typename VTy, typename Ty> struct ValueRAII {
ValueRAII(VTy &V, Ty NewValue, Ty OldValue, bool Active, IdentTy *Ident)
: Ptr(Active ? &V.lookup(/* IsReadonly */ false, Ident) : nullptr),
ValueRAII(VTy &V, Ty NewValue, Ty OldValue, bool Active, IdentTy *Ident,
bool ForceTeamState = false)
: Ptr(Active ? &V.lookup(/* IsReadonly */ false, Ident, ForceTeamState)
: (Ty *)utils::UndefPtr),
Val(OldValue), Active(Active) {
if (!Active)
return;
Expand All @@ -149,6 +329,9 @@ inline state::Value<uint32_t, state::VK_RunSchedChunk> RunSchedChunk;
/// TODO
inline state::Value<uint32_t, state::VK_ParallelTeamSize> ParallelTeamSize;

/// TODO
inline state::Value<uint32_t, state::VK_HasThreadState> HasThreadState;

/// TODO
inline state::PtrValue<ParallelRegionFnTy, state::VK_ParallelRegionFn>
ParallelRegionFn;
Expand Down Expand Up @@ -181,29 +364,6 @@ inline state::Value<uint32_t, state::VK_RunSched> RunSched;

} // namespace icv

namespace memory {

/// Alloca \p Size bytes in shared memory, if possible, for \p Reason.
///
/// Note: See the restrictions on __kmpc_alloc_shared for proper usage.
void *allocShared(uint64_t Size, const char *Reason);

/// Free \p Ptr, alloated via allocShared, for \p Reason.
///
/// Note: See the restrictions on __kmpc_free_shared for proper usage.
void freeShared(void *Ptr, uint64_t Bytes, const char *Reason);

/// Alloca \p Size bytes in global memory, if possible, for \p Reason.
void *allocGlobal(uint64_t Size, const char *Reason);

/// Return a pointer to the dynamic shared memory buffer.
void *getDynamicBuffer();

/// Free \p Ptr, alloated via allocGlobal, for \p Reason.
void freeGlobal(void *Ptr, const char *Reason);

} // namespace memory

} // namespace _OMP

#pragma omp end declare target
Expand Down
7 changes: 7 additions & 0 deletions openmp/libomptarget/DeviceRTL/include/Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

#include "Types.h"

#pragma omp begin declare target device_type(nohost)

namespace _OMP {
namespace utils {

Expand Down Expand Up @@ -72,10 +74,15 @@ template <typename Ty1, typename Ty2> inline Ty1 align_down(Ty1 V, Ty2 Align) {
return V - V % Align;
}

/// A pointer variable that has by design an `undef` value. Use with care.
__attribute__((loader_uninitialized)) static void *const UndefPtr;

#define OMP_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
#define OMP_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)

} // namespace utils
} // namespace _OMP

#pragma omp end declare target

#endif
44 changes: 29 additions & 15 deletions openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,14 +85,21 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
FunctionTracingRAII();

uint32_t TId = mapping::getThreadIdInBlock();
// Handle the serialized case first, same for SPMD/non-SPMD.
if (OMP_UNLIKELY(!if_expr || icv::Level)) {

// Handle the serialized case first, same for SPMD/non-SPMD:
// 1) if-clause(0)
// 2) nested parallel regions
// 3) parallel in task or other thread state inducing construct
if (OMP_UNLIKELY(!if_expr || icv::Level || state::HasThreadState)) {
state::DateEnvironmentRAII DERAII(ident);
++icv::Level;
invokeMicrotask(TId, 0, fn, args, nargs);
return;
}

// From this point forward we know that there is no thread state used.
ASSERT(state::HasThreadState == false);

uint32_t NumThreads = determineNumberOfThreads(num_threads);
if (mapping::isSPMDMode()) {
// Avoid the race between the read of the `icv::Level` above and the write
Expand All @@ -103,18 +110,21 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
// last or the other updates will cause a thread specific state to be
// created.
state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
1u, TId == 0, ident);
1u, TId == 0, ident,
/* ForceTeamState */ true);
state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0,
ident);
state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident);
ident, /* ForceTeamState */ true);
state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident,
/* ForceTeamState */ true);

// Synchronize all threads after the main thread (TId == 0) set up the
// team state properly.
synchronize::threadsAligned();

ASSERT(state::ParallelTeamSize == NumThreads);
ASSERT(icv::ActiveLevel == 1u);
ASSERT(icv::Level == 1u);
state::ParallelTeamSize.assert_eq(NumThreads, ident,
/* ForceTeamState */ true);
icv::ActiveLevel.assert_eq(1u, ident, /* ForceTeamState */ true);
icv::Level.assert_eq(1u, ident, /* ForceTeamState */ true);

if (TId < NumThreads)
invokeMicrotask(TId, 0, fn, args, nargs);
Expand All @@ -128,9 +138,9 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
// __kmpc_target_deinit may not hold.
synchronize::threadsAligned();

ASSERT(state::ParallelTeamSize == 1u);
ASSERT(icv::ActiveLevel == 0u);
ASSERT(icv::Level == 0u);
state::ParallelTeamSize.assert_eq(1u, ident, /* ForceTeamState */ true);
icv::ActiveLevel.assert_eq(0u, ident, /* ForceTeamState */ true);
icv::Level.assert_eq(0u, ident, /* ForceTeamState */ true);
return;
}

Expand Down Expand Up @@ -213,11 +223,15 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
// last or the other updates will cause a thread specific state to be
// created.
state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
1u, true, ident);
1u, true, ident,
/* ForceTeamState */ true);
state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
(void *)nullptr, true, ident);
state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident);
state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident);
(void *)nullptr, true, ident,
/* ForceTeamState */ true);
state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident,
/* ForceTeamState */ true);
state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident,
/* ForceTeamState */ true);

// Master signals work to activate workers.
synchronize::threads();
Expand Down
147 changes: 13 additions & 134 deletions openmp/libomptarget/DeviceRTL/src/State.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,8 @@
#include "Configuration.h"
#include "Debug.h"
#include "Interface.h"
#include "Mapping.h"
#include "Synchronization.h"
#include "Types.h"
#include "Utils.h"

using namespace _OMP;

Expand Down Expand Up @@ -180,30 +178,15 @@ void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }

///}

namespace {

struct ICVStateTy {
uint32_t NThreadsVar;
uint32_t LevelVar;
uint32_t ActiveLevelVar;
uint32_t MaxActiveLevelsVar;
uint32_t RunSchedVar;
uint32_t RunSchedChunkVar;

bool operator==(const ICVStateTy &Other) const;

void assertEqual(const ICVStateTy &Other) const;
};

bool ICVStateTy::operator==(const ICVStateTy &Other) const {
bool state::ICVStateTy::operator==(const ICVStateTy &Other) const {
return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) &
(ActiveLevelVar == Other.ActiveLevelVar) &
(MaxActiveLevelsVar == Other.MaxActiveLevelsVar) &
(RunSchedVar == Other.RunSchedVar) &
(RunSchedChunkVar == Other.RunSchedChunkVar);
}

void ICVStateTy::assertEqual(const ICVStateTy &Other) const {
void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const {
ASSERT(NThreadsVar == Other.NThreadsVar);
ASSERT(LevelVar == Other.LevelVar);
ASSERT(ActiveLevelVar == Other.ActiveLevelVar);
Expand All @@ -212,99 +195,38 @@ void ICVStateTy::assertEqual(const ICVStateTy &Other) const {
ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar);
}

struct TeamStateTy {
/// TODO: provide a proper init function.
void init(bool IsSPMD);

bool operator==(const TeamStateTy &) const;

void assertEqual(TeamStateTy &Other) const;

/// ICVs
///
/// Preallocated storage for ICV values that are used if the threads have not
/// set a custom default. The latter is supported but unlikely and slow(er).
///
///{
ICVStateTy ICVState;
///}

uint32_t ParallelTeamSize;
ParallelRegionFnTy ParallelRegionFnVar;
};

TeamStateTy SHARED(TeamState);

void TeamStateTy::init(bool IsSPMD) {
void state::TeamStateTy::init(bool IsSPMD) {
ICVState.NThreadsVar = mapping::getBlockSize(IsSPMD);
ICVState.LevelVar = 0;
ICVState.ActiveLevelVar = 0;
ICVState.MaxActiveLevelsVar = 1;
ICVState.RunSchedVar = omp_sched_static;
ICVState.RunSchedChunkVar = 1;
ParallelTeamSize = 1;
HasThreadState = false;
ParallelRegionFnVar = nullptr;
}

bool TeamStateTy::operator==(const TeamStateTy &Other) const {
bool state::TeamStateTy::operator==(const TeamStateTy &Other) const {
return (ICVState == Other.ICVState) &
(HasThreadState == Other.HasThreadState) &
(ParallelTeamSize == Other.ParallelTeamSize);
}

void TeamStateTy::assertEqual(TeamStateTy &Other) const {
void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
ICVState.assertEqual(Other.ICVState);
ASSERT(ParallelTeamSize == Other.ParallelTeamSize);
ASSERT(HasThreadState == Other.HasThreadState);
}

struct ThreadStateTy {

/// ICVs have preallocated storage in the TeamStateTy which is used if a
/// thread has not set a custom value. The latter is supported but unlikely.
/// When it happens we will allocate dynamic memory to hold the values of all
/// ICVs. Thus, the first time an ICV is set by a thread we will allocate an
/// ICV struct to hold them all. This is slower than alternatives but allows
/// users to pay only for what they use.
///
ICVStateTy ICVState;

ThreadStateTy *PreviousThreadState;

void init() {
ICVState = TeamState.ICVState;
PreviousThreadState = nullptr;
}
namespace {

void init(ThreadStateTy *PreviousTS) {
ICVState = PreviousTS ? PreviousTS->ICVState : TeamState.ICVState;
PreviousThreadState = PreviousTS;
}
};
state::TeamStateTy SHARED(TeamState);

__attribute__((loader_uninitialized))
ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam];
state::ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam];
#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)

uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var, IdentTy *Ident) {
if (OMP_LIKELY(!config::mayUseThreadStates() ||
TeamState.ICVState.LevelVar == 0))
return TeamState.ICVState.*Var;
uint32_t TId = mapping::getThreadIdInBlock();
if (OMP_UNLIKELY(!ThreadStates[TId])) {
ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>(memory::allocGlobal(
sizeof(ThreadStateTy), "ICV modification outside data environment"));
ASSERT(ThreadStates[TId] != nullptr && "Nullptr returned by malloc!");
ThreadStates[TId]->init();
}
return ThreadStates[TId]->ICVState.*Var;
}

template <typename IntTy> IntTy &lookupImpl(IntTy ICVStateTy::*Var) {
IntTy TId = mapping::getThreadIdInBlock();
if (OMP_UNLIKELY(config::mayUseThreadStates() && ThreadStates[TId]))
return ThreadStates[TId]->ICVState.*Var;
return TeamState.ICVState.*Var;
}

int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
int OutOfBoundsVal = -1) {
if (Level == 0)
Expand All @@ -320,50 +242,6 @@ int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,

} // namespace

uint32_t &state::lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident) {
switch (Kind) {
case state::VK_NThreads:
if (IsReadonly)
return lookupImpl<uint32_t>(&ICVStateTy::NThreadsVar);
return lookupForModify32Impl(&ICVStateTy::NThreadsVar, Ident);
case state::VK_Level:
if (IsReadonly)
return lookupImpl<uint32_t>(&ICVStateTy::LevelVar);
return lookupForModify32Impl(&ICVStateTy::LevelVar, Ident);
case state::VK_ActiveLevel:
if (IsReadonly)
return lookupImpl<uint32_t>(&ICVStateTy::ActiveLevelVar);
return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar, Ident);
case state::VK_MaxActiveLevels:
if (IsReadonly)
return lookupImpl<uint32_t>(&ICVStateTy::MaxActiveLevelsVar);
return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar, Ident);
case state::VK_RunSched:
if (IsReadonly)
return lookupImpl<uint32_t>(&ICVStateTy::RunSchedVar);
return lookupForModify32Impl(&ICVStateTy::RunSchedVar, Ident);
case state::VK_RunSchedChunk:
if (IsReadonly)
return lookupImpl<uint32_t>(&ICVStateTy::RunSchedChunkVar);
return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar, Ident);
case state::VK_ParallelTeamSize:
return TeamState.ParallelTeamSize;
default:
break;
}
__builtin_unreachable();
}

void *&state::lookupPtr(ValueKind Kind, bool IsReadonly) {
switch (Kind) {
case state::VK_ParallelRegionFn:
return TeamState.ParallelRegionFnVar;
default:
break;
}
__builtin_unreachable();
}

void state::init(bool IsSPMD) {
SharedMemorySmartStack.init(IsSPMD);
if (mapping::isInitialThreadInLevel0(IsSPMD)) {
Expand All @@ -382,6 +260,7 @@ void state::enterDataEnvironment(IdentTy *Ident) {
ThreadStateTy *NewThreadState =
static_cast<ThreadStateTy *>(__kmpc_alloc_shared(sizeof(ThreadStateTy)));
NewThreadState->init(ThreadStates[TId]);
TeamState.HasThreadState = true;
ThreadStates[TId] = NewThreadState;
}

Expand All @@ -394,7 +273,7 @@ void state::exitDataEnvironment() {
}

void state::resetStateForThread(uint32_t TId) {
if (OMP_LIKELY(!ThreadStates[TId]))
if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId]))
return;

ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState;
Expand Down