diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index 4651f4fff6aa00..e21998860f217b 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -246,6 +246,7 @@ LANGOPT(OpenMPTargetDebug , 32, 0, "Enable debugging in the OpenMP offloading de LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.") LANGOPT(OpenMPThreadSubscription , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.") LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have more iterations than participating teams.") +LANGOPT(OpenMPNoThreadState , 1, 0, "Assume that no thread in a parallel region will modify an ICV.") LANGOPT(RenderScript , 1, 0, "RenderScript") LANGOPT(CUDAIsDevice , 1, 0, "compiling for CUDA device") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 76cfdbcd85f265..c377329e8f6f4d 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2473,6 +2473,10 @@ def fno_openmp_assume_teams_oversubscription : Flag<["-"], "fno-openmp-assume-te Group, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>; def fno_openmp_assume_threads_oversubscription : Flag<["-"], "fno-openmp-assume-threads-oversubscription">, Group, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>; +def fopenmp_assume_no_thread_state : Flag<["-"], "fopenmp-assume-no-thread-state">, Group, + Flags<[CC1Option, NoArgumentUnused, HelpHidden]>, + HelpText<"Assert no thread in a parallel region modifies an ICV">, + MarshallingInfoFlag>; defm openmp_target_new_runtime: BoolFOption<"openmp-target-new-runtime", LangOpts<"OpenMPTargetNewRuntime">, DefaultTrue, PosFlag, diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index bb6847ab87319a..fcaf9d4ed77b38 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1210,6 +1210,8 @@ CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM) "__omp_rtl_assume_teams_oversubscription"); OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription, "__omp_rtl_assume_threads_oversubscription"); + OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState, + "__omp_rtl_assume_no_thread_state"); } } diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index a16175ebebbca8..32cbb7936f7ee9 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5995,6 +5995,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, options::OPT_fno_openmp_assume_threads_oversubscription, /*Default=*/false)) CmdArgs.push_back("-fopenmp-assume-threads-oversubscription"); + if (Args.hasArg(options::OPT_fopenmp_assume_no_thread_state)) + CmdArgs.push_back("-fopenmp-assume-no-thread-state"); break; default: // By default, if Clang doesn't know how to generate useful OpenMP code diff --git a/clang/test/OpenMP/target_globals_codegen.cpp b/clang/test/OpenMP/target_globals_codegen.cpp index fa7569cd4ca6b6..3c5d4b8ed39846 100644 --- a/clang/test/OpenMP/target_globals_codegen.cpp +++ b/clang/test/OpenMP/target_globals_codegen.cpp @@ -6,6 +6,7 @@ // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-DEFAULT // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-threads-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-THREADS // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-TEAMS +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-no-thread-state -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-STATE // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -o - | FileCheck %s --check-prefix=CHECK-RUNTIME // expected-no-diagnostics @@ -16,26 +17,37 @@ // CHECK: @__omp_rtl_debug_kind = weak_odr hidden constant i32 1 // CHECK: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 // CHECK: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 +// CHECK: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 //. // CHECK-EQ: @__omp_rtl_debug_kind = weak_odr hidden constant i32 111 // CHECK-EQ: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 // CHECK-EQ: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 +// CHECK-EQ: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 //. // CHECK-DEFAULT: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 // CHECK-DEFAULT: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 // CHECK-DEFAULT: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 +// CHECK-DEFAULT: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 //. // CHECK-THREADS: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 // CHECK-THREADS: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 // CHECK-THREADS: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 1 +// CHECK-THREADS: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 //. // CHECK-TEAMS: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 // CHECK-TEAMS: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 1 // CHECK-TEAMS: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 +// CHECK-TEAMS: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 +//. +// CHECK-STATE: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 +// CHECK-STATE: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 +// CHECK-STATE: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 +// CHECK-STATE: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 1 //. // CHECK-RUNTIME-NOT: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 // CHECK-RUNTIME-NOT: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 1 // CHECK-RUNTIME-NOT: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 +// CHECK-RUNTIME-NOT: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 //. void foo() { #pragma omp target diff --git a/openmp/libomptarget/DeviceRTL/include/Configuration.h b/openmp/libomptarget/DeviceRTL/include/Configuration.h index 5727f1f2bfbf66..94f11b6066a20a 100644 --- a/openmp/libomptarget/DeviceRTL/include/Configuration.h +++ b/openmp/libomptarget/DeviceRTL/include/Configuration.h @@ -38,8 +38,13 @@ uint32_t getDebugKind(); /// Return the amount of dynamic shared memory that was allocated at launch. uint64_t getDynamicMemorySize(); +/// Return if debugging is enabled for the given debug kind. bool isDebugMode(DebugKind Level); +/// Indicates if this kernel may require thread-specific states, or if it was +/// explicitly disabled by the user. +bool mayUseThreadStates(); + } // namespace config } // namespace _OMP diff --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp index 349f93a08701c6..e9cc9bb0e318e1 100644 --- a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp @@ -20,7 +20,9 @@ using namespace _OMP; #pragma omp declare target -extern uint32_t __omp_rtl_debug_kind; // defined by CGOpenMPRuntimeGPU +// defined by CGOpenMPRuntimeGPU +extern uint32_t __omp_rtl_debug_kind; +extern uint32_t __omp_rtl_assume_no_thread_state; // TODO: We want to change the name as soon as the old runtime is gone. // This variable should be visibile to the plugin so we override the default @@ -48,4 +50,6 @@ bool config::isDebugMode(config::DebugKind Kind) { return config::getDebugKind() & Kind; } +bool config::mayUseThreadStates() { return !__omp_rtl_assume_no_thread_state; } + #pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp index a04f5cccb1738b..a530c5e0b2471f 100644 --- a/openmp/libomptarget/DeviceRTL/src/State.cpp +++ b/openmp/libomptarget/DeviceRTL/src/State.cpp @@ -285,7 +285,8 @@ ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam]; #pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc) uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var, IdentTy *Ident) { - if (OMP_LIKELY(TeamState.ICVState.LevelVar == 0)) + if (OMP_LIKELY(!config::mayUseThreadStates() || + TeamState.ICVState.LevelVar == 0)) return TeamState.ICVState.*Var; uint32_t TId = mapping::getThreadIdInBlock(); if (!ThreadStates[TId]) { @@ -299,13 +300,13 @@ uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var, IdentTy *Ident) { uint32_t &lookup32Impl(uint32_t ICVStateTy::*Var) { uint32_t TId = mapping::getThreadIdInBlock(); - if (OMP_UNLIKELY(ThreadStates[TId])) + if (OMP_UNLIKELY(config::mayUseThreadStates() && ThreadStates[TId])) return ThreadStates[TId]->ICVState.*Var; return TeamState.ICVState.*Var; } uint64_t &lookup64Impl(uint64_t ICVStateTy::*Var) { uint64_t TId = mapping::getThreadIdInBlock(); - if (OMP_UNLIKELY(ThreadStates[TId])) + if (OMP_UNLIKELY(config::mayUseThreadStates() && ThreadStates[TId])) return ThreadStates[TId]->ICVState.*Var; return TeamState.ICVState.*Var; } @@ -380,6 +381,9 @@ void state::init(bool IsSPMD) { } void state::enterDataEnvironment(IdentTy *Ident) { + ASSERT(config::mayUseThreadStates() && + "Thread state modified while explicitly disabled!"); + unsigned TId = mapping::getThreadIdInBlock(); ThreadStateTy *NewThreadState = static_cast(__kmpc_alloc_shared(sizeof(ThreadStateTy))); @@ -388,6 +392,9 @@ void state::enterDataEnvironment(IdentTy *Ident) { } void state::exitDataEnvironment() { + ASSERT(config::mayUseThreadStates() && + "Thread state modified while explicitly disabled!"); + unsigned TId = mapping::getThreadIdInBlock(); resetStateForThread(TId); }