diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 8c68470a18472..63493eb78c451 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -43,6 +43,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -670,7 +671,7 @@ struct KernelInfoState : AbstractState { /// The parallel regions (identified by the outlined parallel functions) that /// can be reached from the associated function. - BooleanStateWithPtrSetVector + BooleanStateWithPtrSetVector ReachedKnownParallelRegions; /// State to track what parallel region we might reach. @@ -4455,11 +4456,15 @@ struct AAKernelInfoFunction : AAKernelInfo { Value *ZeroArg = Constant::getNullValue(ParallelRegionFnTy->getParamType(0)); + const unsigned int WrapperFunctionArgNo = 6; + // Now that we have most of the CFG skeleton it is time for the if-cascade // that checks the function pointer we got from the runtime against the // parallel regions we expect, if there are any. for (int I = 0, E = ReachedKnownParallelRegions.size(); I < E; ++I) { - auto *ParallelRegion = ReachedKnownParallelRegions[I]; + auto *CB = ReachedKnownParallelRegions[I]; + auto *ParallelRegion = dyn_cast( + CB->getArgOperand(WrapperFunctionArgNo)->stripPointerCasts()); BasicBlock *PRExecuteBB = BasicBlock::Create( Ctx, "worker_state_machine.parallel_region.execute", Kernel, StateMachineEndParallelBB); @@ -4822,8 +4827,6 @@ struct AAKernelInfoCallSite : AAKernelInfo { return; } - const unsigned int NonWrapperFunctionArgNo = 5; - const unsigned int WrapperFunctionArgNo = 6; RuntimeFunction RF = It->getSecond(); switch (RF) { // All the functions we know are compatible with SPMD mode. @@ -4902,28 +4905,10 @@ struct AAKernelInfoCallSite : AAKernelInfo { case OMPRTL___kmpc_target_deinit: KernelDeinitCB = &CB; break; - case OMPRTL___kmpc_parallel_51: { - auto *ParallelRegionOp = - CB.getArgOperand(WrapperFunctionArgNo)->stripPointerCasts(); - if (isa(ParallelRegionOp)) - ParallelRegionOp = - CB.getArgOperand(NonWrapperFunctionArgNo)->stripPointerCasts(); - if (auto *ParallelRegion = dyn_cast(ParallelRegionOp)) { - ReachedKnownParallelRegions.insert(ParallelRegion); - /// Check nested parallelism - auto *FnAA = A.getAAFor( - *this, IRPosition::function(*ParallelRegion), DepClassTy::OPTIONAL); - NestedParallelism |= !FnAA || !FnAA->getState().isValidState() || - !FnAA->ReachedKnownParallelRegions.empty() || - !FnAA->ReachedUnknownParallelRegions.empty(); - break; - } - // The condition above should usually get the parallel region function - // pointer and record it. In the off chance it doesn't we assume the - // worst. - ReachedUnknownParallelRegions.insert(&CB); - break; - } + case OMPRTL___kmpc_parallel_51: + if (!handleParallel51(A, CB)) + indicatePessimisticFixpoint(); + return; case OMPRTL___kmpc_omp_task: // We do not look into tasks right now, just give up. SPMDCompatibilityTracker.indicatePessimisticFixpoint(); @@ -4969,14 +4954,21 @@ struct AAKernelInfoCallSite : AAKernelInfo { return ChangeStatus::CHANGED; } + KernelInfoState StateBefore = getState(); + CallBase &CB = cast(getAssociatedValue()); + if (It->getSecond() == OMPRTL___kmpc_parallel_51) { + if (!handleParallel51(A, CB)) + return indicatePessimisticFixpoint(); + return StateBefore == getState() ? ChangeStatus::UNCHANGED + : ChangeStatus::CHANGED; + } + // F is a runtime function that allocates or frees memory, check // AAHeapToStack and AAHeapToShared. - KernelInfoState StateBefore = getState(); assert((It->getSecond() == OMPRTL___kmpc_alloc_shared || It->getSecond() == OMPRTL___kmpc_free_shared) && "Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call"); - CallBase &CB = cast(getAssociatedValue()); auto *HeapToStackAA = A.getAAFor( *this, IRPosition::function(*CB.getCaller()), DepClassTy::OPTIONAL); @@ -5008,6 +5000,32 @@ struct AAKernelInfoCallSite : AAKernelInfo { return StateBefore == getState() ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED; } + + /// Deal with a __kmpc_parallel_51 call (\p CB). Returns true if the call was + /// handled, if a problem occurred, false is returned. + bool handleParallel51(Attributor &A, CallBase &CB) { + const unsigned int NonWrapperFunctionArgNo = 5; + const unsigned int WrapperFunctionArgNo = 6; + auto ParallelRegionOpArgNo = SPMDCompatibilityTracker.isAssumed() + ? NonWrapperFunctionArgNo + : WrapperFunctionArgNo; + + auto *ParallelRegion = dyn_cast( + CB.getArgOperand(ParallelRegionOpArgNo)->stripPointerCasts()); + if (!ParallelRegion) + return false; + + ReachedKnownParallelRegions.insert(&CB); + /// Check nested parallelism + auto *FnAA = A.getAAFor( + *this, IRPosition::function(*ParallelRegion), DepClassTy::OPTIONAL); + NestedParallelism |= !FnAA || !FnAA->getState().isValidState() || + !FnAA->ReachedKnownParallelRegions.empty() || + !FnAA->ReachedKnownParallelRegions.isValidState() || + !FnAA->ReachedUnknownParallelRegions.isValidState() || + !FnAA->ReachedUnknownParallelRegions.empty(); + return true; + } }; struct AAFoldRuntimeCall diff --git a/openmp/libomptarget/test/offloading/bug64959.c b/openmp/libomptarget/test/offloading/bug64959.c new file mode 100644 index 0000000000000..500911fa3f316 --- /dev/null +++ b/openmp/libomptarget/test/offloading/bug64959.c @@ -0,0 +1,57 @@ +// RUN: %libomptarget-compilexx-run-and-check-generic +// RUN: %libomptarget-compileoptxx-run-and-check-generic + +// TODO: This requires malloc support for the threads states. +// UNSUPPORTED: amdgcn-amd-amdhsa + +#include +#include +#define N 10 + +int isCPU() { return 1; } + +#pragma omp begin declare variant match(device = {kind(gpu)}) +int isCPU() { return 0; } +#pragma omp end declare variant + +int main(void) { + long int aa = 0; + int res = 0; + + int ng = 12; + int cmom = 14; + int nxyz; + +#pragma omp target map(from : nxyz, ng, cmom) + { + nxyz = isCPU() ? 2 : 5000; + ng = isCPU() ? 2 : 12; + cmom = isCPU() ? 2 : 14; + } + +#pragma omp target teams distribute num_teams(nxyz) \ + thread_limit(ng *(cmom - 1)) map(tofrom : aa) + for (int gid = 0; gid < nxyz; gid++) { +#pragma omp parallel for collapse(2) + for (unsigned int g = 0; g < ng; g++) { + for (unsigned int l = 0; l < cmom - 1; l++) { + int a = 0; +#pragma omp parallel for reduction(+ : a) + for (int i = 0; i < N; i++) { + a += i; + } +#pragma omp atomic + aa += a; + } + } + } + long exp = (long)ng * (cmom - 1) * nxyz * (N * (N - 1) / 2); + printf("The result is = %ld exp:%ld!\n", aa, exp); + if (aa != exp) { + printf("Failed %ld\n", aa); + return 1; + } + // CHECK: Success + printf("Success\n"); + return 0; +}