diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 8c68470a18472..63493eb78c451 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -43,6 +43,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -670,7 +671,7 @@ struct KernelInfoState : AbstractState {
 
   /// The parallel regions (identified by the outlined parallel functions) that
   /// can be reached from the associated function.
-  BooleanStateWithPtrSetVector<Function, /* InsertInvalidates */ false>
+  BooleanStateWithPtrSetVector<CallBase, /* InsertInvalidates */ false>
       ReachedKnownParallelRegions;
 
   /// State to track what parallel region we might reach.
@@ -4455,11 +4456,15 @@ struct AAKernelInfoFunction : AAKernelInfo {
     Value *ZeroArg =
         Constant::getNullValue(ParallelRegionFnTy->getParamType(0));
 
+    const unsigned int WrapperFunctionArgNo = 6;
+
     // Now that we have most of the CFG skeleton it is time for the if-cascade
     // that checks the function pointer we got from the runtime against the
     // parallel regions we expect, if there are any.
     for (int I = 0, E = ReachedKnownParallelRegions.size(); I < E; ++I) {
-      auto *ParallelRegion = ReachedKnownParallelRegions[I];
+      auto *CB = ReachedKnownParallelRegions[I];
+      auto *ParallelRegion = dyn_cast<Function>(
+          CB->getArgOperand(WrapperFunctionArgNo)->stripPointerCasts());
       BasicBlock *PRExecuteBB = BasicBlock::Create(
           Ctx, "worker_state_machine.parallel_region.execute", Kernel,
           StateMachineEndParallelBB);
@@ -4822,8 +4827,6 @@ struct AAKernelInfoCallSite : AAKernelInfo {
       return;
     }
 
-    const unsigned int NonWrapperFunctionArgNo = 5;
-    const unsigned int WrapperFunctionArgNo = 6;
     RuntimeFunction RF = It->getSecond();
     switch (RF) {
     // All the functions we know are compatible with SPMD mode.
@@ -4902,28 +4905,10 @@ struct AAKernelInfoCallSite : AAKernelInfo {
     case OMPRTL___kmpc_target_deinit:
       KernelDeinitCB = &CB;
       break;
-    case OMPRTL___kmpc_parallel_51: {
-      auto *ParallelRegionOp =
-          CB.getArgOperand(WrapperFunctionArgNo)->stripPointerCasts();
-      if (isa<ConstantPointerNull>(ParallelRegionOp))
-        ParallelRegionOp =
-            CB.getArgOperand(NonWrapperFunctionArgNo)->stripPointerCasts();
-      if (auto *ParallelRegion = dyn_cast<Function>(ParallelRegionOp)) {
-        ReachedKnownParallelRegions.insert(ParallelRegion);
-        /// Check nested parallelism
-        auto *FnAA = A.getAAFor<AAKernelInfo>(
-            *this, IRPosition::function(*ParallelRegion), DepClassTy::OPTIONAL);
-        NestedParallelism |= !FnAA || !FnAA->getState().isValidState() ||
-                             !FnAA->ReachedKnownParallelRegions.empty() ||
-                             !FnAA->ReachedUnknownParallelRegions.empty();
-        break;
-      }
-      // The condition above should usually get the parallel region function
-      // pointer and record it. In the off chance it doesn't we assume the
-      // worst.
-      ReachedUnknownParallelRegions.insert(&CB);
-      break;
-    }
+    case OMPRTL___kmpc_parallel_51:
+      if (!handleParallel51(A, CB))
+        indicatePessimisticFixpoint();
+      return;
     case OMPRTL___kmpc_omp_task:
       // We do not look into tasks right now, just give up.
       SPMDCompatibilityTracker.indicatePessimisticFixpoint();
@@ -4969,14 +4954,21 @@ struct AAKernelInfoCallSite : AAKernelInfo {
       return ChangeStatus::CHANGED;
     }
 
+    KernelInfoState StateBefore = getState();
+    CallBase &CB = cast<CallBase>(getAssociatedValue());
+    if (It->getSecond() == OMPRTL___kmpc_parallel_51) {
+      if (!handleParallel51(A, CB))
+        return indicatePessimisticFixpoint();
+      return StateBefore == getState() ? ChangeStatus::UNCHANGED
+                                       : ChangeStatus::CHANGED;
+    }
+
     // F is a runtime function that allocates or frees memory, check
     // AAHeapToStack and AAHeapToShared.
-    KernelInfoState StateBefore = getState();
     assert((It->getSecond() == OMPRTL___kmpc_alloc_shared ||
             It->getSecond() == OMPRTL___kmpc_free_shared) &&
            "Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call");
 
-    CallBase &CB = cast<CallBase>(getAssociatedValue());
 
     auto *HeapToStackAA = A.getAAFor<AAHeapToStack>(
         *this, IRPosition::function(*CB.getCaller()), DepClassTy::OPTIONAL);
@@ -5008,6 +5000,32 @@ struct AAKernelInfoCallSite : AAKernelInfo {
     return StateBefore == getState() ? ChangeStatus::UNCHANGED
                                      : ChangeStatus::CHANGED;
   }
+
+  /// Deal with a __kmpc_parallel_51 call (\p CB). Returns true if the call was
+  /// handled, if a problem occurred, false is returned.
+  bool handleParallel51(Attributor &A, CallBase &CB) {
+    const unsigned int NonWrapperFunctionArgNo = 5;
+    const unsigned int WrapperFunctionArgNo = 6;
+    auto ParallelRegionOpArgNo = SPMDCompatibilityTracker.isAssumed()
+                                     ? NonWrapperFunctionArgNo
+                                     : WrapperFunctionArgNo;
+
+    auto *ParallelRegion = dyn_cast<Function>(
+        CB.getArgOperand(ParallelRegionOpArgNo)->stripPointerCasts());
+    if (!ParallelRegion)
+      return false;
+
+    ReachedKnownParallelRegions.insert(&CB);
+    /// Check nested parallelism
+    auto *FnAA = A.getAAFor<AAKernelInfo>(
+        *this, IRPosition::function(*ParallelRegion), DepClassTy::OPTIONAL);
+    NestedParallelism |= !FnAA || !FnAA->getState().isValidState() ||
+                         !FnAA->ReachedKnownParallelRegions.empty() ||
+                         !FnAA->ReachedKnownParallelRegions.isValidState() ||
+                         !FnAA->ReachedUnknownParallelRegions.isValidState() ||
+                         !FnAA->ReachedUnknownParallelRegions.empty();
+    return true;
+  }
 };
 
 struct AAFoldRuntimeCall
diff --git a/openmp/libomptarget/test/offloading/bug64959.c b/openmp/libomptarget/test/offloading/bug64959.c
new file mode 100644
index 0000000000000..500911fa3f316
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/bug64959.c
@@ -0,0 +1,57 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// RUN: %libomptarget-compileoptxx-run-and-check-generic
+
+// TODO: This requires malloc support for the threads states.
+// UNSUPPORTED: amdgcn-amd-amdhsa
+
+#include <omp.h>
+#include <stdio.h>
+#define N 10
+
+int isCPU() { return 1; }
+
+#pragma omp begin declare variant match(device = {kind(gpu)})
+int isCPU() { return 0; }
+#pragma omp end declare variant
+
+int main(void) {
+  long int aa = 0;
+  int res = 0;
+
+  int ng = 12;
+  int cmom = 14;
+  int nxyz;
+
+#pragma omp target map(from : nxyz, ng, cmom)
+  {
+    nxyz = isCPU() ? 2 : 5000;
+    ng = isCPU() ? 2 : 12;
+    cmom = isCPU() ? 2 : 14;
+  }
+
+#pragma omp target teams distribute num_teams(nxyz)                            \
+    thread_limit(ng *(cmom - 1)) map(tofrom : aa)
+  for (int gid = 0; gid < nxyz; gid++) {
+#pragma omp parallel for collapse(2)
+    for (unsigned int g = 0; g < ng; g++) {
+      for (unsigned int l = 0; l < cmom - 1; l++) {
+        int a = 0;
+#pragma omp parallel for reduction(+ : a)
+        for (int i = 0; i < N; i++) {
+          a += i;
+        }
+#pragma omp atomic
+        aa += a;
+      }
+    }
+  }
+  long exp = (long)ng * (cmom - 1) * nxyz * (N * (N - 1) / 2);
+  printf("The result is = %ld exp:%ld!\n", aa, exp);
+  if (aa != exp) {
+    printf("Failed %ld\n", aa);
+    return 1;
+  }
+  // CHECK: Success
+  printf("Success\n");
+  return 0;
+}