From 5fbb19e5048a3357dec2c2dd535ed0402102c30e Mon Sep 17 00:00:00 2001 From: ergawy Date: Tue, 9 Dec 2025 09:11:05 -0600 Subject: [PATCH 1/2] [OpenMP][MLIR] Hoist static `alloca`s emitted by private `init` regions to the allocation IP of the construct Having more than 1 descritpr (allocatable or array) on the same `private` clause triggers a runtime crash on GPUs at the moment. For SPMD kernels, the issue happens because the initialization logic includes: * Allocating a number of temporary structs (these are emitted by flang when `fir` is lowered to `mlir.llvm`). * There is a conditional branch that determines whether we will allocate storage for the descriptor and initialize array bounds from the original descriptor or whether we will initialize the private descriptor to null. Because of these 2 things, temp allocations needed for descriptors beyond the 1st one are preceded by branching which causes the observed the runtime crash. This PR solves this issue by hoisting these static `alloca`s instructions to the suitable allca IP of the parent construct. --- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 75 ++++++++++++++++-- .../openmp-private-allloca-hoisting.mlir | 79 +++++++++++++++++++ 2 files changed, 146 insertions(+), 8 deletions(-) create mode 100644 mlir/test/Target/LLVMIR/openmp-private-allloca-hoisting.mlir diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 03d67a52853f6..af83981495557 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -1596,10 +1596,64 @@ static llvm::Expected initPrivateVar( return phis[0]; } +/// Beginning with \p startBlock, this function visits all reachable successor +/// blocks. For each such block, static alloca instructions (i.e. non-array +/// allocas) are collected. Then, these collected alloca instructions are moved +/// to the \p allocaIP insertion point. +/// +/// This is useful in cases where, for example, more than one allocatable or +/// array are privatized. In such cases, we allocate a number of temporary +/// descriptors to handle the initialization logic. Additonally, for each +/// private value, there is branching logic based on the value of the origianl +/// private variable's allocation state. Therefore, we end up with descriptor +/// alloca instructions preceded by conditional branches which casues runtime +/// issues at least on the GPU. +static void hoistStaticAllocasToAllocaIP( + llvm::BasicBlock *startBlock, + const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP) { + llvm::SmallVector inlinedBlocks{startBlock}; + llvm::SmallPtrSet seenBlocks; + llvm::SmallVector staticAllocas; + + while (!inlinedBlocks.empty()) { + llvm::BasicBlock *curBlock = inlinedBlocks.front(); + inlinedBlocks.erase(inlinedBlocks.begin()); + llvm::Instruction *terminator = curBlock->getTerminator(); + + for (llvm::Instruction &inst : *curBlock) { + if (auto *allocaInst = mlir::dyn_cast(&inst)) { + if (!allocaInst->isArrayAllocation()) { +#ifdef EXPENSIVE_CHECKS + assert(llvm::count(staticInitAllocas, allocaInst) == 0); +#endif + staticAllocas.push_back(allocaInst); + } + } + } + + if (!terminator || !terminator->isTerminator() || + terminator->getNumSuccessors() == 0) + continue; + + for (unsigned i = 0; i < terminator->getNumSuccessors(); ++i) { + llvm::BasicBlock *successor = terminator->getSuccessor(i); + + if (!seenBlocks.contains(successor)) { + inlinedBlocks.push_back(successor); + seenBlocks.insert(successor); + } + } + } + + for (llvm::Instruction *staticAlloca : staticAllocas) + staticAlloca->moveBefore(allocaIP.getPoint()); +} + static llvm::Error initPrivateVars(llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation, PrivateVarsInfo &privateVarsInfo, + const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP, llvm::DenseMap *mappedPrivateVars = nullptr) { if (privateVarsInfo.blockArgs.empty()) return llvm::Error::success(); @@ -1624,6 +1678,8 @@ initPrivateVars(llvm::IRBuilderBase &builder, setInsertPointForPossiblyEmptyBlock(builder); } + hoistStaticAllocasToAllocaIP(privInitBlock, allocaIP); + return llvm::Error::success(); } @@ -2575,7 +2631,8 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder, deferredStores, isByRef))) return failure(); - if (handleError(initPrivateVars(builder, moduleTranslation, privateVarsInfo), + if (handleError(initPrivateVars(builder, moduleTranslation, privateVarsInfo, + allocaIP), opInst) .failed()) return failure(); @@ -2764,9 +2821,9 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, assert(afterAllocas.get()->getSinglePredecessor()); builder.restoreIP(codeGenIP); - if (handleError( - initPrivateVars(builder, moduleTranslation, privateVarsInfo), - *opInst) + if (handleError(initPrivateVars(builder, moduleTranslation, privateVarsInfo, + allocaIP), + *opInst) .failed()) return llvm::make_error(); @@ -2967,7 +3024,8 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder, deferredStores, isByRef))) return failure(); - if (handleError(initPrivateVars(builder, moduleTranslation, privateVarsInfo), + if (handleError(initPrivateVars(builder, moduleTranslation, privateVarsInfo, + allocaIP), opInst) .failed()) return failure(); @@ -5288,8 +5346,9 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder, if (handleError(afterAllocas, opInst).failed()) return llvm::make_error(); - if (handleError(initPrivateVars(builder, moduleTranslation, privVarsInfo), - opInst) + if (handleError( + initPrivateVars(builder, moduleTranslation, privVarsInfo, allocaIP), + opInst) .failed()) return llvm::make_error(); @@ -6090,7 +6149,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, builder.restoreIP(codeGenIP); if (handleError(initPrivateVars(builder, moduleTranslation, privateVarsInfo, - &mappedPrivateVars), + allocaIP, &mappedPrivateVars), *targetOp) .failed()) return llvm::make_error(); diff --git a/mlir/test/Target/LLVMIR/openmp-private-allloca-hoisting.mlir b/mlir/test/Target/LLVMIR/openmp-private-allloca-hoisting.mlir new file mode 100644 index 0000000000000..5371f2814085e --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-private-allloca-hoisting.mlir @@ -0,0 +1,79 @@ +// Tests that static alloca's in `omp.private ... init` regions are hoisted to +// the parent construct's alloca IP. +// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s + +llvm.func @foo1() +llvm.func @foo2() +llvm.func @foo3() +llvm.func @foo4() + +omp.private {type = private} @multi_block.privatizer : f32 init { +^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %0 = llvm.mlir.constant(1 : i32) : i32 + %alloca1 = llvm.alloca %0 x !llvm.struct<(i64)> {alignment = 8 : i64} : (i32) -> !llvm.ptr + + %1 = llvm.load %arg0 : !llvm.ptr -> f32 + + %c1 = llvm.mlir.constant(1 : i32) : i32 + %c2 = llvm.mlir.constant(2 : i32) : i32 + %cond1 = llvm.icmp "eq" %c1, %c2 : i32 + llvm.cond_br %cond1, ^bb1, ^bb2 + +^bb1: + llvm.call @foo1() : () -> () + llvm.br ^bb3 + +^bb2: + llvm.call @foo2() : () -> () + llvm.br ^bb3 + +^bb3: + llvm.store %1, %arg1 : f32, !llvm.ptr + + omp.yield(%arg1 : !llvm.ptr) +} + +omp.private {type = private} @multi_block.privatizer2 : f32 init { +^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %0 = llvm.mlir.constant(1 : i32) : i32 + %alloca1 = llvm.alloca %0 x !llvm.struct<(ptr)> {alignment = 8 : i64} : (i32) -> !llvm.ptr + + %1 = llvm.load %arg0 : !llvm.ptr -> f32 + + %c1 = llvm.mlir.constant(1 : i32) : i32 + %c2 = llvm.mlir.constant(2 : i32) : i32 + %cond1 = llvm.icmp "eq" %c1, %c2 : i32 + llvm.cond_br %cond1, ^bb1, ^bb2 + +^bb1: + llvm.call @foo3() : () -> () + llvm.br ^bb3 + +^bb2: + llvm.call @foo4() : () -> () + llvm.br ^bb3 + +^bb3: + llvm.store %1, %arg1 : f32, !llvm.ptr + + omp.yield(%arg1 : !llvm.ptr) +} + +llvm.func @parallel_op_private_multi_block(%arg0: !llvm.ptr, %arg1: !llvm.ptr) { + omp.parallel private(@multi_block.privatizer %arg0 -> %arg2, + @multi_block.privatizer2 %arg1 -> %arg3 : !llvm.ptr, !llvm.ptr) { + %0 = llvm.load %arg2 : !llvm.ptr -> f32 + %1 = llvm.load %arg3 : !llvm.ptr -> f32 + omp.terminator + } + llvm.return +} + +// CHECK: define internal void @parallel_op_private_multi_block..omp_par({{.*}}) {{.*}} { +// CHECK: omp.par.entry: +// Varify that both allocas were hoisted to the parallel region's entry block. +// CHECK: %{{.*}} = alloca { i64 }, align 8 +// CHECK-NEXT: %{{.*}} = alloca { ptr }, align 8 +// CHECK-NEXT: br label %omp.region.after_alloca +// CHECK: omp.region.after_alloca: +// CHECK: } From c9c432fd2a6fddb2b121f6d56eb2293e7c848f5f Mon Sep 17 00:00:00 2001 From: ergawy Date: Thu, 11 Dec 2025 02:43:35 -0600 Subject: [PATCH 2/2] review comments, Tom --- .../LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index af83981495557..012ad1d339683 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -1635,14 +1635,11 @@ static void hoistStaticAllocasToAllocaIP( terminator->getNumSuccessors() == 0) continue; - for (unsigned i = 0; i < terminator->getNumSuccessors(); ++i) { - llvm::BasicBlock *successor = terminator->getSuccessor(i); - + for (llvm::BasicBlock *successor : llvm::successors(terminator)) if (!seenBlocks.contains(successor)) { inlinedBlocks.push_back(successor); seenBlocks.insert(successor); } - } } for (llvm::Instruction *staticAlloca : staticAllocas)