-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[MLIR][OpenMP][OMPIRBuilder] Improve shared memory checks #161864
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/skatrak/flang-generic-10-shared-alloca-pass
Are you sure you want to change the base?
[MLIR][OpenMP][OMPIRBuilder] Improve shared memory checks #161864
Conversation
@llvm/pr-subscribers-flang-openmp @llvm/pr-subscribers-mlir Author: Sergio Afonso (skatrak) ChangesThis patch refines checks to decide whether to use device shared memory or regular stack allocations. In particular, it adds support for parallel regions residing on standalone target device functions. The changes are:
Patch is 26.11 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/161864.diff 7 Files Affected:
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index d8e5f8cf5a45e..410912ba375a3 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -3292,8 +3292,8 @@ class OpenMPIRBuilder {
ArrayRef<InsertPointTy> DeallocIPs)>;
using TargetGenArgAccessorsCallbackTy = function_ref<InsertPointOrErrorTy(
- Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP,
- InsertPointTy CodeGenIP)>;
+ Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocIP,
+ InsertPointTy CodeGenIP, ArrayRef<InsertPointTy> DeallocIPs)>;
/// Generator for '#omp target'
///
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index a18db939b5876..530477e6d2f6d 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -312,6 +312,12 @@ getTargetKernelExecMode(Function &Kernel) {
return static_cast<OMPTgtExecModeFlags>(KernelMode->getZExtValue());
}
+static bool isGenericKernel(Function &Fn){
+ std::optional<omp::OMPTgtExecModeFlags> ExecMode =
+ getTargetKernelExecMode(Fn);
+ return !ExecMode || (*ExecMode & OMP_TGT_EXEC_MODE_GENERIC);
+}
+
/// Make \p Source branch to \p Target.
///
/// Handles two situations:
@@ -1535,11 +1541,9 @@ static void targetParallelCallback(
IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
: Builder.getInt32(1);
- // If this is not a Generic kernel, we can skip generating the wrapper.
- std::optional<omp::OMPTgtExecModeFlags> ExecMode =
- getTargetKernelExecMode(*OuterFn);
+ // If this is a Generic kernel, we can generate the wrapper.
Value *WrapperFn;
- if (ExecMode && (*ExecMode & OMP_TGT_EXEC_MODE_GENERIC))
+ if (isGenericKernel(*OuterFn))
WrapperFn = createTargetParallelWrapper(OMPIRBuilder, OutlinedFn);
else
WrapperFn = Constant::getNullValue(PtrTy);
@@ -1812,13 +1816,10 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
auto OI = [&]() -> std::unique_ptr<OutlineInfo> {
if (Config.isTargetDevice()) {
- std::optional<omp::OMPTgtExecModeFlags> ExecMode =
- getTargetKernelExecMode(*OuterFn);
-
- // If OuterFn is not a Generic kernel, skip custom allocation. This causes
- // the CodeExtractor to follow its default behavior. Otherwise, we need to
- // use device shared memory to allocate argument structures.
- if (ExecMode && *ExecMode & OMP_TGT_EXEC_MODE_GENERIC)
+ // If OuterFn is a Generic kernel, we need to use device shared memory to
+ // allocate argument structures. Otherwise, we use stack allocations as
+ // usual.
+ if (isGenericKernel(*OuterFn))
return std::make_unique<DeviceSharedMemOutlineInfo>(*this);
}
return std::make_unique<OutlineInfo>();
@@ -7806,8 +7807,9 @@ static Expected<Function *> createOutlinedFunction(
Argument &Arg = std::get<1>(InArg);
Value *InputCopy = nullptr;
- llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
- ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
+ llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = ArgAccessorFuncCB(
+ Arg, Input, InputCopy, AllocaIP, Builder.saveIP(),
+ OpenMPIRBuilder::InsertPointTy(ExitBB, ExitBB->begin()));
if (!AfterIP)
return AfterIP.takeError();
Builder.restoreIP(*AfterIP);
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 1e5b8145d5cdc..d231a778a8a97 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -745,8 +745,10 @@ TEST_F(OpenMPIRBuilderTest, ParallelSimpleGPU) {
EXPECT_EQ(OutlinedFn->getArg(2)->getType(),
PointerType::get(M->getContext(), 0));
EXPECT_EQ(&OutlinedFn->getEntryBlock(), PrivAI->getParent());
- EXPECT_TRUE(OutlinedFn->hasOneUse());
- User *Usr = OutlinedFn->user_back();
+ EXPECT_TRUE(OutlinedFn->hasNUses(2));
+ User *Usr = *OutlinedFn->users().begin();
+ User *WrapperUsr = *++OutlinedFn->users().begin();
+
ASSERT_TRUE(isa<CallInst>(Usr));
CallInst *Parallel51CI = dyn_cast<CallInst>(Usr);
ASSERT_NE(Parallel51CI, nullptr);
@@ -757,6 +759,20 @@ TEST_F(OpenMPIRBuilderTest, ParallelSimpleGPU) {
EXPECT_TRUE(
isa<GlobalVariable>(Parallel51CI->getArgOperand(0)->stripPointerCasts()));
EXPECT_EQ(Parallel51CI, Usr);
+
+ ASSERT_TRUE(isa<CallInst>(WrapperUsr));
+ CallInst *OutlinedCI = dyn_cast<CallInst>(WrapperUsr);
+ ASSERT_NE(OutlinedCI, nullptr);
+ EXPECT_EQ(OutlinedCI->getCalledFunction(), OutlinedFn);
+
+ Function *WrapperFn = OutlinedCI->getFunction();
+ EXPECT_TRUE(WrapperFn->hasInternalLinkage());
+ EXPECT_EQ(WrapperFn->arg_size(), 2U);
+ EXPECT_EQ(WrapperFn->getArg(0)->getType(),
+ IntegerType::getInt16Ty(M->getContext()));
+ EXPECT_EQ(WrapperFn->getArg(1)->getType(),
+ IntegerType::getInt32Ty(M->getContext()));
+
M->setDataLayout(oldDLStr);
}
@@ -6396,7 +6412,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) {
auto SimpleArgAccessorCB =
[&](llvm::Argument &Arg, llvm::Value *Input, llvm::Value *&RetVal,
llvm::OpenMPIRBuilder::InsertPointTy AllocaIP,
- llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP) {
+ llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP,
+ llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy> DeallocIPs) {
IRBuilderBase::InsertPointGuard guard(Builder);
Builder.SetCurrentDebugLocation(llvm::DebugLoc());
if (!OMPBuilder.Config.isTargetDevice()) {
@@ -6561,7 +6578,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
auto SimpleArgAccessorCB =
[&](llvm::Argument &Arg, llvm::Value *Input, llvm::Value *&RetVal,
llvm::OpenMPIRBuilder::InsertPointTy AllocaIP,
- llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP) {
+ llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP,
+ llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy> DeallocIPs) {
IRBuilderBase::InsertPointGuard guard(Builder);
Builder.SetCurrentDebugLocation(llvm::DebugLoc());
if (!OMPBuilder.Config.isTargetDevice()) {
@@ -6763,12 +6781,13 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionSPMD) {
return Builder.saveIP();
};
- auto SimpleArgAccessorCB = [&](Argument &, Value *, Value *&,
- OpenMPIRBuilder::InsertPointTy,
- OpenMPIRBuilder::InsertPointTy CodeGenIP) {
- Builder.restoreIP(CodeGenIP);
- return Builder.saveIP();
- };
+ auto SimpleArgAccessorCB =
+ [&](Argument &, Value *, Value *&, OpenMPIRBuilder::InsertPointTy,
+ OpenMPIRBuilder::InsertPointTy CodeGenIP,
+ llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy>) {
+ Builder.restoreIP(CodeGenIP);
+ return Builder.saveIP();
+ };
SmallVector<Value *> Inputs;
OpenMPIRBuilder::MapInfosTy CombinedInfos;
@@ -6862,12 +6881,13 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDeviceSPMD) {
Function *OutlinedFn = nullptr;
SmallVector<Value *> CapturedArgs;
- auto SimpleArgAccessorCB = [&](Argument &, Value *, Value *&,
- OpenMPIRBuilder::InsertPointTy,
- OpenMPIRBuilder::InsertPointTy CodeGenIP) {
- Builder.restoreIP(CodeGenIP);
- return Builder.saveIP();
- };
+ auto SimpleArgAccessorCB =
+ [&](Argument &, Value *, Value *&, OpenMPIRBuilder::InsertPointTy,
+ OpenMPIRBuilder::InsertPointTy CodeGenIP,
+ llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy>) {
+ Builder.restoreIP(CodeGenIP);
+ return Builder.saveIP();
+ };
OpenMPIRBuilder::MapInfosTy CombinedInfos;
auto GenMapInfoCB =
@@ -6961,7 +6981,8 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) {
auto SimpleArgAccessorCB =
[&](llvm::Argument &Arg, llvm::Value *Input, llvm::Value *&RetVal,
llvm::OpenMPIRBuilder::InsertPointTy AllocaIP,
- llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP) {
+ llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP,
+ llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy> DeallocIPs) {
IRBuilderBase::InsertPointGuard guard(Builder);
Builder.SetCurrentDebugLocation(llvm::DebugLoc());
if (!OMPBuilder.Config.isTargetDevice()) {
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 3accca891ba9c..1183865dc3645 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1128,9 +1128,10 @@ struct DeferredStore {
} // namespace
/// Check whether allocations for the given operation might potentially have to
-/// be done in device shared memory. That means we're compiling for a offloading
-/// target, the operation is an `omp::TargetOp` or nested inside of one and that
-/// target region represents a Generic (non-SPMD) kernel.
+/// be done in device shared memory. That means we're compiling for an
+/// offloading target, the operation is neither an `omp::TargetOp` nor nested
+/// inside of one, or it is and that target region represents a Generic
+/// (non-SPMD) kernel.
///
/// This represents a necessary but not sufficient set of conditions to use
/// device shared memory in place of regular allocas. For some variables, the
@@ -1146,7 +1147,7 @@ mightAllocInDeviceSharedMemory(Operation &op,
if (!targetOp)
targetOp = op.getParentOfType<omp::TargetOp>();
- return targetOp &&
+ return !targetOp ||
targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()) ==
omp::TargetExecMode::generic;
}
@@ -1160,18 +1161,36 @@ mightAllocInDeviceSharedMemory(Operation &op,
/// operation that owns the specified block argument.
static bool mustAllocPrivateVarInDeviceSharedMemory(BlockArgument value) {
Operation *parentOp = value.getOwner()->getParentOp();
- auto targetOp = dyn_cast<omp::TargetOp>(parentOp);
- if (!targetOp)
- targetOp = parentOp->getParentOfType<omp::TargetOp>();
- assert(targetOp && "expected a parent omp.target operation");
-
+ auto moduleOp = parentOp->getParentOfType<ModuleOp>();
for (auto *user : value.getUsers()) {
if (auto parallelOp = dyn_cast<omp::ParallelOp>(user)) {
if (llvm::is_contained(parallelOp.getReductionVars(), value))
return true;
} else if (auto parallelOp = user->getParentOfType<omp::ParallelOp>()) {
- if (parentOp->isProperAncestor(parallelOp))
- return true;
+ if (parentOp->isProperAncestor(parallelOp)) {
+ // If it is used directly inside of a parallel region, skip private
+ // clause uses.
+ bool isPrivateClauseUse = false;
+ if (auto argIface = dyn_cast<omp::BlockArgOpenMPOpInterface>(user)) {
+ if (auto privateSyms = llvm::cast_or_null<ArrayAttr>(
+ user->getAttr("private_syms"))) {
+ for (auto [var, sym] :
+ llvm::zip_equal(argIface.getPrivateVars(), privateSyms)) {
+ if (var != value)
+ continue;
+
+ auto privateOp = cast<omp::PrivateClauseOp>(
+ moduleOp.lookupSymbol(cast<SymbolRefAttr>(sym)));
+ if (privateOp.getCopyRegion().empty()) {
+ isPrivateClauseUse = true;
+ break;
+ }
+ }
+ }
+ }
+ if (!isPrivateClauseUse)
+ return true;
+ }
}
}
@@ -1196,8 +1215,8 @@ allocReductionVars(T op, ArrayRef<BlockArgument> reductionArgs,
builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
- bool useDeviceSharedMem =
- isa<omp::TeamsOp>(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder);
+ bool useDeviceSharedMem = isa<omp::TeamsOp>(*op) &&
+ mightAllocInDeviceSharedMemory(*op, *ompBuilder);
// delay creating stores until after all allocas
deferredStores.reserve(op.getNumReductionVars());
@@ -1318,8 +1337,8 @@ initReductionVars(OP op, ArrayRef<BlockArgument> reductionArgs,
return success();
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
- bool useDeviceSharedMem =
- isa<omp::TeamsOp>(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder);
+ bool useDeviceSharedMem = isa<omp::TeamsOp>(*op) &&
+ mightAllocInDeviceSharedMemory(*op, *ompBuilder);
llvm::BasicBlock *initBlock = splitBB(builder, true, "omp.reduction.init");
auto allocaIP = llvm::IRBuilderBase::InsertPoint(
@@ -1540,8 +1559,8 @@ static LogicalResult createReductionsAndCleanup(
reductionRegions, privateReductionVariables, moduleTranslation, builder,
"omp.reduction.cleanup");
- bool useDeviceSharedMem =
- isa<omp::TeamsOp>(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder);
+ bool useDeviceSharedMem = isa<omp::TeamsOp>(*op) &&
+ mightAllocInDeviceSharedMemory(*op, *ompBuilder);
if (useDeviceSharedMem) {
for (auto [var, reductionDecl] :
llvm::zip_equal(privateReductionVariables, reductionDecls))
@@ -1721,7 +1740,7 @@ allocatePrivateVars(T op, llvm::IRBuilderBase &builder,
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
bool mightUseDeviceSharedMem =
- isa<omp::TeamsOp, omp::DistributeOp>(*op) &&
+ isa<omp::TargetOp, omp::TeamsOp, omp::DistributeOp>(*op) &&
mightAllocInDeviceSharedMemory(*op, *ompBuilder);
unsigned int allocaAS =
moduleTranslation.getLLVMModule()->getDataLayout().getAllocaAddrSpace();
@@ -1839,7 +1858,7 @@ cleanupPrivateVars(T op, llvm::IRBuilderBase &builder,
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
bool mightUseDeviceSharedMem =
- isa<omp::TeamsOp, omp::DistributeOp>(*op) &&
+ isa<omp::TargetOp, omp::TeamsOp, omp::DistributeOp>(*op) &&
mightAllocInDeviceSharedMemory(*op, *ompBuilder);
for (auto [privDecl, llvmPrivVar, blockArg] :
llvm::zip_equal(privateVarsInfo.privatizers, privateVarsInfo.llvmVars,
@@ -5265,42 +5284,68 @@ handleDeclareTargetMapVar(MapInfoData &mapData,
// a store of the kernel argument into this allocated memory which
// will then be loaded from, ByCopy will use the allocated memory
// directly.
-static llvm::IRBuilderBase::InsertPoint
-createDeviceArgumentAccessor(MapInfoData &mapData, llvm::Argument &arg,
- llvm::Value *input, llvm::Value *&retVal,
- llvm::IRBuilderBase &builder,
- llvm::OpenMPIRBuilder &ompBuilder,
- LLVM::ModuleTranslation &moduleTranslation,
- llvm::IRBuilderBase::InsertPoint allocaIP,
- llvm::IRBuilderBase::InsertPoint codeGenIP) {
+static llvm::IRBuilderBase::InsertPoint createDeviceArgumentAccessor(
+ omp::TargetOp targetOp, MapInfoData &mapData, llvm::Argument &arg,
+ llvm::Value *input, llvm::Value *&retVal, llvm::IRBuilderBase &builder,
+ llvm::OpenMPIRBuilder &ompBuilder,
+ LLVM::ModuleTranslation &moduleTranslation,
+ llvm::IRBuilderBase::InsertPoint allocIP,
+ llvm::IRBuilderBase::InsertPoint codeGenIP,
+ llvm::ArrayRef<llvm::IRBuilderBase::InsertPoint> deallocIPs) {
assert(ompBuilder.Config.isTargetDevice() &&
"function only supported for target device codegen");
- builder.restoreIP(allocaIP);
+ builder.restoreIP(allocIP);
omp::VariableCaptureKind capture = omp::VariableCaptureKind::ByRef;
LLVM::TypeToLLVMIRTranslator typeToLLVMIRTranslator(
ompBuilder.M.getContext());
unsigned alignmentValue = 0;
+ BlockArgument mlirArg;
// Find the associated MapInfoData entry for the current input
- for (size_t i = 0; i < mapData.MapClause.size(); ++i)
+ for (size_t i = 0; i < mapData.MapClause.size(); ++i) {
if (mapData.OriginalValue[i] == input) {
auto mapOp = cast<omp::MapInfoOp>(mapData.MapClause[i]);
capture = mapOp.getMapCaptureType();
// Get information of alignment of mapped object
alignmentValue = typeToLLVMIRTranslator.getPreferredAlignment(
mapOp.getVarType(), ompBuilder.M.getDataLayout());
+ // Get the corresponding target entry block argument
+ mlirArg =
+ cast<omp::BlockArgOpenMPOpInterface>(*targetOp).getMapBlockArgs()[i];
break;
}
+ }
unsigned int allocaAS = ompBuilder.M.getDataLayout().getAllocaAddrSpace();
unsigned int defaultAS =
ompBuilder.M.getDataLayout().getProgramAddressSpace();
- // Create the alloca for the argument the current point.
- llvm::Value *v = builder.CreateAlloca(arg.getType(), allocaAS);
+ // Create the allocation for the argument.
+ llvm::Value *v = nullptr;
+ if (mightAllocInDeviceSharedMemory(*targetOp, ompBuilder) &&
+ mustAllocPrivateVarInDeviceSharedMemory(mlirArg)) {
+ // Use the beginning of the codeGenIP rather than the usual allocation point
+ // for shared memory allocations because otherwise these would be done prior
+ // to the target initialization call. Also, the exit block (where the
+ // deallocation is placed) is only executed if the initialization call
+ // succeeds.
+ builder.SetInsertPoint(codeGenIP.getBlock()->getFirstInsertionPt());
+ v = ompBuilder.createOMPAllocShared(builder, arg.getType());
+
+ // Create deallocations in all provided deallocation points and then restore
+ // the insertion point to right after the new allocations.
+ llvm::IRBuilderBase::InsertPointGuard guard(builder);
+ for (auto deallocIP : deallocIPs) {
+ builder.SetInsertPoint(deallocIP.getBlock(), deallocIP.getPoint());
+ ompBuilder.createOMPFreeShared(builder, v, arg.getType());
+ }
+ } else {
+ // Use the current point, which was previously set to allocIP.
+ v = builder.CreateAlloca(arg.getType(), allocaAS);
- if (allocaAS != defaultAS && arg.getType()->isPointerTy())
- v = builder.CreateAddrSpaceCast(v, builder.getPtrTy(defaultAS));
+ if (allocaAS != defaultAS && arg.getType()->isPointerTy())
+ v = builder.CreateAddrSpaceCast(v, builder.getPtrTy(defaultAS));
+ }
builder.CreateStore(&arg, v);
@@ -5890,8 +5935,9 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
};
auto argAccessorCB = [&](llvm::Argument &arg, llvm::Value *input,
- llvm::Value *&retVal, InsertPointTy allocaIP,
- InsertPointTy codeGenIP)
+ llvm::Value *&retVal, InsertPointTy allocIP,
+ InsertPointTy codeGenIP,
+ llvm::ArrayRef<InsertPointTy> deallocIPs)
-> llvm::OpenMPIRBuilder::InsertPointOrErrorTy {
llvm::IRBuilderBase::InsertPointGuard guard(builder);
builder.SetCurrentDebugLocation(llvm::DebugLoc());
@@ -5905,9 +5951,9 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
return codeGenIP;
}
- return createDeviceArgumentAccessor(mapData, arg, input, retVal, builder,
- *ompBuilder, moduleTranslation,
- allocaIP, codeGenIP);
+ return createDeviceArgumentAccessor(targetOp, mapData, arg, input, retVal,
+ builder, *ompBuilder, moduleTranslation,
+ allocIP, codeGenIP, deallocIPs);
};
llvm::OpenMPIRBuilder::TargetKernelRuntimeAttrs runtimeAttrs;
diff --git a/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
index c3ce2f62c486e..c1a7c9736910a 100644
--- a/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
@@ -55,15 +55,14 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
// CHECK: define weak_odr protected amdgpu_kernel void @[[FUNC0:.*]](
// CHECK-SAME: ptr %[[TMP:.*]], ptr %[[TMP0:.*]]) #{{[0-9]+}} {
// ...
[truncated]
|
@llvm/pr-subscribers-mlir-llvm Author: Sergio Afonso (skatrak) ChangesThis patch refines checks to decide whether to use device shared memory or regular stack allocations. In particular, it adds support for parallel regions residing on standalone target device functions. The changes are:
Patch is 26.11 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/161864.diff 7 Files Affected:
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index d8e5f8cf5a45e..410912ba375a3 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -3292,8 +3292,8 @@ class OpenMPIRBuilder {
ArrayRef<InsertPointTy> DeallocIPs)>;
using TargetGenArgAccessorsCallbackTy = function_ref<InsertPointOrErrorTy(
- Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP,
- InsertPointTy CodeGenIP)>;
+ Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocIP,
+ InsertPointTy CodeGenIP, ArrayRef<InsertPointTy> DeallocIPs)>;
/// Generator for '#omp target'
///
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index a18db939b5876..530477e6d2f6d 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -312,6 +312,12 @@ getTargetKernelExecMode(Function &Kernel) {
return static_cast<OMPTgtExecModeFlags>(KernelMode->getZExtValue());
}
+static bool isGenericKernel(Function &Fn){
+ std::optional<omp::OMPTgtExecModeFlags> ExecMode =
+ getTargetKernelExecMode(Fn);
+ return !ExecMode || (*ExecMode & OMP_TGT_EXEC_MODE_GENERIC);
+}
+
/// Make \p Source branch to \p Target.
///
/// Handles two situations:
@@ -1535,11 +1541,9 @@ static void targetParallelCallback(
IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
: Builder.getInt32(1);
- // If this is not a Generic kernel, we can skip generating the wrapper.
- std::optional<omp::OMPTgtExecModeFlags> ExecMode =
- getTargetKernelExecMode(*OuterFn);
+ // If this is a Generic kernel, we can generate the wrapper.
Value *WrapperFn;
- if (ExecMode && (*ExecMode & OMP_TGT_EXEC_MODE_GENERIC))
+ if (isGenericKernel(*OuterFn))
WrapperFn = createTargetParallelWrapper(OMPIRBuilder, OutlinedFn);
else
WrapperFn = Constant::getNullValue(PtrTy);
@@ -1812,13 +1816,10 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
auto OI = [&]() -> std::unique_ptr<OutlineInfo> {
if (Config.isTargetDevice()) {
- std::optional<omp::OMPTgtExecModeFlags> ExecMode =
- getTargetKernelExecMode(*OuterFn);
-
- // If OuterFn is not a Generic kernel, skip custom allocation. This causes
- // the CodeExtractor to follow its default behavior. Otherwise, we need to
- // use device shared memory to allocate argument structures.
- if (ExecMode && *ExecMode & OMP_TGT_EXEC_MODE_GENERIC)
+ // If OuterFn is a Generic kernel, we need to use device shared memory to
+ // allocate argument structures. Otherwise, we use stack allocations as
+ // usual.
+ if (isGenericKernel(*OuterFn))
return std::make_unique<DeviceSharedMemOutlineInfo>(*this);
}
return std::make_unique<OutlineInfo>();
@@ -7806,8 +7807,9 @@ static Expected<Function *> createOutlinedFunction(
Argument &Arg = std::get<1>(InArg);
Value *InputCopy = nullptr;
- llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
- ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
+ llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = ArgAccessorFuncCB(
+ Arg, Input, InputCopy, AllocaIP, Builder.saveIP(),
+ OpenMPIRBuilder::InsertPointTy(ExitBB, ExitBB->begin()));
if (!AfterIP)
return AfterIP.takeError();
Builder.restoreIP(*AfterIP);
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 1e5b8145d5cdc..d231a778a8a97 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -745,8 +745,10 @@ TEST_F(OpenMPIRBuilderTest, ParallelSimpleGPU) {
EXPECT_EQ(OutlinedFn->getArg(2)->getType(),
PointerType::get(M->getContext(), 0));
EXPECT_EQ(&OutlinedFn->getEntryBlock(), PrivAI->getParent());
- EXPECT_TRUE(OutlinedFn->hasOneUse());
- User *Usr = OutlinedFn->user_back();
+ EXPECT_TRUE(OutlinedFn->hasNUses(2));
+ User *Usr = *OutlinedFn->users().begin();
+ User *WrapperUsr = *++OutlinedFn->users().begin();
+
ASSERT_TRUE(isa<CallInst>(Usr));
CallInst *Parallel51CI = dyn_cast<CallInst>(Usr);
ASSERT_NE(Parallel51CI, nullptr);
@@ -757,6 +759,20 @@ TEST_F(OpenMPIRBuilderTest, ParallelSimpleGPU) {
EXPECT_TRUE(
isa<GlobalVariable>(Parallel51CI->getArgOperand(0)->stripPointerCasts()));
EXPECT_EQ(Parallel51CI, Usr);
+
+ ASSERT_TRUE(isa<CallInst>(WrapperUsr));
+ CallInst *OutlinedCI = dyn_cast<CallInst>(WrapperUsr);
+ ASSERT_NE(OutlinedCI, nullptr);
+ EXPECT_EQ(OutlinedCI->getCalledFunction(), OutlinedFn);
+
+ Function *WrapperFn = OutlinedCI->getFunction();
+ EXPECT_TRUE(WrapperFn->hasInternalLinkage());
+ EXPECT_EQ(WrapperFn->arg_size(), 2U);
+ EXPECT_EQ(WrapperFn->getArg(0)->getType(),
+ IntegerType::getInt16Ty(M->getContext()));
+ EXPECT_EQ(WrapperFn->getArg(1)->getType(),
+ IntegerType::getInt32Ty(M->getContext()));
+
M->setDataLayout(oldDLStr);
}
@@ -6396,7 +6412,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) {
auto SimpleArgAccessorCB =
[&](llvm::Argument &Arg, llvm::Value *Input, llvm::Value *&RetVal,
llvm::OpenMPIRBuilder::InsertPointTy AllocaIP,
- llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP) {
+ llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP,
+ llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy> DeallocIPs) {
IRBuilderBase::InsertPointGuard guard(Builder);
Builder.SetCurrentDebugLocation(llvm::DebugLoc());
if (!OMPBuilder.Config.isTargetDevice()) {
@@ -6561,7 +6578,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
auto SimpleArgAccessorCB =
[&](llvm::Argument &Arg, llvm::Value *Input, llvm::Value *&RetVal,
llvm::OpenMPIRBuilder::InsertPointTy AllocaIP,
- llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP) {
+ llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP,
+ llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy> DeallocIPs) {
IRBuilderBase::InsertPointGuard guard(Builder);
Builder.SetCurrentDebugLocation(llvm::DebugLoc());
if (!OMPBuilder.Config.isTargetDevice()) {
@@ -6763,12 +6781,13 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionSPMD) {
return Builder.saveIP();
};
- auto SimpleArgAccessorCB = [&](Argument &, Value *, Value *&,
- OpenMPIRBuilder::InsertPointTy,
- OpenMPIRBuilder::InsertPointTy CodeGenIP) {
- Builder.restoreIP(CodeGenIP);
- return Builder.saveIP();
- };
+ auto SimpleArgAccessorCB =
+ [&](Argument &, Value *, Value *&, OpenMPIRBuilder::InsertPointTy,
+ OpenMPIRBuilder::InsertPointTy CodeGenIP,
+ llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy>) {
+ Builder.restoreIP(CodeGenIP);
+ return Builder.saveIP();
+ };
SmallVector<Value *> Inputs;
OpenMPIRBuilder::MapInfosTy CombinedInfos;
@@ -6862,12 +6881,13 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDeviceSPMD) {
Function *OutlinedFn = nullptr;
SmallVector<Value *> CapturedArgs;
- auto SimpleArgAccessorCB = [&](Argument &, Value *, Value *&,
- OpenMPIRBuilder::InsertPointTy,
- OpenMPIRBuilder::InsertPointTy CodeGenIP) {
- Builder.restoreIP(CodeGenIP);
- return Builder.saveIP();
- };
+ auto SimpleArgAccessorCB =
+ [&](Argument &, Value *, Value *&, OpenMPIRBuilder::InsertPointTy,
+ OpenMPIRBuilder::InsertPointTy CodeGenIP,
+ llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy>) {
+ Builder.restoreIP(CodeGenIP);
+ return Builder.saveIP();
+ };
OpenMPIRBuilder::MapInfosTy CombinedInfos;
auto GenMapInfoCB =
@@ -6961,7 +6981,8 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) {
auto SimpleArgAccessorCB =
[&](llvm::Argument &Arg, llvm::Value *Input, llvm::Value *&RetVal,
llvm::OpenMPIRBuilder::InsertPointTy AllocaIP,
- llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP) {
+ llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP,
+ llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy> DeallocIPs) {
IRBuilderBase::InsertPointGuard guard(Builder);
Builder.SetCurrentDebugLocation(llvm::DebugLoc());
if (!OMPBuilder.Config.isTargetDevice()) {
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 3accca891ba9c..1183865dc3645 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1128,9 +1128,10 @@ struct DeferredStore {
} // namespace
/// Check whether allocations for the given operation might potentially have to
-/// be done in device shared memory. That means we're compiling for a offloading
-/// target, the operation is an `omp::TargetOp` or nested inside of one and that
-/// target region represents a Generic (non-SPMD) kernel.
+/// be done in device shared memory. That means we're compiling for an
+/// offloading target, the operation is neither an `omp::TargetOp` nor nested
+/// inside of one, or it is and that target region represents a Generic
+/// (non-SPMD) kernel.
///
/// This represents a necessary but not sufficient set of conditions to use
/// device shared memory in place of regular allocas. For some variables, the
@@ -1146,7 +1147,7 @@ mightAllocInDeviceSharedMemory(Operation &op,
if (!targetOp)
targetOp = op.getParentOfType<omp::TargetOp>();
- return targetOp &&
+ return !targetOp ||
targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()) ==
omp::TargetExecMode::generic;
}
@@ -1160,18 +1161,36 @@ mightAllocInDeviceSharedMemory(Operation &op,
/// operation that owns the specified block argument.
static bool mustAllocPrivateVarInDeviceSharedMemory(BlockArgument value) {
Operation *parentOp = value.getOwner()->getParentOp();
- auto targetOp = dyn_cast<omp::TargetOp>(parentOp);
- if (!targetOp)
- targetOp = parentOp->getParentOfType<omp::TargetOp>();
- assert(targetOp && "expected a parent omp.target operation");
-
+ auto moduleOp = parentOp->getParentOfType<ModuleOp>();
for (auto *user : value.getUsers()) {
if (auto parallelOp = dyn_cast<omp::ParallelOp>(user)) {
if (llvm::is_contained(parallelOp.getReductionVars(), value))
return true;
} else if (auto parallelOp = user->getParentOfType<omp::ParallelOp>()) {
- if (parentOp->isProperAncestor(parallelOp))
- return true;
+ if (parentOp->isProperAncestor(parallelOp)) {
+ // If it is used directly inside of a parallel region, skip private
+ // clause uses.
+ bool isPrivateClauseUse = false;
+ if (auto argIface = dyn_cast<omp::BlockArgOpenMPOpInterface>(user)) {
+ if (auto privateSyms = llvm::cast_or_null<ArrayAttr>(
+ user->getAttr("private_syms"))) {
+ for (auto [var, sym] :
+ llvm::zip_equal(argIface.getPrivateVars(), privateSyms)) {
+ if (var != value)
+ continue;
+
+ auto privateOp = cast<omp::PrivateClauseOp>(
+ moduleOp.lookupSymbol(cast<SymbolRefAttr>(sym)));
+ if (privateOp.getCopyRegion().empty()) {
+ isPrivateClauseUse = true;
+ break;
+ }
+ }
+ }
+ }
+ if (!isPrivateClauseUse)
+ return true;
+ }
}
}
@@ -1196,8 +1215,8 @@ allocReductionVars(T op, ArrayRef<BlockArgument> reductionArgs,
builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
- bool useDeviceSharedMem =
- isa<omp::TeamsOp>(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder);
+ bool useDeviceSharedMem = isa<omp::TeamsOp>(*op) &&
+ mightAllocInDeviceSharedMemory(*op, *ompBuilder);
// delay creating stores until after all allocas
deferredStores.reserve(op.getNumReductionVars());
@@ -1318,8 +1337,8 @@ initReductionVars(OP op, ArrayRef<BlockArgument> reductionArgs,
return success();
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
- bool useDeviceSharedMem =
- isa<omp::TeamsOp>(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder);
+ bool useDeviceSharedMem = isa<omp::TeamsOp>(*op) &&
+ mightAllocInDeviceSharedMemory(*op, *ompBuilder);
llvm::BasicBlock *initBlock = splitBB(builder, true, "omp.reduction.init");
auto allocaIP = llvm::IRBuilderBase::InsertPoint(
@@ -1540,8 +1559,8 @@ static LogicalResult createReductionsAndCleanup(
reductionRegions, privateReductionVariables, moduleTranslation, builder,
"omp.reduction.cleanup");
- bool useDeviceSharedMem =
- isa<omp::TeamsOp>(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder);
+ bool useDeviceSharedMem = isa<omp::TeamsOp>(*op) &&
+ mightAllocInDeviceSharedMemory(*op, *ompBuilder);
if (useDeviceSharedMem) {
for (auto [var, reductionDecl] :
llvm::zip_equal(privateReductionVariables, reductionDecls))
@@ -1721,7 +1740,7 @@ allocatePrivateVars(T op, llvm::IRBuilderBase &builder,
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
bool mightUseDeviceSharedMem =
- isa<omp::TeamsOp, omp::DistributeOp>(*op) &&
+ isa<omp::TargetOp, omp::TeamsOp, omp::DistributeOp>(*op) &&
mightAllocInDeviceSharedMemory(*op, *ompBuilder);
unsigned int allocaAS =
moduleTranslation.getLLVMModule()->getDataLayout().getAllocaAddrSpace();
@@ -1839,7 +1858,7 @@ cleanupPrivateVars(T op, llvm::IRBuilderBase &builder,
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
bool mightUseDeviceSharedMem =
- isa<omp::TeamsOp, omp::DistributeOp>(*op) &&
+ isa<omp::TargetOp, omp::TeamsOp, omp::DistributeOp>(*op) &&
mightAllocInDeviceSharedMemory(*op, *ompBuilder);
for (auto [privDecl, llvmPrivVar, blockArg] :
llvm::zip_equal(privateVarsInfo.privatizers, privateVarsInfo.llvmVars,
@@ -5265,42 +5284,68 @@ handleDeclareTargetMapVar(MapInfoData &mapData,
// a store of the kernel argument into this allocated memory which
// will then be loaded from, ByCopy will use the allocated memory
// directly.
-static llvm::IRBuilderBase::InsertPoint
-createDeviceArgumentAccessor(MapInfoData &mapData, llvm::Argument &arg,
- llvm::Value *input, llvm::Value *&retVal,
- llvm::IRBuilderBase &builder,
- llvm::OpenMPIRBuilder &ompBuilder,
- LLVM::ModuleTranslation &moduleTranslation,
- llvm::IRBuilderBase::InsertPoint allocaIP,
- llvm::IRBuilderBase::InsertPoint codeGenIP) {
+static llvm::IRBuilderBase::InsertPoint createDeviceArgumentAccessor(
+ omp::TargetOp targetOp, MapInfoData &mapData, llvm::Argument &arg,
+ llvm::Value *input, llvm::Value *&retVal, llvm::IRBuilderBase &builder,
+ llvm::OpenMPIRBuilder &ompBuilder,
+ LLVM::ModuleTranslation &moduleTranslation,
+ llvm::IRBuilderBase::InsertPoint allocIP,
+ llvm::IRBuilderBase::InsertPoint codeGenIP,
+ llvm::ArrayRef<llvm::IRBuilderBase::InsertPoint> deallocIPs) {
assert(ompBuilder.Config.isTargetDevice() &&
"function only supported for target device codegen");
- builder.restoreIP(allocaIP);
+ builder.restoreIP(allocIP);
omp::VariableCaptureKind capture = omp::VariableCaptureKind::ByRef;
LLVM::TypeToLLVMIRTranslator typeToLLVMIRTranslator(
ompBuilder.M.getContext());
unsigned alignmentValue = 0;
+ BlockArgument mlirArg;
// Find the associated MapInfoData entry for the current input
- for (size_t i = 0; i < mapData.MapClause.size(); ++i)
+ for (size_t i = 0; i < mapData.MapClause.size(); ++i) {
if (mapData.OriginalValue[i] == input) {
auto mapOp = cast<omp::MapInfoOp>(mapData.MapClause[i]);
capture = mapOp.getMapCaptureType();
// Get information of alignment of mapped object
alignmentValue = typeToLLVMIRTranslator.getPreferredAlignment(
mapOp.getVarType(), ompBuilder.M.getDataLayout());
+ // Get the corresponding target entry block argument
+ mlirArg =
+ cast<omp::BlockArgOpenMPOpInterface>(*targetOp).getMapBlockArgs()[i];
break;
}
+ }
unsigned int allocaAS = ompBuilder.M.getDataLayout().getAllocaAddrSpace();
unsigned int defaultAS =
ompBuilder.M.getDataLayout().getProgramAddressSpace();
- // Create the alloca for the argument the current point.
- llvm::Value *v = builder.CreateAlloca(arg.getType(), allocaAS);
+ // Create the allocation for the argument.
+ llvm::Value *v = nullptr;
+ if (mightAllocInDeviceSharedMemory(*targetOp, ompBuilder) &&
+ mustAllocPrivateVarInDeviceSharedMemory(mlirArg)) {
+ // Use the beginning of the codeGenIP rather than the usual allocation point
+ // for shared memory allocations because otherwise these would be done prior
+ // to the target initialization call. Also, the exit block (where the
+ // deallocation is placed) is only executed if the initialization call
+ // succeeds.
+ builder.SetInsertPoint(codeGenIP.getBlock()->getFirstInsertionPt());
+ v = ompBuilder.createOMPAllocShared(builder, arg.getType());
+
+ // Create deallocations in all provided deallocation points and then restore
+ // the insertion point to right after the new allocations.
+ llvm::IRBuilderBase::InsertPointGuard guard(builder);
+ for (auto deallocIP : deallocIPs) {
+ builder.SetInsertPoint(deallocIP.getBlock(), deallocIP.getPoint());
+ ompBuilder.createOMPFreeShared(builder, v, arg.getType());
+ }
+ } else {
+ // Use the current point, which was previously set to allocIP.
+ v = builder.CreateAlloca(arg.getType(), allocaAS);
- if (allocaAS != defaultAS && arg.getType()->isPointerTy())
- v = builder.CreateAddrSpaceCast(v, builder.getPtrTy(defaultAS));
+ if (allocaAS != defaultAS && arg.getType()->isPointerTy())
+ v = builder.CreateAddrSpaceCast(v, builder.getPtrTy(defaultAS));
+ }
builder.CreateStore(&arg, v);
@@ -5890,8 +5935,9 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
};
auto argAccessorCB = [&](llvm::Argument &arg, llvm::Value *input,
- llvm::Value *&retVal, InsertPointTy allocaIP,
- InsertPointTy codeGenIP)
+ llvm::Value *&retVal, InsertPointTy allocIP,
+ InsertPointTy codeGenIP,
+ llvm::ArrayRef<InsertPointTy> deallocIPs)
-> llvm::OpenMPIRBuilder::InsertPointOrErrorTy {
llvm::IRBuilderBase::InsertPointGuard guard(builder);
builder.SetCurrentDebugLocation(llvm::DebugLoc());
@@ -5905,9 +5951,9 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
return codeGenIP;
}
- return createDeviceArgumentAccessor(mapData, arg, input, retVal, builder,
- *ompBuilder, moduleTranslation,
- allocaIP, codeGenIP);
+ return createDeviceArgumentAccessor(targetOp, mapData, arg, input, retVal,
+ builder, *ompBuilder, moduleTranslation,
+ allocIP, codeGenIP, deallocIPs);
};
llvm::OpenMPIRBuilder::TargetKernelRuntimeAttrs runtimeAttrs;
diff --git a/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
index c3ce2f62c486e..c1a7c9736910a 100644
--- a/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
@@ -55,15 +55,14 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
// CHECK: define weak_odr protected amdgpu_kernel void @[[FUNC0:.*]](
// CHECK-SAME: ptr %[[TMP:.*]], ptr %[[TMP0:.*]]) #{{[0-9]+}} {
// ...
[truncated]
|
@llvm/pr-subscribers-offload Author: Sergio Afonso (skatrak) ChangesThis patch refines checks to decide whether to use device shared memory or regular stack allocations. In particular, it adds support for parallel regions residing on standalone target device functions. The changes are:
Patch is 26.11 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/161864.diff 7 Files Affected:
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index d8e5f8cf5a45e..410912ba375a3 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -3292,8 +3292,8 @@ class OpenMPIRBuilder {
ArrayRef<InsertPointTy> DeallocIPs)>;
using TargetGenArgAccessorsCallbackTy = function_ref<InsertPointOrErrorTy(
- Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP,
- InsertPointTy CodeGenIP)>;
+ Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocIP,
+ InsertPointTy CodeGenIP, ArrayRef<InsertPointTy> DeallocIPs)>;
/// Generator for '#omp target'
///
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index a18db939b5876..530477e6d2f6d 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -312,6 +312,12 @@ getTargetKernelExecMode(Function &Kernel) {
return static_cast<OMPTgtExecModeFlags>(KernelMode->getZExtValue());
}
+static bool isGenericKernel(Function &Fn){
+ std::optional<omp::OMPTgtExecModeFlags> ExecMode =
+ getTargetKernelExecMode(Fn);
+ return !ExecMode || (*ExecMode & OMP_TGT_EXEC_MODE_GENERIC);
+}
+
/// Make \p Source branch to \p Target.
///
/// Handles two situations:
@@ -1535,11 +1541,9 @@ static void targetParallelCallback(
IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
: Builder.getInt32(1);
- // If this is not a Generic kernel, we can skip generating the wrapper.
- std::optional<omp::OMPTgtExecModeFlags> ExecMode =
- getTargetKernelExecMode(*OuterFn);
+ // If this is a Generic kernel, we can generate the wrapper.
Value *WrapperFn;
- if (ExecMode && (*ExecMode & OMP_TGT_EXEC_MODE_GENERIC))
+ if (isGenericKernel(*OuterFn))
WrapperFn = createTargetParallelWrapper(OMPIRBuilder, OutlinedFn);
else
WrapperFn = Constant::getNullValue(PtrTy);
@@ -1812,13 +1816,10 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
auto OI = [&]() -> std::unique_ptr<OutlineInfo> {
if (Config.isTargetDevice()) {
- std::optional<omp::OMPTgtExecModeFlags> ExecMode =
- getTargetKernelExecMode(*OuterFn);
-
- // If OuterFn is not a Generic kernel, skip custom allocation. This causes
- // the CodeExtractor to follow its default behavior. Otherwise, we need to
- // use device shared memory to allocate argument structures.
- if (ExecMode && *ExecMode & OMP_TGT_EXEC_MODE_GENERIC)
+ // If OuterFn is a Generic kernel, we need to use device shared memory to
+ // allocate argument structures. Otherwise, we use stack allocations as
+ // usual.
+ if (isGenericKernel(*OuterFn))
return std::make_unique<DeviceSharedMemOutlineInfo>(*this);
}
return std::make_unique<OutlineInfo>();
@@ -7806,8 +7807,9 @@ static Expected<Function *> createOutlinedFunction(
Argument &Arg = std::get<1>(InArg);
Value *InputCopy = nullptr;
- llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
- ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
+ llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = ArgAccessorFuncCB(
+ Arg, Input, InputCopy, AllocaIP, Builder.saveIP(),
+ OpenMPIRBuilder::InsertPointTy(ExitBB, ExitBB->begin()));
if (!AfterIP)
return AfterIP.takeError();
Builder.restoreIP(*AfterIP);
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 1e5b8145d5cdc..d231a778a8a97 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -745,8 +745,10 @@ TEST_F(OpenMPIRBuilderTest, ParallelSimpleGPU) {
EXPECT_EQ(OutlinedFn->getArg(2)->getType(),
PointerType::get(M->getContext(), 0));
EXPECT_EQ(&OutlinedFn->getEntryBlock(), PrivAI->getParent());
- EXPECT_TRUE(OutlinedFn->hasOneUse());
- User *Usr = OutlinedFn->user_back();
+ EXPECT_TRUE(OutlinedFn->hasNUses(2));
+ User *Usr = *OutlinedFn->users().begin();
+ User *WrapperUsr = *++OutlinedFn->users().begin();
+
ASSERT_TRUE(isa<CallInst>(Usr));
CallInst *Parallel51CI = dyn_cast<CallInst>(Usr);
ASSERT_NE(Parallel51CI, nullptr);
@@ -757,6 +759,20 @@ TEST_F(OpenMPIRBuilderTest, ParallelSimpleGPU) {
EXPECT_TRUE(
isa<GlobalVariable>(Parallel51CI->getArgOperand(0)->stripPointerCasts()));
EXPECT_EQ(Parallel51CI, Usr);
+
+ ASSERT_TRUE(isa<CallInst>(WrapperUsr));
+ CallInst *OutlinedCI = dyn_cast<CallInst>(WrapperUsr);
+ ASSERT_NE(OutlinedCI, nullptr);
+ EXPECT_EQ(OutlinedCI->getCalledFunction(), OutlinedFn);
+
+ Function *WrapperFn = OutlinedCI->getFunction();
+ EXPECT_TRUE(WrapperFn->hasInternalLinkage());
+ EXPECT_EQ(WrapperFn->arg_size(), 2U);
+ EXPECT_EQ(WrapperFn->getArg(0)->getType(),
+ IntegerType::getInt16Ty(M->getContext()));
+ EXPECT_EQ(WrapperFn->getArg(1)->getType(),
+ IntegerType::getInt32Ty(M->getContext()));
+
M->setDataLayout(oldDLStr);
}
@@ -6396,7 +6412,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) {
auto SimpleArgAccessorCB =
[&](llvm::Argument &Arg, llvm::Value *Input, llvm::Value *&RetVal,
llvm::OpenMPIRBuilder::InsertPointTy AllocaIP,
- llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP) {
+ llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP,
+ llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy> DeallocIPs) {
IRBuilderBase::InsertPointGuard guard(Builder);
Builder.SetCurrentDebugLocation(llvm::DebugLoc());
if (!OMPBuilder.Config.isTargetDevice()) {
@@ -6561,7 +6578,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
auto SimpleArgAccessorCB =
[&](llvm::Argument &Arg, llvm::Value *Input, llvm::Value *&RetVal,
llvm::OpenMPIRBuilder::InsertPointTy AllocaIP,
- llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP) {
+ llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP,
+ llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy> DeallocIPs) {
IRBuilderBase::InsertPointGuard guard(Builder);
Builder.SetCurrentDebugLocation(llvm::DebugLoc());
if (!OMPBuilder.Config.isTargetDevice()) {
@@ -6763,12 +6781,13 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionSPMD) {
return Builder.saveIP();
};
- auto SimpleArgAccessorCB = [&](Argument &, Value *, Value *&,
- OpenMPIRBuilder::InsertPointTy,
- OpenMPIRBuilder::InsertPointTy CodeGenIP) {
- Builder.restoreIP(CodeGenIP);
- return Builder.saveIP();
- };
+ auto SimpleArgAccessorCB =
+ [&](Argument &, Value *, Value *&, OpenMPIRBuilder::InsertPointTy,
+ OpenMPIRBuilder::InsertPointTy CodeGenIP,
+ llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy>) {
+ Builder.restoreIP(CodeGenIP);
+ return Builder.saveIP();
+ };
SmallVector<Value *> Inputs;
OpenMPIRBuilder::MapInfosTy CombinedInfos;
@@ -6862,12 +6881,13 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDeviceSPMD) {
Function *OutlinedFn = nullptr;
SmallVector<Value *> CapturedArgs;
- auto SimpleArgAccessorCB = [&](Argument &, Value *, Value *&,
- OpenMPIRBuilder::InsertPointTy,
- OpenMPIRBuilder::InsertPointTy CodeGenIP) {
- Builder.restoreIP(CodeGenIP);
- return Builder.saveIP();
- };
+ auto SimpleArgAccessorCB =
+ [&](Argument &, Value *, Value *&, OpenMPIRBuilder::InsertPointTy,
+ OpenMPIRBuilder::InsertPointTy CodeGenIP,
+ llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy>) {
+ Builder.restoreIP(CodeGenIP);
+ return Builder.saveIP();
+ };
OpenMPIRBuilder::MapInfosTy CombinedInfos;
auto GenMapInfoCB =
@@ -6961,7 +6981,8 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) {
auto SimpleArgAccessorCB =
[&](llvm::Argument &Arg, llvm::Value *Input, llvm::Value *&RetVal,
llvm::OpenMPIRBuilder::InsertPointTy AllocaIP,
- llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP) {
+ llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP,
+ llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy> DeallocIPs) {
IRBuilderBase::InsertPointGuard guard(Builder);
Builder.SetCurrentDebugLocation(llvm::DebugLoc());
if (!OMPBuilder.Config.isTargetDevice()) {
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 3accca891ba9c..1183865dc3645 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1128,9 +1128,10 @@ struct DeferredStore {
} // namespace
/// Check whether allocations for the given operation might potentially have to
-/// be done in device shared memory. That means we're compiling for a offloading
-/// target, the operation is an `omp::TargetOp` or nested inside of one and that
-/// target region represents a Generic (non-SPMD) kernel.
+/// be done in device shared memory. That means we're compiling for an
+/// offloading target, the operation is neither an `omp::TargetOp` nor nested
+/// inside of one, or it is and that target region represents a Generic
+/// (non-SPMD) kernel.
///
/// This represents a necessary but not sufficient set of conditions to use
/// device shared memory in place of regular allocas. For some variables, the
@@ -1146,7 +1147,7 @@ mightAllocInDeviceSharedMemory(Operation &op,
if (!targetOp)
targetOp = op.getParentOfType<omp::TargetOp>();
- return targetOp &&
+ return !targetOp ||
targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()) ==
omp::TargetExecMode::generic;
}
@@ -1160,18 +1161,36 @@ mightAllocInDeviceSharedMemory(Operation &op,
/// operation that owns the specified block argument.
static bool mustAllocPrivateVarInDeviceSharedMemory(BlockArgument value) {
Operation *parentOp = value.getOwner()->getParentOp();
- auto targetOp = dyn_cast<omp::TargetOp>(parentOp);
- if (!targetOp)
- targetOp = parentOp->getParentOfType<omp::TargetOp>();
- assert(targetOp && "expected a parent omp.target operation");
-
+ auto moduleOp = parentOp->getParentOfType<ModuleOp>();
for (auto *user : value.getUsers()) {
if (auto parallelOp = dyn_cast<omp::ParallelOp>(user)) {
if (llvm::is_contained(parallelOp.getReductionVars(), value))
return true;
} else if (auto parallelOp = user->getParentOfType<omp::ParallelOp>()) {
- if (parentOp->isProperAncestor(parallelOp))
- return true;
+ if (parentOp->isProperAncestor(parallelOp)) {
+ // If it is used directly inside of a parallel region, skip private
+ // clause uses.
+ bool isPrivateClauseUse = false;
+ if (auto argIface = dyn_cast<omp::BlockArgOpenMPOpInterface>(user)) {
+ if (auto privateSyms = llvm::cast_or_null<ArrayAttr>(
+ user->getAttr("private_syms"))) {
+ for (auto [var, sym] :
+ llvm::zip_equal(argIface.getPrivateVars(), privateSyms)) {
+ if (var != value)
+ continue;
+
+ auto privateOp = cast<omp::PrivateClauseOp>(
+ moduleOp.lookupSymbol(cast<SymbolRefAttr>(sym)));
+ if (privateOp.getCopyRegion().empty()) {
+ isPrivateClauseUse = true;
+ break;
+ }
+ }
+ }
+ }
+ if (!isPrivateClauseUse)
+ return true;
+ }
}
}
@@ -1196,8 +1215,8 @@ allocReductionVars(T op, ArrayRef<BlockArgument> reductionArgs,
builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
- bool useDeviceSharedMem =
- isa<omp::TeamsOp>(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder);
+ bool useDeviceSharedMem = isa<omp::TeamsOp>(*op) &&
+ mightAllocInDeviceSharedMemory(*op, *ompBuilder);
// delay creating stores until after all allocas
deferredStores.reserve(op.getNumReductionVars());
@@ -1318,8 +1337,8 @@ initReductionVars(OP op, ArrayRef<BlockArgument> reductionArgs,
return success();
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
- bool useDeviceSharedMem =
- isa<omp::TeamsOp>(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder);
+ bool useDeviceSharedMem = isa<omp::TeamsOp>(*op) &&
+ mightAllocInDeviceSharedMemory(*op, *ompBuilder);
llvm::BasicBlock *initBlock = splitBB(builder, true, "omp.reduction.init");
auto allocaIP = llvm::IRBuilderBase::InsertPoint(
@@ -1540,8 +1559,8 @@ static LogicalResult createReductionsAndCleanup(
reductionRegions, privateReductionVariables, moduleTranslation, builder,
"omp.reduction.cleanup");
- bool useDeviceSharedMem =
- isa<omp::TeamsOp>(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder);
+ bool useDeviceSharedMem = isa<omp::TeamsOp>(*op) &&
+ mightAllocInDeviceSharedMemory(*op, *ompBuilder);
if (useDeviceSharedMem) {
for (auto [var, reductionDecl] :
llvm::zip_equal(privateReductionVariables, reductionDecls))
@@ -1721,7 +1740,7 @@ allocatePrivateVars(T op, llvm::IRBuilderBase &builder,
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
bool mightUseDeviceSharedMem =
- isa<omp::TeamsOp, omp::DistributeOp>(*op) &&
+ isa<omp::TargetOp, omp::TeamsOp, omp::DistributeOp>(*op) &&
mightAllocInDeviceSharedMemory(*op, *ompBuilder);
unsigned int allocaAS =
moduleTranslation.getLLVMModule()->getDataLayout().getAllocaAddrSpace();
@@ -1839,7 +1858,7 @@ cleanupPrivateVars(T op, llvm::IRBuilderBase &builder,
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
bool mightUseDeviceSharedMem =
- isa<omp::TeamsOp, omp::DistributeOp>(*op) &&
+ isa<omp::TargetOp, omp::TeamsOp, omp::DistributeOp>(*op) &&
mightAllocInDeviceSharedMemory(*op, *ompBuilder);
for (auto [privDecl, llvmPrivVar, blockArg] :
llvm::zip_equal(privateVarsInfo.privatizers, privateVarsInfo.llvmVars,
@@ -5265,42 +5284,68 @@ handleDeclareTargetMapVar(MapInfoData &mapData,
// a store of the kernel argument into this allocated memory which
// will then be loaded from, ByCopy will use the allocated memory
// directly.
-static llvm::IRBuilderBase::InsertPoint
-createDeviceArgumentAccessor(MapInfoData &mapData, llvm::Argument &arg,
- llvm::Value *input, llvm::Value *&retVal,
- llvm::IRBuilderBase &builder,
- llvm::OpenMPIRBuilder &ompBuilder,
- LLVM::ModuleTranslation &moduleTranslation,
- llvm::IRBuilderBase::InsertPoint allocaIP,
- llvm::IRBuilderBase::InsertPoint codeGenIP) {
+static llvm::IRBuilderBase::InsertPoint createDeviceArgumentAccessor(
+ omp::TargetOp targetOp, MapInfoData &mapData, llvm::Argument &arg,
+ llvm::Value *input, llvm::Value *&retVal, llvm::IRBuilderBase &builder,
+ llvm::OpenMPIRBuilder &ompBuilder,
+ LLVM::ModuleTranslation &moduleTranslation,
+ llvm::IRBuilderBase::InsertPoint allocIP,
+ llvm::IRBuilderBase::InsertPoint codeGenIP,
+ llvm::ArrayRef<llvm::IRBuilderBase::InsertPoint> deallocIPs) {
assert(ompBuilder.Config.isTargetDevice() &&
"function only supported for target device codegen");
- builder.restoreIP(allocaIP);
+ builder.restoreIP(allocIP);
omp::VariableCaptureKind capture = omp::VariableCaptureKind::ByRef;
LLVM::TypeToLLVMIRTranslator typeToLLVMIRTranslator(
ompBuilder.M.getContext());
unsigned alignmentValue = 0;
+ BlockArgument mlirArg;
// Find the associated MapInfoData entry for the current input
- for (size_t i = 0; i < mapData.MapClause.size(); ++i)
+ for (size_t i = 0; i < mapData.MapClause.size(); ++i) {
if (mapData.OriginalValue[i] == input) {
auto mapOp = cast<omp::MapInfoOp>(mapData.MapClause[i]);
capture = mapOp.getMapCaptureType();
// Get information of alignment of mapped object
alignmentValue = typeToLLVMIRTranslator.getPreferredAlignment(
mapOp.getVarType(), ompBuilder.M.getDataLayout());
+ // Get the corresponding target entry block argument
+ mlirArg =
+ cast<omp::BlockArgOpenMPOpInterface>(*targetOp).getMapBlockArgs()[i];
break;
}
+ }
unsigned int allocaAS = ompBuilder.M.getDataLayout().getAllocaAddrSpace();
unsigned int defaultAS =
ompBuilder.M.getDataLayout().getProgramAddressSpace();
- // Create the alloca for the argument the current point.
- llvm::Value *v = builder.CreateAlloca(arg.getType(), allocaAS);
+ // Create the allocation for the argument.
+ llvm::Value *v = nullptr;
+ if (mightAllocInDeviceSharedMemory(*targetOp, ompBuilder) &&
+ mustAllocPrivateVarInDeviceSharedMemory(mlirArg)) {
+ // Use the beginning of the codeGenIP rather than the usual allocation point
+ // for shared memory allocations because otherwise these would be done prior
+ // to the target initialization call. Also, the exit block (where the
+ // deallocation is placed) is only executed if the initialization call
+ // succeeds.
+ builder.SetInsertPoint(codeGenIP.getBlock()->getFirstInsertionPt());
+ v = ompBuilder.createOMPAllocShared(builder, arg.getType());
+
+ // Create deallocations in all provided deallocation points and then restore
+ // the insertion point to right after the new allocations.
+ llvm::IRBuilderBase::InsertPointGuard guard(builder);
+ for (auto deallocIP : deallocIPs) {
+ builder.SetInsertPoint(deallocIP.getBlock(), deallocIP.getPoint());
+ ompBuilder.createOMPFreeShared(builder, v, arg.getType());
+ }
+ } else {
+ // Use the current point, which was previously set to allocIP.
+ v = builder.CreateAlloca(arg.getType(), allocaAS);
- if (allocaAS != defaultAS && arg.getType()->isPointerTy())
- v = builder.CreateAddrSpaceCast(v, builder.getPtrTy(defaultAS));
+ if (allocaAS != defaultAS && arg.getType()->isPointerTy())
+ v = builder.CreateAddrSpaceCast(v, builder.getPtrTy(defaultAS));
+ }
builder.CreateStore(&arg, v);
@@ -5890,8 +5935,9 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
};
auto argAccessorCB = [&](llvm::Argument &arg, llvm::Value *input,
- llvm::Value *&retVal, InsertPointTy allocaIP,
- InsertPointTy codeGenIP)
+ llvm::Value *&retVal, InsertPointTy allocIP,
+ InsertPointTy codeGenIP,
+ llvm::ArrayRef<InsertPointTy> deallocIPs)
-> llvm::OpenMPIRBuilder::InsertPointOrErrorTy {
llvm::IRBuilderBase::InsertPointGuard guard(builder);
builder.SetCurrentDebugLocation(llvm::DebugLoc());
@@ -5905,9 +5951,9 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
return codeGenIP;
}
- return createDeviceArgumentAccessor(mapData, arg, input, retVal, builder,
- *ompBuilder, moduleTranslation,
- allocaIP, codeGenIP);
+ return createDeviceArgumentAccessor(targetOp, mapData, arg, input, retVal,
+ builder, *ompBuilder, moduleTranslation,
+ allocIP, codeGenIP, deallocIPs);
};
llvm::OpenMPIRBuilder::TargetKernelRuntimeAttrs runtimeAttrs;
diff --git a/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
index c3ce2f62c486e..c1a7c9736910a 100644
--- a/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
@@ -55,15 +55,14 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
// CHECK: define weak_odr protected amdgpu_kernel void @[[FUNC0:.*]](
// CHECK-SAME: ptr %[[TMP:.*]], ptr %[[TMP0:.*]]) #{{[0-9]+}} {
// ...
[truncated]
|
@llvm/pr-subscribers-mlir-openmp Author: Sergio Afonso (skatrak) ChangesThis patch refines checks to decide whether to use device shared memory or regular stack allocations. In particular, it adds support for parallel regions residing on standalone target device functions. The changes are:
Patch is 26.11 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/161864.diff 7 Files Affected:
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index d8e5f8cf5a45e..410912ba375a3 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -3292,8 +3292,8 @@ class OpenMPIRBuilder {
ArrayRef<InsertPointTy> DeallocIPs)>;
using TargetGenArgAccessorsCallbackTy = function_ref<InsertPointOrErrorTy(
- Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP,
- InsertPointTy CodeGenIP)>;
+ Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocIP,
+ InsertPointTy CodeGenIP, ArrayRef<InsertPointTy> DeallocIPs)>;
/// Generator for '#omp target'
///
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index a18db939b5876..530477e6d2f6d 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -312,6 +312,12 @@ getTargetKernelExecMode(Function &Kernel) {
return static_cast<OMPTgtExecModeFlags>(KernelMode->getZExtValue());
}
+static bool isGenericKernel(Function &Fn){
+ std::optional<omp::OMPTgtExecModeFlags> ExecMode =
+ getTargetKernelExecMode(Fn);
+ return !ExecMode || (*ExecMode & OMP_TGT_EXEC_MODE_GENERIC);
+}
+
/// Make \p Source branch to \p Target.
///
/// Handles two situations:
@@ -1535,11 +1541,9 @@ static void targetParallelCallback(
IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
: Builder.getInt32(1);
- // If this is not a Generic kernel, we can skip generating the wrapper.
- std::optional<omp::OMPTgtExecModeFlags> ExecMode =
- getTargetKernelExecMode(*OuterFn);
+ // If this is a Generic kernel, we can generate the wrapper.
Value *WrapperFn;
- if (ExecMode && (*ExecMode & OMP_TGT_EXEC_MODE_GENERIC))
+ if (isGenericKernel(*OuterFn))
WrapperFn = createTargetParallelWrapper(OMPIRBuilder, OutlinedFn);
else
WrapperFn = Constant::getNullValue(PtrTy);
@@ -1812,13 +1816,10 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
auto OI = [&]() -> std::unique_ptr<OutlineInfo> {
if (Config.isTargetDevice()) {
- std::optional<omp::OMPTgtExecModeFlags> ExecMode =
- getTargetKernelExecMode(*OuterFn);
-
- // If OuterFn is not a Generic kernel, skip custom allocation. This causes
- // the CodeExtractor to follow its default behavior. Otherwise, we need to
- // use device shared memory to allocate argument structures.
- if (ExecMode && *ExecMode & OMP_TGT_EXEC_MODE_GENERIC)
+ // If OuterFn is a Generic kernel, we need to use device shared memory to
+ // allocate argument structures. Otherwise, we use stack allocations as
+ // usual.
+ if (isGenericKernel(*OuterFn))
return std::make_unique<DeviceSharedMemOutlineInfo>(*this);
}
return std::make_unique<OutlineInfo>();
@@ -7806,8 +7807,9 @@ static Expected<Function *> createOutlinedFunction(
Argument &Arg = std::get<1>(InArg);
Value *InputCopy = nullptr;
- llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
- ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
+ llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = ArgAccessorFuncCB(
+ Arg, Input, InputCopy, AllocaIP, Builder.saveIP(),
+ OpenMPIRBuilder::InsertPointTy(ExitBB, ExitBB->begin()));
if (!AfterIP)
return AfterIP.takeError();
Builder.restoreIP(*AfterIP);
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 1e5b8145d5cdc..d231a778a8a97 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -745,8 +745,10 @@ TEST_F(OpenMPIRBuilderTest, ParallelSimpleGPU) {
EXPECT_EQ(OutlinedFn->getArg(2)->getType(),
PointerType::get(M->getContext(), 0));
EXPECT_EQ(&OutlinedFn->getEntryBlock(), PrivAI->getParent());
- EXPECT_TRUE(OutlinedFn->hasOneUse());
- User *Usr = OutlinedFn->user_back();
+ EXPECT_TRUE(OutlinedFn->hasNUses(2));
+ User *Usr = *OutlinedFn->users().begin();
+ User *WrapperUsr = *++OutlinedFn->users().begin();
+
ASSERT_TRUE(isa<CallInst>(Usr));
CallInst *Parallel51CI = dyn_cast<CallInst>(Usr);
ASSERT_NE(Parallel51CI, nullptr);
@@ -757,6 +759,20 @@ TEST_F(OpenMPIRBuilderTest, ParallelSimpleGPU) {
EXPECT_TRUE(
isa<GlobalVariable>(Parallel51CI->getArgOperand(0)->stripPointerCasts()));
EXPECT_EQ(Parallel51CI, Usr);
+
+ ASSERT_TRUE(isa<CallInst>(WrapperUsr));
+ CallInst *OutlinedCI = dyn_cast<CallInst>(WrapperUsr);
+ ASSERT_NE(OutlinedCI, nullptr);
+ EXPECT_EQ(OutlinedCI->getCalledFunction(), OutlinedFn);
+
+ Function *WrapperFn = OutlinedCI->getFunction();
+ EXPECT_TRUE(WrapperFn->hasInternalLinkage());
+ EXPECT_EQ(WrapperFn->arg_size(), 2U);
+ EXPECT_EQ(WrapperFn->getArg(0)->getType(),
+ IntegerType::getInt16Ty(M->getContext()));
+ EXPECT_EQ(WrapperFn->getArg(1)->getType(),
+ IntegerType::getInt32Ty(M->getContext()));
+
M->setDataLayout(oldDLStr);
}
@@ -6396,7 +6412,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) {
auto SimpleArgAccessorCB =
[&](llvm::Argument &Arg, llvm::Value *Input, llvm::Value *&RetVal,
llvm::OpenMPIRBuilder::InsertPointTy AllocaIP,
- llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP) {
+ llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP,
+ llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy> DeallocIPs) {
IRBuilderBase::InsertPointGuard guard(Builder);
Builder.SetCurrentDebugLocation(llvm::DebugLoc());
if (!OMPBuilder.Config.isTargetDevice()) {
@@ -6561,7 +6578,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
auto SimpleArgAccessorCB =
[&](llvm::Argument &Arg, llvm::Value *Input, llvm::Value *&RetVal,
llvm::OpenMPIRBuilder::InsertPointTy AllocaIP,
- llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP) {
+ llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP,
+ llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy> DeallocIPs) {
IRBuilderBase::InsertPointGuard guard(Builder);
Builder.SetCurrentDebugLocation(llvm::DebugLoc());
if (!OMPBuilder.Config.isTargetDevice()) {
@@ -6763,12 +6781,13 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionSPMD) {
return Builder.saveIP();
};
- auto SimpleArgAccessorCB = [&](Argument &, Value *, Value *&,
- OpenMPIRBuilder::InsertPointTy,
- OpenMPIRBuilder::InsertPointTy CodeGenIP) {
- Builder.restoreIP(CodeGenIP);
- return Builder.saveIP();
- };
+ auto SimpleArgAccessorCB =
+ [&](Argument &, Value *, Value *&, OpenMPIRBuilder::InsertPointTy,
+ OpenMPIRBuilder::InsertPointTy CodeGenIP,
+ llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy>) {
+ Builder.restoreIP(CodeGenIP);
+ return Builder.saveIP();
+ };
SmallVector<Value *> Inputs;
OpenMPIRBuilder::MapInfosTy CombinedInfos;
@@ -6862,12 +6881,13 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDeviceSPMD) {
Function *OutlinedFn = nullptr;
SmallVector<Value *> CapturedArgs;
- auto SimpleArgAccessorCB = [&](Argument &, Value *, Value *&,
- OpenMPIRBuilder::InsertPointTy,
- OpenMPIRBuilder::InsertPointTy CodeGenIP) {
- Builder.restoreIP(CodeGenIP);
- return Builder.saveIP();
- };
+ auto SimpleArgAccessorCB =
+ [&](Argument &, Value *, Value *&, OpenMPIRBuilder::InsertPointTy,
+ OpenMPIRBuilder::InsertPointTy CodeGenIP,
+ llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy>) {
+ Builder.restoreIP(CodeGenIP);
+ return Builder.saveIP();
+ };
OpenMPIRBuilder::MapInfosTy CombinedInfos;
auto GenMapInfoCB =
@@ -6961,7 +6981,8 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) {
auto SimpleArgAccessorCB =
[&](llvm::Argument &Arg, llvm::Value *Input, llvm::Value *&RetVal,
llvm::OpenMPIRBuilder::InsertPointTy AllocaIP,
- llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP) {
+ llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP,
+ llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy> DeallocIPs) {
IRBuilderBase::InsertPointGuard guard(Builder);
Builder.SetCurrentDebugLocation(llvm::DebugLoc());
if (!OMPBuilder.Config.isTargetDevice()) {
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 3accca891ba9c..1183865dc3645 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1128,9 +1128,10 @@ struct DeferredStore {
} // namespace
/// Check whether allocations for the given operation might potentially have to
-/// be done in device shared memory. That means we're compiling for a offloading
-/// target, the operation is an `omp::TargetOp` or nested inside of one and that
-/// target region represents a Generic (non-SPMD) kernel.
+/// be done in device shared memory. That means we're compiling for an
+/// offloading target, the operation is neither an `omp::TargetOp` nor nested
+/// inside of one, or it is and that target region represents a Generic
+/// (non-SPMD) kernel.
///
/// This represents a necessary but not sufficient set of conditions to use
/// device shared memory in place of regular allocas. For some variables, the
@@ -1146,7 +1147,7 @@ mightAllocInDeviceSharedMemory(Operation &op,
if (!targetOp)
targetOp = op.getParentOfType<omp::TargetOp>();
- return targetOp &&
+ return !targetOp ||
targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()) ==
omp::TargetExecMode::generic;
}
@@ -1160,18 +1161,36 @@ mightAllocInDeviceSharedMemory(Operation &op,
/// operation that owns the specified block argument.
static bool mustAllocPrivateVarInDeviceSharedMemory(BlockArgument value) {
Operation *parentOp = value.getOwner()->getParentOp();
- auto targetOp = dyn_cast<omp::TargetOp>(parentOp);
- if (!targetOp)
- targetOp = parentOp->getParentOfType<omp::TargetOp>();
- assert(targetOp && "expected a parent omp.target operation");
-
+ auto moduleOp = parentOp->getParentOfType<ModuleOp>();
for (auto *user : value.getUsers()) {
if (auto parallelOp = dyn_cast<omp::ParallelOp>(user)) {
if (llvm::is_contained(parallelOp.getReductionVars(), value))
return true;
} else if (auto parallelOp = user->getParentOfType<omp::ParallelOp>()) {
- if (parentOp->isProperAncestor(parallelOp))
- return true;
+ if (parentOp->isProperAncestor(parallelOp)) {
+ // If it is used directly inside of a parallel region, skip private
+ // clause uses.
+ bool isPrivateClauseUse = false;
+ if (auto argIface = dyn_cast<omp::BlockArgOpenMPOpInterface>(user)) {
+ if (auto privateSyms = llvm::cast_or_null<ArrayAttr>(
+ user->getAttr("private_syms"))) {
+ for (auto [var, sym] :
+ llvm::zip_equal(argIface.getPrivateVars(), privateSyms)) {
+ if (var != value)
+ continue;
+
+ auto privateOp = cast<omp::PrivateClauseOp>(
+ moduleOp.lookupSymbol(cast<SymbolRefAttr>(sym)));
+ if (privateOp.getCopyRegion().empty()) {
+ isPrivateClauseUse = true;
+ break;
+ }
+ }
+ }
+ }
+ if (!isPrivateClauseUse)
+ return true;
+ }
}
}
@@ -1196,8 +1215,8 @@ allocReductionVars(T op, ArrayRef<BlockArgument> reductionArgs,
builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
- bool useDeviceSharedMem =
- isa<omp::TeamsOp>(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder);
+ bool useDeviceSharedMem = isa<omp::TeamsOp>(*op) &&
+ mightAllocInDeviceSharedMemory(*op, *ompBuilder);
// delay creating stores until after all allocas
deferredStores.reserve(op.getNumReductionVars());
@@ -1318,8 +1337,8 @@ initReductionVars(OP op, ArrayRef<BlockArgument> reductionArgs,
return success();
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
- bool useDeviceSharedMem =
- isa<omp::TeamsOp>(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder);
+ bool useDeviceSharedMem = isa<omp::TeamsOp>(*op) &&
+ mightAllocInDeviceSharedMemory(*op, *ompBuilder);
llvm::BasicBlock *initBlock = splitBB(builder, true, "omp.reduction.init");
auto allocaIP = llvm::IRBuilderBase::InsertPoint(
@@ -1540,8 +1559,8 @@ static LogicalResult createReductionsAndCleanup(
reductionRegions, privateReductionVariables, moduleTranslation, builder,
"omp.reduction.cleanup");
- bool useDeviceSharedMem =
- isa<omp::TeamsOp>(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder);
+ bool useDeviceSharedMem = isa<omp::TeamsOp>(*op) &&
+ mightAllocInDeviceSharedMemory(*op, *ompBuilder);
if (useDeviceSharedMem) {
for (auto [var, reductionDecl] :
llvm::zip_equal(privateReductionVariables, reductionDecls))
@@ -1721,7 +1740,7 @@ allocatePrivateVars(T op, llvm::IRBuilderBase &builder,
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
bool mightUseDeviceSharedMem =
- isa<omp::TeamsOp, omp::DistributeOp>(*op) &&
+ isa<omp::TargetOp, omp::TeamsOp, omp::DistributeOp>(*op) &&
mightAllocInDeviceSharedMemory(*op, *ompBuilder);
unsigned int allocaAS =
moduleTranslation.getLLVMModule()->getDataLayout().getAllocaAddrSpace();
@@ -1839,7 +1858,7 @@ cleanupPrivateVars(T op, llvm::IRBuilderBase &builder,
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
bool mightUseDeviceSharedMem =
- isa<omp::TeamsOp, omp::DistributeOp>(*op) &&
+ isa<omp::TargetOp, omp::TeamsOp, omp::DistributeOp>(*op) &&
mightAllocInDeviceSharedMemory(*op, *ompBuilder);
for (auto [privDecl, llvmPrivVar, blockArg] :
llvm::zip_equal(privateVarsInfo.privatizers, privateVarsInfo.llvmVars,
@@ -5265,42 +5284,68 @@ handleDeclareTargetMapVar(MapInfoData &mapData,
// a store of the kernel argument into this allocated memory which
// will then be loaded from, ByCopy will use the allocated memory
// directly.
-static llvm::IRBuilderBase::InsertPoint
-createDeviceArgumentAccessor(MapInfoData &mapData, llvm::Argument &arg,
- llvm::Value *input, llvm::Value *&retVal,
- llvm::IRBuilderBase &builder,
- llvm::OpenMPIRBuilder &ompBuilder,
- LLVM::ModuleTranslation &moduleTranslation,
- llvm::IRBuilderBase::InsertPoint allocaIP,
- llvm::IRBuilderBase::InsertPoint codeGenIP) {
+static llvm::IRBuilderBase::InsertPoint createDeviceArgumentAccessor(
+ omp::TargetOp targetOp, MapInfoData &mapData, llvm::Argument &arg,
+ llvm::Value *input, llvm::Value *&retVal, llvm::IRBuilderBase &builder,
+ llvm::OpenMPIRBuilder &ompBuilder,
+ LLVM::ModuleTranslation &moduleTranslation,
+ llvm::IRBuilderBase::InsertPoint allocIP,
+ llvm::IRBuilderBase::InsertPoint codeGenIP,
+ llvm::ArrayRef<llvm::IRBuilderBase::InsertPoint> deallocIPs) {
assert(ompBuilder.Config.isTargetDevice() &&
"function only supported for target device codegen");
- builder.restoreIP(allocaIP);
+ builder.restoreIP(allocIP);
omp::VariableCaptureKind capture = omp::VariableCaptureKind::ByRef;
LLVM::TypeToLLVMIRTranslator typeToLLVMIRTranslator(
ompBuilder.M.getContext());
unsigned alignmentValue = 0;
+ BlockArgument mlirArg;
// Find the associated MapInfoData entry for the current input
- for (size_t i = 0; i < mapData.MapClause.size(); ++i)
+ for (size_t i = 0; i < mapData.MapClause.size(); ++i) {
if (mapData.OriginalValue[i] == input) {
auto mapOp = cast<omp::MapInfoOp>(mapData.MapClause[i]);
capture = mapOp.getMapCaptureType();
// Get information of alignment of mapped object
alignmentValue = typeToLLVMIRTranslator.getPreferredAlignment(
mapOp.getVarType(), ompBuilder.M.getDataLayout());
+ // Get the corresponding target entry block argument
+ mlirArg =
+ cast<omp::BlockArgOpenMPOpInterface>(*targetOp).getMapBlockArgs()[i];
break;
}
+ }
unsigned int allocaAS = ompBuilder.M.getDataLayout().getAllocaAddrSpace();
unsigned int defaultAS =
ompBuilder.M.getDataLayout().getProgramAddressSpace();
- // Create the alloca for the argument the current point.
- llvm::Value *v = builder.CreateAlloca(arg.getType(), allocaAS);
+ // Create the allocation for the argument.
+ llvm::Value *v = nullptr;
+ if (mightAllocInDeviceSharedMemory(*targetOp, ompBuilder) &&
+ mustAllocPrivateVarInDeviceSharedMemory(mlirArg)) {
+ // Use the beginning of the codeGenIP rather than the usual allocation point
+ // for shared memory allocations because otherwise these would be done prior
+ // to the target initialization call. Also, the exit block (where the
+ // deallocation is placed) is only executed if the initialization call
+ // succeeds.
+ builder.SetInsertPoint(codeGenIP.getBlock()->getFirstInsertionPt());
+ v = ompBuilder.createOMPAllocShared(builder, arg.getType());
+
+ // Create deallocations in all provided deallocation points and then restore
+ // the insertion point to right after the new allocations.
+ llvm::IRBuilderBase::InsertPointGuard guard(builder);
+ for (auto deallocIP : deallocIPs) {
+ builder.SetInsertPoint(deallocIP.getBlock(), deallocIP.getPoint());
+ ompBuilder.createOMPFreeShared(builder, v, arg.getType());
+ }
+ } else {
+ // Use the current point, which was previously set to allocIP.
+ v = builder.CreateAlloca(arg.getType(), allocaAS);
- if (allocaAS != defaultAS && arg.getType()->isPointerTy())
- v = builder.CreateAddrSpaceCast(v, builder.getPtrTy(defaultAS));
+ if (allocaAS != defaultAS && arg.getType()->isPointerTy())
+ v = builder.CreateAddrSpaceCast(v, builder.getPtrTy(defaultAS));
+ }
builder.CreateStore(&arg, v);
@@ -5890,8 +5935,9 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
};
auto argAccessorCB = [&](llvm::Argument &arg, llvm::Value *input,
- llvm::Value *&retVal, InsertPointTy allocaIP,
- InsertPointTy codeGenIP)
+ llvm::Value *&retVal, InsertPointTy allocIP,
+ InsertPointTy codeGenIP,
+ llvm::ArrayRef<InsertPointTy> deallocIPs)
-> llvm::OpenMPIRBuilder::InsertPointOrErrorTy {
llvm::IRBuilderBase::InsertPointGuard guard(builder);
builder.SetCurrentDebugLocation(llvm::DebugLoc());
@@ -5905,9 +5951,9 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
return codeGenIP;
}
- return createDeviceArgumentAccessor(mapData, arg, input, retVal, builder,
- *ompBuilder, moduleTranslation,
- allocaIP, codeGenIP);
+ return createDeviceArgumentAccessor(targetOp, mapData, arg, input, retVal,
+ builder, *ompBuilder, moduleTranslation,
+ allocIP, codeGenIP, deallocIPs);
};
llvm::OpenMPIRBuilder::TargetKernelRuntimeAttrs runtimeAttrs;
diff --git a/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
index c3ce2f62c486e..c1a7c9736910a 100644
--- a/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
@@ -55,15 +55,14 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
// CHECK: define weak_odr protected amdgpu_kernel void @[[FUNC0:.*]](
// CHECK-SAME: ptr %[[TMP:.*]], ptr %[[TMP0:.*]]) #{{[0-9]+}} {
// ...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
This patch refines checks to decide whether to use device shared memory or regular stack allocations. In particular, it adds support for parallel regions residing on standalone target device functions. The changes are: - Shared memory is introduced for `omp.target` implicit allocations, such as those related to privatization and mapping, as long as they are shared across threads in a nested parallel region. - Standalone target device functions are interpreted as being part of a Generic kernel, since the fact that they are present in the module after filtering means they must be reachable from a target region. - Prevent allocations whose only shared uses inside of an `omp.parallel` region are as part of a `private` clause from being moved to device shared memory.
2fb0295
to
33e2524
Compare
This patch refines checks to decide whether to use device shared memory or regular stack allocations. In particular, it adds support for parallel regions residing on standalone target device functions.
The changes are:
omp.target
implicit allocations, such as those related to privatization and mapping, as long as they are shared across threads in a nested parallel region.omp.parallel
region are as part of aprivate
clause from being moved to device shared memory.