Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -3292,8 +3292,8 @@ class OpenMPIRBuilder {
ArrayRef<InsertPointTy> DeallocIPs)>;

using TargetGenArgAccessorsCallbackTy = function_ref<InsertPointOrErrorTy(
Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP,
InsertPointTy CodeGenIP)>;
Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocIP,
InsertPointTy CodeGenIP, ArrayRef<InsertPointTy> DeallocIPs)>;

/// Generator for '#omp target'
///
Expand Down
28 changes: 15 additions & 13 deletions llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,12 @@ getTargetKernelExecMode(Function &Kernel) {
return static_cast<OMPTgtExecModeFlags>(KernelMode->getZExtValue());
}

static bool isGenericKernel(Function &Fn) {
std::optional<omp::OMPTgtExecModeFlags> ExecMode =
getTargetKernelExecMode(Fn);
return !ExecMode || (*ExecMode & OMP_TGT_EXEC_MODE_GENERIC);
}

/// Make \p Source branch to \p Target.
///
/// Handles two situations:
Expand Down Expand Up @@ -1535,11 +1541,9 @@ static void targetParallelCallback(
IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
: Builder.getInt32(1);

// If this is not a Generic kernel, we can skip generating the wrapper.
std::optional<omp::OMPTgtExecModeFlags> ExecMode =
getTargetKernelExecMode(*OuterFn);
// If this is a Generic kernel, we can generate the wrapper.
Value *WrapperFn;
if (ExecMode && (*ExecMode & OMP_TGT_EXEC_MODE_GENERIC))
if (isGenericKernel(*OuterFn))
WrapperFn = createTargetParallelWrapper(OMPIRBuilder, OutlinedFn);
else
WrapperFn = Constant::getNullValue(PtrTy);
Expand Down Expand Up @@ -1812,13 +1816,10 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(

auto OI = [&]() -> std::unique_ptr<OutlineInfo> {
if (Config.isTargetDevice()) {
std::optional<omp::OMPTgtExecModeFlags> ExecMode =
getTargetKernelExecMode(*OuterFn);

// If OuterFn is not a Generic kernel, skip custom allocation. This causes
// the CodeExtractor to follow its default behavior. Otherwise, we need to
// use device shared memory to allocate argument structures.
if (ExecMode && *ExecMode & OMP_TGT_EXEC_MODE_GENERIC)
// If OuterFn is a Generic kernel, we need to use device shared memory to
// allocate argument structures. Otherwise, we use stack allocations as
// usual.
if (isGenericKernel(*OuterFn))
return std::make_unique<DeviceSharedMemOutlineInfo>(*this);
}
return std::make_unique<OutlineInfo>();
Expand Down Expand Up @@ -7806,8 +7807,9 @@ static Expected<Function *> createOutlinedFunction(
Argument &Arg = std::get<1>(InArg);
Value *InputCopy = nullptr;

llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = ArgAccessorFuncCB(
Arg, Input, InputCopy, AllocaIP, Builder.saveIP(),
OpenMPIRBuilder::InsertPointTy(ExitBB, ExitBB->begin()));
if (!AfterIP)
return AfterIP.takeError();
Builder.restoreIP(*AfterIP);
Expand Down
55 changes: 38 additions & 17 deletions llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -745,8 +745,10 @@ TEST_F(OpenMPIRBuilderTest, ParallelSimpleGPU) {
EXPECT_EQ(OutlinedFn->getArg(2)->getType(),
PointerType::get(M->getContext(), 0));
EXPECT_EQ(&OutlinedFn->getEntryBlock(), PrivAI->getParent());
EXPECT_TRUE(OutlinedFn->hasOneUse());
User *Usr = OutlinedFn->user_back();
EXPECT_TRUE(OutlinedFn->hasNUses(2));
User *Usr = *OutlinedFn->users().begin();
User *WrapperUsr = *++OutlinedFn->users().begin();

ASSERT_TRUE(isa<CallInst>(Usr));
CallInst *Parallel51CI = dyn_cast<CallInst>(Usr);
ASSERT_NE(Parallel51CI, nullptr);
Expand All @@ -757,6 +759,20 @@ TEST_F(OpenMPIRBuilderTest, ParallelSimpleGPU) {
EXPECT_TRUE(
isa<GlobalVariable>(Parallel51CI->getArgOperand(0)->stripPointerCasts()));
EXPECT_EQ(Parallel51CI, Usr);

ASSERT_TRUE(isa<CallInst>(WrapperUsr));
CallInst *OutlinedCI = dyn_cast<CallInst>(WrapperUsr);
ASSERT_NE(OutlinedCI, nullptr);
EXPECT_EQ(OutlinedCI->getCalledFunction(), OutlinedFn);

Function *WrapperFn = OutlinedCI->getFunction();
EXPECT_TRUE(WrapperFn->hasInternalLinkage());
EXPECT_EQ(WrapperFn->arg_size(), 2U);
EXPECT_EQ(WrapperFn->getArg(0)->getType(),
IntegerType::getInt16Ty(M->getContext()));
EXPECT_EQ(WrapperFn->getArg(1)->getType(),
IntegerType::getInt32Ty(M->getContext()));

M->setDataLayout(oldDLStr);
}

Expand Down Expand Up @@ -6396,7 +6412,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) {
auto SimpleArgAccessorCB =
[&](llvm::Argument &Arg, llvm::Value *Input, llvm::Value *&RetVal,
llvm::OpenMPIRBuilder::InsertPointTy AllocaIP,
llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP) {
llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP,
llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy> DeallocIPs) {
IRBuilderBase::InsertPointGuard guard(Builder);
Builder.SetCurrentDebugLocation(llvm::DebugLoc());
if (!OMPBuilder.Config.isTargetDevice()) {
Expand Down Expand Up @@ -6561,7 +6578,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
auto SimpleArgAccessorCB =
[&](llvm::Argument &Arg, llvm::Value *Input, llvm::Value *&RetVal,
llvm::OpenMPIRBuilder::InsertPointTy AllocaIP,
llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP) {
llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP,
llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy> DeallocIPs) {
IRBuilderBase::InsertPointGuard guard(Builder);
Builder.SetCurrentDebugLocation(llvm::DebugLoc());
if (!OMPBuilder.Config.isTargetDevice()) {
Expand Down Expand Up @@ -6763,12 +6781,13 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionSPMD) {
return Builder.saveIP();
};

auto SimpleArgAccessorCB = [&](Argument &, Value *, Value *&,
OpenMPIRBuilder::InsertPointTy,
OpenMPIRBuilder::InsertPointTy CodeGenIP) {
Builder.restoreIP(CodeGenIP);
return Builder.saveIP();
};
auto SimpleArgAccessorCB =
[&](Argument &, Value *, Value *&, OpenMPIRBuilder::InsertPointTy,
OpenMPIRBuilder::InsertPointTy CodeGenIP,
llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy>) {
Builder.restoreIP(CodeGenIP);
return Builder.saveIP();
};

SmallVector<Value *> Inputs;
OpenMPIRBuilder::MapInfosTy CombinedInfos;
Expand Down Expand Up @@ -6862,12 +6881,13 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDeviceSPMD) {
Function *OutlinedFn = nullptr;
SmallVector<Value *> CapturedArgs;

auto SimpleArgAccessorCB = [&](Argument &, Value *, Value *&,
OpenMPIRBuilder::InsertPointTy,
OpenMPIRBuilder::InsertPointTy CodeGenIP) {
Builder.restoreIP(CodeGenIP);
return Builder.saveIP();
};
auto SimpleArgAccessorCB =
[&](Argument &, Value *, Value *&, OpenMPIRBuilder::InsertPointTy,
OpenMPIRBuilder::InsertPointTy CodeGenIP,
llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy>) {
Builder.restoreIP(CodeGenIP);
return Builder.saveIP();
};

OpenMPIRBuilder::MapInfosTy CombinedInfos;
auto GenMapInfoCB =
Expand Down Expand Up @@ -6961,7 +6981,8 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) {
auto SimpleArgAccessorCB =
[&](llvm::Argument &Arg, llvm::Value *Input, llvm::Value *&RetVal,
llvm::OpenMPIRBuilder::InsertPointTy AllocaIP,
llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP) {
llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP,
llvm::ArrayRef<llvm::OpenMPIRBuilder::InsertPointTy> DeallocIPs) {
IRBuilderBase::InsertPointGuard guard(Builder);
Builder.SetCurrentDebugLocation(llvm::DebugLoc());
if (!OMPBuilder.Config.isTargetDevice()) {
Expand Down
122 changes: 84 additions & 38 deletions mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1128,9 +1128,10 @@ struct DeferredStore {
} // namespace

/// Check whether allocations for the given operation might potentially have to
/// be done in device shared memory. That means we're compiling for a offloading
/// target, the operation is an `omp::TargetOp` or nested inside of one and that
/// target region represents a Generic (non-SPMD) kernel.
/// be done in device shared memory. That means we're compiling for an
/// offloading target, the operation is neither an `omp::TargetOp` nor nested
/// inside of one, or it is and that target region represents a Generic
/// (non-SPMD) kernel.
///
/// This represents a necessary but not sufficient set of conditions to use
/// device shared memory in place of regular allocas. For some variables, the
Expand All @@ -1146,7 +1147,7 @@ mightAllocInDeviceSharedMemory(Operation &op,
if (!targetOp)
targetOp = op.getParentOfType<omp::TargetOp>();

return targetOp &&
return !targetOp ||
targetOp.getKernelExecFlags(targetOp.getInnermostCapturedOmpOp()) ==
omp::TargetExecMode::generic;
}
Expand All @@ -1160,18 +1161,36 @@ mightAllocInDeviceSharedMemory(Operation &op,
/// operation that owns the specified block argument.
static bool mustAllocPrivateVarInDeviceSharedMemory(BlockArgument value) {
Operation *parentOp = value.getOwner()->getParentOp();
auto targetOp = dyn_cast<omp::TargetOp>(parentOp);
if (!targetOp)
targetOp = parentOp->getParentOfType<omp::TargetOp>();
assert(targetOp && "expected a parent omp.target operation");

auto moduleOp = parentOp->getParentOfType<ModuleOp>();
for (auto *user : value.getUsers()) {
if (auto parallelOp = dyn_cast<omp::ParallelOp>(user)) {
if (llvm::is_contained(parallelOp.getReductionVars(), value))
return true;
} else if (auto parallelOp = user->getParentOfType<omp::ParallelOp>()) {
if (parentOp->isProperAncestor(parallelOp))
return true;
if (parentOp->isProperAncestor(parallelOp)) {
// If it is used directly inside of a parallel region, skip private
// clause uses.
bool isPrivateClauseUse = false;
if (auto argIface = dyn_cast<omp::BlockArgOpenMPOpInterface>(user)) {
if (auto privateSyms = llvm::cast_or_null<ArrayAttr>(
user->getAttr("private_syms"))) {
for (auto [var, sym] :
llvm::zip_equal(argIface.getPrivateVars(), privateSyms)) {
if (var != value)
continue;

auto privateOp = cast<omp::PrivateClauseOp>(
moduleOp.lookupSymbol(cast<SymbolRefAttr>(sym)));
if (privateOp.getCopyRegion().empty()) {
isPrivateClauseUse = true;
break;
}
}
}
}
if (!isPrivateClauseUse)
return true;
}
}
}

Expand All @@ -1196,8 +1215,8 @@ allocReductionVars(T op, ArrayRef<BlockArgument> reductionArgs,
builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());

llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
bool useDeviceSharedMem =
isa<omp::TeamsOp>(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder);
bool useDeviceSharedMem = isa<omp::TeamsOp>(*op) &&
mightAllocInDeviceSharedMemory(*op, *ompBuilder);

// delay creating stores until after all allocas
deferredStores.reserve(op.getNumReductionVars());
Expand Down Expand Up @@ -1318,8 +1337,8 @@ initReductionVars(OP op, ArrayRef<BlockArgument> reductionArgs,
return success();

llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
bool useDeviceSharedMem =
isa<omp::TeamsOp>(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder);
bool useDeviceSharedMem = isa<omp::TeamsOp>(*op) &&
mightAllocInDeviceSharedMemory(*op, *ompBuilder);

llvm::BasicBlock *initBlock = splitBB(builder, true, "omp.reduction.init");
auto allocaIP = llvm::IRBuilderBase::InsertPoint(
Expand Down Expand Up @@ -1540,8 +1559,8 @@ static LogicalResult createReductionsAndCleanup(
reductionRegions, privateReductionVariables, moduleTranslation, builder,
"omp.reduction.cleanup");

bool useDeviceSharedMem =
isa<omp::TeamsOp>(op) && mightAllocInDeviceSharedMemory(*op, *ompBuilder);
bool useDeviceSharedMem = isa<omp::TeamsOp>(*op) &&
mightAllocInDeviceSharedMemory(*op, *ompBuilder);
if (useDeviceSharedMem) {
for (auto [var, reductionDecl] :
llvm::zip_equal(privateReductionVariables, reductionDecls))
Expand Down Expand Up @@ -1721,7 +1740,7 @@ allocatePrivateVars(T op, llvm::IRBuilderBase &builder,

llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
bool mightUseDeviceSharedMem =
isa<omp::TeamsOp, omp::DistributeOp>(*op) &&
isa<omp::TargetOp, omp::TeamsOp, omp::DistributeOp>(*op) &&
mightAllocInDeviceSharedMemory(*op, *ompBuilder);
unsigned int allocaAS =
moduleTranslation.getLLVMModule()->getDataLayout().getAllocaAddrSpace();
Expand Down Expand Up @@ -1839,7 +1858,7 @@ cleanupPrivateVars(T op, llvm::IRBuilderBase &builder,

llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
bool mightUseDeviceSharedMem =
isa<omp::TeamsOp, omp::DistributeOp>(*op) &&
isa<omp::TargetOp, omp::TeamsOp, omp::DistributeOp>(*op) &&
mightAllocInDeviceSharedMemory(*op, *ompBuilder);
for (auto [privDecl, llvmPrivVar, blockArg] :
llvm::zip_equal(privateVarsInfo.privatizers, privateVarsInfo.llvmVars,
Expand Down Expand Up @@ -5265,42 +5284,68 @@ handleDeclareTargetMapVar(MapInfoData &mapData,
// a store of the kernel argument into this allocated memory which
// will then be loaded from, ByCopy will use the allocated memory
// directly.
static llvm::IRBuilderBase::InsertPoint
createDeviceArgumentAccessor(MapInfoData &mapData, llvm::Argument &arg,
llvm::Value *input, llvm::Value *&retVal,
llvm::IRBuilderBase &builder,
llvm::OpenMPIRBuilder &ompBuilder,
LLVM::ModuleTranslation &moduleTranslation,
llvm::IRBuilderBase::InsertPoint allocaIP,
llvm::IRBuilderBase::InsertPoint codeGenIP) {
static llvm::IRBuilderBase::InsertPoint createDeviceArgumentAccessor(
omp::TargetOp targetOp, MapInfoData &mapData, llvm::Argument &arg,
llvm::Value *input, llvm::Value *&retVal, llvm::IRBuilderBase &builder,
llvm::OpenMPIRBuilder &ompBuilder,
LLVM::ModuleTranslation &moduleTranslation,
llvm::IRBuilderBase::InsertPoint allocIP,
llvm::IRBuilderBase::InsertPoint codeGenIP,
llvm::ArrayRef<llvm::IRBuilderBase::InsertPoint> deallocIPs) {
assert(ompBuilder.Config.isTargetDevice() &&
"function only supported for target device codegen");
builder.restoreIP(allocaIP);
builder.restoreIP(allocIP);

omp::VariableCaptureKind capture = omp::VariableCaptureKind::ByRef;
LLVM::TypeToLLVMIRTranslator typeToLLVMIRTranslator(
ompBuilder.M.getContext());
unsigned alignmentValue = 0;
BlockArgument mlirArg;
// Find the associated MapInfoData entry for the current input
for (size_t i = 0; i < mapData.MapClause.size(); ++i)
for (size_t i = 0; i < mapData.MapClause.size(); ++i) {
if (mapData.OriginalValue[i] == input) {
auto mapOp = cast<omp::MapInfoOp>(mapData.MapClause[i]);
capture = mapOp.getMapCaptureType();
// Get information of alignment of mapped object
alignmentValue = typeToLLVMIRTranslator.getPreferredAlignment(
mapOp.getVarType(), ompBuilder.M.getDataLayout());
// Get the corresponding target entry block argument
mlirArg =
cast<omp::BlockArgOpenMPOpInterface>(*targetOp).getMapBlockArgs()[i];
break;
}
}

unsigned int allocaAS = ompBuilder.M.getDataLayout().getAllocaAddrSpace();
unsigned int defaultAS =
ompBuilder.M.getDataLayout().getProgramAddressSpace();

// Create the alloca for the argument the current point.
llvm::Value *v = builder.CreateAlloca(arg.getType(), allocaAS);
// Create the allocation for the argument.
llvm::Value *v = nullptr;
if (mightAllocInDeviceSharedMemory(*targetOp, ompBuilder) &&
mustAllocPrivateVarInDeviceSharedMemory(mlirArg)) {
// Use the beginning of the codeGenIP rather than the usual allocation point
// for shared memory allocations because otherwise these would be done prior
// to the target initialization call. Also, the exit block (where the
// deallocation is placed) is only executed if the initialization call
// succeeds.
builder.SetInsertPoint(codeGenIP.getBlock()->getFirstInsertionPt());
v = ompBuilder.createOMPAllocShared(builder, arg.getType());

// Create deallocations in all provided deallocation points and then restore
// the insertion point to right after the new allocations.
llvm::IRBuilderBase::InsertPointGuard guard(builder);
for (auto deallocIP : deallocIPs) {
builder.SetInsertPoint(deallocIP.getBlock(), deallocIP.getPoint());
ompBuilder.createOMPFreeShared(builder, v, arg.getType());
}
} else {
// Use the current point, which was previously set to allocIP.
v = builder.CreateAlloca(arg.getType(), allocaAS);

if (allocaAS != defaultAS && arg.getType()->isPointerTy())
v = builder.CreateAddrSpaceCast(v, builder.getPtrTy(defaultAS));
if (allocaAS != defaultAS && arg.getType()->isPointerTy())
v = builder.CreateAddrSpaceCast(v, builder.getPtrTy(defaultAS));
}

builder.CreateStore(&arg, v);

Expand Down Expand Up @@ -5890,8 +5935,9 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
};

auto argAccessorCB = [&](llvm::Argument &arg, llvm::Value *input,
llvm::Value *&retVal, InsertPointTy allocaIP,
InsertPointTy codeGenIP)
llvm::Value *&retVal, InsertPointTy allocIP,
InsertPointTy codeGenIP,
llvm::ArrayRef<InsertPointTy> deallocIPs)
-> llvm::OpenMPIRBuilder::InsertPointOrErrorTy {
llvm::IRBuilderBase::InsertPointGuard guard(builder);
builder.SetCurrentDebugLocation(llvm::DebugLoc());
Expand All @@ -5905,9 +5951,9 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
return codeGenIP;
}

return createDeviceArgumentAccessor(mapData, arg, input, retVal, builder,
*ompBuilder, moduleTranslation,
allocaIP, codeGenIP);
return createDeviceArgumentAccessor(targetOp, mapData, arg, input, retVal,
builder, *ompBuilder, moduleTranslation,
allocIP, codeGenIP, deallocIPs);
};

llvm::OpenMPIRBuilder::TargetKernelRuntimeAttrs runtimeAttrs;
Expand Down
Loading