Skip to content

Commit

Permalink
Enable dynamic-sized VLAs for data sharing in OpenMP offloaded target…
Browse files Browse the repository at this point in the history
  • Loading branch information
doru1004 committed Jul 6, 2023
1 parent 1db5b49 commit 1388887
Show file tree
Hide file tree
Showing 6 changed files with 1,436 additions and 50 deletions.
80 changes: 63 additions & 17 deletions clang/lib/CodeGen/CGDecl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -581,6 +581,16 @@ namespace {
}
};

struct KmpcAllocFree final : EHScopeStack::Cleanup {
std::pair<llvm::Value *, llvm::Value *> AddrSizePair;
KmpcAllocFree(const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair)
: AddrSizePair(AddrSizePair) {}
void Emit(CodeGenFunction &CGF, Flags EmissionFlags) override {
auto &RT = CGF.CGM.getOpenMPRuntime();
RT.getKmpcFreeShared(CGF, AddrSizePair);
}
};

struct ExtendGCLifetime final : EHScopeStack::Cleanup {
const VarDecl &Var;
ExtendGCLifetime(const VarDecl *var) : Var(*var) {}
Expand Down Expand Up @@ -1583,28 +1593,59 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
} else {
EnsureInsertPoint();

if (!DidCallStackSave) {
// Save the stack.
Address Stack =
CreateTempAlloca(Int8PtrTy, getPointerAlign(), "saved_stack");
// Delayed globalization for variable length declarations. This ensures that
// the expression representing the length has been emitted and can be used
// by the definition of the VLA. Since this is an escaped declaration, in
// OpenMP we have to use a call to __kmpc_alloc_shared(). The matching
// deallocation call to __kmpc_free_shared() is emitted later.
bool VarAllocated = false;
if (getLangOpts().OpenMPIsDevice) {
auto &RT = CGM.getOpenMPRuntime();
if (RT.isDelayedVariableLengthDecl(*this, &D)) {
// Emit call to __kmpc_alloc_shared() instead of the alloca.
std::pair<llvm::Value *, llvm::Value *> AddrSizePair =
RT.getKmpcAllocShared(*this, &D);

// Save the address of the allocation:
LValue Base = MakeAddrLValue(AddrSizePair.first, D.getType(),
CGM.getContext().getDeclAlign(&D),
AlignmentSource::Decl);
address = Base.getAddress(*this);

// Push a cleanup block to emit the call to __kmpc_free_shared in the
// appropriate location at the end of the scope of the
// __kmpc_alloc_shared functions:
pushKmpcAllocFree(NormalCleanup, AddrSizePair);

// Mark variable as allocated:
VarAllocated = true;
}
}

llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::stacksave);
llvm::Value *V = Builder.CreateCall(F);
Builder.CreateStore(V, Stack);
if (!VarAllocated) {
if (!DidCallStackSave) {
// Save the stack.
Address Stack =
CreateTempAlloca(Int8PtrTy, getPointerAlign(), "saved_stack");

DidCallStackSave = true;
llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::stacksave);
llvm::Value *V = Builder.CreateCall(F);
Builder.CreateStore(V, Stack);

// Push a cleanup block and restore the stack there.
// FIXME: in general circumstances, this should be an EH cleanup.
pushStackRestore(NormalCleanup, Stack);
}
DidCallStackSave = true;

auto VlaSize = getVLASize(Ty);
llvm::Type *llvmTy = ConvertTypeForMem(VlaSize.Type);
// Push a cleanup block and restore the stack there.
// FIXME: in general circumstances, this should be an EH cleanup.
pushStackRestore(NormalCleanup, Stack);
}

auto VlaSize = getVLASize(Ty);
llvm::Type *llvmTy = ConvertTypeForMem(VlaSize.Type);

// Allocate memory for the array.
address = CreateTempAlloca(llvmTy, alignment, "vla", VlaSize.NumElts,
&AllocaAddr);
// Allocate memory for the array.
address = CreateTempAlloca(llvmTy, alignment, "vla", VlaSize.NumElts,
&AllocaAddr);
}

// If we have debug info enabled, properly describe the VLA dimensions for
// this type by registering the vla size expression for each of the
Expand Down Expand Up @@ -2141,6 +2182,11 @@ void CodeGenFunction::pushStackRestore(CleanupKind Kind, Address SPMem) {
EHStack.pushCleanup<CallStackRestore>(Kind, SPMem);
}

void CodeGenFunction::pushKmpcAllocFree(
CleanupKind Kind, std::pair<llvm::Value *, llvm::Value *> AddrSizePair) {
EHStack.pushCleanup<KmpcAllocFree>(Kind, AddrSizePair);
}

void CodeGenFunction::pushLifetimeExtendedDestroy(CleanupKind cleanupKind,
Address addr, QualType type,
Destroyer *destroyer,
Expand Down
19 changes: 19 additions & 0 deletions clang/lib/CodeGen/CGOpenMPRuntime.h
Original file line number Diff line number Diff line change
Expand Up @@ -690,6 +690,25 @@ class CGOpenMPRuntime {
/// Returns true if the current target is a GPU.
virtual bool isTargetCodegen() const { return false; }

/// Check if the variable length declaration is delayed:
virtual bool isDelayedVariableLengthDecl(CodeGenFunction &CGF,
const VarDecl *VD) const {
return false;
};

/// Get call to __kmpc_alloc_shared
virtual std::pair<llvm::Value *, llvm::Value *>
getKmpcAllocShared(CodeGenFunction &CGF, const VarDecl *VD) {
llvm_unreachable("not implemented");
}

/// Get call to __kmpc_free_shared
virtual void getKmpcFreeShared(
CodeGenFunction &CGF,
const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) {
llvm_unreachable("not implemented");
}

/// Emits code for OpenMP 'if' clause using specified \a CodeGen
/// function. Here is the logic:
/// if (Cond) {
Expand Down
111 changes: 78 additions & 33 deletions clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ class CheckVarsEscapingDeclContext final
CodeGenFunction &CGF;
llvm::SetVector<const ValueDecl *> EscapedDecls;
llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls;
llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
RecordDecl *GlobalizedRD = nullptr;
llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
Expand All @@ -221,10 +222,12 @@ class CheckVarsEscapingDeclContext final
if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
return;
// Variables captured by value must be globalized.
bool IsCaptured = false;
if (auto *CSI = CGF.CapturedStmtInfo) {
if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
// Check if need to capture the variable that was already captured by
// value in the outer region.
IsCaptured = true;
if (!IsForCombinedParallelRegion) {
if (!FD->hasAttrs())
return;
Expand All @@ -251,9 +254,14 @@ class CheckVarsEscapingDeclContext final
VD->getType()->isReferenceType())
// Do not globalize variables with reference type.
return;
if (VD->getType()->isVariablyModifiedType())
EscapedVariableLengthDecls.insert(VD);
else
if (VD->getType()->isVariablyModifiedType()) {
// If not captured at the target region level then mark the escaped
// variable as delayed.
if (IsCaptured)
EscapedVariableLengthDecls.insert(VD);
else
DelayedVariableLengthDecls.insert(VD);
} else
EscapedDecls.insert(VD);
}

Expand Down Expand Up @@ -504,6 +512,12 @@ class CheckVarsEscapingDeclContext final
ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
return EscapedVariableLengthDecls.getArrayRef();
}

/// Returns the list of the delayed variables with the variably modified
/// types.
ArrayRef<const ValueDecl *> getDelayedVariableLengthDecls() const {
return DelayedVariableLengthDecls.getArrayRef();
}
};
} // anonymous namespace

Expand Down Expand Up @@ -1084,41 +1098,66 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
if (auto *DI = CGF.getDebugInfo())
VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
}
for (const auto *VD : I->getSecond().EscapedVariableLengthDecls) {
// Use actual memory size of the VLA object including the padding
// for alignment purposes.
llvm::Value *Size = CGF.getTypeSize(VD->getType());
CharUnits Align = CGM.getContext().getDeclAlign(VD);
Size = Bld.CreateNUWAdd(
Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
llvm::Value *AlignVal =
llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());

Size = Bld.CreateUDiv(Size, AlignVal);
Size = Bld.CreateNUWMul(Size, AlignVal);

// Allocate space for this VLA object to be globalized.
llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
llvm::CallBase *VoidPtr =
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_alloc_shared),
AllocArgs, VD->getName());
VoidPtr->addRetAttr(
llvm::Attribute::get(CGM.getLLVMContext(), llvm::Attribute::Alignment,
CGM.getContext().getTargetInfo().getNewAlign()));

I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(
std::pair<llvm::Value *, llvm::Value *>(
{VoidPtr, CGF.getTypeSize(VD->getType())}));
LValue Base = CGF.MakeAddrLValue(VoidPtr, VD->getType(),
for (const auto *ValueD : I->getSecond().EscapedVariableLengthDecls) {
const auto *VD = cast<VarDecl>(ValueD);
std::pair<llvm::Value *, llvm::Value *> AddrSizePair =
getKmpcAllocShared(CGF, VD);
I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(AddrSizePair);
LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(),
CGM.getContext().getDeclAlign(VD),
AlignmentSource::Decl);
I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
Base.getAddress(CGF));
I->getSecond().MappedParams->setVarAddr(CGF, VD, Base.getAddress(CGF));
}
I->getSecond().MappedParams->apply(CGF);
}

bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF,
const VarDecl *VD) const {
const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
if (I == FunctionGlobalizedDecls.end())
return false;

// Check variable declaration is delayed:
return llvm::is_contained(I->getSecond().DelayedVariableLengthDecls, VD);
}

std::pair<llvm::Value *, llvm::Value *>
CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF,
const VarDecl *VD) {
CGBuilderTy &Bld = CGF.Builder;

// Compute size and alignment.
llvm::Value *Size = CGF.getTypeSize(VD->getType());
CharUnits Align = CGM.getContext().getDeclAlign(VD);
Size = Bld.CreateNUWAdd(
Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
llvm::Value *AlignVal =
llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
Size = Bld.CreateUDiv(Size, AlignVal);
Size = Bld.CreateNUWMul(Size, AlignVal);

// Allocate space for this VLA object to be globalized.
llvm::Value *AllocArgs[] = {Size};
llvm::CallBase *VoidPtr =
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_alloc_shared),
AllocArgs, VD->getName());
VoidPtr->addRetAttr(llvm::Attribute::get(
CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity()));

return std::make_pair(VoidPtr, Size);
}

void CGOpenMPRuntimeGPU::getKmpcFreeShared(
CodeGenFunction &CGF,
const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) {
// Deallocate the memory for each globalized VLA object
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_free_shared),
{AddrSizePair.first, AddrSizePair.second});
}

void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
bool WithSPMDCheck) {
if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
Expand All @@ -1127,7 +1166,8 @@ void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,

const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
if (I != FunctionGlobalizedDecls.end()) {
// Deallocate the memory for each globalized VLA object
// Deallocate the memory for each globalized VLA object that was
// globalized in the prolog (i.e. emitGenericVarsProlog).
for (const auto &AddrSizePair :
llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
Expand Down Expand Up @@ -3282,7 +3322,10 @@ void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
TeamAndReductions.second.clear();
ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
VarChecker.getEscapedVariableLengthDecls();
if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
ArrayRef<const ValueDecl *> DelayedVariableLengthDecls =
VarChecker.getDelayedVariableLengthDecls();
if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() &&
DelayedVariableLengthDecls.empty())
return;
auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
I->getSecond().MappedParams =
Expand All @@ -3292,6 +3335,8 @@ void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
VarChecker.getEscapedParameters().end());
I->getSecond().EscapedVariableLengthDecls.append(
EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
I->getSecond().DelayedVariableLengthDecls.append(
DelayedVariableLengthDecls.begin(), DelayedVariableLengthDecls.end());
DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
assert(VD->isCanonicalDecl() && "Expected canonical declaration");
Expand Down
14 changes: 14 additions & 0 deletions clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,19 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
/// by all specializations of OpenMPGPURuntime Targets like AMDGCN
/// and NVPTX.

/// Check if the variable length declaration is delayed:
bool isDelayedVariableLengthDecl(CodeGenFunction &CGF,
const VarDecl *VD) const override;

/// Get call to __kmpc_alloc_shared
std::pair<llvm::Value *, llvm::Value *>
getKmpcAllocShared(CodeGenFunction &CGF, const VarDecl *VD) override;

/// Get call to __kmpc_free_shared
void getKmpcFreeShared(
CodeGenFunction &CGF,
const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) override;

/// Get the GPU warp size.
llvm::Value *getGPUWarpSize(CodeGenFunction &CGF);

Expand Down Expand Up @@ -359,6 +372,7 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
DeclToAddrMapTy LocalVarData;
EscapedParamsTy EscapedParameters;
llvm::SmallVector<const ValueDecl*, 4> EscapedVariableLengthDecls;
llvm::SmallVector<const ValueDecl *, 4> DelayedVariableLengthDecls;
llvm::SmallVector<std::pair<llvm::Value *, llvm::Value *>, 4>
EscapedVariableLengthDeclsAddrs;
std::unique_ptr<CodeGenFunction::OMPMapVars> MappedParams;
Expand Down
2 changes: 2 additions & 0 deletions clang/lib/CodeGen/CodeGenFunction.h
Original file line number Diff line number Diff line change
Expand Up @@ -2065,6 +2065,8 @@ class CodeGenFunction : public CodeGenTypeCache {
llvm::Value *CompletePtr,
QualType ElementType);
void pushStackRestore(CleanupKind kind, Address SPMem);
void pushKmpcAllocFree(CleanupKind Kind,
std::pair<llvm::Value *, llvm::Value *> AddrSizePair);
void emitDestroy(Address addr, QualType type, Destroyer *destroyer,
bool useEHCleanupForArray);
llvm::Function *generateDestroyHelper(Address addr, QualType type,
Expand Down

0 comments on commit 1388887

Please sign in to comment.