Skip to content

Commit

Permalink
[OpenMP][FIX] Allocate per launch memory for GPU team reductions (#70752
Browse files Browse the repository at this point in the history
)

We used to perform team reduction on global memory allocated in the
runtime and by clang. This was racy as multiple instances of a kernel,
or different kernels with team reductions, would use the same locations.
Since we now have the kernel launch environment, we can allocate dynamic
memory per-launch, allowing us to move all the state into a non-racy
place.

Fixes: #70249
  • Loading branch information
jdoerfert committed Nov 1, 2023
1 parent 0d3377c commit f9a89e6
Show file tree
Hide file tree
Showing 9 changed files with 231 additions and 195 deletions.
75 changes: 28 additions & 47 deletions clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -803,8 +803,30 @@ void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
if (!IsSPMD)
emitGenericVarsEpilog(CGF);

// This is temporary until we remove the fixed sized buffer.
ASTContext &C = CGM.getContext();
RecordDecl *StaticRD = C.buildImplicitRecord(
"_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
StaticRD->startDefinition();
for (const RecordDecl *TeamReductionRec : TeamsReductions) {
QualType RecTy = C.getRecordType(TeamReductionRec);
auto *Field = FieldDecl::Create(
C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
/*BW=*/nullptr, /*Mutable=*/false,
/*InitStyle=*/ICIS_NoInit);
Field->setAccess(AS_public);
StaticRD->addDecl(Field);
}
StaticRD->completeDefinition();
QualType StaticTy = C.getRecordType(StaticRD);
llvm::Type *LLVMReductionsBufferTy =
CGM.getTypes().ConvertTypeForMem(StaticTy);
const auto &DL = CGM.getModule().getDataLayout();
uint64_t BufferSize =
DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue();
CGBuilderTy &Bld = CGF.Builder;
OMPBuilder.createTargetDeinit(Bld);
OMPBuilder.createTargetDeinit(Bld, BufferSize);
}

void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
Expand Down Expand Up @@ -2998,15 +3020,10 @@ void CGOpenMPRuntimeGPU::emitReduction(
CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap,
C.getLangOpts().OpenMPCUDAReductionBufNum);
TeamsReductions.push_back(TeamReductionRec);
if (!KernelTeamsReductionPtr) {
KernelTeamsReductionPtr = new llvm::GlobalVariable(
CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
llvm::GlobalValue::InternalLinkage, nullptr,
"_openmp_teams_reductions_buffer_$_$ptr");
}
llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
Address(KernelTeamsReductionPtr, CGF.VoidPtrTy, CGM.getPointerAlign()),
/*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer),
{}, "_openmp_teams_reductions_buffer_$_$ptr");
llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
Expand All @@ -3021,7 +3038,7 @@ void CGOpenMPRuntimeGPU::emitReduction(
llvm::Value *Args[] = {
RTLoc,
ThreadId,
GlobalBufferPtr,
KernelTeamsReductionPtr,
CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
RL,
ShuffleAndReduceFn,
Expand Down Expand Up @@ -3654,42 +3671,6 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(
CGOpenMPRuntime::processRequiresDirective(D);
}

void CGOpenMPRuntimeGPU::clear() {

if (!TeamsReductions.empty()) {
ASTContext &C = CGM.getContext();
RecordDecl *StaticRD = C.buildImplicitRecord(
"_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
StaticRD->startDefinition();
for (const RecordDecl *TeamReductionRec : TeamsReductions) {
QualType RecTy = C.getRecordType(TeamReductionRec);
auto *Field = FieldDecl::Create(
C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
/*BW=*/nullptr, /*Mutable=*/false,
/*InitStyle=*/ICIS_NoInit);
Field->setAccess(AS_public);
StaticRD->addDecl(Field);
}
StaticRD->completeDefinition();
QualType StaticTy = C.getRecordType(StaticRD);
llvm::Type *LLVMReductionsBufferTy =
CGM.getTypes().ConvertTypeForMem(StaticTy);
// FIXME: nvlink does not handle weak linkage correctly (object with the
// different size are reported as erroneous).
// Restore CommonLinkage as soon as nvlink is fixed.
auto *GV = new llvm::GlobalVariable(
CGM.getModule(), LLVMReductionsBufferTy,
/*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
llvm::Constant::getNullValue(LLVMReductionsBufferTy),
"_openmp_teams_reductions_buffer_$_");
KernelTeamsReductionPtr->setInitializer(
llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
CGM.VoidPtrTy));
}
CGOpenMPRuntime::clear();
}

llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
CGBuilderTy &Bld = CGF.Builder;
llvm::Module *M = &CGF.CGM.getModule();
Expand Down
2 changes: 0 additions & 2 deletions clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {

public:
explicit CGOpenMPRuntimeGPU(CodeGenModule &CGM);
void clear() override;

bool isGPU() const override { return true; };

Expand Down Expand Up @@ -386,7 +385,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
/// Maps the function to the list of the globalized variables with their
/// addresses.
llvm::SmallDenseMap<llvm::Function *, FunctionData> FunctionGlobalizedDecls;
llvm::GlobalVariable *KernelTeamsReductionPtr = nullptr;
/// List of the records with the list of fields for the reductions across the
/// teams. Used to build the intermediate buffer for the fast teams
/// reductions.
Expand Down

0 comments on commit f9a89e6

Please sign in to comment.