Skip to content

Commit

Permalink
Introduce the initial support for OpenMP kernel language (#66844)
Browse files Browse the repository at this point in the history
This patch starts the support for OpenMP kernel language, basically to write
OpenMP target region in SIMT style, similar to kernel languages such as CUDA.
What included in this first patch is the `ompx_bare` clause for `target teams`
directive. When `ompx_bare` exists, globalization is disabled such that local
variables will not be globalized. The runtime init/deinit function calls will
not be emitted. That being said, almost all OpenMP executable directives are
not supported in the region, such as parallel, task. This patch doesn't include
the Sema checks for that, so the use of them is UB. Simple directives, such as
atomic, can be used. We provide a set of APIs (for C, they are prefix with
`ompx_`; for C++, they are in `ompx` namespace) to get thread id, block id, etc.
Please refer to
https://tianshilei.me/wp-content/uploads/llvm-hpc-2023.pdf for more details.
  • Loading branch information
shiltian committed Oct 5, 2023
1 parent 7a46baa commit d6254e1
Show file tree
Hide file tree
Showing 20 changed files with 1,145 additions and 657 deletions.
21 changes: 21 additions & 0 deletions clang/include/clang/AST/OpenMPClause.h
Original file line number Diff line number Diff line change
Expand Up @@ -9220,6 +9220,27 @@ class OMPXAttributeClause
}
};

/// This represents 'ompx_bare' clause in the '#pragma omp target teams ...'
/// directive.
///
/// \code
/// #pragma omp target teams ompx_bare
/// \endcode
/// In this example directive '#pragma omp target teams' has a 'ompx_bare'
/// clause.
class OMPXBareClause : public OMPNoChildClause<llvm::omp::OMPC_ompx_bare> {
public:
/// Build 'ompx_bare' clause.
///
/// \param StartLoc Starting location of the clause.
/// \param EndLoc Ending location of the clause.
OMPXBareClause(SourceLocation StartLoc, SourceLocation EndLoc)
: OMPNoChildClause(StartLoc, EndLoc) {}

/// Build an empty clause.
OMPXBareClause() = default;
};

} // namespace clang

#endif // LLVM_CLANG_AST_OPENMPCLAUSE_H
5 changes: 5 additions & 0 deletions clang/include/clang/AST/RecursiveASTVisitor.h
Original file line number Diff line number Diff line change
Expand Up @@ -3890,6 +3890,11 @@ bool RecursiveASTVisitor<Derived>::VisitOMPXAttributeClause(
return true;
}

template <typename Derived>
bool RecursiveASTVisitor<Derived>::VisitOMPXBareClause(OMPXBareClause *C) {
return true;
}

// FIXME: look at the following tricky-seeming exprs to see if we
// need to recurse on anything. These are ones that have methods
// returning decls or qualtypes or nestednamespecifier -- though I'm
Expand Down
4 changes: 4 additions & 0 deletions clang/include/clang/Basic/DiagnosticParseKinds.td
Original file line number Diff line number Diff line change
Expand Up @@ -1360,6 +1360,8 @@ def warn_clause_expected_string : Warning<
"expected string literal in 'clause %0' - ignoring">, InGroup<IgnoredPragmas>;
def err_omp_unexpected_clause : Error<
"unexpected OpenMP clause '%0' in directive '#pragma omp %1'">;
def err_omp_unexpected_clause_extension_only : Error<
"OpenMP clause '%0' is only available as extension, use '-fopenmp-extensions'">;
def err_omp_immediate_directive : Error<
"'#pragma omp %0' %select{|with '%2' clause }1cannot be an immediate substatement">;
def err_omp_expected_identifier_for_critical : Error<
Expand Down Expand Up @@ -1452,6 +1454,8 @@ def warn_unknown_declare_variant_isa_trait
"spelling or consider restricting the context selector with the "
"'arch' selector further">,
InGroup<SourceUsesOpenMP>;
def note_ompx_bare_clause : Note<
"OpenMP extension clause '%0' only allowed with '#pragma omp %1'">;
def note_omp_declare_variant_ctx_options
: Note<"context %select{set|selector|property}0 options are: %1">;
def warn_omp_declare_variant_expected
Expand Down
4 changes: 4 additions & 0 deletions clang/include/clang/Sema/Sema.h
Original file line number Diff line number Diff line change
Expand Up @@ -12486,6 +12486,10 @@ class Sema final {
SourceLocation LParenLoc,
SourceLocation EndLoc);

/// Called on a well-formed 'ompx_bare' clause.
OMPClause *ActOnOpenMPXBareClause(SourceLocation StartLoc,
SourceLocation EndLoc);

/// The kind of conversion being performed.
enum CheckedConversionKind {
/// An implicit conversion.
Expand Down
5 changes: 5 additions & 0 deletions clang/lib/AST/OpenMPClause.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ const OMPClauseWithPreInit *OMPClauseWithPreInit::get(const OMPClause *C) {
case OMPC_affinity:
case OMPC_when:
case OMPC_bind:
case OMPC_ompx_bare:
break;
default:
break;
Expand Down Expand Up @@ -2546,6 +2547,10 @@ void OMPClausePrinter::VisitOMPXAttributeClause(OMPXAttributeClause *Node) {
OS << ")";
}

void OMPClausePrinter::VisitOMPXBareClause(OMPXBareClause *Node) {
OS << "ompx_bare";
}

void OMPTraitInfo::getAsVariantMatchInfo(ASTContext &ASTCtx,
VariantMatchInfo &VMI) const {
for (const OMPTraitSet &Set : Sets) {
Expand Down
1 change: 1 addition & 0 deletions clang/lib/AST/StmtProfile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -930,6 +930,7 @@ void OMPClauseProfiler::VisitOMPDoacrossClause(const OMPDoacrossClause *C) {
}
void OMPClauseProfiler::VisitOMPXAttributeClause(const OMPXAttributeClause *C) {
}
void OMPClauseProfiler::VisitOMPXBareClause(const OMPXBareClause *C) {}
} // namespace

void
Expand Down
59 changes: 42 additions & 17 deletions clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -551,10 +551,9 @@ CGOpenMPRuntimeGPU::getExecutionMode() const {
return CurrentExecutionMode;
}

static CGOpenMPRuntimeGPU::DataSharingMode
getDataSharingMode(CodeGenModule &CGM) {
return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA
: CGOpenMPRuntimeGPU::Generic;
CGOpenMPRuntimeGPU::DataSharingMode
CGOpenMPRuntimeGPU::getDataSharingMode() const {
return CurrentDataSharingMode;
}

/// Check for inner (nested) SPMD construct, if any
Expand Down Expand Up @@ -752,6 +751,9 @@ void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
EntryFunctionState EST;
WrapperFunctionsMap.clear();

[[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
assert(!IsBareKernel && "bare kernel should not be at generic mode");

// Emit target region as a standalone region.
class NVPTXPrePostActionTy : public PrePostActionTy {
CGOpenMPRuntimeGPU::EntryFunctionState &EST;
Expand All @@ -760,15 +762,13 @@ void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST)
: EST(EST) {}
void Enter(CodeGenFunction &CGF) override {
auto &RT =
static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
RT.emitKernelInit(CGF, EST, /* IsSPMD */ false);
// Skip target region initialization.
RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
}
void Exit(CodeGenFunction &CGF) override {
auto &RT =
static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
RT.clearLocThreadIdInsertPt(CGF);
RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
}
Expand Down Expand Up @@ -807,25 +807,39 @@ void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
EntryFunctionState EST;

bool IsBareKernel = D.getSingleClause<OMPXBareClause>();

// Emit target region as a standalone region.
class NVPTXPrePostActionTy : public PrePostActionTy {
CGOpenMPRuntimeGPU &RT;
CGOpenMPRuntimeGPU::EntryFunctionState &EST;
bool IsBareKernel;
DataSharingMode Mode;

public:
NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
CGOpenMPRuntimeGPU::EntryFunctionState &EST)
: RT(RT), EST(EST) {}
CGOpenMPRuntimeGPU::EntryFunctionState &EST,
bool IsBareKernel)
: RT(RT), EST(EST), IsBareKernel(IsBareKernel),
Mode(RT.CurrentDataSharingMode) {}
void Enter(CodeGenFunction &CGF) override {
if (IsBareKernel) {
RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;
return;
}
RT.emitKernelInit(CGF, EST, /* IsSPMD */ true);
// Skip target region initialization.
RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
}
void Exit(CodeGenFunction &CGF) override {
if (IsBareKernel) {
RT.CurrentDataSharingMode = Mode;
return;
}
RT.clearLocThreadIdInsertPt(CGF);
RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
}
} Action(*this, EST);
} Action(*this, EST, IsBareKernel);
CodeGen.setAction(Action);
IsInTTDRegion = true;
emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
Expand All @@ -843,7 +857,8 @@ void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
assert(!ParentName.empty() && "Invalid target region parent name!");

bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
if (Mode)
bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
if (Mode || IsBareKernel)
emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
CodeGen);
else
Expand All @@ -863,6 +878,9 @@ CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
if (!CGM.getLangOpts().OpenMPIsTargetDevice)
llvm_unreachable("OpenMP can only handle device code.");

if (CGM.getLangOpts().OpenMPCUDAMode)
CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;

llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())
return;
Expand Down Expand Up @@ -1030,7 +1048,7 @@ llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
SourceLocation Loc,
bool WithSPMDCheck) {
if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic &&
getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
return;

Expand Down Expand Up @@ -1142,7 +1160,7 @@ void CGOpenMPRuntimeGPU::getKmpcFreeShared(

void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
bool WithSPMDCheck) {
if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic &&
getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
return;

Expand Down Expand Up @@ -1178,11 +1196,18 @@ void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
if (!CGF.HaveInsertPoint())
return;

bool IsBareKernel = D.getSingleClause<OMPXBareClause>();

Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
/*Name=*/".zero.addr");
CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
// We don't emit any thread id function call in bare kernel, but because the
// outlined function has a pointer argument, we emit a nullptr here.
if (IsBareKernel)
OutlinedFnArgs.push_back(llvm::ConstantPointerNull::get(CGM.VoidPtrTy));
else
OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
OutlinedFnArgs.push_back(ZeroAddr.getPointer());
OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
Expand Down Expand Up @@ -3273,7 +3298,7 @@ llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(

void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
const Decl *D) {
if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
return;

assert(D && "Expected function or captured|block decl.");
Expand Down Expand Up @@ -3382,7 +3407,7 @@ Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
VarTy, Align);
}

if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
return Address::invalid();

VD = VD->getCanonicalDecl();
Expand Down
29 changes: 18 additions & 11 deletions clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,18 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
/// Unknown execution mode (orphaned directive).
EM_Unknown,
};

/// Target codegen is specialized based on two data-sharing modes: CUDA, in
/// which the local variables are actually global threadlocal, and Generic, in
/// which the local variables are placed in global memory if they may escape
/// their declaration context.
enum DataSharingMode {
/// CUDA data sharing mode.
DS_CUDA,
/// Generic data-sharing mode.
DS_Generic,
};

private:
/// Parallel outlined function work for workers to execute.
llvm::SmallVector<llvm::Function *, 16> Work;
Expand All @@ -42,6 +54,8 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {

ExecutionMode getExecutionMode() const;

DataSharingMode getDataSharingMode() const;

/// Get barrier to synchronize all threads in a block.
void syncCTAThreads(CodeGenFunction &CGF);

Expand Down Expand Up @@ -297,17 +311,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
Address getAddressOfLocalVariable(CodeGenFunction &CGF,
const VarDecl *VD) override;

/// Target codegen is specialized based on two data-sharing modes: CUDA, in
/// which the local variables are actually global threadlocal, and Generic, in
/// which the local variables are placed in global memory if they may escape
/// their declaration context.
enum DataSharingMode {
/// CUDA data sharing mode.
CUDA,
/// Generic data-sharing mode.
Generic,
};

/// Cleans up references to the objects in finished function.
///
void functionFinished(CodeGenFunction &CGF) override;
Expand Down Expand Up @@ -343,6 +346,10 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
/// to emit optimized code.
ExecutionMode CurrentExecutionMode = EM_Unknown;

/// Track the data sharing mode when codegening directives within a target
/// region.
DataSharingMode CurrentDataSharingMode = DataSharingMode::DS_Generic;

/// true if currently emitting code for target/teams/distribute region, false
/// - otherwise.
bool IsInTTDRegion = false;
Expand Down
11 changes: 11 additions & 0 deletions clang/lib/Parse/ParseOpenMP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3416,6 +3416,17 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
case OMPC_ompx_attribute:
Clause = ParseOpenMPOMPXAttributesClause(WrongDirective);
break;
case OMPC_ompx_bare:
if (WrongDirective)
Diag(Tok, diag::note_ompx_bare_clause)
<< getOpenMPClauseName(CKind) << "target teams";
if (!ErrorFound && !getLangOpts().OpenMPExtensions) {
Diag(Tok, diag::err_omp_unexpected_clause_extension_only)
<< getOpenMPClauseName(CKind) << getOpenMPDirectiveName(DKind);
ErrorFound = true;
}
Clause = ParseOpenMPClause(CKind, WrongDirective);
break;
default:
break;
}
Expand Down
8 changes: 8 additions & 0 deletions clang/lib/Sema/SemaOpenMP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17553,6 +17553,9 @@ OMPClause *Sema::ActOnOpenMPClause(OpenMPClauseKind Kind,
case OMPC_partial:
Res = ActOnOpenMPPartialClause(nullptr, StartLoc, /*LParenLoc=*/{}, EndLoc);
break;
case OMPC_ompx_bare:
Res = ActOnOpenMPXBareClause(StartLoc, EndLoc);
break;
case OMPC_if:
case OMPC_final:
case OMPC_num_threads:
Expand Down Expand Up @@ -24279,3 +24282,8 @@ OMPClause *Sema::ActOnOpenMPXAttributeClause(ArrayRef<const Attr *> Attrs,
SourceLocation EndLoc) {
return new (Context) OMPXAttributeClause(Attrs, StartLoc, LParenLoc, EndLoc);
}

OMPClause *Sema::ActOnOpenMPXBareClause(SourceLocation StartLoc,
SourceLocation EndLoc) {
return new (Context) OMPXBareClause(StartLoc, EndLoc);
}
14 changes: 14 additions & 0 deletions clang/lib/Sema/TreeTransform.h
Original file line number Diff line number Diff line change
Expand Up @@ -2391,6 +2391,15 @@ class TreeTransform {
EndLoc);
}

/// Build a new OpenMP 'ompx_bare' clause.
///
/// By default, performs semantic analysis to build the new OpenMP clause.
/// Subclasses may override this routine to provide different behavior.
OMPClause *RebuildOMPXBareClause(SourceLocation StartLoc,
SourceLocation EndLoc) {
return getSema().ActOnOpenMPXBareClause(StartLoc, EndLoc);
}

/// Build a new OpenMP 'align' clause.
///
/// By default, performs semantic analysis to build the new OpenMP clause.
Expand Down Expand Up @@ -10807,6 +10816,11 @@ TreeTransform<Derived>::TransformOMPXAttributeClause(OMPXAttributeClause *C) {
NewAttrs, C->getBeginLoc(), C->getLParenLoc(), C->getEndLoc());
}

template <typename Derived>
OMPClause *TreeTransform<Derived>::TransformOMPXBareClause(OMPXBareClause *C) {
return getDerived().RebuildOMPXBareClause(C->getBeginLoc(), C->getEndLoc());
}

//===----------------------------------------------------------------------===//
// Expression transformation
//===----------------------------------------------------------------------===//
Expand Down
5 changes: 5 additions & 0 deletions clang/lib/Serialization/ASTReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10447,6 +10447,9 @@ OMPClause *OMPClauseReader::readClause() {
case llvm::omp::OMPC_ompx_attribute:
C = new (Context) OMPXAttributeClause();
break;
case llvm::omp::OMPC_ompx_bare:
C = new (Context) OMPXBareClause();
break;
#define OMP_CLAUSE_NO_CLASS(Enum, Str) \
case llvm::omp::Enum: \
break;
Expand Down Expand Up @@ -11548,6 +11551,8 @@ void OMPClauseReader::VisitOMPXAttributeClause(OMPXAttributeClause *C) {
C->setLocEnd(Record.readSourceLocation());
}

void OMPClauseReader::VisitOMPXBareClause(OMPXBareClause *C) {}

OMPTraitInfo *ASTRecordReader::readOMPTraitInfo() {
OMPTraitInfo &TI = getContext().getNewOMPTraitInfo();
TI.Sets.resize(readUInt32());
Expand Down

0 comments on commit d6254e1

Please sign in to comment.