Skip to content

Commit

Permalink
[OpenMP] Add option to make offloading mandatory
Browse files Browse the repository at this point in the history
Currently when we generate OpenMP offloading code we always make
fallback code for the CPU. This is necessary for implementing features
like conditional offloading and ensuring that unhandled pragmas don't
result in missing symbols. However, this is problematic for a few cases.
For offloading tests we can silently fail to the host without realizing
that offloading failed. Additionally, this makes it impossible to
provide interoperabiility to other offloading schemes like HIP or CUDA
because those methods do not provide any such host fallback guaruntee.
this patch adds the `-fopenmp-offload-mandatory` flag to prevent
generating the fallback symbol on the CPU and instead replaces the
function with a dummy global and the failed branch with 'unreachable'.

Reviewed By: ABataev

Differential Revision: https://reviews.llvm.org/D120353
  • Loading branch information
jhuber6 committed Feb 23, 2022
1 parent a4541fd commit 2b97b16
Show file tree
Hide file tree
Showing 7 changed files with 151 additions and 27 deletions.
1 change: 1 addition & 0 deletions clang/include/clang/Basic/LangOptions.def
Expand Up @@ -247,6 +247,7 @@ LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the
LANGOPT(OpenMPThreadSubscription , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.")
LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have more iterations than participating teams.")
LANGOPT(OpenMPNoThreadState , 1, 0, "Assume that no thread in a parallel region will modify an ICV.")
LANGOPT(OpenMPOffloadMandatory , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.")
LANGOPT(RenderScript , 1, 0, "RenderScript")

LANGOPT(CUDAIsDevice , 1, 0, "compiling for CUDA device")
Expand Down
4 changes: 4 additions & 0 deletions clang/include/clang/Driver/Options.td
Expand Up @@ -2479,6 +2479,10 @@ def fopenmp_assume_no_thread_state : Flag<["-"], "fopenmp-assume-no-thread-state
Flags<[CC1Option, NoArgumentUnused, HelpHidden]>,
HelpText<"Assert no thread in a parallel region modifies an ICV">,
MarshallingInfoFlag<LangOpts<"OpenMPNoThreadState">>;
def fopenmp_offload_mandatory : Flag<["-"], "fopenmp-offload-mandatory">, Group<f_Group>,
Flags<[CC1Option, NoArgumentUnused]>,
HelpText<"Do not create a host fallback if offloading to the device fails.">,
MarshallingInfoFlag<LangOpts<"OpenMPOffloadMandatory">>;
defm openmp_target_new_runtime: BoolFOption<"openmp-target-new-runtime",
LangOpts<"OpenMPTargetNewRuntime">, DefaultTrue,
PosFlag<SetTrue, [CC1Option], "Use the new bitcode library for OpenMP offloading">,
Expand Down
72 changes: 46 additions & 26 deletions clang/lib/CodeGen/CGOpenMPRuntime.cpp
Expand Up @@ -6538,6 +6538,8 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
// mangled name of the function that encloses the target region and BB is the
// line number of the target region.

const bool BuildOutlinedFn = CGM.getLangOpts().OpenMPIsDevice ||
!CGM.getLangOpts().OpenMPOffloadMandatory;
unsigned DeviceID;
unsigned FileID;
unsigned Line;
Expand All @@ -6556,7 +6558,8 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
CGOpenMPTargetRegionInfo CGInfo(CS, CodeGen, EntryFnName);
CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);

OutlinedFn = CGF.GenerateOpenMPCapturedStmtFunction(CS, D.getBeginLoc());
if (BuildOutlinedFn)
OutlinedFn = CGF.GenerateOpenMPCapturedStmtFunction(CS, D.getBeginLoc());

// If this target outline function is not an offload entry, we don't need to
// register it.
Expand Down Expand Up @@ -6588,26 +6591,38 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
llvm::Constant::getNullValue(CGM.Int8Ty), Name);
}

// If we do not allow host fallback we still need a named address to use.
llvm::Constant *TargetRegionEntryAddr = OutlinedFn;
if (!BuildOutlinedFn) {
assert(!CGM.getModule().getGlobalVariable(EntryFnName, true) &&
"Named kernel already exists?");
TargetRegionEntryAddr = new llvm::GlobalVariable(
CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
llvm::GlobalValue::InternalLinkage,
llvm::Constant::getNullValue(CGM.Int8Ty), EntryFnName);
}

// Register the information for the entry associated with this target region.
OffloadEntriesInfoManager.registerTargetRegionEntryInfo(
DeviceID, FileID, ParentName, Line, OutlinedFn, OutlinedFnID,
DeviceID, FileID, ParentName, Line, TargetRegionEntryAddr, OutlinedFnID,
OffloadEntriesInfoManagerTy::OMPTargetRegionEntryTargetRegion);

// Add NumTeams and ThreadLimit attributes to the outlined GPU function
int32_t DefaultValTeams = -1;
getNumTeamsExprForTargetDirective(CGF, D, DefaultValTeams);
if (DefaultValTeams > 0) {
if (DefaultValTeams > 0 && OutlinedFn) {
OutlinedFn->addFnAttr("omp_target_num_teams",
std::to_string(DefaultValTeams));
}
int32_t DefaultValThreads = -1;
getNumThreadsExprForTargetDirective(CGF, D, DefaultValThreads);
if (DefaultValThreads > 0) {
if (DefaultValThreads > 0 && OutlinedFn) {
OutlinedFn->addFnAttr("omp_target_thread_limit",
std::to_string(DefaultValThreads));
}

CGM.getTargetCodeGenInfo().setTargetAttributes(nullptr, OutlinedFn, CGM);
if (BuildOutlinedFn)
CGM.getTargetCodeGenInfo().setTargetAttributes(nullptr, OutlinedFn, CGM);
}

/// Checks if the expression is constant or does not have non-trivial function
Expand Down Expand Up @@ -10324,7 +10339,10 @@ void CGOpenMPRuntime::emitTargetCall(
if (!CGF.HaveInsertPoint())
return;

assert(OutlinedFn && "Invalid outlined function!");
const bool OffloadingMandatory = !CGM.getLangOpts().OpenMPIsDevice &&
CGM.getLangOpts().OpenMPOffloadMandatory;

assert((OffloadingMandatory || OutlinedFn) && "Invalid outlined function!");

const bool RequiresOuterTask = D.hasClausesOfKind<OMPDependClause>() ||
D.hasClausesOfKind<OMPNowaitClause>();
Expand All @@ -10339,18 +10357,28 @@ void CGOpenMPRuntime::emitTargetCall(
CodeGenFunction::OMPTargetDataInfo InputInfo;
llvm::Value *MapTypesArray = nullptr;
llvm::Value *MapNamesArray = nullptr;
// Fill up the pointer arrays and transfer execution to the device.
auto &&ThenGen = [this, Device, OutlinedFn, OutlinedFnID, &D, &InputInfo,
&MapTypesArray, &MapNamesArray, &CS, RequiresOuterTask,
&CapturedVars,
SizeEmitter](CodeGenFunction &CGF, PrePostActionTy &) {
if (Device.getInt() == OMPC_DEVICE_ancestor) {
// Reverse offloading is not supported, so just execute on the host.
// Generate code for the host fallback function.
auto &&FallbackGen = [this, OutlinedFn, OutlinedFnID, &D, &CapturedVars,
RequiresOuterTask, &CS,
OffloadingMandatory](CodeGenFunction &CGF) {
if (OffloadingMandatory) {
CGF.Builder.CreateUnreachable();
} else {
if (RequiresOuterTask) {
CapturedVars.clear();
CGF.GenerateOpenMPCapturedVars(CS, CapturedVars);
}
emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedFn, CapturedVars);
}
};
// Fill up the pointer arrays and transfer execution to the device.
auto &&ThenGen = [this, Device, OutlinedFn, OutlinedFnID, &D, &InputInfo,
&MapTypesArray, &MapNamesArray, &CS, RequiresOuterTask,
&CapturedVars, SizeEmitter,
FallbackGen](CodeGenFunction &CGF, PrePostActionTy &) {
if (Device.getInt() == OMPC_DEVICE_ancestor) {
// Reverse offloading is not supported, so just execute on the host.
FallbackGen(CGF);
return;
}

Expand Down Expand Up @@ -10494,25 +10522,17 @@ void CGOpenMPRuntime::emitTargetCall(
CGF.Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);

CGF.EmitBlock(OffloadFailedBlock);
if (RequiresOuterTask) {
CapturedVars.clear();
CGF.GenerateOpenMPCapturedVars(CS, CapturedVars);
}
emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedFn, CapturedVars);
FallbackGen(CGF);

CGF.EmitBranch(OffloadContBlock);

CGF.EmitBlock(OffloadContBlock, /*IsFinished=*/true);
};

// Notify that the host version must be executed.
auto &&ElseGen = [this, &D, OutlinedFn, &CS, &CapturedVars,
RequiresOuterTask](CodeGenFunction &CGF,
PrePostActionTy &) {
if (RequiresOuterTask) {
CapturedVars.clear();
CGF.GenerateOpenMPCapturedVars(CS, CapturedVars);
}
emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedFn, CapturedVars);
auto &&ElseGen = [this, &D, OutlinedFn, &CS, &CapturedVars, RequiresOuterTask,
FallbackGen](CodeGenFunction &CGF, PrePostActionTy &) {
FallbackGen(CGF);
};

auto &&TargetThenGen = [this, &ThenGen, &D, &InputInfo, &MapTypesArray,
Expand Down
7 changes: 7 additions & 0 deletions clang/lib/CodeGen/CGStmtOpenMP.cpp
Expand Up @@ -6289,6 +6289,13 @@ static void emitCommonOMPTargetDirective(CodeGenFunction &CGF,
if (CGM.getLangOpts().OMPTargetTriples.empty())
IsOffloadEntry = false;

if (CGM.getLangOpts().OpenMPOffloadMandatory && !IsOffloadEntry) {
unsigned DiagID = CGM.getDiags().getCustomDiagID(
DiagnosticsEngine::Error,
"No offloading entry generated while offloading is mandatory.");
CGM.getDiags().Report(DiagID);
}

assert(CGF.CurFuncDecl && "No parent declaration for target region!");
StringRef ParentName;
// In case we have Ctors/Dtors we use the complete type variant to produce
Expand Down
2 changes: 2 additions & 0 deletions clang/lib/Driver/ToolChains/Clang.cpp
Expand Up @@ -5997,6 +5997,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back("-fopenmp-assume-threads-oversubscription");
if (Args.hasArg(options::OPT_fopenmp_assume_no_thread_state))
CmdArgs.push_back("-fopenmp-assume-no-thread-state");
if (Args.hasArg(options::OPT_fopenmp_offload_mandatory))
CmdArgs.push_back("-fopenmp-offload-mandatory");
break;
default:
// By default, if Clang doesn't know how to generate useful OpenMP code
Expand Down
2 changes: 1 addition & 1 deletion clang/lib/Sema/SemaOpenMP.cpp
Expand Up @@ -2517,7 +2517,7 @@ void Sema::finalizeOpenMPDelayedAnalysis(const FunctionDecl *Caller,
<< HostDevTy;
return;
}
if (!LangOpts.OpenMPIsDevice && DevTy &&
if (!LangOpts.OpenMPIsDevice && !LangOpts.OpenMPOffloadMandatory && DevTy &&
*DevTy == OMPDeclareTargetDeclAttr::DT_NoHost) {
// Diagnose nohost function called during host codegen.
StringRef NoHostDevTy = getOpenMPSimpleClauseTypeName(
Expand Down
90 changes: 90 additions & 0 deletions clang/test/OpenMP/target_offload_mandatory_codegen.cpp
@@ -0,0 +1,90 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+"
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp-offload-mandatory -emit-llvm %s -o - | FileCheck %s --check-prefix=MANDATORY
// expected-no-diagnostics

void foo() {}
#pragma omp declare target(foo)

void bar() {}
#pragma omp declare target device_type(nohost) to(bar)

void host() {
#pragma omp target
{ bar(); }
}

void host_if(bool cond) {
#pragma omp target if(cond)
{ bar(); }
}

void host_dev(int device) {
#pragma omp target device(device)
{ bar(); }
}
// MANDATORY-LABEL: define {{[^@]+}}@_Z3foov
// MANDATORY-SAME: () #[[ATTR0:[0-9]+]] {
// MANDATORY-NEXT: entry:
// MANDATORY-NEXT: ret void
//
//
// MANDATORY-LABEL: define {{[^@]+}}@_Z4hostv
// MANDATORY-SAME: () #[[ATTR0]] {
// MANDATORY-NEXT: entry:
// MANDATORY-NEXT: [[TMP0:%.*]] = call i32 @__tgt_target_mapper(%struct.ident_t* @[[GLOB1:[0-9]+]], i64 -1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4hostv_l12.region_id, i32 0, i8** null, i8** null, i64* null, i64* null, i8** null, i8** null)
// MANDATORY-NEXT: [[TMP1:%.*]] = icmp ne i32 [[TMP0]], 0
// MANDATORY-NEXT: br i1 [[TMP1]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
// MANDATORY: omp_offload.failed:
// MANDATORY-NEXT: unreachable
// MANDATORY: omp_offload.cont:
// MANDATORY-NEXT: ret void
//
//
// MANDATORY-LABEL: define {{[^@]+}}@_Z7host_ifb
// MANDATORY-SAME: (i1 noundef zeroext [[COND:%.*]]) #[[ATTR0]] {
// MANDATORY-NEXT: entry:
// MANDATORY-NEXT: [[COND_ADDR:%.*]] = alloca i8, align 1
// MANDATORY-NEXT: [[FROMBOOL:%.*]] = zext i1 [[COND]] to i8
// MANDATORY-NEXT: store i8 [[FROMBOOL]], i8* [[COND_ADDR]], align 1
// MANDATORY-NEXT: [[TMP0:%.*]] = load i8, i8* [[COND_ADDR]], align 1
// MANDATORY-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1
// MANDATORY-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]]
// MANDATORY: omp_if.then:
// MANDATORY-NEXT: [[TMP1:%.*]] = call i32 @__tgt_target_mapper(%struct.ident_t* @[[GLOB1]], i64 -1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7host_ifb_l17.region_id, i32 0, i8** null, i8** null, i64* null, i64* null, i8** null, i8** null)
// MANDATORY-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
// MANDATORY-NEXT: br i1 [[TMP2]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
// MANDATORY: omp_offload.failed:
// MANDATORY-NEXT: unreachable
// MANDATORY: omp_offload.cont:
// MANDATORY-NEXT: br label [[OMP_IF_END:%.*]]
// MANDATORY: omp_if.else:
// MANDATORY-NEXT: unreachable
// MANDATORY: omp_if.end:
// MANDATORY-NEXT: ret void
//
//
// MANDATORY-LABEL: define {{[^@]+}}@_Z8host_devi
// MANDATORY-SAME: (i32 noundef signext [[DEVICE:%.*]]) #[[ATTR0]] {
// MANDATORY-NEXT: entry:
// MANDATORY-NEXT: [[DEVICE_ADDR:%.*]] = alloca i32, align 4
// MANDATORY-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
// MANDATORY-NEXT: store i32 [[DEVICE]], i32* [[DEVICE_ADDR]], align 4
// MANDATORY-NEXT: [[TMP0:%.*]] = load i32, i32* [[DEVICE_ADDR]], align 4
// MANDATORY-NEXT: store i32 [[TMP0]], i32* [[DOTCAPTURE_EXPR_]], align 4
// MANDATORY-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
// MANDATORY-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
// MANDATORY-NEXT: [[TMP3:%.*]] = call i32 @__tgt_target_mapper(%struct.ident_t* @[[GLOB1]], i64 [[TMP2]], i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8host_devi_l22.region_id, i32 0, i8** null, i8** null, i64* null, i64* null, i8** null, i8** null)
// MANDATORY-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
// MANDATORY-NEXT: br i1 [[TMP4]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
// MANDATORY: omp_offload.failed:
// MANDATORY-NEXT: unreachable
// MANDATORY: omp_offload.cont:
// MANDATORY-NEXT: ret void
//
//
// MANDATORY-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
// MANDATORY-SAME: () #[[ATTR3:[0-9]+]] {
// MANDATORY-NEXT: entry:
// MANDATORY-NEXT: call void @__tgt_register_requires(i64 1)
// MANDATORY-NEXT: ret void
//

0 comments on commit 2b97b16

Please sign in to comment.